xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision be672c8e21cc446e1091014ae0ad206f6b8c1d55)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2022 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static list_t		vmm_destroy_list;
81 static id_space_t	*vmm_minors;
82 static void		*vmm_statep;
83 
84 /* temporary safety switch */
85 int		vmm_allow_state_writes;
86 
87 static const char *vmmdev_hvm_name = "bhyve";
88 
89 /* For sdev plugin (/dev) */
90 #define	VMM_SDEV_ROOT "/dev/vmm"
91 
92 /* From uts/intel/io/vmm/intel/vmx.c */
93 extern int vmx_x86_supported(const char **);
94 
95 /* Holds and hooks from drivers external to vmm */
96 struct vmm_hold {
97 	list_node_t	vmh_node;
98 	vmm_softc_t	*vmh_sc;
99 	boolean_t	vmh_release_req;
100 	uint_t		vmh_ioport_hook_cnt;
101 };
102 
103 struct vmm_lease {
104 	list_node_t		vml_node;
105 	struct vm		*vml_vm;
106 	vm_client_t		*vml_vmclient;
107 	boolean_t		vml_expired;
108 	boolean_t		vml_break_deferred;
109 	boolean_t		(*vml_expire_func)(void *);
110 	void			*vml_expire_arg;
111 	struct vmm_hold		*vml_hold;
112 };
113 
114 /* Options for vmm_destroy_locked */
115 typedef enum vmm_destroy_opts {
116 	VDO_DEFAULT		= 0,
117 	/*
118 	 * Request that zone-specific-data associated with this VM not be
119 	 * cleaned up as part of the destroy.  Skipping ZSD clean-up is
120 	 * necessary when VM is being destroyed as part of zone destruction,
121 	 * when said ZSD is already being cleaned up.
122 	 */
123 	VDO_NO_CLEAN_ZSD	= (1 << 0),
124 	/*
125 	 * Skip any attempt to wait for vmm_drv consumers when attempting to
126 	 * purge them from the instance.  When performing an auto-destruct, it
127 	 * is not desirable to wait, since said consumer might exist in a
128 	 * "higher" file descriptor which has not yet been closed.
129 	 */
130 	VDO_NO_PURGE_WAIT	= (1 << 1),
131 } vmm_destroy_opts_t;
132 
133 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, boolean_t *);
134 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
135 static void vmm_lease_block(vmm_softc_t *);
136 static void vmm_lease_unblock(vmm_softc_t *);
137 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
138 static void vmm_kstat_init(vmm_softc_t *);
139 static void vmm_kstat_fini(vmm_softc_t *);
140 
141 /*
142  * The 'devmem' hack:
143  *
144  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
145  * in the vm which appear with their own name related to the vm under /dev.
146  * Since this would be a hassle from an sdev perspective and would require a
147  * new cdev interface (or complicate the existing one), we choose to implement
148  * this in a different manner.  Direct access to the underlying vm memory
149  * segments is exposed by placing them in a range of offsets beyond the normal
150  * guest memory space.  Userspace can query the appropriate offset to mmap()
151  * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
152  */
153 
154 static vmm_devmem_entry_t *
155 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
156 {
157 	vmm_devmem_entry_t *ent = NULL;
158 	list_t *dl = &sc->vmm_devmem_list;
159 
160 	for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
161 		if (ent->vde_segid == segid) {
162 			return (ent);
163 		}
164 	}
165 	return (NULL);
166 }
167 
168 static int
169 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
170 {
171 	int error;
172 	bool sysmem;
173 
174 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
175 	    NULL);
176 	if (error || mseg->len == 0)
177 		return (error);
178 
179 	if (!sysmem) {
180 		vmm_devmem_entry_t *de;
181 
182 		de = vmmdev_devmem_find(sc, mseg->segid);
183 		if (de != NULL) {
184 			(void) strlcpy(mseg->name, de->vde_name,
185 			    sizeof (mseg->name));
186 		}
187 	} else {
188 		bzero(mseg->name, sizeof (mseg->name));
189 	}
190 
191 	return (error);
192 }
193 
194 static int
195 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
196 {
197 	off_t map_offset;
198 	vmm_devmem_entry_t *entry;
199 
200 	if (list_is_empty(&sc->vmm_devmem_list)) {
201 		map_offset = VM_DEVMEM_START;
202 	} else {
203 		entry = list_tail(&sc->vmm_devmem_list);
204 		map_offset = entry->vde_off + entry->vde_len;
205 		if (map_offset < entry->vde_off) {
206 			/* Do not tolerate overflow */
207 			return (ERANGE);
208 		}
209 		/*
210 		 * XXXJOY: We could choose to search the list for duplicate
211 		 * names and toss an error.  Since we're using the offset
212 		 * method for now, it does not make much of a difference.
213 		 */
214 	}
215 
216 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
217 	entry->vde_segid = mseg->segid;
218 	entry->vde_len = mseg->len;
219 	entry->vde_off = map_offset;
220 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
221 	list_insert_tail(&sc->vmm_devmem_list, entry);
222 
223 	return (0);
224 }
225 
226 static boolean_t
227 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
228     off_t *map_offp)
229 {
230 	list_t *dl = &sc->vmm_devmem_list;
231 	vmm_devmem_entry_t *de = NULL;
232 	const off_t map_end = off + len;
233 
234 	VERIFY(off >= VM_DEVMEM_START);
235 
236 	if (map_end < off) {
237 		/* No match on overflow */
238 		return (B_FALSE);
239 	}
240 
241 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
242 		const off_t item_end = de->vde_off + de->vde_len;
243 
244 		if (de->vde_off <= off && item_end >= map_end) {
245 			*segidp = de->vde_segid;
246 			*map_offp = off - de->vde_off;
247 			return (B_TRUE);
248 		}
249 	}
250 	return (B_FALSE);
251 }
252 
253 static void
254 vmmdev_devmem_purge(vmm_softc_t *sc)
255 {
256 	vmm_devmem_entry_t *entry;
257 
258 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
259 		kmem_free(entry, sizeof (*entry));
260 	}
261 }
262 
263 static int
264 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
265 {
266 	int error;
267 	bool sysmem = true;
268 
269 	if (VM_MEMSEG_NAME(mseg)) {
270 		sysmem = false;
271 	}
272 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
273 
274 	if (error == 0) {
275 		/*
276 		 * Rather than create a whole fresh device from which userspace
277 		 * can mmap this segment, instead make it available at an
278 		 * offset above where the main guest memory resides.
279 		 */
280 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
281 		if (error != 0) {
282 			vm_free_memseg(sc->vmm_vm, mseg->segid);
283 		}
284 	}
285 	return (error);
286 }
287 
288 /*
289  * Resource Locking and Exclusion
290  *
291  * Much of bhyve depends on key portions of VM state, such as the guest memory
292  * map, to remain unchanged while the guest is running.  As ported from
293  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
294  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
295  * performing the work of actually running the guest in VMX/SVM, would lock
296  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
297  * state, all of the vCPUs would be first locked, ensuring that the
298  * operation(s) could complete without any other threads stumbling into
299  * intermediate states.
300  *
301  * This approach is largely effective for bhyve.  Common operations, such as
302  * running the vCPUs, steer clear of lock contention.  The model begins to
303  * break down for operations which do not occur in the context of a specific
304  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
305  * thread in the bhyve process.  In order to properly protect those vCPU-less
306  * operations from encountering invalid states, additional locking is required.
307  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
308  * It does mean that class of operations will be serialized on locking the
309  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
310  * undue contention on the VM_MAXCPU-1 vCPU.
311  *
312  * In order to address the shortcomings of this model, the concept of a
313  * read/write lock has been added to bhyve.  Operations which change
314  * fundamental aspects of a VM (such as the memory map) must acquire the write
315  * lock, which also implies locking all of the vCPUs and waiting for all read
316  * lock holders to release.  While it increases the cost and waiting time for
317  * those few operations, it allows most hot-path operations on the VM (which
318  * depend on its configuration remaining stable) to occur with minimal locking.
319  *
320  * Consumers of the Driver API (see below) are a special case when it comes to
321  * this locking, since they may hold a read lock via the drv_lease mechanism
322  * for an extended period of time.  Rather than forcing those consumers to
323  * continuously poll for a write lock attempt, the lease system forces them to
324  * provide a release callback to trigger their clean-up (and potential later
325  * reacquisition) of the read lock.
326  */
327 
328 static void
329 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
330 {
331 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
332 
333 	/*
334 	 * Since this state transition is utilizing from_idle=true, it should
335 	 * not fail, but rather block until it can be successful.
336 	 */
337 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
338 }
339 
340 static void
341 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
342 {
343 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
344 
345 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
346 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
347 }
348 
349 static void
350 vmm_read_lock(vmm_softc_t *sc)
351 {
352 	rw_enter(&sc->vmm_rwlock, RW_READER);
353 }
354 
355 static void
356 vmm_read_unlock(vmm_softc_t *sc)
357 {
358 	rw_exit(&sc->vmm_rwlock);
359 }
360 
361 static void
362 vmm_write_lock(vmm_softc_t *sc)
363 {
364 	int maxcpus;
365 
366 	/* First lock all the vCPUs */
367 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
368 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
369 		vcpu_lock_one(sc, vcpu);
370 	}
371 
372 	/*
373 	 * Block vmm_drv leases from being acquired or held while the VM write
374 	 * lock is held.
375 	 */
376 	vmm_lease_block(sc);
377 
378 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
379 	/*
380 	 * For now, the 'maxcpus' value for an instance is fixed at the
381 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
382 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
383 	 * of the write lock will need to be wary of such changes.
384 	 */
385 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
386 }
387 
388 static void
389 vmm_write_unlock(vmm_softc_t *sc)
390 {
391 	int maxcpus;
392 
393 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
394 	vmm_lease_unblock(sc);
395 
396 	/*
397 	 * The VM write lock _must_ be released from the same thread it was
398 	 * acquired in, unlike the read lock.
399 	 */
400 	VERIFY(rw_write_held(&sc->vmm_rwlock));
401 	rw_exit(&sc->vmm_rwlock);
402 
403 	/* Unlock all the vCPUs */
404 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
405 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
406 		vcpu_unlock_one(sc, vcpu);
407 	}
408 }
409 
410 static int
411 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
412     cred_t *credp, int *rvalp)
413 {
414 	int error = 0, vcpu = -1;
415 	void *datap = (void *)arg;
416 	enum vm_lock_type {
417 		LOCK_NONE = 0,
418 		LOCK_VCPU,
419 		LOCK_READ_HOLD,
420 		LOCK_WRITE_HOLD
421 	} lock_type = LOCK_NONE;
422 
423 	/* Acquire any exclusion resources needed for the operation. */
424 	switch (cmd) {
425 	case VM_RUN:
426 	case VM_GET_REGISTER:
427 	case VM_SET_REGISTER:
428 	case VM_GET_SEGMENT_DESCRIPTOR:
429 	case VM_SET_SEGMENT_DESCRIPTOR:
430 	case VM_GET_REGISTER_SET:
431 	case VM_SET_REGISTER_SET:
432 	case VM_INJECT_EXCEPTION:
433 	case VM_GET_CAPABILITY:
434 	case VM_SET_CAPABILITY:
435 	case VM_PPTDEV_MSI:
436 	case VM_PPTDEV_MSIX:
437 	case VM_SET_X2APIC_STATE:
438 	case VM_GLA2GPA:
439 	case VM_GLA2GPA_NOFAULT:
440 	case VM_ACTIVATE_CPU:
441 	case VM_SET_INTINFO:
442 	case VM_GET_INTINFO:
443 	case VM_RESTART_INSTRUCTION:
444 	case VM_SET_KERNEMU_DEV:
445 	case VM_GET_KERNEMU_DEV:
446 	case VM_RESET_CPU:
447 	case VM_GET_RUN_STATE:
448 	case VM_SET_RUN_STATE:
449 	case VM_GET_FPU:
450 	case VM_SET_FPU:
451 		/*
452 		 * Copy in the ID of the vCPU chosen for this operation.
453 		 * Since a nefarious caller could update their struct between
454 		 * this locking and when the rest of the ioctl data is copied
455 		 * in, it is _critical_ that this local 'vcpu' variable be used
456 		 * rather than the in-struct one when performing the ioctl.
457 		 */
458 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
459 			return (EFAULT);
460 		}
461 		if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
462 			return (EINVAL);
463 		}
464 		vcpu_lock_one(sc, vcpu);
465 		lock_type = LOCK_VCPU;
466 		break;
467 
468 	case VM_REINIT:
469 	case VM_BIND_PPTDEV:
470 	case VM_UNBIND_PPTDEV:
471 	case VM_MAP_PPTDEV_MMIO:
472 	case VM_UNMAP_PPTDEV_MMIO:
473 	case VM_ALLOC_MEMSEG:
474 	case VM_MMAP_MEMSEG:
475 	case VM_MUNMAP_MEMSEG:
476 	case VM_WRLOCK_CYCLE:
477 	case VM_PMTMR_LOCATE:
478 		vmm_write_lock(sc);
479 		lock_type = LOCK_WRITE_HOLD;
480 		break;
481 
482 	case VM_GET_MEMSEG:
483 	case VM_MMAP_GETNEXT:
484 	case VM_LAPIC_IRQ:
485 	case VM_INJECT_NMI:
486 	case VM_IOAPIC_ASSERT_IRQ:
487 	case VM_IOAPIC_DEASSERT_IRQ:
488 	case VM_IOAPIC_PULSE_IRQ:
489 	case VM_LAPIC_MSI:
490 	case VM_LAPIC_LOCAL_IRQ:
491 	case VM_GET_X2APIC_STATE:
492 	case VM_RTC_READ:
493 	case VM_RTC_WRITE:
494 	case VM_RTC_SETTIME:
495 	case VM_RTC_GETTIME:
496 	case VM_PPTDEV_DISABLE_MSIX:
497 	case VM_DEVMEM_GETOFFSET:
498 	case VM_TRACK_DIRTY_PAGES:
499 		vmm_read_lock(sc);
500 		lock_type = LOCK_READ_HOLD;
501 		break;
502 
503 	case VM_DATA_READ:
504 	case VM_DATA_WRITE:
505 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
506 			return (EFAULT);
507 		}
508 		if (vcpu == -1) {
509 			/* Access data for VM-wide devices */
510 			vmm_write_lock(sc);
511 			lock_type = LOCK_WRITE_HOLD;
512 		} else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
513 			/* Access data associated with a specific vCPU */
514 			vcpu_lock_one(sc, vcpu);
515 			lock_type = LOCK_VCPU;
516 		} else {
517 			return (EINVAL);
518 		}
519 		break;
520 
521 	case VM_GET_GPA_PMAP:
522 	case VM_IOAPIC_PINCOUNT:
523 	case VM_SUSPEND:
524 	case VM_DESC_FPU_AREA:
525 	case VM_SET_AUTODESTRUCT:
526 	default:
527 		break;
528 	}
529 
530 	/* Execute the primary logic for the ioctl. */
531 	switch (cmd) {
532 	case VM_RUN: {
533 		struct vm_entry entry;
534 
535 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
536 			error = EFAULT;
537 			break;
538 		}
539 
540 		if (!(curthread->t_schedflag & TS_VCPU))
541 			smt_mark_as_vcpu();
542 
543 		error = vm_run(sc->vmm_vm, vcpu, &entry);
544 
545 		/*
546 		 * Unexpected states in vm_run() are expressed through positive
547 		 * errno-oriented return values.  VM states which expect further
548 		 * processing in userspace (necessary context via exitinfo) are
549 		 * expressed through negative return values.  For the time being
550 		 * a return value of 0 is not expected from vm_run().
551 		 */
552 		ASSERT(error != 0);
553 		if (error < 0) {
554 			const struct vm_exit *vme;
555 			void *outp = entry.exit_data;
556 
557 			error = 0;
558 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
559 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
560 				error = EFAULT;
561 			}
562 		}
563 		break;
564 	}
565 	case VM_SUSPEND: {
566 		struct vm_suspend vmsuspend;
567 
568 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
569 			error = EFAULT;
570 			break;
571 		}
572 		error = vm_suspend(sc->vmm_vm, vmsuspend.how);
573 		break;
574 	}
575 	case VM_REINIT: {
576 		struct vm_reinit reinit;
577 
578 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
579 			error = EFAULT;
580 			break;
581 		}
582 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
583 			/*
584 			 * The VM instance should be free of driver-attached
585 			 * hooks during the reinitialization process.
586 			 */
587 			break;
588 		}
589 		error = vm_reinit(sc->vmm_vm, reinit.flags);
590 		(void) vmm_drv_block_hook(sc, B_FALSE);
591 		break;
592 	}
593 	case VM_STAT_DESC: {
594 		struct vm_stat_desc statdesc;
595 
596 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
597 			error = EFAULT;
598 			break;
599 		}
600 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
601 		    sizeof (statdesc.desc));
602 		if (error == 0 &&
603 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
604 			error = EFAULT;
605 			break;
606 		}
607 		break;
608 	}
609 	case VM_STATS_IOC: {
610 		struct vm_stats vmstats;
611 
612 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
613 			error = EFAULT;
614 			break;
615 		}
616 		hrt2tv(gethrtime(), &vmstats.tv);
617 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
618 		    nitems(vmstats.statbuf),
619 		    &vmstats.num_entries, vmstats.statbuf);
620 		if (error == 0 &&
621 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
622 			error = EFAULT;
623 			break;
624 		}
625 		break;
626 	}
627 
628 	case VM_PPTDEV_MSI: {
629 		struct vm_pptdev_msi pptmsi;
630 
631 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
632 			error = EFAULT;
633 			break;
634 		}
635 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
636 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
637 		break;
638 	}
639 	case VM_PPTDEV_MSIX: {
640 		struct vm_pptdev_msix pptmsix;
641 
642 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
643 			error = EFAULT;
644 			break;
645 		}
646 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
647 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
648 		    pptmsix.vector_control);
649 		break;
650 	}
651 	case VM_PPTDEV_DISABLE_MSIX: {
652 		struct vm_pptdev pptdev;
653 
654 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
655 			error = EFAULT;
656 			break;
657 		}
658 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
659 		break;
660 	}
661 	case VM_MAP_PPTDEV_MMIO: {
662 		struct vm_pptdev_mmio pptmmio;
663 
664 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
665 			error = EFAULT;
666 			break;
667 		}
668 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
669 		    pptmmio.len, pptmmio.hpa);
670 		break;
671 	}
672 	case VM_UNMAP_PPTDEV_MMIO: {
673 		struct vm_pptdev_mmio pptmmio;
674 
675 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
676 			error = EFAULT;
677 			break;
678 		}
679 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
680 		    pptmmio.len);
681 		break;
682 	}
683 	case VM_BIND_PPTDEV: {
684 		struct vm_pptdev pptdev;
685 
686 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
687 			error = EFAULT;
688 			break;
689 		}
690 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
691 		break;
692 	}
693 	case VM_UNBIND_PPTDEV: {
694 		struct vm_pptdev pptdev;
695 
696 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
697 			error = EFAULT;
698 			break;
699 		}
700 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
701 		break;
702 	}
703 	case VM_GET_PPTDEV_LIMITS: {
704 		struct vm_pptdev_limits pptlimits;
705 
706 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
707 			error = EFAULT;
708 			break;
709 		}
710 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
711 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
712 		if (error == 0 &&
713 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
714 			error = EFAULT;
715 			break;
716 		}
717 		break;
718 	}
719 	case VM_INJECT_EXCEPTION: {
720 		struct vm_exception vmexc;
721 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
722 			error = EFAULT;
723 			break;
724 		}
725 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
726 		    vmexc.error_code_valid != 0, vmexc.error_code,
727 		    vmexc.restart_instruction != 0);
728 		break;
729 	}
730 	case VM_INJECT_NMI: {
731 		struct vm_nmi vmnmi;
732 
733 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
734 			error = EFAULT;
735 			break;
736 		}
737 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
738 		break;
739 	}
740 	case VM_LAPIC_IRQ: {
741 		struct vm_lapic_irq vmirq;
742 
743 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
744 			error = EFAULT;
745 			break;
746 		}
747 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
748 		break;
749 	}
750 	case VM_LAPIC_LOCAL_IRQ: {
751 		struct vm_lapic_irq vmirq;
752 
753 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
754 			error = EFAULT;
755 			break;
756 		}
757 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
758 		    vmirq.vector);
759 		break;
760 	}
761 	case VM_LAPIC_MSI: {
762 		struct vm_lapic_msi vmmsi;
763 
764 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
765 			error = EFAULT;
766 			break;
767 		}
768 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
769 		break;
770 	}
771 
772 	case VM_IOAPIC_ASSERT_IRQ: {
773 		struct vm_ioapic_irq ioapic_irq;
774 
775 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
776 			error = EFAULT;
777 			break;
778 		}
779 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
780 		break;
781 	}
782 	case VM_IOAPIC_DEASSERT_IRQ: {
783 		struct vm_ioapic_irq ioapic_irq;
784 
785 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
786 			error = EFAULT;
787 			break;
788 		}
789 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
790 		break;
791 	}
792 	case VM_IOAPIC_PULSE_IRQ: {
793 		struct vm_ioapic_irq ioapic_irq;
794 
795 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
796 			error = EFAULT;
797 			break;
798 		}
799 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
800 		break;
801 	}
802 	case VM_IOAPIC_PINCOUNT: {
803 		int pincount;
804 
805 		pincount = vioapic_pincount(sc->vmm_vm);
806 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
807 			error = EFAULT;
808 			break;
809 		}
810 		break;
811 	}
812 	case VM_DESC_FPU_AREA: {
813 		struct vm_fpu_desc desc;
814 		void *buf = NULL;
815 
816 		if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
817 			error = EFAULT;
818 			break;
819 		}
820 		if (desc.vfd_num_entries > 64) {
821 			error = EINVAL;
822 			break;
823 		}
824 		const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
825 		    desc.vfd_num_entries;
826 		if (buf_sz != 0) {
827 			buf = kmem_zalloc(buf_sz, KM_SLEEP);
828 		}
829 
830 		/*
831 		 * For now, we are depending on vm_fpu_desc_entry and
832 		 * hma_xsave_state_desc_t having the same format.
833 		 */
834 		CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
835 		    sizeof (hma_xsave_state_desc_t));
836 
837 		size_t req_size;
838 		const uint_t max_entries = hma_fpu_describe_xsave_state(
839 		    (hma_xsave_state_desc_t *)buf,
840 		    desc.vfd_num_entries,
841 		    &req_size);
842 
843 		desc.vfd_req_size = req_size;
844 		desc.vfd_num_entries = max_entries;
845 		if (buf_sz != 0) {
846 			if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
847 				error = EFAULT;
848 			}
849 			kmem_free(buf, buf_sz);
850 		}
851 
852 		if (error == 0) {
853 			if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
854 				error = EFAULT;
855 			}
856 		}
857 		break;
858 	}
859 	case VM_SET_AUTODESTRUCT: {
860 		/*
861 		 * Since this has to do with controlling the lifetime of the
862 		 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather
863 		 * than the vcpu-centric or rwlock exclusion mechanisms.
864 		 */
865 		mutex_enter(&vmm_mtx);
866 		sc->vmm_autodestruct = (arg != 0);
867 		mutex_exit(&vmm_mtx);
868 		break;
869 	}
870 
871 	case VM_ISA_ASSERT_IRQ: {
872 		struct vm_isa_irq isa_irq;
873 
874 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
875 			error = EFAULT;
876 			break;
877 		}
878 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
879 		if (error == 0 && isa_irq.ioapic_irq != -1) {
880 			error = vioapic_assert_irq(sc->vmm_vm,
881 			    isa_irq.ioapic_irq);
882 		}
883 		break;
884 	}
885 	case VM_ISA_DEASSERT_IRQ: {
886 		struct vm_isa_irq isa_irq;
887 
888 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
889 			error = EFAULT;
890 			break;
891 		}
892 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
893 		if (error == 0 && isa_irq.ioapic_irq != -1) {
894 			error = vioapic_deassert_irq(sc->vmm_vm,
895 			    isa_irq.ioapic_irq);
896 		}
897 		break;
898 	}
899 	case VM_ISA_PULSE_IRQ: {
900 		struct vm_isa_irq isa_irq;
901 
902 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
903 			error = EFAULT;
904 			break;
905 		}
906 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
907 		if (error == 0 && isa_irq.ioapic_irq != -1) {
908 			error = vioapic_pulse_irq(sc->vmm_vm,
909 			    isa_irq.ioapic_irq);
910 		}
911 		break;
912 	}
913 	case VM_ISA_SET_IRQ_TRIGGER: {
914 		struct vm_isa_irq_trigger isa_irq_trigger;
915 
916 		if (ddi_copyin(datap, &isa_irq_trigger,
917 		    sizeof (isa_irq_trigger), md)) {
918 			error = EFAULT;
919 			break;
920 		}
921 		error = vatpic_set_irq_trigger(sc->vmm_vm,
922 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
923 		break;
924 	}
925 
926 	case VM_MMAP_GETNEXT: {
927 		struct vm_memmap mm;
928 
929 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
930 			error = EFAULT;
931 			break;
932 		}
933 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
934 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
935 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
936 			error = EFAULT;
937 			break;
938 		}
939 		break;
940 	}
941 	case VM_MMAP_MEMSEG: {
942 		struct vm_memmap mm;
943 
944 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
945 			error = EFAULT;
946 			break;
947 		}
948 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
949 		    mm.len, mm.prot, mm.flags);
950 		break;
951 	}
952 	case VM_MUNMAP_MEMSEG: {
953 		struct vm_munmap mu;
954 
955 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
956 			error = EFAULT;
957 			break;
958 		}
959 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
960 		break;
961 	}
962 	case VM_ALLOC_MEMSEG: {
963 		struct vm_memseg vmseg;
964 
965 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
966 			error = EFAULT;
967 			break;
968 		}
969 		error = vmmdev_alloc_memseg(sc, &vmseg);
970 		break;
971 	}
972 	case VM_GET_MEMSEG: {
973 		struct vm_memseg vmseg;
974 
975 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
976 			error = EFAULT;
977 			break;
978 		}
979 		error = vmmdev_get_memseg(sc, &vmseg);
980 		if (error == 0 &&
981 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
982 			error = EFAULT;
983 			break;
984 		}
985 		break;
986 	}
987 	case VM_GET_REGISTER: {
988 		struct vm_register vmreg;
989 
990 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
991 			error = EFAULT;
992 			break;
993 		}
994 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
995 		    &vmreg.regval);
996 		if (error == 0 &&
997 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
998 			error = EFAULT;
999 			break;
1000 		}
1001 		break;
1002 	}
1003 	case VM_SET_REGISTER: {
1004 		struct vm_register vmreg;
1005 
1006 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1007 			error = EFAULT;
1008 			break;
1009 		}
1010 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
1011 		    vmreg.regval);
1012 		break;
1013 	}
1014 	case VM_SET_SEGMENT_DESCRIPTOR: {
1015 		struct vm_seg_desc vmsegd;
1016 
1017 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1018 			error = EFAULT;
1019 			break;
1020 		}
1021 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1022 		    &vmsegd.desc);
1023 		break;
1024 	}
1025 	case VM_GET_SEGMENT_DESCRIPTOR: {
1026 		struct vm_seg_desc vmsegd;
1027 
1028 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1029 			error = EFAULT;
1030 			break;
1031 		}
1032 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1033 		    &vmsegd.desc);
1034 		if (error == 0 &&
1035 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1036 			error = EFAULT;
1037 			break;
1038 		}
1039 		break;
1040 	}
1041 	case VM_GET_REGISTER_SET: {
1042 		struct vm_register_set vrs;
1043 		int regnums[VM_REG_LAST];
1044 		uint64_t regvals[VM_REG_LAST];
1045 
1046 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1047 			error = EFAULT;
1048 			break;
1049 		}
1050 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1051 			error = EINVAL;
1052 			break;
1053 		}
1054 		if (ddi_copyin(vrs.regnums, regnums,
1055 		    sizeof (int) * vrs.count, md)) {
1056 			error = EFAULT;
1057 			break;
1058 		}
1059 
1060 		error = 0;
1061 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1062 			if (regnums[i] < 0) {
1063 				error = EINVAL;
1064 				break;
1065 			}
1066 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1067 			    &regvals[i]);
1068 		}
1069 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1070 		    sizeof (uint64_t) * vrs.count, md)) {
1071 			error = EFAULT;
1072 		}
1073 		break;
1074 	}
1075 	case VM_SET_REGISTER_SET: {
1076 		struct vm_register_set vrs;
1077 		int regnums[VM_REG_LAST];
1078 		uint64_t regvals[VM_REG_LAST];
1079 
1080 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1081 			error = EFAULT;
1082 			break;
1083 		}
1084 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1085 			error = EINVAL;
1086 			break;
1087 		}
1088 		if (ddi_copyin(vrs.regnums, regnums,
1089 		    sizeof (int) * vrs.count, md)) {
1090 			error = EFAULT;
1091 			break;
1092 		}
1093 		if (ddi_copyin(vrs.regvals, regvals,
1094 		    sizeof (uint64_t) * vrs.count, md)) {
1095 			error = EFAULT;
1096 			break;
1097 		}
1098 
1099 		error = 0;
1100 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1101 			/*
1102 			 * Setting registers in a set is not atomic, since a
1103 			 * failure in the middle of the set will cause a
1104 			 * bail-out and inconsistent register state.  Callers
1105 			 * should be wary of this.
1106 			 */
1107 			if (regnums[i] < 0) {
1108 				error = EINVAL;
1109 				break;
1110 			}
1111 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1112 			    regvals[i]);
1113 		}
1114 		break;
1115 	}
1116 	case VM_RESET_CPU: {
1117 		struct vm_vcpu_reset vvr;
1118 
1119 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1120 			error = EFAULT;
1121 			break;
1122 		}
1123 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1124 			error = EINVAL;
1125 		}
1126 
1127 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1128 		break;
1129 	}
1130 	case VM_GET_RUN_STATE: {
1131 		struct vm_run_state vrs;
1132 
1133 		bzero(&vrs, sizeof (vrs));
1134 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1135 		    &vrs.sipi_vector);
1136 		if (error == 0) {
1137 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1138 				error = EFAULT;
1139 				break;
1140 			}
1141 		}
1142 		break;
1143 	}
1144 	case VM_SET_RUN_STATE: {
1145 		struct vm_run_state vrs;
1146 
1147 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1148 			error = EFAULT;
1149 			break;
1150 		}
1151 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1152 		    vrs.sipi_vector);
1153 		break;
1154 	}
1155 	case VM_GET_FPU: {
1156 		struct vm_fpu_state req;
1157 		const size_t max_len = (PAGESIZE * 2);
1158 		void *kbuf;
1159 
1160 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1161 			error = EFAULT;
1162 			break;
1163 		}
1164 		if (req.len > max_len || req.len == 0) {
1165 			error = EINVAL;
1166 			break;
1167 		}
1168 		kbuf = kmem_zalloc(req.len, KM_SLEEP);
1169 		error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1170 		if (error == 0) {
1171 			if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1172 				error = EFAULT;
1173 			}
1174 		}
1175 		kmem_free(kbuf, req.len);
1176 		break;
1177 	}
1178 	case VM_SET_FPU: {
1179 		struct vm_fpu_state req;
1180 		const size_t max_len = (PAGESIZE * 2);
1181 		void *kbuf;
1182 
1183 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1184 			error = EFAULT;
1185 			break;
1186 		}
1187 		if (req.len > max_len || req.len == 0) {
1188 			error = EINVAL;
1189 			break;
1190 		}
1191 		kbuf = kmem_alloc(req.len, KM_SLEEP);
1192 		if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1193 			error = EFAULT;
1194 		} else {
1195 			error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1196 		}
1197 		kmem_free(kbuf, req.len);
1198 		break;
1199 	}
1200 
1201 	case VM_SET_KERNEMU_DEV:
1202 	case VM_GET_KERNEMU_DEV: {
1203 		struct vm_readwrite_kernemu_device kemu;
1204 		size_t size = 0;
1205 
1206 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1207 			error = EFAULT;
1208 			break;
1209 		}
1210 
1211 		if (kemu.access_width > 3) {
1212 			error = EINVAL;
1213 			break;
1214 		}
1215 		size = (1 << kemu.access_width);
1216 		ASSERT(size >= 1 && size <= 8);
1217 
1218 		if (cmd == VM_SET_KERNEMU_DEV) {
1219 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1220 			    kemu.gpa, kemu.value, size);
1221 		} else {
1222 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1223 			    kemu.gpa, &kemu.value, size);
1224 		}
1225 
1226 		if (error == 0) {
1227 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1228 				error = EFAULT;
1229 				break;
1230 			}
1231 		}
1232 		break;
1233 	}
1234 
1235 	case VM_GET_CAPABILITY: {
1236 		struct vm_capability vmcap;
1237 
1238 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1239 			error = EFAULT;
1240 			break;
1241 		}
1242 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1243 		    &vmcap.capval);
1244 		if (error == 0 &&
1245 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1246 			error = EFAULT;
1247 			break;
1248 		}
1249 		break;
1250 	}
1251 	case VM_SET_CAPABILITY: {
1252 		struct vm_capability vmcap;
1253 
1254 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1255 			error = EFAULT;
1256 			break;
1257 		}
1258 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1259 		    vmcap.capval);
1260 		break;
1261 	}
1262 	case VM_SET_X2APIC_STATE: {
1263 		struct vm_x2apic x2apic;
1264 
1265 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1266 			error = EFAULT;
1267 			break;
1268 		}
1269 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1270 		break;
1271 	}
1272 	case VM_GET_X2APIC_STATE: {
1273 		struct vm_x2apic x2apic;
1274 
1275 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1276 			error = EFAULT;
1277 			break;
1278 		}
1279 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1280 		    &x2apic.state);
1281 		if (error == 0 &&
1282 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1283 			error = EFAULT;
1284 			break;
1285 		}
1286 		break;
1287 	}
1288 	case VM_GET_GPA_PMAP: {
1289 		/*
1290 		 * Until there is a necessity to leak EPT/RVI PTE values to
1291 		 * userspace, this will remain unimplemented
1292 		 */
1293 		error = EINVAL;
1294 		break;
1295 	}
1296 	case VM_GET_HPET_CAPABILITIES: {
1297 		struct vm_hpet_cap hpetcap;
1298 
1299 		error = vhpet_getcap(&hpetcap);
1300 		if (error == 0 &&
1301 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1302 			error = EFAULT;
1303 			break;
1304 		}
1305 		break;
1306 	}
1307 	case VM_GLA2GPA: {
1308 		struct vm_gla2gpa gg;
1309 
1310 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1311 			error = EFAULT;
1312 			break;
1313 		}
1314 		gg.vcpuid = vcpu;
1315 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1316 		    gg.prot, &gg.gpa, &gg.fault);
1317 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1318 			error = EFAULT;
1319 			break;
1320 		}
1321 		break;
1322 	}
1323 	case VM_GLA2GPA_NOFAULT: {
1324 		struct vm_gla2gpa gg;
1325 
1326 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1327 			error = EFAULT;
1328 			break;
1329 		}
1330 		gg.vcpuid = vcpu;
1331 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1332 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1333 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1334 			error = EFAULT;
1335 			break;
1336 		}
1337 		break;
1338 	}
1339 
1340 	case VM_ACTIVATE_CPU:
1341 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1342 		break;
1343 
1344 	case VM_SUSPEND_CPU:
1345 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1346 			error = EFAULT;
1347 		} else {
1348 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1349 		}
1350 		break;
1351 
1352 	case VM_RESUME_CPU:
1353 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1354 			error = EFAULT;
1355 		} else {
1356 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1357 		}
1358 		break;
1359 
1360 	case VM_GET_CPUS: {
1361 		struct vm_cpuset vm_cpuset;
1362 		cpuset_t tempset;
1363 		void *srcp = &tempset;
1364 		int size;
1365 
1366 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1367 			error = EFAULT;
1368 			break;
1369 		}
1370 
1371 		/* Be more generous about sizing since our cpuset_t is large. */
1372 		size = vm_cpuset.cpusetsize;
1373 		if (size <= 0 || size > sizeof (cpuset_t)) {
1374 			error = ERANGE;
1375 		}
1376 		/*
1377 		 * If they want a ulong_t or less, make sure they receive the
1378 		 * low bits with all the useful information.
1379 		 */
1380 		if (size <= sizeof (tempset.cpub[0])) {
1381 			srcp = &tempset.cpub[0];
1382 		}
1383 
1384 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1385 			tempset = vm_active_cpus(sc->vmm_vm);
1386 		} else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1387 			tempset = vm_suspended_cpus(sc->vmm_vm);
1388 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1389 			tempset = vm_debug_cpus(sc->vmm_vm);
1390 		} else {
1391 			error = EINVAL;
1392 		}
1393 
1394 		ASSERT(size > 0 && size <= sizeof (tempset));
1395 		if (error == 0 &&
1396 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1397 			error = EFAULT;
1398 			break;
1399 		}
1400 		break;
1401 	}
1402 	case VM_SET_INTINFO: {
1403 		struct vm_intinfo vmii;
1404 
1405 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1406 			error = EFAULT;
1407 			break;
1408 		}
1409 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1410 		break;
1411 	}
1412 	case VM_GET_INTINFO: {
1413 		struct vm_intinfo vmii;
1414 
1415 		vmii.vcpuid = vcpu;
1416 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1417 		    &vmii.info2);
1418 		if (error == 0 &&
1419 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1420 			error = EFAULT;
1421 			break;
1422 		}
1423 		break;
1424 	}
1425 	case VM_RTC_WRITE: {
1426 		struct vm_rtc_data rtcdata;
1427 
1428 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1429 			error = EFAULT;
1430 			break;
1431 		}
1432 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1433 		    rtcdata.value);
1434 		break;
1435 	}
1436 	case VM_RTC_READ: {
1437 		struct vm_rtc_data rtcdata;
1438 
1439 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1440 			error = EFAULT;
1441 			break;
1442 		}
1443 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1444 		    &rtcdata.value);
1445 		if (error == 0 &&
1446 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1447 			error = EFAULT;
1448 			break;
1449 		}
1450 		break;
1451 	}
1452 	case VM_RTC_SETTIME: {
1453 		struct vm_rtc_time rtctime;
1454 
1455 		if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1456 			error = EFAULT;
1457 			break;
1458 		}
1459 		error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1460 		break;
1461 	}
1462 	case VM_RTC_GETTIME: {
1463 		struct vm_rtc_time rtctime;
1464 
1465 		rtctime.secs = vrtc_get_time(sc->vmm_vm);
1466 		if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1467 			error = EFAULT;
1468 			break;
1469 		}
1470 		break;
1471 	}
1472 
1473 	case VM_PMTMR_LOCATE: {
1474 		uint16_t port = arg;
1475 		error = vpmtmr_set_location(sc->vmm_vm, port);
1476 		break;
1477 	}
1478 
1479 	case VM_RESTART_INSTRUCTION:
1480 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1481 		break;
1482 
1483 	case VM_SET_TOPOLOGY: {
1484 		struct vm_cpu_topology topo;
1485 
1486 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1487 			error = EFAULT;
1488 			break;
1489 		}
1490 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1491 		    topo.threads, topo.maxcpus);
1492 		break;
1493 	}
1494 	case VM_GET_TOPOLOGY: {
1495 		struct vm_cpu_topology topo;
1496 
1497 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1498 		    &topo.threads, &topo.maxcpus);
1499 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1500 			error = EFAULT;
1501 			break;
1502 		}
1503 		break;
1504 	}
1505 	case VM_DEVMEM_GETOFFSET: {
1506 		struct vm_devmem_offset vdo;
1507 		vmm_devmem_entry_t *de;
1508 
1509 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1510 			error = EFAULT;
1511 			break;
1512 		}
1513 
1514 		de = vmmdev_devmem_find(sc, vdo.segid);
1515 		if (de != NULL) {
1516 			vdo.offset = de->vde_off;
1517 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1518 				error = EFAULT;
1519 			}
1520 		} else {
1521 			error = ENOENT;
1522 		}
1523 		break;
1524 	}
1525 	case VM_TRACK_DIRTY_PAGES: {
1526 		const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1527 		struct vmm_dirty_tracker tracker;
1528 		uint8_t *bitmap;
1529 		size_t len;
1530 
1531 		if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1532 			error = EFAULT;
1533 			break;
1534 		}
1535 		if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1536 			error = EINVAL;
1537 			break;
1538 		}
1539 		if (tracker.vdt_len == 0) {
1540 			break;
1541 		}
1542 		if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1543 			error = EINVAL;
1544 			break;
1545 		}
1546 		if (tracker.vdt_len > max_track_region_len) {
1547 			error = EINVAL;
1548 			break;
1549 		}
1550 		len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1551 		bitmap = kmem_zalloc(len, KM_SLEEP);
1552 		vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1553 		    tracker.vdt_len, bitmap);
1554 		if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1555 			error = EFAULT;
1556 		}
1557 		kmem_free(bitmap, len);
1558 
1559 		break;
1560 	}
1561 	case VM_WRLOCK_CYCLE: {
1562 		/*
1563 		 * Present a test mechanism to acquire/release the write lock
1564 		 * on the VM without any other effects.
1565 		 */
1566 		break;
1567 	}
1568 	case VM_DATA_READ: {
1569 		struct vm_data_xfer vdx;
1570 
1571 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1572 			error = EFAULT;
1573 			break;
1574 		}
1575 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1576 			error = EINVAL;
1577 			break;
1578 		}
1579 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1580 			error = EFBIG;
1581 			break;
1582 		}
1583 
1584 		const size_t len = vdx.vdx_len;
1585 		void *buf = NULL;
1586 		if (len != 0) {
1587 			buf = kmem_alloc(len, KM_SLEEP);
1588 			if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 &&
1589 			    ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1590 				kmem_free(buf, len);
1591 				error = EFAULT;
1592 				break;
1593 			} else {
1594 				bzero(buf, len);
1595 			}
1596 		}
1597 
1598 		vdx.vdx_result_len = 0;
1599 		vmm_data_req_t req = {
1600 			.vdr_class = vdx.vdx_class,
1601 			.vdr_version = vdx.vdx_version,
1602 			.vdr_flags = vdx.vdx_flags,
1603 			.vdr_len = len,
1604 			.vdr_data = buf,
1605 			.vdr_result_len = &vdx.vdx_result_len,
1606 		};
1607 		error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req);
1608 
1609 		if (error == 0 && buf != NULL) {
1610 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1611 				error = EFAULT;
1612 			}
1613 		}
1614 
1615 		/*
1616 		 * Copy out the transfer request so that the value of
1617 		 * vdx_result_len can be made available, regardless of any
1618 		 * error(s) which may have occurred.
1619 		 */
1620 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1621 			error = (error != 0) ? error : EFAULT;
1622 		}
1623 
1624 		if (buf != NULL) {
1625 			kmem_free(buf, len);
1626 		}
1627 		break;
1628 	}
1629 	case VM_DATA_WRITE: {
1630 		struct vm_data_xfer vdx;
1631 
1632 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1633 			error = EFAULT;
1634 			break;
1635 		}
1636 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1637 			error = EINVAL;
1638 			break;
1639 		}
1640 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1641 			error = EFBIG;
1642 			break;
1643 		}
1644 
1645 		const size_t len = vdx.vdx_len;
1646 		void *buf = NULL;
1647 		if (len != 0) {
1648 			buf = kmem_alloc(len, KM_SLEEP);
1649 			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1650 				kmem_free(buf, len);
1651 				error = EFAULT;
1652 				break;
1653 			}
1654 		}
1655 
1656 		vdx.vdx_result_len = 0;
1657 		vmm_data_req_t req = {
1658 			.vdr_class = vdx.vdx_class,
1659 			.vdr_version = vdx.vdx_version,
1660 			.vdr_flags = vdx.vdx_flags,
1661 			.vdr_len = len,
1662 			.vdr_data = buf,
1663 			.vdr_result_len = &vdx.vdx_result_len,
1664 		};
1665 		if (vmm_allow_state_writes == 0) {
1666 			/* XXX: Play it safe for now */
1667 			error = EPERM;
1668 		} else {
1669 			error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid,
1670 			    &req);
1671 		}
1672 
1673 		if (error == 0 && buf != NULL &&
1674 		    (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1675 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1676 				error = EFAULT;
1677 			}
1678 		}
1679 
1680 		/*
1681 		 * Copy out the transfer request so that the value of
1682 		 * vdx_result_len can be made available, regardless of any
1683 		 * error(s) which may have occurred.
1684 		 */
1685 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1686 			error = (error != 0) ? error : EFAULT;
1687 		}
1688 
1689 		if (buf != NULL) {
1690 			kmem_free(buf, len);
1691 		}
1692 		break;
1693 	}
1694 
1695 	default:
1696 		error = ENOTTY;
1697 		break;
1698 	}
1699 
1700 	/* Release exclusion resources */
1701 	switch (lock_type) {
1702 	case LOCK_NONE:
1703 		break;
1704 	case LOCK_VCPU:
1705 		vcpu_unlock_one(sc, vcpu);
1706 		break;
1707 	case LOCK_READ_HOLD:
1708 		vmm_read_unlock(sc);
1709 		break;
1710 	case LOCK_WRITE_HOLD:
1711 		vmm_write_unlock(sc);
1712 		break;
1713 	default:
1714 		panic("unexpected lock type");
1715 		break;
1716 	}
1717 
1718 	return (error);
1719 }
1720 
1721 static vmm_softc_t *
1722 vmm_lookup(const char *name)
1723 {
1724 	list_t *vml = &vmm_list;
1725 	vmm_softc_t *sc;
1726 
1727 	ASSERT(MUTEX_HELD(&vmm_mtx));
1728 
1729 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1730 		if (strcmp(sc->vmm_name, name) == 0) {
1731 			break;
1732 		}
1733 	}
1734 
1735 	return (sc);
1736 }
1737 
1738 /*
1739  * Acquire an HMA registration if not already held.
1740  */
1741 static boolean_t
1742 vmm_hma_acquire(void)
1743 {
1744 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1745 
1746 	mutex_enter(&vmmdev_mtx);
1747 
1748 	if (vmmdev_hma_reg == NULL) {
1749 		VERIFY3U(vmmdev_hma_ref, ==, 0);
1750 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1751 		if (vmmdev_hma_reg == NULL) {
1752 			cmn_err(CE_WARN, "%s HMA registration failed.",
1753 			    vmmdev_hvm_name);
1754 			mutex_exit(&vmmdev_mtx);
1755 			return (B_FALSE);
1756 		}
1757 	}
1758 
1759 	vmmdev_hma_ref++;
1760 
1761 	mutex_exit(&vmmdev_mtx);
1762 
1763 	return (B_TRUE);
1764 }
1765 
1766 /*
1767  * Release the HMA registration if held and there are no remaining VMs.
1768  */
1769 static void
1770 vmm_hma_release(void)
1771 {
1772 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1773 
1774 	mutex_enter(&vmmdev_mtx);
1775 
1776 	VERIFY3U(vmmdev_hma_ref, !=, 0);
1777 
1778 	vmmdev_hma_ref--;
1779 
1780 	if (vmmdev_hma_ref == 0) {
1781 		VERIFY(vmmdev_hma_reg != NULL);
1782 		hma_unregister(vmmdev_hma_reg);
1783 		vmmdev_hma_reg = NULL;
1784 	}
1785 	mutex_exit(&vmmdev_mtx);
1786 }
1787 
1788 static int
1789 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
1790 {
1791 	vmm_softc_t	*sc = NULL;
1792 	minor_t		minor;
1793 	int		error = ENOMEM;
1794 	size_t		len;
1795 	const char	*name = req->name;
1796 
1797 	len = strnlen(name, VM_MAX_NAMELEN);
1798 	if (len == 0) {
1799 		return (EINVAL);
1800 	}
1801 	if (len >= VM_MAX_NAMELEN) {
1802 		return (ENAMETOOLONG);
1803 	}
1804 	if (strchr(name, '/') != NULL) {
1805 		return (EINVAL);
1806 	}
1807 
1808 	if (!vmm_hma_acquire())
1809 		return (ENXIO);
1810 
1811 	mutex_enter(&vmm_mtx);
1812 
1813 	/* Look for duplicate names */
1814 	if (vmm_lookup(name) != NULL) {
1815 		mutex_exit(&vmm_mtx);
1816 		vmm_hma_release();
1817 		return (EEXIST);
1818 	}
1819 
1820 	/* Allow only one instance per non-global zone. */
1821 	if (!INGLOBALZONE(curproc)) {
1822 		for (sc = list_head(&vmm_list); sc != NULL;
1823 		    sc = list_next(&vmm_list, sc)) {
1824 			if (sc->vmm_zone == curzone) {
1825 				mutex_exit(&vmm_mtx);
1826 				vmm_hma_release();
1827 				return (EINVAL);
1828 			}
1829 		}
1830 	}
1831 
1832 	minor = id_alloc(vmm_minors);
1833 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1834 		goto fail;
1835 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1836 		ddi_soft_state_free(vmm_statep, minor);
1837 		goto fail;
1838 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1839 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
1840 		goto fail;
1841 	}
1842 
1843 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1844 		goto fail;
1845 	}
1846 
1847 	error = vm_create(req->flags, &sc->vmm_vm);
1848 	if (error == 0) {
1849 		/* Complete VM intialization and report success. */
1850 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1851 		sc->vmm_minor = minor;
1852 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1853 		    offsetof(vmm_devmem_entry_t, vde_node));
1854 
1855 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1856 		    offsetof(vmm_hold_t, vmh_node));
1857 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1858 
1859 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1860 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1861 		    offsetof(vmm_lease_t, vml_node));
1862 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1863 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1864 
1865 		sc->vmm_zone = crgetzone(cr);
1866 		zone_hold(sc->vmm_zone);
1867 		vmm_zsd_add_vm(sc);
1868 		vmm_kstat_init(sc);
1869 
1870 		list_insert_tail(&vmm_list, sc);
1871 		mutex_exit(&vmm_mtx);
1872 		return (0);
1873 	}
1874 
1875 	vmm_kstat_fini(sc);
1876 	ddi_remove_minor_node(vmmdev_dip, name);
1877 fail:
1878 	id_free(vmm_minors, minor);
1879 	if (sc != NULL) {
1880 		ddi_soft_state_free(vmm_statep, minor);
1881 	}
1882 	mutex_exit(&vmm_mtx);
1883 	vmm_hma_release();
1884 
1885 	return (error);
1886 }
1887 
1888 /*
1889  * Bhyve 'Driver' Interface
1890  *
1891  * While many devices are emulated in the bhyve userspace process, there are
1892  * others with performance constraints which require that they run mostly or
1893  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
1894  * needed so they can query/manipulate the portions of VM state needed to
1895  * fulfill their purpose.
1896  *
1897  * This includes:
1898  * - Translating guest-physical addresses to host-virtual pointers
1899  * - Injecting MSIs
1900  * - Hooking IO port addresses
1901  *
1902  * The vmm_drv interface exists to provide that functionality to its consumers.
1903  * (At this time, 'viona' is the only user)
1904  */
1905 int
1906 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1907 {
1908 	vnode_t *vp = fp->f_vnode;
1909 	const dev_t dev = vp->v_rdev;
1910 	vmm_softc_t *sc;
1911 	vmm_hold_t *hold;
1912 	int err = 0;
1913 
1914 	if (vp->v_type != VCHR) {
1915 		return (ENXIO);
1916 	}
1917 	const major_t major = getmajor(dev);
1918 	const minor_t minor = getminor(dev);
1919 
1920 	mutex_enter(&vmmdev_mtx);
1921 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1922 		mutex_exit(&vmmdev_mtx);
1923 		return (ENOENT);
1924 	}
1925 	mutex_enter(&vmm_mtx);
1926 	mutex_exit(&vmmdev_mtx);
1927 
1928 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1929 		err = ENOENT;
1930 		goto out;
1931 	}
1932 	/* XXXJOY: check cred permissions against instance */
1933 
1934 	if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1935 		err = EBUSY;
1936 		goto out;
1937 	}
1938 
1939 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1940 	hold->vmh_sc = sc;
1941 	hold->vmh_release_req = B_FALSE;
1942 
1943 	list_insert_tail(&sc->vmm_holds, hold);
1944 	sc->vmm_flags |= VMM_HELD;
1945 	*holdp = hold;
1946 
1947 out:
1948 	mutex_exit(&vmm_mtx);
1949 	return (err);
1950 }
1951 
1952 void
1953 vmm_drv_rele(vmm_hold_t *hold)
1954 {
1955 	vmm_softc_t *sc;
1956 	boolean_t hma_release = B_FALSE;
1957 
1958 	ASSERT(hold != NULL);
1959 	ASSERT(hold->vmh_sc != NULL);
1960 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
1961 
1962 	mutex_enter(&vmm_mtx);
1963 	sc = hold->vmh_sc;
1964 	list_remove(&sc->vmm_holds, hold);
1965 	if (list_is_empty(&sc->vmm_holds)) {
1966 		sc->vmm_flags &= ~VMM_HELD;
1967 		cv_broadcast(&sc->vmm_cv);
1968 
1969 		/*
1970 		 * If pending hold(s) had prevented an auto-destruct of the
1971 		 * instance when it was closed, finish that clean-up now.
1972 		 */
1973 		if (sc->vmm_autodestruct && !sc->vmm_is_open) {
1974 			int err = vmm_destroy_locked(sc,
1975 			    VDO_NO_PURGE_WAIT, &hma_release);
1976 
1977 			VERIFY0(err);
1978 			VERIFY(hma_release);
1979 		}
1980 	}
1981 	mutex_exit(&vmm_mtx);
1982 	kmem_free(hold, sizeof (*hold));
1983 
1984 	if (hma_release) {
1985 		vmm_hma_release();
1986 	}
1987 }
1988 
1989 boolean_t
1990 vmm_drv_release_reqd(vmm_hold_t *hold)
1991 {
1992 	ASSERT(hold != NULL);
1993 
1994 	return (hold->vmh_release_req);
1995 }
1996 
1997 vmm_lease_t *
1998 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1999 {
2000 	vmm_softc_t *sc = hold->vmh_sc;
2001 	vmm_lease_t *lease;
2002 
2003 	ASSERT3P(expiref, !=, NULL);
2004 
2005 	if (hold->vmh_release_req) {
2006 		return (NULL);
2007 	}
2008 
2009 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
2010 	list_link_init(&lease->vml_node);
2011 	lease->vml_expire_func = expiref;
2012 	lease->vml_expire_arg = arg;
2013 	lease->vml_expired = B_FALSE;
2014 	lease->vml_break_deferred = B_FALSE;
2015 	lease->vml_hold = hold;
2016 	/* cache the VM pointer for one less pointer chase */
2017 	lease->vml_vm = sc->vmm_vm;
2018 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
2019 
2020 	mutex_enter(&sc->vmm_lease_lock);
2021 	while (sc->vmm_lease_blocker != 0) {
2022 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2023 	}
2024 	list_insert_tail(&sc->vmm_lease_list, lease);
2025 	vmm_read_lock(sc);
2026 	mutex_exit(&sc->vmm_lease_lock);
2027 
2028 	return (lease);
2029 }
2030 
2031 static void
2032 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
2033 {
2034 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
2035 
2036 	list_remove(&sc->vmm_lease_list, lease);
2037 	vmm_read_unlock(sc);
2038 	vmc_destroy(lease->vml_vmclient);
2039 	kmem_free(lease, sizeof (*lease));
2040 }
2041 
2042 static void
2043 vmm_lease_block(vmm_softc_t *sc)
2044 {
2045 	mutex_enter(&sc->vmm_lease_lock);
2046 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
2047 	sc->vmm_lease_blocker++;
2048 	if (sc->vmm_lease_blocker == 1) {
2049 		list_t *list = &sc->vmm_lease_list;
2050 		vmm_lease_t *lease = list_head(list);
2051 
2052 		while (lease != NULL) {
2053 			void *arg = lease->vml_expire_arg;
2054 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
2055 			boolean_t sync_break = B_FALSE;
2056 
2057 			/*
2058 			 * Since the lease expiration notification may
2059 			 * need to take locks which would deadlock with
2060 			 * vmm_lease_lock, drop it across the call.
2061 			 *
2062 			 * We are the only one allowed to manipulate
2063 			 * vmm_lease_list right now, so it is safe to
2064 			 * continue iterating through it after
2065 			 * reacquiring the lock.
2066 			 */
2067 			lease->vml_expired = B_TRUE;
2068 			mutex_exit(&sc->vmm_lease_lock);
2069 			sync_break = expiref(arg);
2070 			mutex_enter(&sc->vmm_lease_lock);
2071 
2072 			if (sync_break) {
2073 				vmm_lease_t *next;
2074 
2075 				/*
2076 				 * These leases which are synchronously broken
2077 				 * result in vmm_read_unlock() calls from a
2078 				 * different thread than the corresponding
2079 				 * vmm_read_lock().  This is acceptable, given
2080 				 * that the rwlock underpinning the whole
2081 				 * mechanism tolerates the behavior.  This
2082 				 * flexibility is _only_ afforded to VM read
2083 				 * lock (RW_READER) holders.
2084 				 */
2085 				next = list_next(list, lease);
2086 				vmm_lease_break_locked(sc, lease);
2087 				lease = next;
2088 			} else {
2089 				lease = list_next(list, lease);
2090 			}
2091 		}
2092 
2093 		/* Process leases which were not broken synchronously. */
2094 		while (!list_is_empty(list)) {
2095 			/*
2096 			 * Although the nested loops are quadratic, the number
2097 			 * of leases is small.
2098 			 */
2099 			lease = list_head(list);
2100 			while (lease != NULL) {
2101 				vmm_lease_t *next = list_next(list, lease);
2102 				if (lease->vml_break_deferred) {
2103 					vmm_lease_break_locked(sc, lease);
2104 				}
2105 				lease = next;
2106 			}
2107 			if (list_is_empty(list)) {
2108 				break;
2109 			}
2110 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2111 		}
2112 		/* Wake anyone else waiting for the lease list to be empty  */
2113 		cv_broadcast(&sc->vmm_lease_cv);
2114 	} else {
2115 		list_t *list = &sc->vmm_lease_list;
2116 
2117 		/*
2118 		 * Some other thread beat us to the duty of lease cleanup.
2119 		 * Wait until that is complete.
2120 		 */
2121 		while (!list_is_empty(list)) {
2122 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2123 		}
2124 	}
2125 	mutex_exit(&sc->vmm_lease_lock);
2126 }
2127 
2128 static void
2129 vmm_lease_unblock(vmm_softc_t *sc)
2130 {
2131 	mutex_enter(&sc->vmm_lease_lock);
2132 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2133 	sc->vmm_lease_blocker--;
2134 	if (sc->vmm_lease_blocker == 0) {
2135 		cv_broadcast(&sc->vmm_lease_cv);
2136 	}
2137 	mutex_exit(&sc->vmm_lease_lock);
2138 }
2139 
2140 void
2141 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2142 {
2143 	vmm_softc_t *sc = hold->vmh_sc;
2144 
2145 	VERIFY3P(hold, ==, lease->vml_hold);
2146 	VERIFY(!lease->vml_break_deferred);
2147 
2148 	mutex_enter(&sc->vmm_lease_lock);
2149 	if (sc->vmm_lease_blocker == 0) {
2150 		vmm_lease_break_locked(sc, lease);
2151 	} else {
2152 		/*
2153 		 * Defer the lease-breaking to whichever thread is currently
2154 		 * cleaning up all leases as part of a vmm_lease_block() call.
2155 		 */
2156 		lease->vml_break_deferred = B_TRUE;
2157 		cv_broadcast(&sc->vmm_lease_cv);
2158 	}
2159 	mutex_exit(&sc->vmm_lease_lock);
2160 }
2161 
2162 boolean_t
2163 vmm_drv_lease_expired(vmm_lease_t *lease)
2164 {
2165 	return (lease->vml_expired);
2166 }
2167 
2168 vmm_page_t *
2169 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2170 {
2171 	ASSERT(lease != NULL);
2172 	ASSERT0(gpa & PAGEOFFSET);
2173 
2174 	return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2175 }
2176 
2177 void
2178 vmm_drv_page_release(vmm_page_t *vmmp)
2179 {
2180 	(void) vmp_release((vm_page_t *)vmmp);
2181 }
2182 
2183 void
2184 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2185 {
2186 	(void) vmp_release_chain((vm_page_t *)vmmp);
2187 }
2188 
2189 const void *
2190 vmm_drv_page_readable(const vmm_page_t *vmmp)
2191 {
2192 	return (vmp_get_readable((const vm_page_t *)vmmp));
2193 }
2194 
2195 void *
2196 vmm_drv_page_writable(const vmm_page_t *vmmp)
2197 {
2198 	return (vmp_get_writable((const vm_page_t *)vmmp));
2199 }
2200 
2201 void
2202 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2203 {
2204 	vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2205 }
2206 
2207 vmm_page_t *
2208 vmm_drv_page_next(const vmm_page_t *vmmp)
2209 {
2210 	return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2211 }
2212 
2213 int
2214 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2215 {
2216 	ASSERT(lease != NULL);
2217 
2218 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
2219 }
2220 
2221 int
2222 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2223     void *arg, void **cookie)
2224 {
2225 	vmm_softc_t *sc;
2226 	int err;
2227 
2228 	ASSERT(hold != NULL);
2229 	ASSERT(cookie != NULL);
2230 
2231 	sc = hold->vmh_sc;
2232 	mutex_enter(&vmm_mtx);
2233 	/* Confirm that hook installation is not blocked */
2234 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2235 		mutex_exit(&vmm_mtx);
2236 		return (EBUSY);
2237 	}
2238 	/*
2239 	 * Optimistically record an installed hook which will prevent a block
2240 	 * from being asserted while the mutex is dropped.
2241 	 */
2242 	hold->vmh_ioport_hook_cnt++;
2243 	mutex_exit(&vmm_mtx);
2244 
2245 	vmm_write_lock(sc);
2246 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2247 	    arg, cookie);
2248 	vmm_write_unlock(sc);
2249 
2250 	if (err != 0) {
2251 		mutex_enter(&vmm_mtx);
2252 		/* Walk back optimism about the hook installation */
2253 		hold->vmh_ioport_hook_cnt--;
2254 		mutex_exit(&vmm_mtx);
2255 	}
2256 	return (err);
2257 }
2258 
2259 void
2260 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2261 {
2262 	vmm_softc_t *sc;
2263 
2264 	ASSERT(hold != NULL);
2265 	ASSERT(cookie != NULL);
2266 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
2267 
2268 	sc = hold->vmh_sc;
2269 	vmm_write_lock(sc);
2270 	vm_ioport_unhook(sc->vmm_vm, cookie);
2271 	vmm_write_unlock(sc);
2272 
2273 	mutex_enter(&vmm_mtx);
2274 	hold->vmh_ioport_hook_cnt--;
2275 	mutex_exit(&vmm_mtx);
2276 }
2277 
2278 static int
2279 vmm_drv_purge(vmm_softc_t *sc, boolean_t no_wait)
2280 {
2281 	ASSERT(MUTEX_HELD(&vmm_mtx));
2282 
2283 	if ((sc->vmm_flags & VMM_HELD) != 0) {
2284 		vmm_hold_t *hold;
2285 
2286 		sc->vmm_flags |= VMM_CLEANUP;
2287 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2288 		    hold = list_next(&sc->vmm_holds, hold)) {
2289 			hold->vmh_release_req = B_TRUE;
2290 		}
2291 
2292 		/*
2293 		 * Require that all leases on the instance be broken, now that
2294 		 * all associated holds have been marked as needing release.
2295 		 *
2296 		 * Dropping vmm_mtx is not strictly necessary, but if any of the
2297 		 * lessees are slow to respond, it would be nice to leave it
2298 		 * available for other parties.
2299 		 */
2300 		mutex_exit(&vmm_mtx);
2301 		vmm_lease_block(sc);
2302 		vmm_lease_unblock(sc);
2303 		mutex_enter(&vmm_mtx);
2304 
2305 		/*
2306 		 * With all of the leases broken, we can proceed in an orderly
2307 		 * fashion to waiting for any lingering holds to be dropped.
2308 		 */
2309 		while ((sc->vmm_flags & VMM_HELD) != 0) {
2310 			/*
2311 			 * Some holds remain, so wait (if acceptable) for them
2312 			 * to be cleaned up.
2313 			 */
2314 			if (no_wait ||
2315 			    cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2316 				sc->vmm_flags &= ~VMM_CLEANUP;
2317 				return (EINTR);
2318 			}
2319 		}
2320 		sc->vmm_flags &= ~VMM_CLEANUP;
2321 	}
2322 
2323 	VERIFY(list_is_empty(&sc->vmm_holds));
2324 	sc->vmm_flags |= VMM_PURGED;
2325 	return (0);
2326 }
2327 
2328 static int
2329 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2330 {
2331 	int err = 0;
2332 
2333 	mutex_enter(&vmm_mtx);
2334 	if (!enable_block) {
2335 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2336 
2337 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2338 		goto done;
2339 	}
2340 
2341 	/* If any holds have hooks installed, the block is a failure */
2342 	if (!list_is_empty(&sc->vmm_holds)) {
2343 		vmm_hold_t *hold;
2344 
2345 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2346 		    hold = list_next(&sc->vmm_holds, hold)) {
2347 			if (hold->vmh_ioport_hook_cnt != 0) {
2348 				err = EBUSY;
2349 				goto done;
2350 			}
2351 		}
2352 	}
2353 	sc->vmm_flags |= VMM_BLOCK_HOOK;
2354 
2355 done:
2356 	mutex_exit(&vmm_mtx);
2357 	return (err);
2358 }
2359 
2360 static int
2361 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts,
2362     boolean_t *hma_release)
2363 {
2364 	dev_info_t	*pdip = ddi_get_parent(vmmdev_dip);
2365 	minor_t		minor;
2366 
2367 	ASSERT(MUTEX_HELD(&vmm_mtx));
2368 
2369 	*hma_release = B_FALSE;
2370 
2371 	if (vmm_drv_purge(sc, (opts & VDO_NO_PURGE_WAIT) != 0) != 0) {
2372 		return (EINTR);
2373 	}
2374 
2375 	if ((opts & VDO_NO_CLEAN_ZSD) == 0) {
2376 		vmm_zsd_rem_vm(sc);
2377 	}
2378 
2379 	/* Clean up devmem entries */
2380 	vmmdev_devmem_purge(sc);
2381 
2382 	list_remove(&vmm_list, sc);
2383 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2384 	minor = sc->vmm_minor;
2385 	zone_rele(sc->vmm_zone);
2386 	if (sc->vmm_is_open) {
2387 		list_insert_tail(&vmm_destroy_list, sc);
2388 		sc->vmm_flags |= VMM_DESTROY;
2389 	} else {
2390 		vmm_kstat_fini(sc);
2391 		vm_destroy(sc->vmm_vm);
2392 		ddi_soft_state_free(vmm_statep, minor);
2393 		id_free(vmm_minors, minor);
2394 		*hma_release = B_TRUE;
2395 	}
2396 	(void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
2397 
2398 	return (0);
2399 }
2400 
2401 int
2402 vmm_zone_vm_destroy(vmm_softc_t *sc)
2403 {
2404 	boolean_t	hma_release = B_FALSE;
2405 	int		err;
2406 
2407 	mutex_enter(&vmm_mtx);
2408 	err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release);
2409 	mutex_exit(&vmm_mtx);
2410 
2411 	if (hma_release)
2412 		vmm_hma_release();
2413 
2414 	return (err);
2415 }
2416 
2417 /* ARGSUSED */
2418 static int
2419 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2420 {
2421 	boolean_t	hma_release = B_FALSE;
2422 	vmm_softc_t	*sc;
2423 	int		err;
2424 
2425 	if (crgetuid(cr) != 0)
2426 		return (EPERM);
2427 
2428 	mutex_enter(&vmm_mtx);
2429 
2430 	if ((sc = vmm_lookup(req->name)) == NULL) {
2431 		mutex_exit(&vmm_mtx);
2432 		return (ENOENT);
2433 	}
2434 	/*
2435 	 * We don't check this in vmm_lookup() since that function is also used
2436 	 * for validation during create and currently vmm names must be unique.
2437 	 */
2438 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2439 		mutex_exit(&vmm_mtx);
2440 		return (EPERM);
2441 	}
2442 	err = vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release);
2443 
2444 	mutex_exit(&vmm_mtx);
2445 
2446 	if (hma_release)
2447 		vmm_hma_release();
2448 
2449 	return (err);
2450 }
2451 
2452 #define	VCPU_NAME_BUFLEN	32
2453 
2454 static int
2455 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2456 {
2457 	zoneid_t zid = crgetzoneid(cr);
2458 	int instance = minor;
2459 	kstat_t *ksp;
2460 
2461 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2462 
2463 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2464 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2465 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2466 
2467 	if (ksp == NULL) {
2468 		return (-1);
2469 	}
2470 	sc->vmm_kstat_vm = ksp;
2471 
2472 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2473 		char namebuf[VCPU_NAME_BUFLEN];
2474 
2475 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2476 
2477 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2478 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2479 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2480 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2481 		    0, zid);
2482 		if (ksp == NULL) {
2483 			goto fail;
2484 		}
2485 
2486 		sc->vmm_kstat_vcpu[i] = ksp;
2487 	}
2488 
2489 	/*
2490 	 * If this instance is associated with a non-global zone, make its
2491 	 * kstats visible from the GZ.
2492 	 */
2493 	if (zid != GLOBAL_ZONEID) {
2494 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2495 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2496 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2497 		}
2498 	}
2499 
2500 	return (0);
2501 
2502 fail:
2503 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2504 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2505 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2506 			sc->vmm_kstat_vcpu[i] = NULL;
2507 		} else {
2508 			break;
2509 		}
2510 	}
2511 	kstat_delete(sc->vmm_kstat_vm);
2512 	sc->vmm_kstat_vm = NULL;
2513 	return (-1);
2514 }
2515 
2516 static void
2517 vmm_kstat_init(vmm_softc_t *sc)
2518 {
2519 	kstat_t *ksp;
2520 
2521 	ASSERT3P(sc->vmm_vm, !=, NULL);
2522 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2523 
2524 	ksp = sc->vmm_kstat_vm;
2525 	vmm_kstats_t *vk = ksp->ks_data;
2526 	ksp->ks_private = sc->vmm_vm;
2527 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2528 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2529 
2530 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2531 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2532 
2533 		ksp = sc->vmm_kstat_vcpu[i];
2534 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2535 
2536 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2537 		vvk->vvk_vcpu.value.ui32 = i;
2538 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2539 		    KSTAT_DATA_UINT64);
2540 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2541 		    KSTAT_DATA_UINT64);
2542 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2543 		    KSTAT_DATA_UINT64);
2544 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2545 		    KSTAT_DATA_UINT64);
2546 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2547 		    KSTAT_DATA_UINT64);
2548 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2549 		    KSTAT_DATA_UINT64);
2550 		ksp->ks_private = sc->vmm_vm;
2551 		ksp->ks_update = vmm_kstat_update_vcpu;
2552 	}
2553 
2554 	kstat_install(sc->vmm_kstat_vm);
2555 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2556 		kstat_install(sc->vmm_kstat_vcpu[i]);
2557 	}
2558 }
2559 
2560 static void
2561 vmm_kstat_fini(vmm_softc_t *sc)
2562 {
2563 	ASSERT(sc->vmm_kstat_vm != NULL);
2564 
2565 	kstat_delete(sc->vmm_kstat_vm);
2566 	sc->vmm_kstat_vm = NULL;
2567 
2568 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2569 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2570 
2571 		kstat_delete(sc->vmm_kstat_vcpu[i]);
2572 		sc->vmm_kstat_vcpu[i] = NULL;
2573 	}
2574 }
2575 
2576 static int
2577 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2578 {
2579 	minor_t		minor;
2580 	vmm_softc_t	*sc;
2581 
2582 	/*
2583 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2584 	 * verified to be safe.
2585 	 */
2586 	if (curproc->p_model != DATAMODEL_LP64) {
2587 		return (EFBIG);
2588 	}
2589 
2590 	minor = getminor(*devp);
2591 	if (minor == VMM_CTL_MINOR) {
2592 		/*
2593 		 * Master control device must be opened exclusively.
2594 		 */
2595 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2596 			return (EINVAL);
2597 		}
2598 
2599 		return (0);
2600 	}
2601 
2602 	mutex_enter(&vmm_mtx);
2603 	sc = ddi_get_soft_state(vmm_statep, minor);
2604 	if (sc == NULL) {
2605 		mutex_exit(&vmm_mtx);
2606 		return (ENXIO);
2607 	}
2608 
2609 	sc->vmm_is_open = B_TRUE;
2610 	mutex_exit(&vmm_mtx);
2611 
2612 	return (0);
2613 }
2614 
2615 static int
2616 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2617 {
2618 	minor_t		minor;
2619 	vmm_softc_t	*sc;
2620 	boolean_t	hma_release = B_FALSE;
2621 
2622 	minor = getminor(dev);
2623 	if (minor == VMM_CTL_MINOR)
2624 		return (0);
2625 
2626 	mutex_enter(&vmm_mtx);
2627 	sc = ddi_get_soft_state(vmm_statep, minor);
2628 	if (sc == NULL) {
2629 		mutex_exit(&vmm_mtx);
2630 		return (ENXIO);
2631 	}
2632 
2633 	VERIFY(sc->vmm_is_open);
2634 	sc->vmm_is_open = B_FALSE;
2635 
2636 	/*
2637 	 * If this VM was destroyed while the vmm device was open, then
2638 	 * clean it up now that it is closed.
2639 	 */
2640 	if (sc->vmm_flags & VMM_DESTROY) {
2641 		list_remove(&vmm_destroy_list, sc);
2642 		vmm_kstat_fini(sc);
2643 		vm_destroy(sc->vmm_vm);
2644 		ddi_soft_state_free(vmm_statep, minor);
2645 		id_free(vmm_minors, minor);
2646 		hma_release = B_TRUE;
2647 	} else if (sc->vmm_autodestruct) {
2648 		/*
2649 		 * Attempt auto-destruct on instance if requested.
2650 		 *
2651 		 * Do not wait for existing holds to be purged from the
2652 		 * instance, since there is no guarantee that will happen in a
2653 		 * timely manner.  Auto-destruction will resume when the last
2654 		 * hold is released. (See: vmm_drv_rele)
2655 		 */
2656 		(void) vmm_destroy_locked(sc, VDO_NO_PURGE_WAIT, &hma_release);
2657 	}
2658 	mutex_exit(&vmm_mtx);
2659 
2660 	if (hma_release)
2661 		vmm_hma_release();
2662 
2663 	return (0);
2664 }
2665 
2666 static int
2667 vmm_is_supported(intptr_t arg)
2668 {
2669 	int r;
2670 	const char *msg;
2671 
2672 	if (vmm_is_intel()) {
2673 		r = vmx_x86_supported(&msg);
2674 	} else if (vmm_is_svm()) {
2675 		/*
2676 		 * HMA already ensured that the features necessary for SVM
2677 		 * operation were present and online during vmm_attach().
2678 		 */
2679 		r = 0;
2680 	} else {
2681 		r = ENXIO;
2682 		msg = "Unsupported CPU vendor";
2683 	}
2684 
2685 	if (r != 0 && arg != (intptr_t)NULL) {
2686 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2687 			return (EFAULT);
2688 	}
2689 	return (r);
2690 }
2691 
2692 static int
2693 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
2694 {
2695 	void *argp = (void *)arg;
2696 
2697 	switch (cmd) {
2698 	case VMM_CREATE_VM: {
2699 		struct vm_create_req req;
2700 
2701 		if ((md & FWRITE) == 0) {
2702 			return (EPERM);
2703 		}
2704 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2705 			return (EFAULT);
2706 		}
2707 		return (vmmdev_do_vm_create(&req, cr));
2708 	}
2709 	case VMM_DESTROY_VM: {
2710 		struct vm_destroy_req req;
2711 
2712 		if ((md & FWRITE) == 0) {
2713 			return (EPERM);
2714 		}
2715 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2716 			return (EFAULT);
2717 		}
2718 		return (vmmdev_do_vm_destroy(&req, cr));
2719 	}
2720 	case VMM_VM_SUPPORTED:
2721 		return (vmm_is_supported(arg));
2722 	case VMM_INTERFACE_VERSION:
2723 		*rvalp = VMM_CURRENT_INTERFACE_VERSION;
2724 		return (0);
2725 	case VMM_CHECK_IOMMU:
2726 		if (!vmm_check_iommu()) {
2727 			return (ENXIO);
2728 		}
2729 		return (0);
2730 	case VMM_RESV_QUERY:
2731 	case VMM_RESV_ADD:
2732 	case VMM_RESV_REMOVE:
2733 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
2734 	default:
2735 		break;
2736 	}
2737 	/* No other actions are legal on ctl device */
2738 	return (ENOTTY);
2739 }
2740 
2741 static int
2742 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2743     int *rvalp)
2744 {
2745 	vmm_softc_t	*sc;
2746 	minor_t		minor;
2747 
2748 	/*
2749 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2750 	 * verified to be safe.
2751 	 */
2752 	if (curproc->p_model != DATAMODEL_LP64) {
2753 		return (EFBIG);
2754 	}
2755 
2756 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
2757 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2758 		return (ENOTSUP);
2759 	}
2760 
2761 	minor = getminor(dev);
2762 
2763 	if (minor == VMM_CTL_MINOR) {
2764 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
2765 	}
2766 
2767 	sc = ddi_get_soft_state(vmm_statep, minor);
2768 	ASSERT(sc);
2769 
2770 	if (sc->vmm_flags & VMM_DESTROY)
2771 		return (ENXIO);
2772 
2773 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2774 }
2775 
2776 static int
2777 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2778     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2779 {
2780 	vmm_softc_t *sc;
2781 	const minor_t minor = getminor(dev);
2782 	int err;
2783 
2784 	if (minor == VMM_CTL_MINOR) {
2785 		return (ENODEV);
2786 	}
2787 	if (off < 0 || (off + len) <= 0) {
2788 		return (EINVAL);
2789 	}
2790 	if ((prot & PROT_USER) == 0) {
2791 		return (EACCES);
2792 	}
2793 
2794 	sc = ddi_get_soft_state(vmm_statep, minor);
2795 	ASSERT(sc);
2796 
2797 	if (sc->vmm_flags & VMM_DESTROY)
2798 		return (ENXIO);
2799 
2800 	/* Grab read lock on the VM to prevent any changes to the memory map */
2801 	vmm_read_lock(sc);
2802 
2803 	if (off >= VM_DEVMEM_START) {
2804 		int segid;
2805 		off_t segoff;
2806 
2807 		/* Mapping a devmem "device" */
2808 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
2809 			err = ENODEV;
2810 		} else {
2811 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
2812 			    addrp, prot, maxprot, flags);
2813 		}
2814 	} else {
2815 		/* Mapping a part of the guest physical space */
2816 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
2817 		    maxprot, flags);
2818 	}
2819 
2820 	vmm_read_unlock(sc);
2821 	return (err);
2822 }
2823 
2824 static sdev_plugin_validate_t
2825 vmm_sdev_validate(sdev_ctx_t ctx)
2826 {
2827 	const char *name = sdev_ctx_name(ctx);
2828 	vmm_softc_t *sc;
2829 	sdev_plugin_validate_t ret;
2830 	minor_t minor;
2831 
2832 	if (sdev_ctx_vtype(ctx) != VCHR)
2833 		return (SDEV_VTOR_INVALID);
2834 
2835 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2836 
2837 	mutex_enter(&vmm_mtx);
2838 	if ((sc = vmm_lookup(name)) == NULL)
2839 		ret = SDEV_VTOR_INVALID;
2840 	else if (sc->vmm_minor != minor)
2841 		ret = SDEV_VTOR_STALE;
2842 	else
2843 		ret = SDEV_VTOR_VALID;
2844 	mutex_exit(&vmm_mtx);
2845 
2846 	return (ret);
2847 }
2848 
2849 static int
2850 vmm_sdev_filldir(sdev_ctx_t ctx)
2851 {
2852 	vmm_softc_t *sc;
2853 	int ret;
2854 
2855 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2856 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2857 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2858 		return (EINVAL);
2859 	}
2860 
2861 	mutex_enter(&vmm_mtx);
2862 	ASSERT(vmmdev_dip != NULL);
2863 	for (sc = list_head(&vmm_list); sc != NULL;
2864 	    sc = list_next(&vmm_list, sc)) {
2865 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2866 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2867 			    S_IFCHR | 0600,
2868 			    makedevice(ddi_driver_major(vmmdev_dip),
2869 			    sc->vmm_minor));
2870 		} else {
2871 			continue;
2872 		}
2873 		if (ret != 0 && ret != EEXIST)
2874 			goto out;
2875 	}
2876 
2877 	ret = 0;
2878 
2879 out:
2880 	mutex_exit(&vmm_mtx);
2881 	return (ret);
2882 }
2883 
2884 /* ARGSUSED */
2885 static void
2886 vmm_sdev_inactive(sdev_ctx_t ctx)
2887 {
2888 }
2889 
2890 static sdev_plugin_ops_t vmm_sdev_ops = {
2891 	.spo_version = SDEV_PLUGIN_VERSION,
2892 	.spo_flags = SDEV_PLUGIN_SUBDIR,
2893 	.spo_validate = vmm_sdev_validate,
2894 	.spo_filldir = vmm_sdev_filldir,
2895 	.spo_inactive = vmm_sdev_inactive
2896 };
2897 
2898 /* ARGSUSED */
2899 static int
2900 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2901 {
2902 	int error;
2903 
2904 	switch (cmd) {
2905 	case DDI_INFO_DEVT2DEVINFO:
2906 		*result = (void *)vmmdev_dip;
2907 		error = DDI_SUCCESS;
2908 		break;
2909 	case DDI_INFO_DEVT2INSTANCE:
2910 		*result = (void *)0;
2911 		error = DDI_SUCCESS;
2912 		break;
2913 	default:
2914 		error = DDI_FAILURE;
2915 		break;
2916 	}
2917 	return (error);
2918 }
2919 
2920 static int
2921 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2922 {
2923 	sdev_plugin_hdl_t sph;
2924 	hma_reg_t *reg = NULL;
2925 	boolean_t vmm_loaded = B_FALSE;
2926 
2927 	if (cmd != DDI_ATTACH) {
2928 		return (DDI_FAILURE);
2929 	}
2930 
2931 	mutex_enter(&vmmdev_mtx);
2932 	/* Ensure we are not already attached. */
2933 	if (vmmdev_dip != NULL) {
2934 		mutex_exit(&vmmdev_mtx);
2935 		return (DDI_FAILURE);
2936 	}
2937 
2938 	vmm_sol_glue_init();
2939 
2940 	/*
2941 	 * Perform temporary HMA registration to determine if the system
2942 	 * is capable.
2943 	 */
2944 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2945 		goto fail;
2946 	} else if (vmm_mod_load() != 0) {
2947 		goto fail;
2948 	}
2949 	vmm_loaded = B_TRUE;
2950 	hma_unregister(reg);
2951 	reg = NULL;
2952 
2953 	/* Create control node.  Other nodes will be created on demand. */
2954 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2955 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2956 		goto fail;
2957 	}
2958 
2959 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
2960 	if (sph == (sdev_plugin_hdl_t)NULL) {
2961 		ddi_remove_minor_node(dip, NULL);
2962 		goto fail;
2963 	}
2964 
2965 	ddi_report_dev(dip);
2966 	vmmdev_sdev_hdl = sph;
2967 	vmmdev_dip = dip;
2968 	mutex_exit(&vmmdev_mtx);
2969 	return (DDI_SUCCESS);
2970 
2971 fail:
2972 	if (vmm_loaded) {
2973 		VERIFY0(vmm_mod_unload());
2974 	}
2975 	if (reg != NULL) {
2976 		hma_unregister(reg);
2977 	}
2978 	vmm_sol_glue_cleanup();
2979 	mutex_exit(&vmmdev_mtx);
2980 	return (DDI_FAILURE);
2981 }
2982 
2983 static int
2984 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2985 {
2986 	if (cmd != DDI_DETACH) {
2987 		return (DDI_FAILURE);
2988 	}
2989 
2990 	/*
2991 	 * Ensure that all resources have been cleaned up.
2992 	 *
2993 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2994 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2995 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
2996 	 * devinfo, including our own, while holding vmmdev_mtx.
2997 	 */
2998 	if (mutex_tryenter(&vmmdev_mtx) == 0)
2999 		return (DDI_FAILURE);
3000 
3001 	mutex_enter(&vmm_mtx);
3002 	if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
3003 		mutex_exit(&vmm_mtx);
3004 		mutex_exit(&vmmdev_mtx);
3005 		return (DDI_FAILURE);
3006 	}
3007 	mutex_exit(&vmm_mtx);
3008 
3009 	if (!vmmr_is_empty()) {
3010 		mutex_exit(&vmmdev_mtx);
3011 		return (DDI_FAILURE);
3012 	}
3013 
3014 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
3015 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
3016 		mutex_exit(&vmmdev_mtx);
3017 		return (DDI_FAILURE);
3018 	}
3019 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
3020 
3021 	/* Remove the control node. */
3022 	ddi_remove_minor_node(dip, "ctl");
3023 	vmmdev_dip = NULL;
3024 
3025 	VERIFY0(vmm_mod_unload());
3026 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
3027 	vmm_sol_glue_cleanup();
3028 
3029 	mutex_exit(&vmmdev_mtx);
3030 
3031 	return (DDI_SUCCESS);
3032 }
3033 
3034 static struct cb_ops vmm_cb_ops = {
3035 	vmm_open,
3036 	vmm_close,
3037 	nodev,		/* strategy */
3038 	nodev,		/* print */
3039 	nodev,		/* dump */
3040 	nodev,		/* read */
3041 	nodev,		/* write */
3042 	vmm_ioctl,
3043 	nodev,		/* devmap */
3044 	nodev,		/* mmap */
3045 	vmm_segmap,
3046 	nochpoll,	/* poll */
3047 	ddi_prop_op,
3048 	NULL,
3049 	D_NEW | D_MP | D_DEVMAP
3050 };
3051 
3052 static struct dev_ops vmm_ops = {
3053 	DEVO_REV,
3054 	0,
3055 	vmm_info,
3056 	nulldev,	/* identify */
3057 	nulldev,	/* probe */
3058 	vmm_attach,
3059 	vmm_detach,
3060 	nodev,		/* reset */
3061 	&vmm_cb_ops,
3062 	(struct bus_ops *)NULL
3063 };
3064 
3065 static struct modldrv modldrv = {
3066 	&mod_driverops,
3067 	"bhyve vmm",
3068 	&vmm_ops
3069 };
3070 
3071 static struct modlinkage modlinkage = {
3072 	MODREV_1,
3073 	&modldrv,
3074 	NULL
3075 };
3076 
3077 int
3078 _init(void)
3079 {
3080 	int	error;
3081 
3082 	sysinit();
3083 
3084 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3085 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3086 	list_create(&vmm_list, sizeof (vmm_softc_t),
3087 	    offsetof(vmm_softc_t, vmm_node));
3088 	list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
3089 	    offsetof(vmm_softc_t, vmm_node));
3090 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3091 
3092 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3093 	if (error) {
3094 		return (error);
3095 	}
3096 
3097 	vmm_zsd_init();
3098 	vmmr_init();
3099 
3100 	error = mod_install(&modlinkage);
3101 	if (error) {
3102 		ddi_soft_state_fini(&vmm_statep);
3103 		vmm_zsd_fini();
3104 		vmmr_fini();
3105 	}
3106 
3107 	return (error);
3108 }
3109 
3110 int
3111 _fini(void)
3112 {
3113 	int	error;
3114 
3115 	error = mod_remove(&modlinkage);
3116 	if (error) {
3117 		return (error);
3118 	}
3119 
3120 	vmm_zsd_fini();
3121 	vmmr_fini();
3122 
3123 	ddi_soft_state_fini(&vmm_statep);
3124 
3125 	return (0);
3126 }
3127 
3128 int
3129 _info(struct modinfo *modinfop)
3130 {
3131 	return (mod_info(&modlinkage, modinfop));
3132 }
3133