xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision badf94ff3599fab15963f6c532929e9bc411757a)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2021 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static list_t		vmm_destroy_list;
81 static id_space_t	*vmm_minors;
82 static void		*vmm_statep;
83 
84 static const char *vmmdev_hvm_name = "bhyve";
85 
86 /* For sdev plugin (/dev) */
87 #define	VMM_SDEV_ROOT "/dev/vmm"
88 
89 /* From uts/intel/io/vmm/intel/vmx.c */
90 extern int vmx_x86_supported(const char **);
91 
92 /* Holds and hooks from drivers external to vmm */
93 struct vmm_hold {
94 	list_node_t	vmh_node;
95 	vmm_softc_t	*vmh_sc;
96 	boolean_t	vmh_release_req;
97 	uint_t		vmh_ioport_hook_cnt;
98 };
99 
100 struct vmm_lease {
101 	list_node_t		vml_node;
102 	struct vm		*vml_vm;
103 	vm_client_t		*vml_vmclient;
104 	boolean_t		vml_expired;
105 	boolean_t		vml_break_deferred;
106 	boolean_t		(*vml_expire_func)(void *);
107 	void			*vml_expire_arg;
108 	struct vmm_hold		*vml_hold;
109 };
110 
111 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
112 static void vmm_lease_block(vmm_softc_t *);
113 static void vmm_lease_unblock(vmm_softc_t *);
114 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
115 static void vmm_kstat_init(vmm_softc_t *);
116 static void vmm_kstat_fini(vmm_softc_t *);
117 
118 /*
119  * The 'devmem' hack:
120  *
121  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
122  * in the vm which appear with their own name related to the vm under /dev.
123  * Since this would be a hassle from an sdev perspective and would require a
124  * new cdev interface (or complicate the existing one), we choose to implement
125  * this in a different manner.  Direct access to the underlying vm memory
126  * segments is exposed by placing them in a range of offsets beyond the normal
127  * guest memory space.  Userspace can query the appropriate offset to mmap()
128  * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
129  */
130 
131 static vmm_devmem_entry_t *
132 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
133 {
134 	vmm_devmem_entry_t *ent = NULL;
135 	list_t *dl = &sc->vmm_devmem_list;
136 
137 	for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
138 		if (ent->vde_segid == segid) {
139 			return (ent);
140 		}
141 	}
142 	return (NULL);
143 }
144 
145 static int
146 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
147 {
148 	int error;
149 	bool sysmem;
150 
151 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
152 	    NULL);
153 	if (error || mseg->len == 0)
154 		return (error);
155 
156 	if (!sysmem) {
157 		vmm_devmem_entry_t *de;
158 
159 		de = vmmdev_devmem_find(sc, mseg->segid);
160 		if (de != NULL) {
161 			(void) strlcpy(mseg->name, de->vde_name,
162 			    sizeof (mseg->name));
163 		}
164 	} else {
165 		bzero(mseg->name, sizeof (mseg->name));
166 	}
167 
168 	return (error);
169 }
170 
171 static int
172 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
173 {
174 	off_t map_offset;
175 	vmm_devmem_entry_t *entry;
176 
177 	if (list_is_empty(&sc->vmm_devmem_list)) {
178 		map_offset = VM_DEVMEM_START;
179 	} else {
180 		entry = list_tail(&sc->vmm_devmem_list);
181 		map_offset = entry->vde_off + entry->vde_len;
182 		if (map_offset < entry->vde_off) {
183 			/* Do not tolerate overflow */
184 			return (ERANGE);
185 		}
186 		/*
187 		 * XXXJOY: We could choose to search the list for duplicate
188 		 * names and toss an error.  Since we're using the offset
189 		 * method for now, it does not make much of a difference.
190 		 */
191 	}
192 
193 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
194 	entry->vde_segid = mseg->segid;
195 	entry->vde_len = mseg->len;
196 	entry->vde_off = map_offset;
197 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
198 	list_insert_tail(&sc->vmm_devmem_list, entry);
199 
200 	return (0);
201 }
202 
203 static boolean_t
204 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
205     off_t *map_offp)
206 {
207 	list_t *dl = &sc->vmm_devmem_list;
208 	vmm_devmem_entry_t *de = NULL;
209 	const off_t map_end = off + len;
210 
211 	VERIFY(off >= VM_DEVMEM_START);
212 
213 	if (map_end < off) {
214 		/* No match on overflow */
215 		return (B_FALSE);
216 	}
217 
218 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
219 		const off_t item_end = de->vde_off + de->vde_len;
220 
221 		if (de->vde_off <= off && item_end >= map_end) {
222 			*segidp = de->vde_segid;
223 			*map_offp = off - de->vde_off;
224 			return (B_TRUE);
225 		}
226 	}
227 	return (B_FALSE);
228 }
229 
230 static void
231 vmmdev_devmem_purge(vmm_softc_t *sc)
232 {
233 	vmm_devmem_entry_t *entry;
234 
235 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
236 		kmem_free(entry, sizeof (*entry));
237 	}
238 }
239 
240 static int
241 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
242 {
243 	int error;
244 	bool sysmem = true;
245 
246 	if (VM_MEMSEG_NAME(mseg)) {
247 		sysmem = false;
248 	}
249 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
250 
251 	if (error == 0) {
252 		/*
253 		 * Rather than create a whole fresh device from which userspace
254 		 * can mmap this segment, instead make it available at an
255 		 * offset above where the main guest memory resides.
256 		 */
257 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
258 		if (error != 0) {
259 			vm_free_memseg(sc->vmm_vm, mseg->segid);
260 		}
261 	}
262 	return (error);
263 }
264 
265 /*
266  * Resource Locking and Exclusion
267  *
268  * Much of bhyve depends on key portions of VM state, such as the guest memory
269  * map, to remain unchanged while the guest is running.  As ported from
270  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
271  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
272  * performing the work of actually running the guest in VMX/SVM, would lock
273  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
274  * state, all of the vCPUs would be first locked, ensuring that the
275  * operation(s) could complete without any other threads stumbling into
276  * intermediate states.
277  *
278  * This approach is largely effective for bhyve.  Common operations, such as
279  * running the vCPUs, steer clear of lock contention.  The model begins to
280  * break down for operations which do not occur in the context of a specific
281  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
282  * thread in the bhyve process.  In order to properly protect those vCPU-less
283  * operations from encountering invalid states, additional locking is required.
284  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
285  * It does mean that class of operations will be serialized on locking the
286  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
287  * undue contention on the VM_MAXCPU-1 vCPU.
288  *
289  * In order to address the shortcomings of this model, the concept of a
290  * read/write lock has been added to bhyve.  Operations which change
291  * fundamental aspects of a VM (such as the memory map) must acquire the write
292  * lock, which also implies locking all of the vCPUs and waiting for all read
293  * lock holders to release.  While it increases the cost and waiting time for
294  * those few operations, it allows most hot-path operations on the VM (which
295  * depend on its configuration remaining stable) to occur with minimal locking.
296  *
297  * Consumers of the Driver API (see below) are a special case when it comes to
298  * this locking, since they may hold a read lock via the drv_lease mechanism
299  * for an extended period of time.  Rather than forcing those consumers to
300  * continuously poll for a write lock attempt, the lease system forces them to
301  * provide a release callback to trigger their clean-up (and potential later
302  * reacquisition) of the read lock.
303  */
304 
305 static void
306 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
307 {
308 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
309 
310 	/*
311 	 * Since this state transition is utilizing from_idle=true, it should
312 	 * not fail, but rather block until it can be successful.
313 	 */
314 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
315 }
316 
317 static void
318 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
319 {
320 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
321 
322 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
323 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
324 }
325 
326 static void
327 vmm_read_lock(vmm_softc_t *sc)
328 {
329 	rw_enter(&sc->vmm_rwlock, RW_READER);
330 }
331 
332 static void
333 vmm_read_unlock(vmm_softc_t *sc)
334 {
335 	rw_exit(&sc->vmm_rwlock);
336 }
337 
338 static void
339 vmm_write_lock(vmm_softc_t *sc)
340 {
341 	int maxcpus;
342 
343 	/* First lock all the vCPUs */
344 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
345 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
346 		vcpu_lock_one(sc, vcpu);
347 	}
348 
349 	/*
350 	 * Block vmm_drv leases from being acquired or held while the VM write
351 	 * lock is held.
352 	 */
353 	vmm_lease_block(sc);
354 
355 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
356 	/*
357 	 * For now, the 'maxcpus' value for an instance is fixed at the
358 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
359 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
360 	 * of the write lock will need to be wary of such changes.
361 	 */
362 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
363 }
364 
365 static void
366 vmm_write_unlock(vmm_softc_t *sc)
367 {
368 	int maxcpus;
369 
370 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
371 	vmm_lease_unblock(sc);
372 
373 	/*
374 	 * The VM write lock _must_ be released from the same thread it was
375 	 * acquired in, unlike the read lock.
376 	 */
377 	VERIFY(rw_write_held(&sc->vmm_rwlock));
378 	rw_exit(&sc->vmm_rwlock);
379 
380 	/* Unlock all the vCPUs */
381 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
382 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
383 		vcpu_unlock_one(sc, vcpu);
384 	}
385 }
386 
387 static int
388 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
389     cred_t *credp, int *rvalp)
390 {
391 	int error = 0, vcpu = -1;
392 	void *datap = (void *)arg;
393 	enum vm_lock_type {
394 		LOCK_NONE = 0,
395 		LOCK_VCPU,
396 		LOCK_READ_HOLD,
397 		LOCK_WRITE_HOLD
398 	} lock_type = LOCK_NONE;
399 
400 	/* Acquire any exclusion resources needed for the operation. */
401 	switch (cmd) {
402 	case VM_RUN:
403 	case VM_GET_REGISTER:
404 	case VM_SET_REGISTER:
405 	case VM_GET_SEGMENT_DESCRIPTOR:
406 	case VM_SET_SEGMENT_DESCRIPTOR:
407 	case VM_GET_REGISTER_SET:
408 	case VM_SET_REGISTER_SET:
409 	case VM_INJECT_EXCEPTION:
410 	case VM_GET_CAPABILITY:
411 	case VM_SET_CAPABILITY:
412 	case VM_PPTDEV_MSI:
413 	case VM_PPTDEV_MSIX:
414 	case VM_SET_X2APIC_STATE:
415 	case VM_GLA2GPA:
416 	case VM_GLA2GPA_NOFAULT:
417 	case VM_ACTIVATE_CPU:
418 	case VM_SET_INTINFO:
419 	case VM_GET_INTINFO:
420 	case VM_RESTART_INSTRUCTION:
421 	case VM_SET_KERNEMU_DEV:
422 	case VM_GET_KERNEMU_DEV:
423 	case VM_RESET_CPU:
424 	case VM_GET_RUN_STATE:
425 	case VM_SET_RUN_STATE:
426 	case VM_GET_FPU:
427 	case VM_SET_FPU:
428 		/*
429 		 * Copy in the ID of the vCPU chosen for this operation.
430 		 * Since a nefarious caller could update their struct between
431 		 * this locking and when the rest of the ioctl data is copied
432 		 * in, it is _critical_ that this local 'vcpu' variable be used
433 		 * rather than the in-struct one when performing the ioctl.
434 		 */
435 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
436 			return (EFAULT);
437 		}
438 		if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
439 			return (EINVAL);
440 		}
441 		vcpu_lock_one(sc, vcpu);
442 		lock_type = LOCK_VCPU;
443 		break;
444 
445 	case VM_REINIT:
446 	case VM_BIND_PPTDEV:
447 	case VM_UNBIND_PPTDEV:
448 	case VM_MAP_PPTDEV_MMIO:
449 	case VM_UNMAP_PPTDEV_MMIO:
450 	case VM_ALLOC_MEMSEG:
451 	case VM_MMAP_MEMSEG:
452 	case VM_MUNMAP_MEMSEG:
453 	case VM_WRLOCK_CYCLE:
454 	case VM_PMTMR_LOCATE:
455 		vmm_write_lock(sc);
456 		lock_type = LOCK_WRITE_HOLD;
457 		break;
458 
459 	case VM_GET_MEMSEG:
460 	case VM_MMAP_GETNEXT:
461 	case VM_LAPIC_IRQ:
462 	case VM_INJECT_NMI:
463 	case VM_IOAPIC_ASSERT_IRQ:
464 	case VM_IOAPIC_DEASSERT_IRQ:
465 	case VM_IOAPIC_PULSE_IRQ:
466 	case VM_LAPIC_MSI:
467 	case VM_LAPIC_LOCAL_IRQ:
468 	case VM_GET_X2APIC_STATE:
469 	case VM_RTC_READ:
470 	case VM_RTC_WRITE:
471 	case VM_RTC_SETTIME:
472 	case VM_RTC_GETTIME:
473 	case VM_PPTDEV_DISABLE_MSIX:
474 	case VM_DEVMEM_GETOFFSET:
475 	case VM_TRACK_DIRTY_PAGES:
476 		vmm_read_lock(sc);
477 		lock_type = LOCK_READ_HOLD;
478 		break;
479 
480 	case VM_GET_GPA_PMAP:
481 	case VM_IOAPIC_PINCOUNT:
482 	case VM_SUSPEND:
483 	case VM_DESC_FPU_AREA:
484 	default:
485 		break;
486 	}
487 
488 	/* Execute the primary logic for the ioctl. */
489 	switch (cmd) {
490 	case VM_RUN: {
491 		struct vm_entry entry;
492 
493 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
494 			error = EFAULT;
495 			break;
496 		}
497 
498 		if (!(curthread->t_schedflag & TS_VCPU))
499 			smt_mark_as_vcpu();
500 
501 		error = vm_run(sc->vmm_vm, vcpu, &entry);
502 
503 		/*
504 		 * Unexpected states in vm_run() are expressed through positive
505 		 * errno-oriented return values.  VM states which expect further
506 		 * processing in userspace (necessary context via exitinfo) are
507 		 * expressed through negative return values.  For the time being
508 		 * a return value of 0 is not expected from vm_run().
509 		 */
510 		ASSERT(error != 0);
511 		if (error < 0) {
512 			const struct vm_exit *vme;
513 			void *outp = entry.exit_data;
514 
515 			error = 0;
516 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
517 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
518 				error = EFAULT;
519 			}
520 		}
521 		break;
522 	}
523 	case VM_SUSPEND: {
524 		struct vm_suspend vmsuspend;
525 
526 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
527 			error = EFAULT;
528 			break;
529 		}
530 		error = vm_suspend(sc->vmm_vm, vmsuspend.how);
531 		break;
532 	}
533 	case VM_REINIT: {
534 		struct vm_reinit reinit;
535 
536 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
537 			error = EFAULT;
538 			break;
539 		}
540 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
541 			/*
542 			 * The VM instance should be free of driver-attached
543 			 * hooks during the reinitialization process.
544 			 */
545 			break;
546 		}
547 		error = vm_reinit(sc->vmm_vm, reinit.flags);
548 		(void) vmm_drv_block_hook(sc, B_FALSE);
549 		break;
550 	}
551 	case VM_STAT_DESC: {
552 		struct vm_stat_desc statdesc;
553 
554 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
555 			error = EFAULT;
556 			break;
557 		}
558 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
559 		    sizeof (statdesc.desc));
560 		if (error == 0 &&
561 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
562 			error = EFAULT;
563 			break;
564 		}
565 		break;
566 	}
567 	case VM_STATS_IOC: {
568 		struct vm_stats vmstats;
569 
570 		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
571 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
572 			error = EFAULT;
573 			break;
574 		}
575 		hrt2tv(gethrtime(), &vmstats.tv);
576 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
577 		    &vmstats.num_entries, vmstats.statbuf);
578 		if (error == 0 &&
579 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
580 			error = EFAULT;
581 			break;
582 		}
583 		break;
584 	}
585 
586 	case VM_PPTDEV_MSI: {
587 		struct vm_pptdev_msi pptmsi;
588 
589 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
590 			error = EFAULT;
591 			break;
592 		}
593 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
594 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
595 		break;
596 	}
597 	case VM_PPTDEV_MSIX: {
598 		struct vm_pptdev_msix pptmsix;
599 
600 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
601 			error = EFAULT;
602 			break;
603 		}
604 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
605 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
606 		    pptmsix.vector_control);
607 		break;
608 	}
609 	case VM_PPTDEV_DISABLE_MSIX: {
610 		struct vm_pptdev pptdev;
611 
612 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
613 			error = EFAULT;
614 			break;
615 		}
616 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
617 		break;
618 	}
619 	case VM_MAP_PPTDEV_MMIO: {
620 		struct vm_pptdev_mmio pptmmio;
621 
622 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
623 			error = EFAULT;
624 			break;
625 		}
626 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
627 		    pptmmio.len, pptmmio.hpa);
628 		break;
629 	}
630 	case VM_UNMAP_PPTDEV_MMIO: {
631 		struct vm_pptdev_mmio pptmmio;
632 
633 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
634 			error = EFAULT;
635 			break;
636 		}
637 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
638 		    pptmmio.len);
639 		break;
640 	}
641 	case VM_BIND_PPTDEV: {
642 		struct vm_pptdev pptdev;
643 
644 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
645 			error = EFAULT;
646 			break;
647 		}
648 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
649 		break;
650 	}
651 	case VM_UNBIND_PPTDEV: {
652 		struct vm_pptdev pptdev;
653 
654 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
655 			error = EFAULT;
656 			break;
657 		}
658 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
659 		break;
660 	}
661 	case VM_GET_PPTDEV_LIMITS: {
662 		struct vm_pptdev_limits pptlimits;
663 
664 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
665 			error = EFAULT;
666 			break;
667 		}
668 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
669 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
670 		if (error == 0 &&
671 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
672 			error = EFAULT;
673 			break;
674 		}
675 		break;
676 	}
677 	case VM_INJECT_EXCEPTION: {
678 		struct vm_exception vmexc;
679 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
680 			error = EFAULT;
681 			break;
682 		}
683 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
684 		    vmexc.error_code_valid, vmexc.error_code,
685 		    vmexc.restart_instruction);
686 		break;
687 	}
688 	case VM_INJECT_NMI: {
689 		struct vm_nmi vmnmi;
690 
691 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
692 			error = EFAULT;
693 			break;
694 		}
695 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
696 		break;
697 	}
698 	case VM_LAPIC_IRQ: {
699 		struct vm_lapic_irq vmirq;
700 
701 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
702 			error = EFAULT;
703 			break;
704 		}
705 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
706 		break;
707 	}
708 	case VM_LAPIC_LOCAL_IRQ: {
709 		struct vm_lapic_irq vmirq;
710 
711 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
712 			error = EFAULT;
713 			break;
714 		}
715 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
716 		    vmirq.vector);
717 		break;
718 	}
719 	case VM_LAPIC_MSI: {
720 		struct vm_lapic_msi vmmsi;
721 
722 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
723 			error = EFAULT;
724 			break;
725 		}
726 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
727 		break;
728 	}
729 
730 	case VM_IOAPIC_ASSERT_IRQ: {
731 		struct vm_ioapic_irq ioapic_irq;
732 
733 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
734 			error = EFAULT;
735 			break;
736 		}
737 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
738 		break;
739 	}
740 	case VM_IOAPIC_DEASSERT_IRQ: {
741 		struct vm_ioapic_irq ioapic_irq;
742 
743 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
744 			error = EFAULT;
745 			break;
746 		}
747 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
748 		break;
749 	}
750 	case VM_IOAPIC_PULSE_IRQ: {
751 		struct vm_ioapic_irq ioapic_irq;
752 
753 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
754 			error = EFAULT;
755 			break;
756 		}
757 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
758 		break;
759 	}
760 	case VM_IOAPIC_PINCOUNT: {
761 		int pincount;
762 
763 		pincount = vioapic_pincount(sc->vmm_vm);
764 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
765 			error = EFAULT;
766 			break;
767 		}
768 		break;
769 	}
770 	case VM_DESC_FPU_AREA: {
771 		struct vm_fpu_desc desc;
772 		void *buf = NULL;
773 
774 		if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
775 			error = EFAULT;
776 			break;
777 		}
778 		if (desc.vfd_num_entries > 64) {
779 			error = EINVAL;
780 			break;
781 		}
782 		const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
783 		    desc.vfd_num_entries;
784 		if (buf_sz != 0) {
785 			buf = kmem_zalloc(buf_sz, KM_SLEEP);
786 		}
787 
788 		/*
789 		 * For now, we are depending on vm_fpu_desc_entry and
790 		 * hma_xsave_state_desc_t having the same format.
791 		 */
792 		CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
793 		    sizeof (hma_xsave_state_desc_t));
794 
795 		size_t req_size;
796 		const uint_t max_entries = hma_fpu_describe_xsave_state(
797 		    (hma_xsave_state_desc_t *)buf,
798 		    desc.vfd_num_entries,
799 		    &req_size);
800 
801 		desc.vfd_req_size = req_size;
802 		desc.vfd_num_entries = max_entries;
803 		if (buf_sz != 0) {
804 			if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
805 				error = EFAULT;
806 			}
807 			kmem_free(buf, buf_sz);
808 		}
809 
810 		if (error == 0) {
811 			if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
812 				error = EFAULT;
813 			}
814 		}
815 		break;
816 	}
817 
818 	case VM_ISA_ASSERT_IRQ: {
819 		struct vm_isa_irq isa_irq;
820 
821 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
822 			error = EFAULT;
823 			break;
824 		}
825 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
826 		if (error == 0 && isa_irq.ioapic_irq != -1) {
827 			error = vioapic_assert_irq(sc->vmm_vm,
828 			    isa_irq.ioapic_irq);
829 		}
830 		break;
831 	}
832 	case VM_ISA_DEASSERT_IRQ: {
833 		struct vm_isa_irq isa_irq;
834 
835 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
836 			error = EFAULT;
837 			break;
838 		}
839 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
840 		if (error == 0 && isa_irq.ioapic_irq != -1) {
841 			error = vioapic_deassert_irq(sc->vmm_vm,
842 			    isa_irq.ioapic_irq);
843 		}
844 		break;
845 	}
846 	case VM_ISA_PULSE_IRQ: {
847 		struct vm_isa_irq isa_irq;
848 
849 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
850 			error = EFAULT;
851 			break;
852 		}
853 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
854 		if (error == 0 && isa_irq.ioapic_irq != -1) {
855 			error = vioapic_pulse_irq(sc->vmm_vm,
856 			    isa_irq.ioapic_irq);
857 		}
858 		break;
859 	}
860 	case VM_ISA_SET_IRQ_TRIGGER: {
861 		struct vm_isa_irq_trigger isa_irq_trigger;
862 
863 		if (ddi_copyin(datap, &isa_irq_trigger,
864 		    sizeof (isa_irq_trigger), md)) {
865 			error = EFAULT;
866 			break;
867 		}
868 		error = vatpic_set_irq_trigger(sc->vmm_vm,
869 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
870 		break;
871 	}
872 
873 	case VM_MMAP_GETNEXT: {
874 		struct vm_memmap mm;
875 
876 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
877 			error = EFAULT;
878 			break;
879 		}
880 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
881 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
882 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
883 			error = EFAULT;
884 			break;
885 		}
886 		break;
887 	}
888 	case VM_MMAP_MEMSEG: {
889 		struct vm_memmap mm;
890 
891 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
892 			error = EFAULT;
893 			break;
894 		}
895 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
896 		    mm.len, mm.prot, mm.flags);
897 		break;
898 	}
899 	case VM_MUNMAP_MEMSEG: {
900 		struct vm_munmap mu;
901 
902 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
903 			error = EFAULT;
904 			break;
905 		}
906 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
907 		break;
908 	}
909 	case VM_ALLOC_MEMSEG: {
910 		struct vm_memseg vmseg;
911 
912 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
913 			error = EFAULT;
914 			break;
915 		}
916 		error = vmmdev_alloc_memseg(sc, &vmseg);
917 		break;
918 	}
919 	case VM_GET_MEMSEG: {
920 		struct vm_memseg vmseg;
921 
922 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
923 			error = EFAULT;
924 			break;
925 		}
926 		error = vmmdev_get_memseg(sc, &vmseg);
927 		if (error == 0 &&
928 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
929 			error = EFAULT;
930 			break;
931 		}
932 		break;
933 	}
934 	case VM_GET_REGISTER: {
935 		struct vm_register vmreg;
936 
937 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
938 			error = EFAULT;
939 			break;
940 		}
941 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
942 		    &vmreg.regval);
943 		if (error == 0 &&
944 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
945 			error = EFAULT;
946 			break;
947 		}
948 		break;
949 	}
950 	case VM_SET_REGISTER: {
951 		struct vm_register vmreg;
952 
953 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
954 			error = EFAULT;
955 			break;
956 		}
957 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
958 		    vmreg.regval);
959 		break;
960 	}
961 	case VM_SET_SEGMENT_DESCRIPTOR: {
962 		struct vm_seg_desc vmsegd;
963 
964 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
965 			error = EFAULT;
966 			break;
967 		}
968 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
969 		    &vmsegd.desc);
970 		break;
971 	}
972 	case VM_GET_SEGMENT_DESCRIPTOR: {
973 		struct vm_seg_desc vmsegd;
974 
975 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
976 			error = EFAULT;
977 			break;
978 		}
979 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
980 		    &vmsegd.desc);
981 		if (error == 0 &&
982 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
983 			error = EFAULT;
984 			break;
985 		}
986 		break;
987 	}
988 	case VM_GET_REGISTER_SET: {
989 		struct vm_register_set vrs;
990 		int regnums[VM_REG_LAST];
991 		uint64_t regvals[VM_REG_LAST];
992 
993 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
994 			error = EFAULT;
995 			break;
996 		}
997 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
998 			error = EINVAL;
999 			break;
1000 		}
1001 		if (ddi_copyin(vrs.regnums, regnums,
1002 		    sizeof (int) * vrs.count, md)) {
1003 			error = EFAULT;
1004 			break;
1005 		}
1006 
1007 		error = 0;
1008 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1009 			if (regnums[i] < 0) {
1010 				error = EINVAL;
1011 				break;
1012 			}
1013 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1014 			    &regvals[i]);
1015 		}
1016 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1017 		    sizeof (uint64_t) * vrs.count, md)) {
1018 			error = EFAULT;
1019 		}
1020 		break;
1021 	}
1022 	case VM_SET_REGISTER_SET: {
1023 		struct vm_register_set vrs;
1024 		int regnums[VM_REG_LAST];
1025 		uint64_t regvals[VM_REG_LAST];
1026 
1027 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1028 			error = EFAULT;
1029 			break;
1030 		}
1031 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1032 			error = EINVAL;
1033 			break;
1034 		}
1035 		if (ddi_copyin(vrs.regnums, regnums,
1036 		    sizeof (int) * vrs.count, md)) {
1037 			error = EFAULT;
1038 			break;
1039 		}
1040 		if (ddi_copyin(vrs.regvals, regvals,
1041 		    sizeof (uint64_t) * vrs.count, md)) {
1042 			error = EFAULT;
1043 			break;
1044 		}
1045 
1046 		error = 0;
1047 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1048 			/*
1049 			 * Setting registers in a set is not atomic, since a
1050 			 * failure in the middle of the set will cause a
1051 			 * bail-out and inconsistent register state.  Callers
1052 			 * should be wary of this.
1053 			 */
1054 			if (regnums[i] < 0) {
1055 				error = EINVAL;
1056 				break;
1057 			}
1058 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1059 			    regvals[i]);
1060 		}
1061 		break;
1062 	}
1063 	case VM_RESET_CPU: {
1064 		struct vm_vcpu_reset vvr;
1065 
1066 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1067 			error = EFAULT;
1068 			break;
1069 		}
1070 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1071 			error = EINVAL;
1072 		}
1073 
1074 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1075 		break;
1076 	}
1077 	case VM_GET_RUN_STATE: {
1078 		struct vm_run_state vrs;
1079 
1080 		bzero(&vrs, sizeof (vrs));
1081 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1082 		    &vrs.sipi_vector);
1083 		if (error == 0) {
1084 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1085 				error = EFAULT;
1086 				break;
1087 			}
1088 		}
1089 		break;
1090 	}
1091 	case VM_SET_RUN_STATE: {
1092 		struct vm_run_state vrs;
1093 
1094 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1095 			error = EFAULT;
1096 			break;
1097 		}
1098 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1099 		    vrs.sipi_vector);
1100 		break;
1101 	}
1102 	case VM_GET_FPU: {
1103 		struct vm_fpu_state req;
1104 		const size_t max_len = (PAGESIZE * 2);
1105 		void *kbuf;
1106 
1107 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1108 			error = EFAULT;
1109 			break;
1110 		}
1111 		if (req.len > max_len || req.len == 0) {
1112 			error = EINVAL;
1113 			break;
1114 		}
1115 		kbuf = kmem_zalloc(req.len, KM_SLEEP);
1116 		error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1117 		if (error == 0) {
1118 			if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1119 				error = EFAULT;
1120 			}
1121 		}
1122 		kmem_free(kbuf, req.len);
1123 		break;
1124 	}
1125 	case VM_SET_FPU: {
1126 		struct vm_fpu_state req;
1127 		const size_t max_len = (PAGESIZE * 2);
1128 		void *kbuf;
1129 
1130 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1131 			error = EFAULT;
1132 			break;
1133 		}
1134 		if (req.len > max_len || req.len == 0) {
1135 			error = EINVAL;
1136 			break;
1137 		}
1138 		kbuf = kmem_alloc(req.len, KM_SLEEP);
1139 		if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1140 			error = EFAULT;
1141 		} else {
1142 			error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1143 		}
1144 		kmem_free(kbuf, req.len);
1145 		break;
1146 	}
1147 
1148 	case VM_SET_KERNEMU_DEV:
1149 	case VM_GET_KERNEMU_DEV: {
1150 		struct vm_readwrite_kernemu_device kemu;
1151 		size_t size = 0;
1152 
1153 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1154 			error = EFAULT;
1155 			break;
1156 		}
1157 
1158 		if (kemu.access_width > 3) {
1159 			error = EINVAL;
1160 			break;
1161 		}
1162 		size = (1 << kemu.access_width);
1163 		ASSERT(size >= 1 && size <= 8);
1164 
1165 		if (cmd == VM_SET_KERNEMU_DEV) {
1166 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1167 			    kemu.gpa, kemu.value, size);
1168 		} else {
1169 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1170 			    kemu.gpa, &kemu.value, size);
1171 		}
1172 
1173 		if (error == 0) {
1174 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1175 				error = EFAULT;
1176 				break;
1177 			}
1178 		}
1179 		break;
1180 	}
1181 
1182 	case VM_GET_CAPABILITY: {
1183 		struct vm_capability vmcap;
1184 
1185 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1186 			error = EFAULT;
1187 			break;
1188 		}
1189 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1190 		    &vmcap.capval);
1191 		if (error == 0 &&
1192 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1193 			error = EFAULT;
1194 			break;
1195 		}
1196 		break;
1197 	}
1198 	case VM_SET_CAPABILITY: {
1199 		struct vm_capability vmcap;
1200 
1201 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1202 			error = EFAULT;
1203 			break;
1204 		}
1205 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1206 		    vmcap.capval);
1207 		break;
1208 	}
1209 	case VM_SET_X2APIC_STATE: {
1210 		struct vm_x2apic x2apic;
1211 
1212 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1213 			error = EFAULT;
1214 			break;
1215 		}
1216 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1217 		break;
1218 	}
1219 	case VM_GET_X2APIC_STATE: {
1220 		struct vm_x2apic x2apic;
1221 
1222 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1223 			error = EFAULT;
1224 			break;
1225 		}
1226 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1227 		    &x2apic.state);
1228 		if (error == 0 &&
1229 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1230 			error = EFAULT;
1231 			break;
1232 		}
1233 		break;
1234 	}
1235 	case VM_GET_GPA_PMAP: {
1236 		/*
1237 		 * Until there is a necessity to leak EPT/RVI PTE values to
1238 		 * userspace, this will remain unimplemented
1239 		 */
1240 		error = EINVAL;
1241 		break;
1242 	}
1243 	case VM_GET_HPET_CAPABILITIES: {
1244 		struct vm_hpet_cap hpetcap;
1245 
1246 		error = vhpet_getcap(&hpetcap);
1247 		if (error == 0 &&
1248 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1249 			error = EFAULT;
1250 			break;
1251 		}
1252 		break;
1253 	}
1254 	case VM_GLA2GPA: {
1255 		struct vm_gla2gpa gg;
1256 
1257 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1258 			error = EFAULT;
1259 			break;
1260 		}
1261 		gg.vcpuid = vcpu;
1262 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1263 		    gg.prot, &gg.gpa, &gg.fault);
1264 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1265 			error = EFAULT;
1266 			break;
1267 		}
1268 		break;
1269 	}
1270 	case VM_GLA2GPA_NOFAULT: {
1271 		struct vm_gla2gpa gg;
1272 
1273 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1274 			error = EFAULT;
1275 			break;
1276 		}
1277 		gg.vcpuid = vcpu;
1278 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1279 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1280 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1281 			error = EFAULT;
1282 			break;
1283 		}
1284 		break;
1285 	}
1286 
1287 	case VM_ACTIVATE_CPU:
1288 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1289 		break;
1290 
1291 	case VM_SUSPEND_CPU:
1292 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1293 			error = EFAULT;
1294 		} else {
1295 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1296 		}
1297 		break;
1298 
1299 	case VM_RESUME_CPU:
1300 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1301 			error = EFAULT;
1302 		} else {
1303 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1304 		}
1305 		break;
1306 
1307 	case VM_GET_CPUS: {
1308 		struct vm_cpuset vm_cpuset;
1309 		cpuset_t tempset;
1310 		void *srcp = &tempset;
1311 		int size;
1312 
1313 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1314 			error = EFAULT;
1315 			break;
1316 		}
1317 
1318 		/* Be more generous about sizing since our cpuset_t is large. */
1319 		size = vm_cpuset.cpusetsize;
1320 		if (size <= 0 || size > sizeof (cpuset_t)) {
1321 			error = ERANGE;
1322 		}
1323 		/*
1324 		 * If they want a ulong_t or less, make sure they receive the
1325 		 * low bits with all the useful information.
1326 		 */
1327 		if (size <= sizeof (tempset.cpub[0])) {
1328 			srcp = &tempset.cpub[0];
1329 		}
1330 
1331 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1332 			tempset = vm_active_cpus(sc->vmm_vm);
1333 		} else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1334 			tempset = vm_suspended_cpus(sc->vmm_vm);
1335 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1336 			tempset = vm_debug_cpus(sc->vmm_vm);
1337 		} else {
1338 			error = EINVAL;
1339 		}
1340 
1341 		ASSERT(size > 0 && size <= sizeof (tempset));
1342 		if (error == 0 &&
1343 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1344 			error = EFAULT;
1345 			break;
1346 		}
1347 		break;
1348 	}
1349 	case VM_SET_INTINFO: {
1350 		struct vm_intinfo vmii;
1351 
1352 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1353 			error = EFAULT;
1354 			break;
1355 		}
1356 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1357 		break;
1358 	}
1359 	case VM_GET_INTINFO: {
1360 		struct vm_intinfo vmii;
1361 
1362 		vmii.vcpuid = vcpu;
1363 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1364 		    &vmii.info2);
1365 		if (error == 0 &&
1366 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1367 			error = EFAULT;
1368 			break;
1369 		}
1370 		break;
1371 	}
1372 	case VM_RTC_WRITE: {
1373 		struct vm_rtc_data rtcdata;
1374 
1375 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1376 			error = EFAULT;
1377 			break;
1378 		}
1379 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1380 		    rtcdata.value);
1381 		break;
1382 	}
1383 	case VM_RTC_READ: {
1384 		struct vm_rtc_data rtcdata;
1385 
1386 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1387 			error = EFAULT;
1388 			break;
1389 		}
1390 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1391 		    &rtcdata.value);
1392 		if (error == 0 &&
1393 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1394 			error = EFAULT;
1395 			break;
1396 		}
1397 		break;
1398 	}
1399 	case VM_RTC_SETTIME: {
1400 		struct vm_rtc_time rtctime;
1401 
1402 		if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1403 			error = EFAULT;
1404 			break;
1405 		}
1406 		error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1407 		break;
1408 	}
1409 	case VM_RTC_GETTIME: {
1410 		struct vm_rtc_time rtctime;
1411 
1412 		rtctime.secs = vrtc_get_time(sc->vmm_vm);
1413 		if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1414 			error = EFAULT;
1415 			break;
1416 		}
1417 		break;
1418 	}
1419 
1420 	case VM_PMTMR_LOCATE: {
1421 		uint16_t port = arg;
1422 		error = vpmtmr_set_location(sc->vmm_vm, port);
1423 		break;
1424 	}
1425 
1426 	case VM_RESTART_INSTRUCTION:
1427 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1428 		break;
1429 
1430 	case VM_SET_TOPOLOGY: {
1431 		struct vm_cpu_topology topo;
1432 
1433 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1434 			error = EFAULT;
1435 			break;
1436 		}
1437 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1438 		    topo.threads, topo.maxcpus);
1439 		break;
1440 	}
1441 	case VM_GET_TOPOLOGY: {
1442 		struct vm_cpu_topology topo;
1443 
1444 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1445 		    &topo.threads, &topo.maxcpus);
1446 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1447 			error = EFAULT;
1448 			break;
1449 		}
1450 		break;
1451 	}
1452 	case VM_DEVMEM_GETOFFSET: {
1453 		struct vm_devmem_offset vdo;
1454 		vmm_devmem_entry_t *de;
1455 
1456 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1457 			error = EFAULT;
1458 			break;
1459 		}
1460 
1461 		de = vmmdev_devmem_find(sc, vdo.segid);
1462 		if (de != NULL) {
1463 			vdo.offset = de->vde_off;
1464 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1465 				error = EFAULT;
1466 			}
1467 		} else {
1468 			error = ENOENT;
1469 		}
1470 		break;
1471 	}
1472 	case VM_TRACK_DIRTY_PAGES: {
1473 		const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1474 		struct vmm_dirty_tracker tracker;
1475 		uint8_t *bitmap;
1476 		size_t len;
1477 
1478 		if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1479 			error = EFAULT;
1480 			break;
1481 		}
1482 		if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1483 			error = EINVAL;
1484 			break;
1485 		}
1486 		if (tracker.vdt_len == 0) {
1487 			break;
1488 		}
1489 		if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1490 			error = EINVAL;
1491 			break;
1492 		}
1493 		if (tracker.vdt_len > max_track_region_len) {
1494 			error = EINVAL;
1495 			break;
1496 		}
1497 		len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1498 		bitmap = kmem_zalloc(len, KM_SLEEP);
1499 		vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1500 		    tracker.vdt_len, bitmap);
1501 		if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1502 			error = EFAULT;
1503 		}
1504 		kmem_free(bitmap, len);
1505 
1506 		break;
1507 	}
1508 	case VM_WRLOCK_CYCLE: {
1509 		/*
1510 		 * Present a test mechanism to acquire/release the write lock
1511 		 * on the VM without any other effects.
1512 		 */
1513 		break;
1514 	}
1515 
1516 	default:
1517 		error = ENOTTY;
1518 		break;
1519 	}
1520 
1521 	/* Release exclusion resources */
1522 	switch (lock_type) {
1523 	case LOCK_NONE:
1524 		break;
1525 	case LOCK_VCPU:
1526 		vcpu_unlock_one(sc, vcpu);
1527 		break;
1528 	case LOCK_READ_HOLD:
1529 		vmm_read_unlock(sc);
1530 		break;
1531 	case LOCK_WRITE_HOLD:
1532 		vmm_write_unlock(sc);
1533 		break;
1534 	default:
1535 		panic("unexpected lock type");
1536 		break;
1537 	}
1538 
1539 	return (error);
1540 }
1541 
1542 static vmm_softc_t *
1543 vmm_lookup(const char *name)
1544 {
1545 	list_t *vml = &vmm_list;
1546 	vmm_softc_t *sc;
1547 
1548 	ASSERT(MUTEX_HELD(&vmm_mtx));
1549 
1550 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1551 		if (strcmp(sc->vmm_name, name) == 0) {
1552 			break;
1553 		}
1554 	}
1555 
1556 	return (sc);
1557 }
1558 
1559 /*
1560  * Acquire an HMA registration if not already held.
1561  */
1562 static boolean_t
1563 vmm_hma_acquire(void)
1564 {
1565 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1566 
1567 	mutex_enter(&vmmdev_mtx);
1568 
1569 	if (vmmdev_hma_reg == NULL) {
1570 		VERIFY3U(vmmdev_hma_ref, ==, 0);
1571 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1572 		if (vmmdev_hma_reg == NULL) {
1573 			cmn_err(CE_WARN, "%s HMA registration failed.",
1574 			    vmmdev_hvm_name);
1575 			mutex_exit(&vmmdev_mtx);
1576 			return (B_FALSE);
1577 		}
1578 	}
1579 
1580 	vmmdev_hma_ref++;
1581 
1582 	mutex_exit(&vmmdev_mtx);
1583 
1584 	return (B_TRUE);
1585 }
1586 
1587 /*
1588  * Release the HMA registration if held and there are no remaining VMs.
1589  */
1590 static void
1591 vmm_hma_release(void)
1592 {
1593 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1594 
1595 	mutex_enter(&vmmdev_mtx);
1596 
1597 	VERIFY3U(vmmdev_hma_ref, !=, 0);
1598 
1599 	vmmdev_hma_ref--;
1600 
1601 	if (vmmdev_hma_ref == 0) {
1602 		VERIFY(vmmdev_hma_reg != NULL);
1603 		hma_unregister(vmmdev_hma_reg);
1604 		vmmdev_hma_reg = NULL;
1605 	}
1606 	mutex_exit(&vmmdev_mtx);
1607 }
1608 
1609 static int
1610 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
1611 {
1612 	vmm_softc_t	*sc = NULL;
1613 	minor_t		minor;
1614 	int		error = ENOMEM;
1615 	size_t		len;
1616 	const char	*name = req->name;
1617 
1618 	len = strnlen(name, VM_MAX_NAMELEN);
1619 	if (len == 0) {
1620 		return (EINVAL);
1621 	}
1622 	if (len >= VM_MAX_NAMELEN) {
1623 		return (ENAMETOOLONG);
1624 	}
1625 	if (strchr(name, '/') != NULL) {
1626 		return (EINVAL);
1627 	}
1628 
1629 	if (!vmm_hma_acquire())
1630 		return (ENXIO);
1631 
1632 	mutex_enter(&vmm_mtx);
1633 
1634 	/* Look for duplicate names */
1635 	if (vmm_lookup(name) != NULL) {
1636 		mutex_exit(&vmm_mtx);
1637 		vmm_hma_release();
1638 		return (EEXIST);
1639 	}
1640 
1641 	/* Allow only one instance per non-global zone. */
1642 	if (!INGLOBALZONE(curproc)) {
1643 		for (sc = list_head(&vmm_list); sc != NULL;
1644 		    sc = list_next(&vmm_list, sc)) {
1645 			if (sc->vmm_zone == curzone) {
1646 				mutex_exit(&vmm_mtx);
1647 				vmm_hma_release();
1648 				return (EINVAL);
1649 			}
1650 		}
1651 	}
1652 
1653 	minor = id_alloc(vmm_minors);
1654 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1655 		goto fail;
1656 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1657 		ddi_soft_state_free(vmm_statep, minor);
1658 		goto fail;
1659 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1660 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
1661 		goto fail;
1662 	}
1663 
1664 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1665 		goto fail;
1666 	}
1667 
1668 	error = vm_create(req->name, req->flags, &sc->vmm_vm);
1669 	if (error == 0) {
1670 		/* Complete VM intialization and report success. */
1671 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1672 		sc->vmm_minor = minor;
1673 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1674 		    offsetof(vmm_devmem_entry_t, vde_node));
1675 
1676 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1677 		    offsetof(vmm_hold_t, vmh_node));
1678 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1679 
1680 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1681 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1682 		    offsetof(vmm_lease_t, vml_node));
1683 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1684 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1685 
1686 		sc->vmm_zone = crgetzone(cr);
1687 		zone_hold(sc->vmm_zone);
1688 		vmm_zsd_add_vm(sc);
1689 		vmm_kstat_init(sc);
1690 
1691 		list_insert_tail(&vmm_list, sc);
1692 		mutex_exit(&vmm_mtx);
1693 		return (0);
1694 	}
1695 
1696 	vmm_kstat_fini(sc);
1697 	ddi_remove_minor_node(vmmdev_dip, name);
1698 fail:
1699 	id_free(vmm_minors, minor);
1700 	if (sc != NULL) {
1701 		ddi_soft_state_free(vmm_statep, minor);
1702 	}
1703 	mutex_exit(&vmm_mtx);
1704 	vmm_hma_release();
1705 
1706 	return (error);
1707 }
1708 
1709 /*
1710  * Bhyve 'Driver' Interface
1711  *
1712  * While many devices are emulated in the bhyve userspace process, there are
1713  * others with performance constraints which require that they run mostly or
1714  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
1715  * needed so they can query/manipulate the portions of VM state needed to
1716  * fulfill their purpose.
1717  *
1718  * This includes:
1719  * - Translating guest-physical addresses to host-virtual pointers
1720  * - Injecting MSIs
1721  * - Hooking IO port addresses
1722  *
1723  * The vmm_drv interface exists to provide that functionality to its consumers.
1724  * (At this time, 'viona' is the only user)
1725  */
1726 int
1727 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1728 {
1729 	vnode_t *vp = fp->f_vnode;
1730 	const dev_t dev = vp->v_rdev;
1731 	vmm_softc_t *sc;
1732 	vmm_hold_t *hold;
1733 	int err = 0;
1734 
1735 	if (vp->v_type != VCHR) {
1736 		return (ENXIO);
1737 	}
1738 	const major_t major = getmajor(dev);
1739 	const minor_t minor = getminor(dev);
1740 
1741 	mutex_enter(&vmmdev_mtx);
1742 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1743 		mutex_exit(&vmmdev_mtx);
1744 		return (ENOENT);
1745 	}
1746 	mutex_enter(&vmm_mtx);
1747 	mutex_exit(&vmmdev_mtx);
1748 
1749 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1750 		err = ENOENT;
1751 		goto out;
1752 	}
1753 	/* XXXJOY: check cred permissions against instance */
1754 
1755 	if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1756 		err = EBUSY;
1757 		goto out;
1758 	}
1759 
1760 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1761 	hold->vmh_sc = sc;
1762 	hold->vmh_release_req = B_FALSE;
1763 
1764 	list_insert_tail(&sc->vmm_holds, hold);
1765 	sc->vmm_flags |= VMM_HELD;
1766 	*holdp = hold;
1767 
1768 out:
1769 	mutex_exit(&vmm_mtx);
1770 	return (err);
1771 }
1772 
1773 void
1774 vmm_drv_rele(vmm_hold_t *hold)
1775 {
1776 	vmm_softc_t *sc;
1777 
1778 	ASSERT(hold != NULL);
1779 	ASSERT(hold->vmh_sc != NULL);
1780 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
1781 
1782 	mutex_enter(&vmm_mtx);
1783 	sc = hold->vmh_sc;
1784 	list_remove(&sc->vmm_holds, hold);
1785 	if (list_is_empty(&sc->vmm_holds)) {
1786 		sc->vmm_flags &= ~VMM_HELD;
1787 		cv_broadcast(&sc->vmm_cv);
1788 	}
1789 	mutex_exit(&vmm_mtx);
1790 	kmem_free(hold, sizeof (*hold));
1791 }
1792 
1793 boolean_t
1794 vmm_drv_release_reqd(vmm_hold_t *hold)
1795 {
1796 	ASSERT(hold != NULL);
1797 
1798 	return (hold->vmh_release_req);
1799 }
1800 
1801 vmm_lease_t *
1802 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1803 {
1804 	vmm_softc_t *sc = hold->vmh_sc;
1805 	vmm_lease_t *lease;
1806 
1807 	ASSERT3P(expiref, !=, NULL);
1808 
1809 	if (hold->vmh_release_req) {
1810 		return (NULL);
1811 	}
1812 
1813 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1814 	list_link_init(&lease->vml_node);
1815 	lease->vml_expire_func = expiref;
1816 	lease->vml_expire_arg = arg;
1817 	lease->vml_expired = B_FALSE;
1818 	lease->vml_break_deferred = B_FALSE;
1819 	lease->vml_hold = hold;
1820 	/* cache the VM pointer for one less pointer chase */
1821 	lease->vml_vm = sc->vmm_vm;
1822 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
1823 
1824 	mutex_enter(&sc->vmm_lease_lock);
1825 	while (sc->vmm_lease_blocker != 0) {
1826 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1827 	}
1828 	list_insert_tail(&sc->vmm_lease_list, lease);
1829 	vmm_read_lock(sc);
1830 	mutex_exit(&sc->vmm_lease_lock);
1831 
1832 	return (lease);
1833 }
1834 
1835 static void
1836 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1837 {
1838 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1839 
1840 	list_remove(&sc->vmm_lease_list, lease);
1841 	vmm_read_unlock(sc);
1842 	vmc_destroy(lease->vml_vmclient);
1843 	kmem_free(lease, sizeof (*lease));
1844 }
1845 
1846 static void
1847 vmm_lease_block(vmm_softc_t *sc)
1848 {
1849 	mutex_enter(&sc->vmm_lease_lock);
1850 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
1851 	sc->vmm_lease_blocker++;
1852 	if (sc->vmm_lease_blocker == 1) {
1853 		list_t *list = &sc->vmm_lease_list;
1854 		vmm_lease_t *lease = list_head(list);
1855 
1856 		while (lease != NULL) {
1857 			void *arg = lease->vml_expire_arg;
1858 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
1859 			boolean_t sync_break = B_FALSE;
1860 
1861 			/*
1862 			 * Since the lease expiration notification may
1863 			 * need to take locks which would deadlock with
1864 			 * vmm_lease_lock, drop it across the call.
1865 			 *
1866 			 * We are the only one allowed to manipulate
1867 			 * vmm_lease_list right now, so it is safe to
1868 			 * continue iterating through it after
1869 			 * reacquiring the lock.
1870 			 */
1871 			lease->vml_expired = B_TRUE;
1872 			mutex_exit(&sc->vmm_lease_lock);
1873 			sync_break = expiref(arg);
1874 			mutex_enter(&sc->vmm_lease_lock);
1875 
1876 			if (sync_break) {
1877 				vmm_lease_t *next;
1878 
1879 				/*
1880 				 * These leases which are synchronously broken
1881 				 * result in vmm_read_unlock() calls from a
1882 				 * different thread than the corresponding
1883 				 * vmm_read_lock().  This is acceptable, given
1884 				 * that the rwlock underpinning the whole
1885 				 * mechanism tolerates the behavior.  This
1886 				 * flexibility is _only_ afforded to VM read
1887 				 * lock (RW_READER) holders.
1888 				 */
1889 				next = list_next(list, lease);
1890 				vmm_lease_break_locked(sc, lease);
1891 				lease = next;
1892 			} else {
1893 				lease = list_next(list, lease);
1894 			}
1895 		}
1896 
1897 		/* Process leases which were not broken synchronously. */
1898 		while (!list_is_empty(list)) {
1899 			/*
1900 			 * Although the nested loops are quadratic, the number
1901 			 * of leases is small.
1902 			 */
1903 			lease = list_head(list);
1904 			while (lease != NULL) {
1905 				vmm_lease_t *next = list_next(list, lease);
1906 				if (lease->vml_break_deferred) {
1907 					vmm_lease_break_locked(sc, lease);
1908 				}
1909 				lease = next;
1910 			}
1911 			if (list_is_empty(list)) {
1912 				break;
1913 			}
1914 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1915 		}
1916 		/* Wake anyone else waiting for the lease list to be empty  */
1917 		cv_broadcast(&sc->vmm_lease_cv);
1918 	} else {
1919 		list_t *list = &sc->vmm_lease_list;
1920 
1921 		/*
1922 		 * Some other thread beat us to the duty of lease cleanup.
1923 		 * Wait until that is complete.
1924 		 */
1925 		while (!list_is_empty(list)) {
1926 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1927 		}
1928 	}
1929 	mutex_exit(&sc->vmm_lease_lock);
1930 }
1931 
1932 static void
1933 vmm_lease_unblock(vmm_softc_t *sc)
1934 {
1935 	mutex_enter(&sc->vmm_lease_lock);
1936 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
1937 	sc->vmm_lease_blocker--;
1938 	if (sc->vmm_lease_blocker == 0) {
1939 		cv_broadcast(&sc->vmm_lease_cv);
1940 	}
1941 	mutex_exit(&sc->vmm_lease_lock);
1942 }
1943 
1944 void
1945 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1946 {
1947 	vmm_softc_t *sc = hold->vmh_sc;
1948 
1949 	VERIFY3P(hold, ==, lease->vml_hold);
1950 	VERIFY(!lease->vml_break_deferred);
1951 
1952 	mutex_enter(&sc->vmm_lease_lock);
1953 	if (sc->vmm_lease_blocker == 0) {
1954 		vmm_lease_break_locked(sc, lease);
1955 	} else {
1956 		/*
1957 		 * Defer the lease-breaking to whichever thread is currently
1958 		 * cleaning up all leases as part of a vmm_lease_block() call.
1959 		 */
1960 		lease->vml_break_deferred = B_TRUE;
1961 		cv_broadcast(&sc->vmm_lease_cv);
1962 	}
1963 	mutex_exit(&sc->vmm_lease_lock);
1964 }
1965 
1966 boolean_t
1967 vmm_drv_lease_expired(vmm_lease_t *lease)
1968 {
1969 	return (lease->vml_expired);
1970 }
1971 
1972 vmm_page_t *
1973 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
1974 {
1975 	ASSERT(lease != NULL);
1976 	ASSERT0(gpa & PAGEOFFSET);
1977 
1978 	return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
1979 }
1980 
1981 void
1982 vmm_drv_page_release(vmm_page_t *vmmp)
1983 {
1984 	(void) vmp_release((vm_page_t *)vmmp);
1985 }
1986 
1987 void
1988 vmm_drv_page_release_chain(vmm_page_t *vmmp)
1989 {
1990 	(void) vmp_release_chain((vm_page_t *)vmmp);
1991 }
1992 
1993 const void *
1994 vmm_drv_page_readable(const vmm_page_t *vmmp)
1995 {
1996 	return (vmp_get_readable((const vm_page_t *)vmmp));
1997 }
1998 
1999 void *
2000 vmm_drv_page_writable(const vmm_page_t *vmmp)
2001 {
2002 	return (vmp_get_writable((const vm_page_t *)vmmp));
2003 }
2004 
2005 void
2006 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2007 {
2008 	vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2009 }
2010 
2011 vmm_page_t *
2012 vmm_drv_page_next(const vmm_page_t *vmmp)
2013 {
2014 	return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2015 }
2016 
2017 int
2018 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2019 {
2020 	ASSERT(lease != NULL);
2021 
2022 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
2023 }
2024 
2025 int
2026 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2027     void *arg, void **cookie)
2028 {
2029 	vmm_softc_t *sc;
2030 	int err;
2031 
2032 	ASSERT(hold != NULL);
2033 	ASSERT(cookie != NULL);
2034 
2035 	sc = hold->vmh_sc;
2036 	mutex_enter(&vmm_mtx);
2037 	/* Confirm that hook installation is not blocked */
2038 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2039 		mutex_exit(&vmm_mtx);
2040 		return (EBUSY);
2041 	}
2042 	/*
2043 	 * Optimistically record an installed hook which will prevent a block
2044 	 * from being asserted while the mutex is dropped.
2045 	 */
2046 	hold->vmh_ioport_hook_cnt++;
2047 	mutex_exit(&vmm_mtx);
2048 
2049 	vmm_write_lock(sc);
2050 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2051 	    arg, cookie);
2052 	vmm_write_unlock(sc);
2053 
2054 	if (err != 0) {
2055 		mutex_enter(&vmm_mtx);
2056 		/* Walk back optimism about the hook installation */
2057 		hold->vmh_ioport_hook_cnt--;
2058 		mutex_exit(&vmm_mtx);
2059 	}
2060 	return (err);
2061 }
2062 
2063 void
2064 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2065 {
2066 	vmm_softc_t *sc;
2067 
2068 	ASSERT(hold != NULL);
2069 	ASSERT(cookie != NULL);
2070 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
2071 
2072 	sc = hold->vmh_sc;
2073 	vmm_write_lock(sc);
2074 	vm_ioport_unhook(sc->vmm_vm, cookie);
2075 	vmm_write_unlock(sc);
2076 
2077 	mutex_enter(&vmm_mtx);
2078 	hold->vmh_ioport_hook_cnt--;
2079 	mutex_exit(&vmm_mtx);
2080 }
2081 
2082 static int
2083 vmm_drv_purge(vmm_softc_t *sc)
2084 {
2085 	ASSERT(MUTEX_HELD(&vmm_mtx));
2086 
2087 	if ((sc->vmm_flags & VMM_HELD) != 0) {
2088 		vmm_hold_t *hold;
2089 
2090 		sc->vmm_flags |= VMM_CLEANUP;
2091 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2092 		    hold = list_next(&sc->vmm_holds, hold)) {
2093 			hold->vmh_release_req = B_TRUE;
2094 		}
2095 		while ((sc->vmm_flags & VMM_HELD) != 0) {
2096 			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2097 				return (EINTR);
2098 			}
2099 		}
2100 		sc->vmm_flags &= ~VMM_CLEANUP;
2101 	}
2102 
2103 	VERIFY(list_is_empty(&sc->vmm_holds));
2104 	sc->vmm_flags |= VMM_PURGED;
2105 	return (0);
2106 }
2107 
2108 static int
2109 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2110 {
2111 	int err = 0;
2112 
2113 	mutex_enter(&vmm_mtx);
2114 	if (!enable_block) {
2115 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2116 
2117 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2118 		goto done;
2119 	}
2120 
2121 	/* If any holds have hooks installed, the block is a failure */
2122 	if (!list_is_empty(&sc->vmm_holds)) {
2123 		vmm_hold_t *hold;
2124 
2125 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2126 		    hold = list_next(&sc->vmm_holds, hold)) {
2127 			if (hold->vmh_ioport_hook_cnt != 0) {
2128 				err = EBUSY;
2129 				goto done;
2130 			}
2131 		}
2132 	}
2133 	sc->vmm_flags |= VMM_BLOCK_HOOK;
2134 
2135 done:
2136 	mutex_exit(&vmm_mtx);
2137 	return (err);
2138 }
2139 
2140 static int
2141 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
2142     boolean_t *hma_release)
2143 {
2144 	dev_info_t	*pdip = ddi_get_parent(vmmdev_dip);
2145 	minor_t		minor;
2146 
2147 	ASSERT(MUTEX_HELD(&vmm_mtx));
2148 
2149 	*hma_release = B_FALSE;
2150 
2151 	if (vmm_drv_purge(sc) != 0) {
2152 		return (EINTR);
2153 	}
2154 
2155 	if (clean_zsd) {
2156 		vmm_zsd_rem_vm(sc);
2157 	}
2158 
2159 	/* Clean up devmem entries */
2160 	vmmdev_devmem_purge(sc);
2161 
2162 	list_remove(&vmm_list, sc);
2163 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2164 	minor = sc->vmm_minor;
2165 	zone_rele(sc->vmm_zone);
2166 	if (sc->vmm_is_open) {
2167 		list_insert_tail(&vmm_destroy_list, sc);
2168 		sc->vmm_flags |= VMM_DESTROY;
2169 	} else {
2170 		vmm_kstat_fini(sc);
2171 		vm_destroy(sc->vmm_vm);
2172 		ddi_soft_state_free(vmm_statep, minor);
2173 		id_free(vmm_minors, minor);
2174 		*hma_release = B_TRUE;
2175 	}
2176 	(void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
2177 
2178 	return (0);
2179 }
2180 
2181 int
2182 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
2183 {
2184 	boolean_t	hma_release = B_FALSE;
2185 	int		err;
2186 
2187 	mutex_enter(&vmm_mtx);
2188 	err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
2189 	mutex_exit(&vmm_mtx);
2190 
2191 	if (hma_release)
2192 		vmm_hma_release();
2193 
2194 	return (err);
2195 }
2196 
2197 /* ARGSUSED */
2198 static int
2199 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2200 {
2201 	boolean_t	hma_release = B_FALSE;
2202 	vmm_softc_t	*sc;
2203 	int		err;
2204 
2205 	if (crgetuid(cr) != 0)
2206 		return (EPERM);
2207 
2208 	mutex_enter(&vmm_mtx);
2209 
2210 	if ((sc = vmm_lookup(req->name)) == NULL) {
2211 		mutex_exit(&vmm_mtx);
2212 		return (ENOENT);
2213 	}
2214 	/*
2215 	 * We don't check this in vmm_lookup() since that function is also used
2216 	 * for validation during create and currently vmm names must be unique.
2217 	 */
2218 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2219 		mutex_exit(&vmm_mtx);
2220 		return (EPERM);
2221 	}
2222 	err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
2223 
2224 	mutex_exit(&vmm_mtx);
2225 
2226 	if (hma_release)
2227 		vmm_hma_release();
2228 
2229 	return (err);
2230 }
2231 
2232 #define	VCPU_NAME_BUFLEN	32
2233 
2234 static int
2235 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2236 {
2237 	zoneid_t zid = crgetzoneid(cr);
2238 	int instance = minor;
2239 	kstat_t *ksp;
2240 
2241 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2242 
2243 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2244 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2245 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2246 
2247 	if (ksp == NULL) {
2248 		return (-1);
2249 	}
2250 	sc->vmm_kstat_vm = ksp;
2251 
2252 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2253 		char namebuf[VCPU_NAME_BUFLEN];
2254 
2255 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2256 
2257 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2258 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2259 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2260 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2261 		    0, zid);
2262 		if (ksp == NULL) {
2263 			goto fail;
2264 		}
2265 
2266 		sc->vmm_kstat_vcpu[i] = ksp;
2267 	}
2268 
2269 	/*
2270 	 * If this instance is associated with a non-global zone, make its
2271 	 * kstats visible from the GZ.
2272 	 */
2273 	if (zid != GLOBAL_ZONEID) {
2274 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2275 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2276 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2277 		}
2278 	}
2279 
2280 	return (0);
2281 
2282 fail:
2283 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2284 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2285 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2286 			sc->vmm_kstat_vcpu[i] = NULL;
2287 		} else {
2288 			break;
2289 		}
2290 	}
2291 	kstat_delete(sc->vmm_kstat_vm);
2292 	sc->vmm_kstat_vm = NULL;
2293 	return (-1);
2294 }
2295 
2296 static void
2297 vmm_kstat_init(vmm_softc_t *sc)
2298 {
2299 	kstat_t *ksp;
2300 
2301 	ASSERT3P(sc->vmm_vm, !=, NULL);
2302 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2303 
2304 	ksp = sc->vmm_kstat_vm;
2305 	vmm_kstats_t *vk = ksp->ks_data;
2306 	ksp->ks_private = sc->vmm_vm;
2307 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2308 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2309 
2310 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2311 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2312 
2313 		ksp = sc->vmm_kstat_vcpu[i];
2314 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2315 
2316 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2317 		vvk->vvk_vcpu.value.ui32 = i;
2318 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2319 		    KSTAT_DATA_UINT64);
2320 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2321 		    KSTAT_DATA_UINT64);
2322 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2323 		    KSTAT_DATA_UINT64);
2324 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2325 		    KSTAT_DATA_UINT64);
2326 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2327 		    KSTAT_DATA_UINT64);
2328 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2329 		    KSTAT_DATA_UINT64);
2330 		ksp->ks_private = sc->vmm_vm;
2331 		ksp->ks_update = vmm_kstat_update_vcpu;
2332 	}
2333 
2334 	kstat_install(sc->vmm_kstat_vm);
2335 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2336 		kstat_install(sc->vmm_kstat_vcpu[i]);
2337 	}
2338 }
2339 
2340 static void
2341 vmm_kstat_fini(vmm_softc_t *sc)
2342 {
2343 	ASSERT(sc->vmm_kstat_vm != NULL);
2344 
2345 	kstat_delete(sc->vmm_kstat_vm);
2346 	sc->vmm_kstat_vm = NULL;
2347 
2348 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2349 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2350 
2351 		kstat_delete(sc->vmm_kstat_vcpu[i]);
2352 		sc->vmm_kstat_vcpu[i] = NULL;
2353 	}
2354 }
2355 
2356 static int
2357 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2358 {
2359 	minor_t		minor;
2360 	vmm_softc_t	*sc;
2361 
2362 	/*
2363 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2364 	 * verified to be safe.
2365 	 */
2366 	if (curproc->p_model != DATAMODEL_LP64) {
2367 		return (EFBIG);
2368 	}
2369 
2370 	minor = getminor(*devp);
2371 	if (minor == VMM_CTL_MINOR) {
2372 		/*
2373 		 * Master control device must be opened exclusively.
2374 		 */
2375 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2376 			return (EINVAL);
2377 		}
2378 
2379 		return (0);
2380 	}
2381 
2382 	mutex_enter(&vmm_mtx);
2383 	sc = ddi_get_soft_state(vmm_statep, minor);
2384 	if (sc == NULL) {
2385 		mutex_exit(&vmm_mtx);
2386 		return (ENXIO);
2387 	}
2388 
2389 	sc->vmm_is_open = B_TRUE;
2390 	mutex_exit(&vmm_mtx);
2391 
2392 	return (0);
2393 }
2394 
2395 static int
2396 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2397 {
2398 	minor_t		minor;
2399 	vmm_softc_t	*sc;
2400 	boolean_t	hma_release = B_FALSE;
2401 
2402 	minor = getminor(dev);
2403 	if (minor == VMM_CTL_MINOR)
2404 		return (0);
2405 
2406 	mutex_enter(&vmm_mtx);
2407 	sc = ddi_get_soft_state(vmm_statep, minor);
2408 	if (sc == NULL) {
2409 		mutex_exit(&vmm_mtx);
2410 		return (ENXIO);
2411 	}
2412 
2413 	VERIFY(sc->vmm_is_open);
2414 	sc->vmm_is_open = B_FALSE;
2415 
2416 	/*
2417 	 * If this VM was destroyed while the vmm device was open, then
2418 	 * clean it up now that it is closed.
2419 	 */
2420 	if (sc->vmm_flags & VMM_DESTROY) {
2421 		list_remove(&vmm_destroy_list, sc);
2422 		vmm_kstat_fini(sc);
2423 		vm_destroy(sc->vmm_vm);
2424 		ddi_soft_state_free(vmm_statep, minor);
2425 		id_free(vmm_minors, minor);
2426 		hma_release = B_TRUE;
2427 	}
2428 	mutex_exit(&vmm_mtx);
2429 
2430 	if (hma_release)
2431 		vmm_hma_release();
2432 
2433 	return (0);
2434 }
2435 
2436 static int
2437 vmm_is_supported(intptr_t arg)
2438 {
2439 	int r;
2440 	const char *msg;
2441 
2442 	if (vmm_is_intel()) {
2443 		r = vmx_x86_supported(&msg);
2444 	} else if (vmm_is_svm()) {
2445 		/*
2446 		 * HMA already ensured that the features necessary for SVM
2447 		 * operation were present and online during vmm_attach().
2448 		 */
2449 		r = 0;
2450 	} else {
2451 		r = ENXIO;
2452 		msg = "Unsupported CPU vendor";
2453 	}
2454 
2455 	if (r != 0 && arg != (intptr_t)NULL) {
2456 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2457 			return (EFAULT);
2458 	}
2459 	return (r);
2460 }
2461 
2462 static int
2463 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
2464 {
2465 	void *argp = (void *)arg;
2466 
2467 	switch (cmd) {
2468 	case VMM_CREATE_VM: {
2469 		struct vm_create_req req;
2470 
2471 		if ((md & FWRITE) == 0) {
2472 			return (EPERM);
2473 		}
2474 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2475 			return (EFAULT);
2476 		}
2477 		return (vmmdev_do_vm_create(&req, cr));
2478 	}
2479 	case VMM_DESTROY_VM: {
2480 		struct vm_destroy_req req;
2481 
2482 		if ((md & FWRITE) == 0) {
2483 			return (EPERM);
2484 		}
2485 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2486 			return (EFAULT);
2487 		}
2488 		return (vmmdev_do_vm_destroy(&req, cr));
2489 	}
2490 	case VMM_VM_SUPPORTED:
2491 		return (vmm_is_supported(arg));
2492 	case VMM_INTERFACE_VERSION:
2493 		*rvalp = VMM_CURRENT_INTERFACE_VERSION;
2494 		return (0);
2495 	case VMM_RESV_QUERY:
2496 	case VMM_RESV_ADD:
2497 	case VMM_RESV_REMOVE:
2498 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
2499 	default:
2500 		break;
2501 	}
2502 	/* No other actions are legal on ctl device */
2503 	return (ENOTTY);
2504 }
2505 
2506 static int
2507 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2508     int *rvalp)
2509 {
2510 	vmm_softc_t	*sc;
2511 	minor_t		minor;
2512 
2513 	/*
2514 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2515 	 * verified to be safe.
2516 	 */
2517 	if (curproc->p_model != DATAMODEL_LP64) {
2518 		return (EFBIG);
2519 	}
2520 
2521 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
2522 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2523 		return (ENOTSUP);
2524 	}
2525 
2526 	minor = getminor(dev);
2527 
2528 	if (minor == VMM_CTL_MINOR) {
2529 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
2530 	}
2531 
2532 	sc = ddi_get_soft_state(vmm_statep, minor);
2533 	ASSERT(sc);
2534 
2535 	if (sc->vmm_flags & VMM_DESTROY)
2536 		return (ENXIO);
2537 
2538 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2539 }
2540 
2541 static int
2542 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2543     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2544 {
2545 	vmm_softc_t *sc;
2546 	const minor_t minor = getminor(dev);
2547 	int err;
2548 
2549 	if (minor == VMM_CTL_MINOR) {
2550 		return (ENODEV);
2551 	}
2552 	if (off < 0 || (off + len) <= 0) {
2553 		return (EINVAL);
2554 	}
2555 	if ((prot & PROT_USER) == 0) {
2556 		return (EACCES);
2557 	}
2558 
2559 	sc = ddi_get_soft_state(vmm_statep, minor);
2560 	ASSERT(sc);
2561 
2562 	if (sc->vmm_flags & VMM_DESTROY)
2563 		return (ENXIO);
2564 
2565 	/* Grab read lock on the VM to prevent any changes to the memory map */
2566 	vmm_read_lock(sc);
2567 
2568 	if (off >= VM_DEVMEM_START) {
2569 		int segid;
2570 		off_t segoff;
2571 
2572 		/* Mapping a devmem "device" */
2573 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
2574 			err = ENODEV;
2575 		} else {
2576 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
2577 			    addrp, prot, maxprot, flags);
2578 		}
2579 	} else {
2580 		/* Mapping a part of the guest physical space */
2581 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
2582 		    maxprot, flags);
2583 	}
2584 
2585 	vmm_read_unlock(sc);
2586 	return (err);
2587 }
2588 
2589 static sdev_plugin_validate_t
2590 vmm_sdev_validate(sdev_ctx_t ctx)
2591 {
2592 	const char *name = sdev_ctx_name(ctx);
2593 	vmm_softc_t *sc;
2594 	sdev_plugin_validate_t ret;
2595 	minor_t minor;
2596 
2597 	if (sdev_ctx_vtype(ctx) != VCHR)
2598 		return (SDEV_VTOR_INVALID);
2599 
2600 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2601 
2602 	mutex_enter(&vmm_mtx);
2603 	if ((sc = vmm_lookup(name)) == NULL)
2604 		ret = SDEV_VTOR_INVALID;
2605 	else if (sc->vmm_minor != minor)
2606 		ret = SDEV_VTOR_STALE;
2607 	else
2608 		ret = SDEV_VTOR_VALID;
2609 	mutex_exit(&vmm_mtx);
2610 
2611 	return (ret);
2612 }
2613 
2614 static int
2615 vmm_sdev_filldir(sdev_ctx_t ctx)
2616 {
2617 	vmm_softc_t *sc;
2618 	int ret;
2619 
2620 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2621 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2622 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2623 		return (EINVAL);
2624 	}
2625 
2626 	mutex_enter(&vmm_mtx);
2627 	ASSERT(vmmdev_dip != NULL);
2628 	for (sc = list_head(&vmm_list); sc != NULL;
2629 	    sc = list_next(&vmm_list, sc)) {
2630 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2631 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2632 			    S_IFCHR | 0600,
2633 			    makedevice(ddi_driver_major(vmmdev_dip),
2634 			    sc->vmm_minor));
2635 		} else {
2636 			continue;
2637 		}
2638 		if (ret != 0 && ret != EEXIST)
2639 			goto out;
2640 	}
2641 
2642 	ret = 0;
2643 
2644 out:
2645 	mutex_exit(&vmm_mtx);
2646 	return (ret);
2647 }
2648 
2649 /* ARGSUSED */
2650 static void
2651 vmm_sdev_inactive(sdev_ctx_t ctx)
2652 {
2653 }
2654 
2655 static sdev_plugin_ops_t vmm_sdev_ops = {
2656 	.spo_version = SDEV_PLUGIN_VERSION,
2657 	.spo_flags = SDEV_PLUGIN_SUBDIR,
2658 	.spo_validate = vmm_sdev_validate,
2659 	.spo_filldir = vmm_sdev_filldir,
2660 	.spo_inactive = vmm_sdev_inactive
2661 };
2662 
2663 /* ARGSUSED */
2664 static int
2665 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2666 {
2667 	int error;
2668 
2669 	switch (cmd) {
2670 	case DDI_INFO_DEVT2DEVINFO:
2671 		*result = (void *)vmmdev_dip;
2672 		error = DDI_SUCCESS;
2673 		break;
2674 	case DDI_INFO_DEVT2INSTANCE:
2675 		*result = (void *)0;
2676 		error = DDI_SUCCESS;
2677 		break;
2678 	default:
2679 		error = DDI_FAILURE;
2680 		break;
2681 	}
2682 	return (error);
2683 }
2684 
2685 static int
2686 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2687 {
2688 	sdev_plugin_hdl_t sph;
2689 	hma_reg_t *reg = NULL;
2690 	boolean_t vmm_loaded = B_FALSE;
2691 
2692 	if (cmd != DDI_ATTACH) {
2693 		return (DDI_FAILURE);
2694 	}
2695 
2696 	mutex_enter(&vmmdev_mtx);
2697 	/* Ensure we are not already attached. */
2698 	if (vmmdev_dip != NULL) {
2699 		mutex_exit(&vmmdev_mtx);
2700 		return (DDI_FAILURE);
2701 	}
2702 
2703 	vmm_sol_glue_init();
2704 
2705 	/*
2706 	 * Perform temporary HMA registration to determine if the system
2707 	 * is capable.
2708 	 */
2709 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2710 		goto fail;
2711 	} else if (vmm_mod_load() != 0) {
2712 		goto fail;
2713 	}
2714 	vmm_loaded = B_TRUE;
2715 	hma_unregister(reg);
2716 	reg = NULL;
2717 
2718 	/* Create control node.  Other nodes will be created on demand. */
2719 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2720 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2721 		goto fail;
2722 	}
2723 
2724 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
2725 	if (sph == (sdev_plugin_hdl_t)NULL) {
2726 		ddi_remove_minor_node(dip, NULL);
2727 		goto fail;
2728 	}
2729 
2730 	ddi_report_dev(dip);
2731 	vmmdev_sdev_hdl = sph;
2732 	vmmdev_dip = dip;
2733 	mutex_exit(&vmmdev_mtx);
2734 	return (DDI_SUCCESS);
2735 
2736 fail:
2737 	if (vmm_loaded) {
2738 		VERIFY0(vmm_mod_unload());
2739 	}
2740 	if (reg != NULL) {
2741 		hma_unregister(reg);
2742 	}
2743 	vmm_sol_glue_cleanup();
2744 	mutex_exit(&vmmdev_mtx);
2745 	return (DDI_FAILURE);
2746 }
2747 
2748 static int
2749 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2750 {
2751 	if (cmd != DDI_DETACH) {
2752 		return (DDI_FAILURE);
2753 	}
2754 
2755 	/*
2756 	 * Ensure that all resources have been cleaned up.
2757 	 *
2758 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2759 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2760 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
2761 	 * devinfo, including our own, while holding vmmdev_mtx.
2762 	 */
2763 	if (mutex_tryenter(&vmmdev_mtx) == 0)
2764 		return (DDI_FAILURE);
2765 
2766 	mutex_enter(&vmm_mtx);
2767 	if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2768 		mutex_exit(&vmm_mtx);
2769 		mutex_exit(&vmmdev_mtx);
2770 		return (DDI_FAILURE);
2771 	}
2772 	mutex_exit(&vmm_mtx);
2773 
2774 	if (!vmmr_is_empty()) {
2775 		mutex_exit(&vmmdev_mtx);
2776 		return (DDI_FAILURE);
2777 	}
2778 
2779 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2780 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2781 		mutex_exit(&vmmdev_mtx);
2782 		return (DDI_FAILURE);
2783 	}
2784 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2785 
2786 	/* Remove the control node. */
2787 	ddi_remove_minor_node(dip, "ctl");
2788 	vmmdev_dip = NULL;
2789 
2790 	VERIFY0(vmm_mod_unload());
2791 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
2792 	vmm_sol_glue_cleanup();
2793 
2794 	mutex_exit(&vmmdev_mtx);
2795 
2796 	return (DDI_SUCCESS);
2797 }
2798 
2799 static struct cb_ops vmm_cb_ops = {
2800 	vmm_open,
2801 	vmm_close,
2802 	nodev,		/* strategy */
2803 	nodev,		/* print */
2804 	nodev,		/* dump */
2805 	nodev,		/* read */
2806 	nodev,		/* write */
2807 	vmm_ioctl,
2808 	nodev,		/* devmap */
2809 	nodev,		/* mmap */
2810 	vmm_segmap,
2811 	nochpoll,	/* poll */
2812 	ddi_prop_op,
2813 	NULL,
2814 	D_NEW | D_MP | D_DEVMAP
2815 };
2816 
2817 static struct dev_ops vmm_ops = {
2818 	DEVO_REV,
2819 	0,
2820 	vmm_info,
2821 	nulldev,	/* identify */
2822 	nulldev,	/* probe */
2823 	vmm_attach,
2824 	vmm_detach,
2825 	nodev,		/* reset */
2826 	&vmm_cb_ops,
2827 	(struct bus_ops *)NULL
2828 };
2829 
2830 static struct modldrv modldrv = {
2831 	&mod_driverops,
2832 	"bhyve vmm",
2833 	&vmm_ops
2834 };
2835 
2836 static struct modlinkage modlinkage = {
2837 	MODREV_1,
2838 	&modldrv,
2839 	NULL
2840 };
2841 
2842 int
2843 _init(void)
2844 {
2845 	int	error;
2846 
2847 	sysinit();
2848 
2849 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2850 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2851 	list_create(&vmm_list, sizeof (vmm_softc_t),
2852 	    offsetof(vmm_softc_t, vmm_node));
2853 	list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2854 	    offsetof(vmm_softc_t, vmm_node));
2855 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2856 
2857 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2858 	if (error) {
2859 		return (error);
2860 	}
2861 
2862 	vmm_zsd_init();
2863 	vmmr_init();
2864 
2865 	error = mod_install(&modlinkage);
2866 	if (error) {
2867 		ddi_soft_state_fini(&vmm_statep);
2868 		vmm_zsd_fini();
2869 		vmmr_fini();
2870 	}
2871 
2872 	return (error);
2873 }
2874 
2875 int
2876 _fini(void)
2877 {
2878 	int	error;
2879 
2880 	error = mod_remove(&modlinkage);
2881 	if (error) {
2882 		return (error);
2883 	}
2884 
2885 	vmm_zsd_fini();
2886 	vmmr_fini();
2887 
2888 	ddi_soft_state_fini(&vmm_statep);
2889 
2890 	return (0);
2891 }
2892 
2893 int
2894 _info(struct modinfo *modinfop)
2895 {
2896 	return (mod_info(&modlinkage, modinfop));
2897 }
2898