xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision 4bd36be41e0f25c6061bb4934a8c1048dbbd938e)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2023 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static id_space_t	*vmm_minors;
81 static void		*vmm_statep;
82 
83 /*
84  * Until device emulation in bhyve had been adequately scrutinized and tested,
85  * there was (justified) concern that unusual or corrupt device state payloads
86  * could crash the host when loaded via the vmm-data interface.
87  *
88  * Now that those concerns have been mitigated, this protection is loosened to
89  * default-allow, but the switch is left in place, in case there is a need to
90  * once again clamp down on vmm-data writes.
91  */
92 int		vmm_allow_state_writes = 1;
93 
94 static const char *vmmdev_hvm_name = "bhyve";
95 
96 /* For sdev plugin (/dev) */
97 #define	VMM_SDEV_ROOT "/dev/vmm"
98 
99 /* From uts/intel/io/vmm/intel/vmx.c */
100 extern int vmx_x86_supported(const char **);
101 
102 /* Holds and hooks from drivers external to vmm */
103 struct vmm_hold {
104 	list_node_t	vmh_node;
105 	vmm_softc_t	*vmh_sc;
106 	boolean_t	vmh_release_req;
107 	uint_t		vmh_ioport_hook_cnt;
108 };
109 
110 struct vmm_lease {
111 	list_node_t		vml_node;
112 	struct vm		*vml_vm;
113 	vm_client_t		*vml_vmclient;
114 	boolean_t		vml_expired;
115 	boolean_t		vml_break_deferred;
116 	boolean_t		(*vml_expire_func)(void *);
117 	void			*vml_expire_arg;
118 	struct vmm_hold		*vml_hold;
119 };
120 
121 /* Options for vmm_destroy_locked */
122 typedef enum vmm_destroy_opts {
123 	VDO_DEFAULT		= 0,
124 	/*
125 	 * Indicate that zone-specific-data associated with this VM not be
126 	 * cleaned up as part of the destroy.  Skipping ZSD clean-up is
127 	 * necessary when VM is being destroyed as part of zone destruction,
128 	 * when said ZSD is already being cleaned up.
129 	 */
130 	VDO_NO_CLEAN_ZSD	= (1 << 0),
131 	/*
132 	 * Attempt to wait for VM destruction to complete.  This is opt-in,
133 	 * since there are many normal conditions which could lead to
134 	 * destruction being stalled pending other clean-up.
135 	 */
136 	VDO_ATTEMPT_WAIT	= (1 << 1),
137 } vmm_destroy_opts_t;
138 
139 static void vmm_hma_release(void);
140 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *);
141 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
142 static void vmm_lease_block(vmm_softc_t *);
143 static void vmm_lease_unblock(vmm_softc_t *);
144 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
145 static void vmm_kstat_init(vmm_softc_t *);
146 static void vmm_kstat_fini(vmm_softc_t *);
147 
148 /*
149  * The 'devmem' hack:
150  *
151  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
152  * in the vm which appear with their own name related to the vm under /dev.
153  * Since this would be a hassle from an sdev perspective and would require a
154  * new cdev interface (or complicate the existing one), we choose to implement
155  * this in a different manner.  Direct access to the underlying vm memory
156  * segments is exposed by placing them in a range of offsets beyond the normal
157  * guest memory space.  Userspace can query the appropriate offset to mmap()
158  * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
159  */
160 
161 static vmm_devmem_entry_t *
vmmdev_devmem_find(vmm_softc_t * sc,int segid)162 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
163 {
164 	vmm_devmem_entry_t *ent = NULL;
165 	list_t *dl = &sc->vmm_devmem_list;
166 
167 	for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
168 		if (ent->vde_segid == segid) {
169 			return (ent);
170 		}
171 	}
172 	return (NULL);
173 }
174 
175 static int
vmmdev_get_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)176 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
177 {
178 	int error;
179 	bool sysmem;
180 
181 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
182 	    NULL);
183 	if (error || mseg->len == 0)
184 		return (error);
185 
186 	if (!sysmem) {
187 		vmm_devmem_entry_t *de;
188 
189 		de = vmmdev_devmem_find(sc, mseg->segid);
190 		if (de != NULL) {
191 			(void) strlcpy(mseg->name, de->vde_name,
192 			    sizeof (mseg->name));
193 		}
194 	} else {
195 		bzero(mseg->name, sizeof (mseg->name));
196 	}
197 
198 	return (error);
199 }
200 
201 static int
vmmdev_devmem_create(vmm_softc_t * sc,struct vm_memseg * mseg,const char * name)202 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
203 {
204 	off_t map_offset;
205 	vmm_devmem_entry_t *entry;
206 
207 	if (list_is_empty(&sc->vmm_devmem_list)) {
208 		map_offset = VM_DEVMEM_START;
209 	} else {
210 		entry = list_tail(&sc->vmm_devmem_list);
211 		map_offset = entry->vde_off + entry->vde_len;
212 		if (map_offset < entry->vde_off) {
213 			/* Do not tolerate overflow */
214 			return (ERANGE);
215 		}
216 		/*
217 		 * XXXJOY: We could choose to search the list for duplicate
218 		 * names and toss an error.  Since we're using the offset
219 		 * method for now, it does not make much of a difference.
220 		 */
221 	}
222 
223 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
224 	entry->vde_segid = mseg->segid;
225 	entry->vde_len = mseg->len;
226 	entry->vde_off = map_offset;
227 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
228 	list_insert_tail(&sc->vmm_devmem_list, entry);
229 
230 	return (0);
231 }
232 
233 static boolean_t
vmmdev_devmem_segid(vmm_softc_t * sc,off_t off,off_t len,int * segidp,off_t * map_offp)234 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
235     off_t *map_offp)
236 {
237 	list_t *dl = &sc->vmm_devmem_list;
238 	vmm_devmem_entry_t *de = NULL;
239 	const off_t map_end = off + len;
240 
241 	VERIFY(off >= VM_DEVMEM_START);
242 
243 	if (map_end < off) {
244 		/* No match on overflow */
245 		return (B_FALSE);
246 	}
247 
248 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
249 		const off_t item_end = de->vde_off + de->vde_len;
250 
251 		if (de->vde_off <= off && item_end >= map_end) {
252 			*segidp = de->vde_segid;
253 			*map_offp = off - de->vde_off;
254 			return (B_TRUE);
255 		}
256 	}
257 	return (B_FALSE);
258 }
259 
260 /*
261  * When an instance is being destroyed, the devmem list of named memory objects
262  * can be torn down, as no new mappings are allowed.
263  */
264 static void
vmmdev_devmem_purge(vmm_softc_t * sc)265 vmmdev_devmem_purge(vmm_softc_t *sc)
266 {
267 	vmm_devmem_entry_t *entry;
268 
269 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
270 		kmem_free(entry, sizeof (*entry));
271 	}
272 }
273 
274 static int
vmmdev_alloc_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)275 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
276 {
277 	int error;
278 	bool sysmem = true;
279 
280 	if (VM_MEMSEG_NAME(mseg)) {
281 		sysmem = false;
282 	}
283 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
284 
285 	if (error == 0) {
286 		/*
287 		 * Rather than create a whole fresh device from which userspace
288 		 * can mmap this segment, instead make it available at an
289 		 * offset above where the main guest memory resides.
290 		 */
291 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
292 		if (error != 0) {
293 			vm_free_memseg(sc->vmm_vm, mseg->segid);
294 		}
295 	}
296 	return (error);
297 }
298 
299 /*
300  * Resource Locking and Exclusion
301  *
302  * Much of bhyve depends on key portions of VM state, such as the guest memory
303  * map, to remain unchanged while the guest is running.  As ported from
304  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
305  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
306  * performing the work of actually running the guest in VMX/SVM, would lock
307  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
308  * state, all of the vCPUs would be first locked, ensuring that the
309  * operation(s) could complete without any other threads stumbling into
310  * intermediate states.
311  *
312  * This approach is largely effective for bhyve.  Common operations, such as
313  * running the vCPUs, steer clear of lock contention.  The model begins to
314  * break down for operations which do not occur in the context of a specific
315  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
316  * thread in the bhyve process.  In order to properly protect those vCPU-less
317  * operations from encountering invalid states, additional locking is required.
318  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
319  * It does mean that class of operations will be serialized on locking the
320  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
321  * undue contention on the VM_MAXCPU-1 vCPU.
322  *
323  * In order to address the shortcomings of this model, the concept of a
324  * read/write lock has been added to bhyve.  Operations which change
325  * fundamental aspects of a VM (such as the memory map) must acquire the write
326  * lock, which also implies locking all of the vCPUs and waiting for all read
327  * lock holders to release.  While it increases the cost and waiting time for
328  * those few operations, it allows most hot-path operations on the VM (which
329  * depend on its configuration remaining stable) to occur with minimal locking.
330  *
331  * Consumers of the Driver API (see below) are a special case when it comes to
332  * this locking, since they may hold a read lock via the drv_lease mechanism
333  * for an extended period of time.  Rather than forcing those consumers to
334  * continuously poll for a write lock attempt, the lease system forces them to
335  * provide a release callback to trigger their clean-up (and potential later
336  * reacquisition) of the read lock.
337  */
338 
339 static void
vcpu_lock_one(vmm_softc_t * sc,int vcpu)340 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
341 {
342 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
343 
344 	/*
345 	 * Since this state transition is utilizing from_idle=true, it should
346 	 * not fail, but rather block until it can be successful.
347 	 */
348 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
349 }
350 
351 static void
vcpu_unlock_one(vmm_softc_t * sc,int vcpu)352 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
353 {
354 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
355 
356 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
357 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
358 }
359 
360 static void
vmm_read_lock(vmm_softc_t * sc)361 vmm_read_lock(vmm_softc_t *sc)
362 {
363 	rw_enter(&sc->vmm_rwlock, RW_READER);
364 }
365 
366 static void
vmm_read_unlock(vmm_softc_t * sc)367 vmm_read_unlock(vmm_softc_t *sc)
368 {
369 	rw_exit(&sc->vmm_rwlock);
370 }
371 
372 static void
vmm_write_lock(vmm_softc_t * sc)373 vmm_write_lock(vmm_softc_t *sc)
374 {
375 	int maxcpus;
376 
377 	/* First lock all the vCPUs */
378 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
379 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
380 		vcpu_lock_one(sc, vcpu);
381 	}
382 
383 	/*
384 	 * Block vmm_drv leases from being acquired or held while the VM write
385 	 * lock is held.
386 	 */
387 	vmm_lease_block(sc);
388 
389 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
390 	/*
391 	 * For now, the 'maxcpus' value for an instance is fixed at the
392 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
393 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
394 	 * of the write lock will need to be wary of such changes.
395 	 */
396 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
397 }
398 
399 static void
vmm_write_unlock(vmm_softc_t * sc)400 vmm_write_unlock(vmm_softc_t *sc)
401 {
402 	int maxcpus;
403 
404 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
405 	vmm_lease_unblock(sc);
406 
407 	/*
408 	 * The VM write lock _must_ be released from the same thread it was
409 	 * acquired in, unlike the read lock.
410 	 */
411 	VERIFY(rw_write_held(&sc->vmm_rwlock));
412 	rw_exit(&sc->vmm_rwlock);
413 
414 	/* Unlock all the vCPUs */
415 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
416 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
417 		vcpu_unlock_one(sc, vcpu);
418 	}
419 }
420 
421 static int
vmmdev_do_ioctl(vmm_softc_t * sc,int cmd,intptr_t arg,int md,cred_t * credp,int * rvalp)422 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
423     cred_t *credp, int *rvalp)
424 {
425 	int error = 0, vcpu = -1;
426 	void *datap = (void *)arg;
427 	enum vm_lock_type {
428 		LOCK_NONE = 0,
429 		LOCK_VCPU,
430 		LOCK_READ_HOLD,
431 		LOCK_WRITE_HOLD
432 	} lock_type = LOCK_NONE;
433 
434 	/* Acquire any exclusion resources needed for the operation. */
435 	switch (cmd) {
436 	case VM_RUN:
437 	case VM_GET_REGISTER:
438 	case VM_SET_REGISTER:
439 	case VM_GET_SEGMENT_DESCRIPTOR:
440 	case VM_SET_SEGMENT_DESCRIPTOR:
441 	case VM_GET_REGISTER_SET:
442 	case VM_SET_REGISTER_SET:
443 	case VM_INJECT_EXCEPTION:
444 	case VM_GET_CAPABILITY:
445 	case VM_SET_CAPABILITY:
446 	case VM_PPTDEV_MSI:
447 	case VM_PPTDEV_MSIX:
448 	case VM_SET_X2APIC_STATE:
449 	case VM_GLA2GPA:
450 	case VM_GLA2GPA_NOFAULT:
451 	case VM_ACTIVATE_CPU:
452 	case VM_SET_INTINFO:
453 	case VM_GET_INTINFO:
454 	case VM_RESTART_INSTRUCTION:
455 	case VM_SET_KERNEMU_DEV:
456 	case VM_GET_KERNEMU_DEV:
457 	case VM_RESET_CPU:
458 	case VM_GET_RUN_STATE:
459 	case VM_SET_RUN_STATE:
460 	case VM_GET_FPU:
461 	case VM_SET_FPU:
462 	case VM_GET_CPUID:
463 	case VM_SET_CPUID:
464 	case VM_LEGACY_CPUID:
465 		/*
466 		 * Copy in the ID of the vCPU chosen for this operation.
467 		 * Since a nefarious caller could update their struct between
468 		 * this locking and when the rest of the ioctl data is copied
469 		 * in, it is _critical_ that this local 'vcpu' variable be used
470 		 * rather than the in-struct one when performing the ioctl.
471 		 */
472 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
473 			return (EFAULT);
474 		}
475 		if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) {
476 			return (EINVAL);
477 		}
478 		vcpu_lock_one(sc, vcpu);
479 		lock_type = LOCK_VCPU;
480 		break;
481 
482 	case VM_REINIT:
483 	case VM_BIND_PPTDEV:
484 	case VM_UNBIND_PPTDEV:
485 	case VM_MAP_PPTDEV_MMIO:
486 	case VM_UNMAP_PPTDEV_MMIO:
487 	case VM_ALLOC_MEMSEG:
488 	case VM_MMAP_MEMSEG:
489 	case VM_MUNMAP_MEMSEG:
490 	case VM_WRLOCK_CYCLE:
491 	case VM_PMTMR_LOCATE:
492 	case VM_PAUSE:
493 	case VM_RESUME:
494 		vmm_write_lock(sc);
495 		lock_type = LOCK_WRITE_HOLD;
496 		break;
497 
498 	case VM_GET_MEMSEG:
499 	case VM_MMAP_GETNEXT:
500 	case VM_LAPIC_IRQ:
501 	case VM_INJECT_NMI:
502 	case VM_IOAPIC_ASSERT_IRQ:
503 	case VM_IOAPIC_DEASSERT_IRQ:
504 	case VM_IOAPIC_PULSE_IRQ:
505 	case VM_LAPIC_MSI:
506 	case VM_LAPIC_LOCAL_IRQ:
507 	case VM_GET_X2APIC_STATE:
508 	case VM_RTC_READ:
509 	case VM_RTC_WRITE:
510 	case VM_RTC_SETTIME:
511 	case VM_RTC_GETTIME:
512 	case VM_PPTDEV_DISABLE_MSIX:
513 	case VM_DEVMEM_GETOFFSET:
514 	case VM_TRACK_DIRTY_PAGES:
515 	case VM_NPT_OPERATION:
516 		vmm_read_lock(sc);
517 		lock_type = LOCK_READ_HOLD;
518 		break;
519 
520 	case VM_DATA_READ:
521 	case VM_DATA_WRITE:
522 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
523 			return (EFAULT);
524 		}
525 		if (vcpu == -1) {
526 			/* Access data for VM-wide devices */
527 			vmm_write_lock(sc);
528 			lock_type = LOCK_WRITE_HOLD;
529 		} else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
530 			/* Access data associated with a specific vCPU */
531 			vcpu_lock_one(sc, vcpu);
532 			lock_type = LOCK_VCPU;
533 		} else {
534 			return (EINVAL);
535 		}
536 		break;
537 
538 	case VM_GET_GPA_PMAP:
539 	case VM_IOAPIC_PINCOUNT:
540 	case VM_SUSPEND:
541 	case VM_DESC_FPU_AREA:
542 	case VM_SET_AUTODESTRUCT:
543 	case VM_DESTROY_SELF:
544 	case VM_DESTROY_PENDING:
545 	case VM_VCPU_BARRIER:
546 	default:
547 		break;
548 	}
549 
550 	/* Execute the primary logic for the ioctl. */
551 	switch (cmd) {
552 	case VM_RUN: {
553 		struct vm_entry entry;
554 
555 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
556 			error = EFAULT;
557 			break;
558 		}
559 
560 		if (!(curthread->t_schedflag & TS_VCPU))
561 			smt_mark_as_vcpu();
562 
563 		error = vm_run(sc->vmm_vm, vcpu, &entry);
564 
565 		/*
566 		 * Unexpected states in vm_run() are expressed through positive
567 		 * errno-oriented return values.  VM states which expect further
568 		 * processing in userspace (necessary context via exitinfo) are
569 		 * expressed through negative return values.  For the time being
570 		 * a return value of 0 is not expected from vm_run().
571 		 */
572 		ASSERT(error != 0);
573 		if (error < 0) {
574 			const struct vm_exit *vme;
575 			void *outp = entry.exit_data;
576 
577 			error = 0;
578 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
579 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
580 				error = EFAULT;
581 			}
582 		}
583 		break;
584 	}
585 	case VM_SUSPEND: {
586 		struct vm_suspend vmsuspend;
587 
588 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
589 			error = EFAULT;
590 			break;
591 		}
592 		error = vm_suspend(sc->vmm_vm, vmsuspend.how, vmsuspend.source);
593 		break;
594 	}
595 	case VM_REINIT: {
596 		struct vm_reinit reinit;
597 
598 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
599 			error = EFAULT;
600 			break;
601 		}
602 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
603 			/*
604 			 * The VM instance should be free of driver-attached
605 			 * hooks during the reinitialization process.
606 			 */
607 			break;
608 		}
609 		error = vm_reinit(sc->vmm_vm, reinit.flags);
610 		(void) vmm_drv_block_hook(sc, B_FALSE);
611 		break;
612 	}
613 	case VM_STAT_DESC: {
614 		struct vm_stat_desc statdesc;
615 
616 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
617 			error = EFAULT;
618 			break;
619 		}
620 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
621 		    sizeof (statdesc.desc));
622 		if (error == 0 &&
623 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
624 			error = EFAULT;
625 			break;
626 		}
627 		break;
628 	}
629 	case VM_STATS_IOC: {
630 		struct vm_stats vmstats;
631 
632 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
633 			error = EFAULT;
634 			break;
635 		}
636 		hrt2tv(gethrtime(), &vmstats.tv);
637 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
638 		    nitems(vmstats.statbuf),
639 		    &vmstats.num_entries, vmstats.statbuf);
640 		if (error == 0 &&
641 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
642 			error = EFAULT;
643 			break;
644 		}
645 		break;
646 	}
647 
648 	case VM_PPTDEV_MSI: {
649 		struct vm_pptdev_msi pptmsi;
650 
651 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
652 			error = EFAULT;
653 			break;
654 		}
655 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
656 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
657 		break;
658 	}
659 	case VM_PPTDEV_MSIX: {
660 		struct vm_pptdev_msix pptmsix;
661 
662 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
663 			error = EFAULT;
664 			break;
665 		}
666 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
667 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
668 		    pptmsix.vector_control);
669 		break;
670 	}
671 	case VM_PPTDEV_DISABLE_MSIX: {
672 		struct vm_pptdev pptdev;
673 
674 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
675 			error = EFAULT;
676 			break;
677 		}
678 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
679 		break;
680 	}
681 	case VM_MAP_PPTDEV_MMIO: {
682 		struct vm_pptdev_mmio pptmmio;
683 
684 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
685 			error = EFAULT;
686 			break;
687 		}
688 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
689 		    pptmmio.len, pptmmio.hpa);
690 		break;
691 	}
692 	case VM_UNMAP_PPTDEV_MMIO: {
693 		struct vm_pptdev_mmio pptmmio;
694 
695 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
696 			error = EFAULT;
697 			break;
698 		}
699 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
700 		    pptmmio.len);
701 		break;
702 	}
703 	case VM_BIND_PPTDEV: {
704 		struct vm_pptdev pptdev;
705 
706 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
707 			error = EFAULT;
708 			break;
709 		}
710 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
711 		break;
712 	}
713 	case VM_UNBIND_PPTDEV: {
714 		struct vm_pptdev pptdev;
715 
716 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
717 			error = EFAULT;
718 			break;
719 		}
720 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
721 		break;
722 	}
723 	case VM_GET_PPTDEV_LIMITS: {
724 		struct vm_pptdev_limits pptlimits;
725 
726 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
727 			error = EFAULT;
728 			break;
729 		}
730 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
731 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
732 		if (error == 0 &&
733 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
734 			error = EFAULT;
735 			break;
736 		}
737 		break;
738 	}
739 	case VM_INJECT_EXCEPTION: {
740 		struct vm_exception vmexc;
741 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
742 			error = EFAULT;
743 			break;
744 		}
745 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
746 		    vmexc.error_code_valid != 0, vmexc.error_code,
747 		    vmexc.restart_instruction != 0);
748 		break;
749 	}
750 	case VM_INJECT_NMI: {
751 		struct vm_nmi vmnmi;
752 
753 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
754 			error = EFAULT;
755 			break;
756 		}
757 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
758 		break;
759 	}
760 	case VM_LAPIC_IRQ: {
761 		struct vm_lapic_irq vmirq;
762 
763 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
764 			error = EFAULT;
765 			break;
766 		}
767 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
768 		break;
769 	}
770 	case VM_LAPIC_LOCAL_IRQ: {
771 		struct vm_lapic_irq vmirq;
772 
773 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
774 			error = EFAULT;
775 			break;
776 		}
777 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
778 		    vmirq.vector);
779 		break;
780 	}
781 	case VM_LAPIC_MSI: {
782 		struct vm_lapic_msi vmmsi;
783 
784 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
785 			error = EFAULT;
786 			break;
787 		}
788 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
789 		break;
790 	}
791 
792 	case VM_IOAPIC_ASSERT_IRQ: {
793 		struct vm_ioapic_irq ioapic_irq;
794 
795 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
796 			error = EFAULT;
797 			break;
798 		}
799 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
800 		break;
801 	}
802 	case VM_IOAPIC_DEASSERT_IRQ: {
803 		struct vm_ioapic_irq ioapic_irq;
804 
805 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
806 			error = EFAULT;
807 			break;
808 		}
809 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
810 		break;
811 	}
812 	case VM_IOAPIC_PULSE_IRQ: {
813 		struct vm_ioapic_irq ioapic_irq;
814 
815 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
816 			error = EFAULT;
817 			break;
818 		}
819 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
820 		break;
821 	}
822 	case VM_IOAPIC_PINCOUNT: {
823 		int pincount;
824 
825 		pincount = vioapic_pincount(sc->vmm_vm);
826 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
827 			error = EFAULT;
828 			break;
829 		}
830 		break;
831 	}
832 	case VM_DESC_FPU_AREA: {
833 		struct vm_fpu_desc desc;
834 		void *buf = NULL;
835 
836 		if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
837 			error = EFAULT;
838 			break;
839 		}
840 		if (desc.vfd_num_entries > 64) {
841 			error = EINVAL;
842 			break;
843 		}
844 		const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
845 		    desc.vfd_num_entries;
846 		if (buf_sz != 0) {
847 			buf = kmem_zalloc(buf_sz, KM_SLEEP);
848 		}
849 
850 		/*
851 		 * For now, we are depending on vm_fpu_desc_entry and
852 		 * hma_xsave_state_desc_t having the same format.
853 		 */
854 		CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
855 		    sizeof (hma_xsave_state_desc_t));
856 
857 		size_t req_size;
858 		const uint_t max_entries = hma_fpu_describe_xsave_state(
859 		    (hma_xsave_state_desc_t *)buf,
860 		    desc.vfd_num_entries,
861 		    &req_size);
862 
863 		desc.vfd_req_size = req_size;
864 		desc.vfd_num_entries = max_entries;
865 		if (buf_sz != 0) {
866 			if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
867 				error = EFAULT;
868 			}
869 			kmem_free(buf, buf_sz);
870 		}
871 
872 		if (error == 0) {
873 			if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
874 				error = EFAULT;
875 			}
876 		}
877 		break;
878 	}
879 	case VM_SET_AUTODESTRUCT: {
880 		/*
881 		 * Since this has to do with controlling the lifetime of the
882 		 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather
883 		 * than the vcpu-centric or rwlock exclusion mechanisms.
884 		 */
885 		mutex_enter(&vmm_mtx);
886 		if (arg != 0) {
887 			sc->vmm_flags |= VMM_AUTODESTROY;
888 		} else {
889 			sc->vmm_flags &= ~VMM_AUTODESTROY;
890 		}
891 		mutex_exit(&vmm_mtx);
892 		break;
893 	}
894 	case VM_DESTROY_SELF: {
895 		bool hma_release = false;
896 
897 		/*
898 		 * Just like VMM_DESTROY_VM, but on the instance file descriptor
899 		 * itself, rather than having to perform a racy name lookup as
900 		 * part of the destroy process.
901 		 *
902 		 * Since vmm_destroy_locked() performs vCPU lock acquisition in
903 		 * order to kick the vCPUs out of guest context as part of any
904 		 * destruction, we do not need to worry about it ourself using
905 		 * the `lock_type` logic here.
906 		 */
907 		mutex_enter(&vmm_mtx);
908 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
909 		mutex_exit(&vmm_mtx);
910 		if (hma_release) {
911 			vmm_hma_release();
912 		}
913 		break;
914 	}
915 	case VM_DESTROY_PENDING: {
916 		/*
917 		 * If we have made it this far, then destruction of the instance
918 		 * has not been initiated.
919 		 */
920 		*rvalp = 0;
921 		break;
922 	}
923 
924 	case VM_ISA_ASSERT_IRQ: {
925 		struct vm_isa_irq isa_irq;
926 
927 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
928 			error = EFAULT;
929 			break;
930 		}
931 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
932 		if (error == 0 && isa_irq.ioapic_irq != -1) {
933 			error = vioapic_assert_irq(sc->vmm_vm,
934 			    isa_irq.ioapic_irq);
935 		}
936 		break;
937 	}
938 	case VM_ISA_DEASSERT_IRQ: {
939 		struct vm_isa_irq isa_irq;
940 
941 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
942 			error = EFAULT;
943 			break;
944 		}
945 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
946 		if (error == 0 && isa_irq.ioapic_irq != -1) {
947 			error = vioapic_deassert_irq(sc->vmm_vm,
948 			    isa_irq.ioapic_irq);
949 		}
950 		break;
951 	}
952 	case VM_ISA_PULSE_IRQ: {
953 		struct vm_isa_irq isa_irq;
954 
955 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
956 			error = EFAULT;
957 			break;
958 		}
959 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
960 		if (error == 0 && isa_irq.ioapic_irq != -1) {
961 			error = vioapic_pulse_irq(sc->vmm_vm,
962 			    isa_irq.ioapic_irq);
963 		}
964 		break;
965 	}
966 	case VM_ISA_SET_IRQ_TRIGGER: {
967 		struct vm_isa_irq_trigger isa_irq_trigger;
968 
969 		if (ddi_copyin(datap, &isa_irq_trigger,
970 		    sizeof (isa_irq_trigger), md)) {
971 			error = EFAULT;
972 			break;
973 		}
974 		error = vatpic_set_irq_trigger(sc->vmm_vm,
975 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
976 		break;
977 	}
978 
979 	case VM_MMAP_GETNEXT: {
980 		struct vm_memmap mm;
981 
982 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
983 			error = EFAULT;
984 			break;
985 		}
986 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
987 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
988 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
989 			error = EFAULT;
990 			break;
991 		}
992 		break;
993 	}
994 	case VM_MMAP_MEMSEG: {
995 		struct vm_memmap mm;
996 
997 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
998 			error = EFAULT;
999 			break;
1000 		}
1001 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
1002 		    mm.len, mm.prot, mm.flags);
1003 		break;
1004 	}
1005 	case VM_MUNMAP_MEMSEG: {
1006 		struct vm_munmap mu;
1007 
1008 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
1009 			error = EFAULT;
1010 			break;
1011 		}
1012 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
1013 		break;
1014 	}
1015 	case VM_ALLOC_MEMSEG: {
1016 		struct vm_memseg vmseg;
1017 
1018 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1019 			error = EFAULT;
1020 			break;
1021 		}
1022 		error = vmmdev_alloc_memseg(sc, &vmseg);
1023 		break;
1024 	}
1025 	case VM_GET_MEMSEG: {
1026 		struct vm_memseg vmseg;
1027 
1028 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1029 			error = EFAULT;
1030 			break;
1031 		}
1032 		error = vmmdev_get_memseg(sc, &vmseg);
1033 		if (error == 0 &&
1034 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
1035 			error = EFAULT;
1036 			break;
1037 		}
1038 		break;
1039 	}
1040 	case VM_GET_REGISTER: {
1041 		struct vm_register vmreg;
1042 
1043 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1044 			error = EFAULT;
1045 			break;
1046 		}
1047 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
1048 		    &vmreg.regval);
1049 		if (error == 0 &&
1050 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
1051 			error = EFAULT;
1052 			break;
1053 		}
1054 		break;
1055 	}
1056 	case VM_SET_REGISTER: {
1057 		struct vm_register vmreg;
1058 
1059 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1060 			error = EFAULT;
1061 			break;
1062 		}
1063 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
1064 		    vmreg.regval);
1065 		break;
1066 	}
1067 	case VM_SET_SEGMENT_DESCRIPTOR: {
1068 		struct vm_seg_desc vmsegd;
1069 
1070 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1071 			error = EFAULT;
1072 			break;
1073 		}
1074 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1075 		    &vmsegd.desc);
1076 		break;
1077 	}
1078 	case VM_GET_SEGMENT_DESCRIPTOR: {
1079 		struct vm_seg_desc vmsegd;
1080 
1081 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1082 			error = EFAULT;
1083 			break;
1084 		}
1085 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1086 		    &vmsegd.desc);
1087 		if (error == 0 &&
1088 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1089 			error = EFAULT;
1090 			break;
1091 		}
1092 		break;
1093 	}
1094 	case VM_GET_REGISTER_SET: {
1095 		struct vm_register_set vrs;
1096 		int regnums[VM_REG_LAST];
1097 		uint64_t regvals[VM_REG_LAST];
1098 
1099 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1100 			error = EFAULT;
1101 			break;
1102 		}
1103 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1104 			error = EINVAL;
1105 			break;
1106 		}
1107 		if (ddi_copyin(vrs.regnums, regnums,
1108 		    sizeof (int) * vrs.count, md)) {
1109 			error = EFAULT;
1110 			break;
1111 		}
1112 
1113 		error = 0;
1114 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1115 			if (regnums[i] < 0) {
1116 				error = EINVAL;
1117 				break;
1118 			}
1119 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1120 			    &regvals[i]);
1121 		}
1122 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1123 		    sizeof (uint64_t) * vrs.count, md)) {
1124 			error = EFAULT;
1125 		}
1126 		break;
1127 	}
1128 	case VM_SET_REGISTER_SET: {
1129 		struct vm_register_set vrs;
1130 		int regnums[VM_REG_LAST];
1131 		uint64_t regvals[VM_REG_LAST];
1132 
1133 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1134 			error = EFAULT;
1135 			break;
1136 		}
1137 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1138 			error = EINVAL;
1139 			break;
1140 		}
1141 		if (ddi_copyin(vrs.regnums, regnums,
1142 		    sizeof (int) * vrs.count, md)) {
1143 			error = EFAULT;
1144 			break;
1145 		}
1146 		if (ddi_copyin(vrs.regvals, regvals,
1147 		    sizeof (uint64_t) * vrs.count, md)) {
1148 			error = EFAULT;
1149 			break;
1150 		}
1151 
1152 		error = 0;
1153 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1154 			/*
1155 			 * Setting registers in a set is not atomic, since a
1156 			 * failure in the middle of the set will cause a
1157 			 * bail-out and inconsistent register state.  Callers
1158 			 * should be wary of this.
1159 			 */
1160 			if (regnums[i] < 0) {
1161 				error = EINVAL;
1162 				break;
1163 			}
1164 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1165 			    regvals[i]);
1166 		}
1167 		break;
1168 	}
1169 	case VM_RESET_CPU: {
1170 		struct vm_vcpu_reset vvr;
1171 
1172 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1173 			error = EFAULT;
1174 			break;
1175 		}
1176 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1177 			error = EINVAL;
1178 		}
1179 
1180 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1181 		break;
1182 	}
1183 	case VM_GET_RUN_STATE: {
1184 		struct vm_run_state vrs;
1185 
1186 		bzero(&vrs, sizeof (vrs));
1187 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1188 		    &vrs.sipi_vector);
1189 		if (error == 0) {
1190 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1191 				error = EFAULT;
1192 				break;
1193 			}
1194 		}
1195 		break;
1196 	}
1197 	case VM_SET_RUN_STATE: {
1198 		struct vm_run_state vrs;
1199 
1200 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1201 			error = EFAULT;
1202 			break;
1203 		}
1204 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1205 		    vrs.sipi_vector);
1206 		break;
1207 	}
1208 	case VM_GET_FPU: {
1209 		struct vm_fpu_state req;
1210 		const size_t max_len = (PAGESIZE * 2);
1211 		void *kbuf;
1212 
1213 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1214 			error = EFAULT;
1215 			break;
1216 		}
1217 		if (req.len > max_len || req.len == 0) {
1218 			error = EINVAL;
1219 			break;
1220 		}
1221 		kbuf = kmem_zalloc(req.len, KM_SLEEP);
1222 		error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1223 		if (error == 0) {
1224 			if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1225 				error = EFAULT;
1226 			}
1227 		}
1228 		kmem_free(kbuf, req.len);
1229 		break;
1230 	}
1231 	case VM_SET_FPU: {
1232 		struct vm_fpu_state req;
1233 		const size_t max_len = (PAGESIZE * 2);
1234 		void *kbuf;
1235 
1236 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1237 			error = EFAULT;
1238 			break;
1239 		}
1240 		if (req.len > max_len || req.len == 0) {
1241 			error = EINVAL;
1242 			break;
1243 		}
1244 		kbuf = kmem_alloc(req.len, KM_SLEEP);
1245 		if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1246 			error = EFAULT;
1247 		} else {
1248 			error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1249 		}
1250 		kmem_free(kbuf, req.len);
1251 		break;
1252 	}
1253 	case VM_GET_CPUID: {
1254 		struct vm_vcpu_cpuid_config cfg;
1255 		struct vcpu_cpuid_entry *entries = NULL;
1256 
1257 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1258 			error = EFAULT;
1259 			break;
1260 		}
1261 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1262 			error = EINVAL;
1263 			break;
1264 		}
1265 
1266 		const size_t entries_size =
1267 		    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1268 		if (entries_size != 0) {
1269 			entries = kmem_zalloc(entries_size, KM_SLEEP);
1270 		}
1271 
1272 		vcpu_cpuid_config_t vm_cfg = {
1273 			.vcc_nent = cfg.vvcc_nent,
1274 			.vcc_entries = entries,
1275 		};
1276 		error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1277 
1278 		/*
1279 		 * Only attempt to copy out the resultant entries if we were
1280 		 * able to query them from the instance.  The flags and number
1281 		 * of entries are emitted regardless.
1282 		 */
1283 		cfg.vvcc_flags = vm_cfg.vcc_flags;
1284 		cfg.vvcc_nent = vm_cfg.vcc_nent;
1285 		if (entries != NULL) {
1286 			if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries,
1287 			    entries_size, md) != 0) {
1288 				error = EFAULT;
1289 			}
1290 
1291 			kmem_free(entries, entries_size);
1292 		}
1293 
1294 		if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) {
1295 			error = EFAULT;
1296 		}
1297 		break;
1298 	}
1299 	case VM_SET_CPUID: {
1300 		struct vm_vcpu_cpuid_config cfg;
1301 		struct vcpu_cpuid_entry *entries = NULL;
1302 		size_t entries_size = 0;
1303 
1304 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1305 			error = EFAULT;
1306 			break;
1307 		}
1308 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1309 			error = EFBIG;
1310 			break;
1311 		}
1312 		if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) {
1313 			/*
1314 			 * If we are being instructed to use "legacy" handling,
1315 			 * then no entries should be provided, since the static
1316 			 * in-kernel masking will be used.
1317 			 */
1318 			if (cfg.vvcc_nent != 0) {
1319 				error = EINVAL;
1320 				break;
1321 			}
1322 		} else if (cfg.vvcc_nent != 0) {
1323 			entries_size =
1324 			    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1325 			entries = kmem_alloc(entries_size, KM_SLEEP);
1326 
1327 			if (ddi_copyin(cfg.vvcc_entries, entries, entries_size,
1328 			    md) != 0) {
1329 				error = EFAULT;
1330 				kmem_free(entries, entries_size);
1331 				break;
1332 			}
1333 		}
1334 
1335 		vcpu_cpuid_config_t vm_cfg = {
1336 			.vcc_flags = cfg.vvcc_flags,
1337 			.vcc_nent = cfg.vvcc_nent,
1338 			.vcc_entries = entries,
1339 		};
1340 		error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1341 
1342 		if (entries != NULL) {
1343 			kmem_free(entries, entries_size);
1344 		}
1345 		break;
1346 	}
1347 	case VM_LEGACY_CPUID: {
1348 		struct vm_legacy_cpuid vlc;
1349 		if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) {
1350 			error = EFAULT;
1351 			break;
1352 		}
1353 		vlc.vlc_vcpuid = vcpu;
1354 
1355 		legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax,
1356 		    &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx);
1357 
1358 		if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) {
1359 			error = EFAULT;
1360 			break;
1361 		}
1362 		break;
1363 	}
1364 
1365 	case VM_SET_KERNEMU_DEV:
1366 	case VM_GET_KERNEMU_DEV: {
1367 		struct vm_readwrite_kernemu_device kemu;
1368 		size_t size = 0;
1369 
1370 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1371 			error = EFAULT;
1372 			break;
1373 		}
1374 
1375 		if (kemu.access_width > 3) {
1376 			error = EINVAL;
1377 			break;
1378 		}
1379 		size = (1 << kemu.access_width);
1380 		ASSERT(size >= 1 && size <= 8);
1381 
1382 		if (cmd == VM_SET_KERNEMU_DEV) {
1383 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1384 			    kemu.gpa, kemu.value, size);
1385 		} else {
1386 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1387 			    kemu.gpa, &kemu.value, size);
1388 		}
1389 
1390 		if (error == 0) {
1391 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1392 				error = EFAULT;
1393 				break;
1394 			}
1395 		}
1396 		break;
1397 	}
1398 
1399 	case VM_GET_CAPABILITY: {
1400 		struct vm_capability vmcap;
1401 
1402 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1403 			error = EFAULT;
1404 			break;
1405 		}
1406 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1407 		    &vmcap.capval);
1408 		if (error == 0 &&
1409 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1410 			error = EFAULT;
1411 			break;
1412 		}
1413 		break;
1414 	}
1415 	case VM_SET_CAPABILITY: {
1416 		struct vm_capability vmcap;
1417 
1418 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1419 			error = EFAULT;
1420 			break;
1421 		}
1422 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1423 		    vmcap.capval);
1424 		break;
1425 	}
1426 	case VM_SET_X2APIC_STATE: {
1427 		struct vm_x2apic x2apic;
1428 
1429 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1430 			error = EFAULT;
1431 			break;
1432 		}
1433 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1434 		break;
1435 	}
1436 	case VM_GET_X2APIC_STATE: {
1437 		struct vm_x2apic x2apic;
1438 
1439 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1440 			error = EFAULT;
1441 			break;
1442 		}
1443 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1444 		    &x2apic.state);
1445 		if (error == 0 &&
1446 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1447 			error = EFAULT;
1448 			break;
1449 		}
1450 		break;
1451 	}
1452 	case VM_GET_GPA_PMAP: {
1453 		/*
1454 		 * Until there is a necessity to leak EPT/RVI PTE values to
1455 		 * userspace, this will remain unimplemented
1456 		 */
1457 		error = EINVAL;
1458 		break;
1459 	}
1460 	case VM_GET_HPET_CAPABILITIES: {
1461 		struct vm_hpet_cap hpetcap;
1462 
1463 		error = vhpet_getcap(&hpetcap);
1464 		if (error == 0 &&
1465 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1466 			error = EFAULT;
1467 			break;
1468 		}
1469 		break;
1470 	}
1471 	case VM_GLA2GPA: {
1472 		struct vm_gla2gpa gg;
1473 
1474 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1475 			error = EFAULT;
1476 			break;
1477 		}
1478 		gg.vcpuid = vcpu;
1479 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1480 		    gg.prot, &gg.gpa, &gg.fault);
1481 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1482 			error = EFAULT;
1483 			break;
1484 		}
1485 		break;
1486 	}
1487 	case VM_GLA2GPA_NOFAULT: {
1488 		struct vm_gla2gpa gg;
1489 
1490 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1491 			error = EFAULT;
1492 			break;
1493 		}
1494 		gg.vcpuid = vcpu;
1495 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1496 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1497 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1498 			error = EFAULT;
1499 			break;
1500 		}
1501 		break;
1502 	}
1503 
1504 	case VM_ACTIVATE_CPU:
1505 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1506 		break;
1507 
1508 	case VM_SUSPEND_CPU:
1509 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1510 			error = EFAULT;
1511 		} else {
1512 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1513 		}
1514 		break;
1515 
1516 	case VM_RESUME_CPU:
1517 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1518 			error = EFAULT;
1519 		} else {
1520 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1521 		}
1522 		break;
1523 
1524 	case VM_VCPU_BARRIER:
1525 		vcpu = arg;
1526 		error = vm_vcpu_barrier(sc->vmm_vm, vcpu);
1527 		break;
1528 
1529 	case VM_GET_CPUS: {
1530 		struct vm_cpuset vm_cpuset;
1531 		cpuset_t tempset;
1532 		void *srcp = &tempset;
1533 		int size;
1534 
1535 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1536 			error = EFAULT;
1537 			break;
1538 		}
1539 
1540 		/* Be more generous about sizing since our cpuset_t is large. */
1541 		size = vm_cpuset.cpusetsize;
1542 		if (size <= 0 || size > sizeof (cpuset_t)) {
1543 			error = ERANGE;
1544 		}
1545 		/*
1546 		 * If they want a ulong_t or less, make sure they receive the
1547 		 * low bits with all the useful information.
1548 		 */
1549 		if (size <= sizeof (tempset.cpub[0])) {
1550 			srcp = &tempset.cpub[0];
1551 		}
1552 
1553 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1554 			tempset = vm_active_cpus(sc->vmm_vm);
1555 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1556 			tempset = vm_debug_cpus(sc->vmm_vm);
1557 		} else {
1558 			error = EINVAL;
1559 		}
1560 
1561 		ASSERT(size > 0 && size <= sizeof (tempset));
1562 		if (error == 0 &&
1563 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1564 			error = EFAULT;
1565 			break;
1566 		}
1567 		break;
1568 	}
1569 	case VM_SET_INTINFO: {
1570 		struct vm_intinfo vmii;
1571 
1572 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1573 			error = EFAULT;
1574 			break;
1575 		}
1576 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1577 		break;
1578 	}
1579 	case VM_GET_INTINFO: {
1580 		struct vm_intinfo vmii;
1581 
1582 		vmii.vcpuid = vcpu;
1583 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1584 		    &vmii.info2);
1585 		if (error == 0 &&
1586 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1587 			error = EFAULT;
1588 			break;
1589 		}
1590 		break;
1591 	}
1592 	case VM_RTC_WRITE: {
1593 		struct vm_rtc_data rtcdata;
1594 
1595 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1596 			error = EFAULT;
1597 			break;
1598 		}
1599 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1600 		    rtcdata.value);
1601 		break;
1602 	}
1603 	case VM_RTC_READ: {
1604 		struct vm_rtc_data rtcdata;
1605 
1606 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1607 			error = EFAULT;
1608 			break;
1609 		}
1610 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1611 		    &rtcdata.value);
1612 		if (error == 0 &&
1613 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1614 			error = EFAULT;
1615 			break;
1616 		}
1617 		break;
1618 	}
1619 	case VM_RTC_SETTIME: {
1620 		timespec_t ts;
1621 
1622 		if (ddi_copyin(datap, &ts, sizeof (ts), md)) {
1623 			error = EFAULT;
1624 			break;
1625 		}
1626 		error = vrtc_set_time(sc->vmm_vm, &ts);
1627 		break;
1628 	}
1629 	case VM_RTC_GETTIME: {
1630 		timespec_t ts;
1631 
1632 		vrtc_get_time(sc->vmm_vm, &ts);
1633 		if (ddi_copyout(&ts, datap, sizeof (ts), md)) {
1634 			error = EFAULT;
1635 			break;
1636 		}
1637 		break;
1638 	}
1639 
1640 	case VM_PMTMR_LOCATE: {
1641 		uint16_t port = arg;
1642 		error = vpmtmr_set_location(sc->vmm_vm, port);
1643 		break;
1644 	}
1645 
1646 	case VM_RESTART_INSTRUCTION:
1647 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1648 		break;
1649 
1650 	case VM_SET_TOPOLOGY: {
1651 		struct vm_cpu_topology topo;
1652 
1653 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1654 			error = EFAULT;
1655 			break;
1656 		}
1657 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1658 		    topo.threads, topo.maxcpus);
1659 		break;
1660 	}
1661 	case VM_GET_TOPOLOGY: {
1662 		struct vm_cpu_topology topo;
1663 
1664 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1665 		    &topo.threads, &topo.maxcpus);
1666 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1667 			error = EFAULT;
1668 			break;
1669 		}
1670 		break;
1671 	}
1672 	case VM_DEVMEM_GETOFFSET: {
1673 		struct vm_devmem_offset vdo;
1674 		vmm_devmem_entry_t *de;
1675 
1676 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1677 			error = EFAULT;
1678 			break;
1679 		}
1680 
1681 		de = vmmdev_devmem_find(sc, vdo.segid);
1682 		if (de != NULL) {
1683 			vdo.offset = de->vde_off;
1684 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1685 				error = EFAULT;
1686 			}
1687 		} else {
1688 			error = ENOENT;
1689 		}
1690 		break;
1691 	}
1692 	case VM_TRACK_DIRTY_PAGES: {
1693 		const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1694 		struct vmm_dirty_tracker tracker;
1695 		uint8_t *bitmap;
1696 		size_t len;
1697 
1698 		if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1699 			error = EFAULT;
1700 			break;
1701 		}
1702 		if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1703 			error = EINVAL;
1704 			break;
1705 		}
1706 		if (tracker.vdt_len == 0) {
1707 			break;
1708 		}
1709 		if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1710 			error = EINVAL;
1711 			break;
1712 		}
1713 		if (tracker.vdt_len > max_track_region_len) {
1714 			error = EINVAL;
1715 			break;
1716 		}
1717 		len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1718 		bitmap = kmem_zalloc(len, KM_SLEEP);
1719 		error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1720 		    tracker.vdt_len, bitmap);
1721 		if (error == 0 &&
1722 		    ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1723 			error = EFAULT;
1724 		}
1725 		kmem_free(bitmap, len);
1726 
1727 		break;
1728 	}
1729 	case VM_NPT_OPERATION: {
1730 		struct vm_npt_operation vno;
1731 		uint8_t *bitmap = NULL;
1732 		uint64_t bitmap_size = 0;
1733 
1734 		if (ddi_copyin(datap, &vno, sizeof (vno), md) != 0) {
1735 			error = EFAULT;
1736 			break;
1737 		}
1738 		if ((vno.vno_gpa & PAGEOFFSET) != 0 ||
1739 		    (vno.vno_len & PAGEOFFSET) != 0) {
1740 			error = EINVAL;
1741 			break;
1742 		}
1743 		if ((UINT64_MAX - vno.vno_len) < vno.vno_gpa) {
1744 			error = EOVERFLOW;
1745 			break;
1746 		}
1747 
1748 		/*
1749 		 * Allocate a bitmap for the operation if it is specified as
1750 		 * part of the input or output.
1751 		 */
1752 		if ((vno.vno_operation &
1753 		    (VNO_FLAG_BITMAP_IN | VNO_FLAG_BITMAP_OUT)) != 0) {
1754 			/*
1755 			 * Operations expecting data to be copied in or out
1756 			 * should not have zero length.
1757 			 */
1758 			if (vno.vno_len == 0) {
1759 				error = EINVAL;
1760 				break;
1761 			}
1762 
1763 			/*
1764 			 * Maximum bitmap size of 8 pages results in 1 GiB of
1765 			 * coverage.
1766 			 */
1767 			const uint64_t max_bitmap_size = 8 * PAGESIZE;
1768 
1769 			bitmap_size = roundup(vno.vno_len / PAGESIZE, 8) / 8;
1770 			if (bitmap_size > max_bitmap_size) {
1771 				error = E2BIG;
1772 				break;
1773 			}
1774 			bitmap = kmem_zalloc(bitmap_size, KM_SLEEP);
1775 		}
1776 
1777 		if ((vno.vno_operation & VNO_FLAG_BITMAP_IN) != 0) {
1778 			ASSERT(bitmap != NULL);
1779 			if (ddi_copyin(vno.vno_bitmap, bitmap, bitmap_size,
1780 			    md) != 0) {
1781 				error = EFAULT;
1782 			}
1783 		}
1784 
1785 		if (error == 0) {
1786 			error = vm_npt_do_operation(sc->vmm_vm, vno.vno_gpa,
1787 			    vno.vno_len, vno.vno_operation, bitmap, rvalp);
1788 		}
1789 
1790 		if ((vno.vno_operation & VNO_FLAG_BITMAP_OUT) != 0 &&
1791 		    error == 0) {
1792 			ASSERT(bitmap != NULL);
1793 			if (ddi_copyout(bitmap, vno.vno_bitmap, bitmap_size,
1794 			    md) != 0) {
1795 				error = EFAULT;
1796 			}
1797 		}
1798 
1799 		if (bitmap != NULL) {
1800 			kmem_free(bitmap, bitmap_size);
1801 		}
1802 
1803 		break;
1804 	}
1805 	case VM_WRLOCK_CYCLE: {
1806 		/*
1807 		 * Present a test mechanism to acquire/release the write lock
1808 		 * on the VM without any other effects.
1809 		 */
1810 		break;
1811 	}
1812 	case VM_DATA_READ: {
1813 		struct vm_data_xfer vdx;
1814 
1815 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1816 			error = EFAULT;
1817 			break;
1818 		}
1819 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1820 			error = EINVAL;
1821 			break;
1822 		}
1823 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1824 			error = EFBIG;
1825 			break;
1826 		}
1827 
1828 		const size_t len = vdx.vdx_len;
1829 		void *buf = NULL;
1830 		if (len != 0) {
1831 			const void *udata = vdx.vdx_data;
1832 
1833 			buf = kmem_alloc(len, KM_SLEEP);
1834 			if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) {
1835 				bzero(buf, len);
1836 			} else if (ddi_copyin(udata, buf, len, md) != 0) {
1837 				kmem_free(buf, len);
1838 				error = EFAULT;
1839 				break;
1840 			}
1841 		}
1842 
1843 		vdx.vdx_result_len = 0;
1844 		vmm_data_req_t req = {
1845 			.vdr_class = vdx.vdx_class,
1846 			.vdr_version = vdx.vdx_version,
1847 			.vdr_flags = vdx.vdx_flags,
1848 			.vdr_len = len,
1849 			.vdr_data = buf,
1850 			.vdr_result_len = &vdx.vdx_result_len,
1851 			.vdr_vcpuid = vdx.vdx_vcpuid,
1852 		};
1853 		error = vmm_data_read(sc->vmm_vm, &req);
1854 
1855 		if (error == 0 && buf != NULL) {
1856 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1857 				error = EFAULT;
1858 			}
1859 		}
1860 
1861 		/*
1862 		 * Copy out the transfer request so that the value of
1863 		 * vdx_result_len can be made available, regardless of any
1864 		 * error(s) which may have occurred.
1865 		 */
1866 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1867 			error = (error != 0) ? error : EFAULT;
1868 		}
1869 
1870 		if (buf != NULL) {
1871 			kmem_free(buf, len);
1872 		}
1873 		break;
1874 	}
1875 	case VM_DATA_WRITE: {
1876 		struct vm_data_xfer vdx;
1877 
1878 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1879 			error = EFAULT;
1880 			break;
1881 		}
1882 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1883 			error = EINVAL;
1884 			break;
1885 		}
1886 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1887 			error = EFBIG;
1888 			break;
1889 		}
1890 
1891 		const size_t len = vdx.vdx_len;
1892 		void *buf = NULL;
1893 		if (len != 0) {
1894 			buf = kmem_alloc(len, KM_SLEEP);
1895 			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1896 				kmem_free(buf, len);
1897 				error = EFAULT;
1898 				break;
1899 			}
1900 		}
1901 
1902 		vdx.vdx_result_len = 0;
1903 		vmm_data_req_t req = {
1904 			.vdr_class = vdx.vdx_class,
1905 			.vdr_version = vdx.vdx_version,
1906 			.vdr_flags = vdx.vdx_flags,
1907 			.vdr_len = len,
1908 			.vdr_data = buf,
1909 			.vdr_result_len = &vdx.vdx_result_len,
1910 			.vdr_vcpuid = vdx.vdx_vcpuid,
1911 		};
1912 		if (vmm_allow_state_writes != 0) {
1913 			error = vmm_data_write(sc->vmm_vm, &req);
1914 		} else {
1915 			/*
1916 			 * Reject the write if somone has thrown the switch back
1917 			 * into the "disallow" position.
1918 			 */
1919 			error = EPERM;
1920 		}
1921 
1922 		if (error == 0 && buf != NULL &&
1923 		    (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1924 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1925 				error = EFAULT;
1926 			}
1927 		}
1928 
1929 		/*
1930 		 * Copy out the transfer request so that the value of
1931 		 * vdx_result_len can be made available, regardless of any
1932 		 * error(s) which may have occurred.
1933 		 */
1934 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1935 			error = (error != 0) ? error : EFAULT;
1936 		}
1937 
1938 		if (buf != NULL) {
1939 			kmem_free(buf, len);
1940 		}
1941 		break;
1942 	}
1943 
1944 	case VM_PAUSE: {
1945 		error = vm_pause_instance(sc->vmm_vm);
1946 		break;
1947 	}
1948 	case VM_RESUME: {
1949 		error = vm_resume_instance(sc->vmm_vm);
1950 		break;
1951 	}
1952 
1953 	default:
1954 		error = ENOTTY;
1955 		break;
1956 	}
1957 
1958 	/* Release exclusion resources */
1959 	switch (lock_type) {
1960 	case LOCK_NONE:
1961 		break;
1962 	case LOCK_VCPU:
1963 		vcpu_unlock_one(sc, vcpu);
1964 		break;
1965 	case LOCK_READ_HOLD:
1966 		vmm_read_unlock(sc);
1967 		break;
1968 	case LOCK_WRITE_HOLD:
1969 		vmm_write_unlock(sc);
1970 		break;
1971 	default:
1972 		panic("unexpected lock type");
1973 		break;
1974 	}
1975 
1976 	return (error);
1977 }
1978 
1979 static vmm_softc_t *
vmm_lookup(const char * name)1980 vmm_lookup(const char *name)
1981 {
1982 	list_t *vml = &vmm_list;
1983 	vmm_softc_t *sc;
1984 
1985 	ASSERT(MUTEX_HELD(&vmm_mtx));
1986 
1987 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1988 		if (strcmp(sc->vmm_name, name) == 0) {
1989 			break;
1990 		}
1991 	}
1992 
1993 	return (sc);
1994 }
1995 
1996 /*
1997  * Acquire an HMA registration if not already held.
1998  */
1999 static boolean_t
vmm_hma_acquire(void)2000 vmm_hma_acquire(void)
2001 {
2002 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
2003 
2004 	mutex_enter(&vmmdev_mtx);
2005 
2006 	if (vmmdev_hma_reg == NULL) {
2007 		VERIFY3U(vmmdev_hma_ref, ==, 0);
2008 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
2009 		if (vmmdev_hma_reg == NULL) {
2010 			cmn_err(CE_WARN, "%s HMA registration failed.",
2011 			    vmmdev_hvm_name);
2012 			mutex_exit(&vmmdev_mtx);
2013 			return (B_FALSE);
2014 		}
2015 	}
2016 
2017 	vmmdev_hma_ref++;
2018 
2019 	mutex_exit(&vmmdev_mtx);
2020 
2021 	return (B_TRUE);
2022 }
2023 
2024 /*
2025  * Release the HMA registration if held and there are no remaining VMs.
2026  */
2027 static void
vmm_hma_release(void)2028 vmm_hma_release(void)
2029 {
2030 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
2031 
2032 	mutex_enter(&vmmdev_mtx);
2033 
2034 	VERIFY3U(vmmdev_hma_ref, !=, 0);
2035 
2036 	vmmdev_hma_ref--;
2037 
2038 	if (vmmdev_hma_ref == 0) {
2039 		VERIFY(vmmdev_hma_reg != NULL);
2040 		hma_unregister(vmmdev_hma_reg);
2041 		vmmdev_hma_reg = NULL;
2042 	}
2043 	mutex_exit(&vmmdev_mtx);
2044 }
2045 
2046 static int
vmmdev_do_vm_create(const struct vm_create_req * req,cred_t * cr)2047 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
2048 {
2049 	vmm_softc_t	*sc = NULL;
2050 	minor_t		minor;
2051 	int		error = ENOMEM;
2052 	size_t		len;
2053 	const char	*name = req->name;
2054 
2055 	len = strnlen(name, VM_MAX_NAMELEN);
2056 	if (len == 0) {
2057 		return (EINVAL);
2058 	}
2059 	if (len >= VM_MAX_NAMELEN) {
2060 		return (ENAMETOOLONG);
2061 	}
2062 	if (strchr(name, '/') != NULL) {
2063 		return (EINVAL);
2064 	}
2065 
2066 	if (!vmm_hma_acquire())
2067 		return (ENXIO);
2068 
2069 	mutex_enter(&vmm_mtx);
2070 
2071 	/* Look for duplicate names */
2072 	if (vmm_lookup(name) != NULL) {
2073 		mutex_exit(&vmm_mtx);
2074 		vmm_hma_release();
2075 		return (EEXIST);
2076 	}
2077 
2078 	/* Allow only one instance per non-global zone. */
2079 	if (!INGLOBALZONE(curproc)) {
2080 		for (sc = list_head(&vmm_list); sc != NULL;
2081 		    sc = list_next(&vmm_list, sc)) {
2082 			if (sc->vmm_zone == curzone) {
2083 				mutex_exit(&vmm_mtx);
2084 				vmm_hma_release();
2085 				return (EINVAL);
2086 			}
2087 		}
2088 	}
2089 
2090 	minor = id_alloc(vmm_minors);
2091 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
2092 		goto fail;
2093 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2094 		ddi_soft_state_free(vmm_statep, minor);
2095 		goto fail;
2096 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
2097 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
2098 		goto fail;
2099 	}
2100 
2101 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
2102 		goto fail;
2103 	}
2104 
2105 	error = vm_create(req->flags, &sc->vmm_vm);
2106 	if (error == 0) {
2107 		/* Complete VM intialization and report success. */
2108 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
2109 		sc->vmm_minor = minor;
2110 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
2111 		    offsetof(vmm_devmem_entry_t, vde_node));
2112 
2113 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
2114 		    offsetof(vmm_hold_t, vmh_node));
2115 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
2116 
2117 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
2118 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
2119 		    offsetof(vmm_lease_t, vml_node));
2120 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
2121 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
2122 
2123 		sc->vmm_zone = crgetzone(cr);
2124 		zone_hold(sc->vmm_zone);
2125 		vmm_zsd_add_vm(sc);
2126 		vmm_kstat_init(sc);
2127 
2128 		list_insert_tail(&vmm_list, sc);
2129 		mutex_exit(&vmm_mtx);
2130 		return (0);
2131 	}
2132 
2133 	vmm_kstat_fini(sc);
2134 	ddi_remove_minor_node(vmmdev_dip, name);
2135 fail:
2136 	id_free(vmm_minors, minor);
2137 	if (sc != NULL) {
2138 		ddi_soft_state_free(vmm_statep, minor);
2139 	}
2140 	mutex_exit(&vmm_mtx);
2141 	vmm_hma_release();
2142 
2143 	return (error);
2144 }
2145 
2146 /*
2147  * Bhyve 'Driver' Interface
2148  *
2149  * While many devices are emulated in the bhyve userspace process, there are
2150  * others with performance constraints which require that they run mostly or
2151  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
2152  * needed so they can query/manipulate the portions of VM state needed to
2153  * fulfill their purpose.
2154  *
2155  * This includes:
2156  * - Translating guest-physical addresses to host-virtual pointers
2157  * - Injecting MSIs
2158  * - Hooking IO port addresses
2159  *
2160  * The vmm_drv interface exists to provide that functionality to its consumers.
2161  * (At this time, 'viona' is the only user)
2162  */
2163 int
vmm_drv_hold(file_t * fp,cred_t * cr,vmm_hold_t ** holdp)2164 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
2165 {
2166 	vnode_t *vp = fp->f_vnode;
2167 	const dev_t dev = vp->v_rdev;
2168 	vmm_softc_t *sc;
2169 	vmm_hold_t *hold;
2170 	int err = 0;
2171 
2172 	if (vp->v_type != VCHR) {
2173 		return (ENXIO);
2174 	}
2175 	const major_t major = getmajor(dev);
2176 	const minor_t minor = getminor(dev);
2177 
2178 	mutex_enter(&vmmdev_mtx);
2179 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
2180 		mutex_exit(&vmmdev_mtx);
2181 		return (ENOENT);
2182 	}
2183 	mutex_enter(&vmm_mtx);
2184 	mutex_exit(&vmmdev_mtx);
2185 
2186 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2187 		err = ENOENT;
2188 		goto out;
2189 	}
2190 	/* XXXJOY: check cred permissions against instance */
2191 
2192 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2193 		err = EBUSY;
2194 		goto out;
2195 	}
2196 
2197 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
2198 	hold->vmh_sc = sc;
2199 	hold->vmh_release_req = B_FALSE;
2200 
2201 	list_insert_tail(&sc->vmm_holds, hold);
2202 	sc->vmm_flags |= VMM_HELD;
2203 	*holdp = hold;
2204 
2205 out:
2206 	mutex_exit(&vmm_mtx);
2207 	return (err);
2208 }
2209 
2210 void
vmm_drv_rele(vmm_hold_t * hold)2211 vmm_drv_rele(vmm_hold_t *hold)
2212 {
2213 	vmm_softc_t *sc;
2214 	bool hma_release = false;
2215 
2216 	ASSERT(hold != NULL);
2217 	ASSERT(hold->vmh_sc != NULL);
2218 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
2219 
2220 	mutex_enter(&vmm_mtx);
2221 	sc = hold->vmh_sc;
2222 	list_remove(&sc->vmm_holds, hold);
2223 	kmem_free(hold, sizeof (*hold));
2224 
2225 	if (list_is_empty(&sc->vmm_holds)) {
2226 		sc->vmm_flags &= ~VMM_HELD;
2227 
2228 		/*
2229 		 * Since outstanding holds would prevent instance destruction
2230 		 * from completing, attempt to finish it now if it was already
2231 		 * set in motion.
2232 		 */
2233 		if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2234 			VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT,
2235 			    &hma_release));
2236 		}
2237 	}
2238 	mutex_exit(&vmm_mtx);
2239 
2240 	if (hma_release) {
2241 		vmm_hma_release();
2242 	}
2243 }
2244 
2245 boolean_t
vmm_drv_release_reqd(vmm_hold_t * hold)2246 vmm_drv_release_reqd(vmm_hold_t *hold)
2247 {
2248 	ASSERT(hold != NULL);
2249 
2250 	return (hold->vmh_release_req);
2251 }
2252 
2253 vmm_lease_t *
vmm_drv_lease_sign(vmm_hold_t * hold,boolean_t (* expiref)(void *),void * arg)2254 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
2255 {
2256 	vmm_softc_t *sc = hold->vmh_sc;
2257 	vmm_lease_t *lease;
2258 
2259 	ASSERT3P(expiref, !=, NULL);
2260 
2261 	if (hold->vmh_release_req) {
2262 		return (NULL);
2263 	}
2264 
2265 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
2266 	list_link_init(&lease->vml_node);
2267 	lease->vml_expire_func = expiref;
2268 	lease->vml_expire_arg = arg;
2269 	lease->vml_expired = B_FALSE;
2270 	lease->vml_break_deferred = B_FALSE;
2271 	lease->vml_hold = hold;
2272 	/* cache the VM pointer for one less pointer chase */
2273 	lease->vml_vm = sc->vmm_vm;
2274 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
2275 
2276 	mutex_enter(&sc->vmm_lease_lock);
2277 	while (sc->vmm_lease_blocker != 0) {
2278 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2279 	}
2280 	list_insert_tail(&sc->vmm_lease_list, lease);
2281 	vmm_read_lock(sc);
2282 	mutex_exit(&sc->vmm_lease_lock);
2283 
2284 	return (lease);
2285 }
2286 
2287 static void
vmm_lease_break_locked(vmm_softc_t * sc,vmm_lease_t * lease)2288 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
2289 {
2290 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
2291 
2292 	list_remove(&sc->vmm_lease_list, lease);
2293 	vmm_read_unlock(sc);
2294 	vmc_destroy(lease->vml_vmclient);
2295 	kmem_free(lease, sizeof (*lease));
2296 }
2297 
2298 static void
vmm_lease_block(vmm_softc_t * sc)2299 vmm_lease_block(vmm_softc_t *sc)
2300 {
2301 	mutex_enter(&sc->vmm_lease_lock);
2302 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
2303 	sc->vmm_lease_blocker++;
2304 	if (sc->vmm_lease_blocker == 1) {
2305 		list_t *list = &sc->vmm_lease_list;
2306 		vmm_lease_t *lease = list_head(list);
2307 
2308 		while (lease != NULL) {
2309 			void *arg = lease->vml_expire_arg;
2310 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
2311 			boolean_t sync_break = B_FALSE;
2312 
2313 			/*
2314 			 * Since the lease expiration notification may
2315 			 * need to take locks which would deadlock with
2316 			 * vmm_lease_lock, drop it across the call.
2317 			 *
2318 			 * We are the only one allowed to manipulate
2319 			 * vmm_lease_list right now, so it is safe to
2320 			 * continue iterating through it after
2321 			 * reacquiring the lock.
2322 			 */
2323 			lease->vml_expired = B_TRUE;
2324 			mutex_exit(&sc->vmm_lease_lock);
2325 			sync_break = expiref(arg);
2326 			mutex_enter(&sc->vmm_lease_lock);
2327 
2328 			if (sync_break) {
2329 				vmm_lease_t *next;
2330 
2331 				/*
2332 				 * These leases which are synchronously broken
2333 				 * result in vmm_read_unlock() calls from a
2334 				 * different thread than the corresponding
2335 				 * vmm_read_lock().  This is acceptable, given
2336 				 * that the rwlock underpinning the whole
2337 				 * mechanism tolerates the behavior.  This
2338 				 * flexibility is _only_ afforded to VM read
2339 				 * lock (RW_READER) holders.
2340 				 */
2341 				next = list_next(list, lease);
2342 				vmm_lease_break_locked(sc, lease);
2343 				lease = next;
2344 			} else {
2345 				lease = list_next(list, lease);
2346 			}
2347 		}
2348 
2349 		/* Process leases which were not broken synchronously. */
2350 		while (!list_is_empty(list)) {
2351 			/*
2352 			 * Although the nested loops are quadratic, the number
2353 			 * of leases is small.
2354 			 */
2355 			lease = list_head(list);
2356 			while (lease != NULL) {
2357 				vmm_lease_t *next = list_next(list, lease);
2358 				if (lease->vml_break_deferred) {
2359 					vmm_lease_break_locked(sc, lease);
2360 				}
2361 				lease = next;
2362 			}
2363 			if (list_is_empty(list)) {
2364 				break;
2365 			}
2366 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2367 		}
2368 		/* Wake anyone else waiting for the lease list to be empty  */
2369 		cv_broadcast(&sc->vmm_lease_cv);
2370 	} else {
2371 		list_t *list = &sc->vmm_lease_list;
2372 
2373 		/*
2374 		 * Some other thread beat us to the duty of lease cleanup.
2375 		 * Wait until that is complete.
2376 		 */
2377 		while (!list_is_empty(list)) {
2378 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2379 		}
2380 	}
2381 	mutex_exit(&sc->vmm_lease_lock);
2382 }
2383 
2384 static void
vmm_lease_unblock(vmm_softc_t * sc)2385 vmm_lease_unblock(vmm_softc_t *sc)
2386 {
2387 	mutex_enter(&sc->vmm_lease_lock);
2388 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2389 	sc->vmm_lease_blocker--;
2390 	if (sc->vmm_lease_blocker == 0) {
2391 		cv_broadcast(&sc->vmm_lease_cv);
2392 	}
2393 	mutex_exit(&sc->vmm_lease_lock);
2394 }
2395 
2396 void
vmm_drv_lease_break(vmm_hold_t * hold,vmm_lease_t * lease)2397 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2398 {
2399 	vmm_softc_t *sc = hold->vmh_sc;
2400 
2401 	VERIFY3P(hold, ==, lease->vml_hold);
2402 	VERIFY(!lease->vml_break_deferred);
2403 
2404 	mutex_enter(&sc->vmm_lease_lock);
2405 	if (sc->vmm_lease_blocker == 0) {
2406 		vmm_lease_break_locked(sc, lease);
2407 	} else {
2408 		/*
2409 		 * Defer the lease-breaking to whichever thread is currently
2410 		 * cleaning up all leases as part of a vmm_lease_block() call.
2411 		 */
2412 		lease->vml_break_deferred = B_TRUE;
2413 		cv_broadcast(&sc->vmm_lease_cv);
2414 	}
2415 	mutex_exit(&sc->vmm_lease_lock);
2416 }
2417 
2418 boolean_t
vmm_drv_lease_expired(vmm_lease_t * lease)2419 vmm_drv_lease_expired(vmm_lease_t *lease)
2420 {
2421 	return (lease->vml_expired);
2422 }
2423 
2424 vmm_page_t *
vmm_drv_page_hold(vmm_lease_t * lease,uintptr_t gpa,int prot)2425 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2426 {
2427 	ASSERT(lease != NULL);
2428 	ASSERT0(gpa & PAGEOFFSET);
2429 
2430 	return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2431 }
2432 
2433 
2434 /* Ensure that flags mirrored by vmm_drv interface properly match up */
2435 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY);
2436 
2437 vmm_page_t *
vmm_drv_page_hold_ext(vmm_lease_t * lease,uintptr_t gpa,int prot,int flags)2438 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags)
2439 {
2440 	ASSERT(lease != NULL);
2441 	ASSERT0(gpa & PAGEOFFSET);
2442 
2443 	vmm_page_t *page =
2444 	    (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags);
2445 	return (page);
2446 }
2447 
2448 void
vmm_drv_page_release(vmm_page_t * vmmp)2449 vmm_drv_page_release(vmm_page_t *vmmp)
2450 {
2451 	(void) vmp_release((vm_page_t *)vmmp);
2452 }
2453 
2454 void
vmm_drv_page_release_chain(vmm_page_t * vmmp)2455 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2456 {
2457 	(void) vmp_release_chain((vm_page_t *)vmmp);
2458 }
2459 
2460 const void *
vmm_drv_page_readable(const vmm_page_t * vmmp)2461 vmm_drv_page_readable(const vmm_page_t *vmmp)
2462 {
2463 	return (vmp_get_readable((const vm_page_t *)vmmp));
2464 }
2465 
2466 void *
vmm_drv_page_writable(const vmm_page_t * vmmp)2467 vmm_drv_page_writable(const vmm_page_t *vmmp)
2468 {
2469 	return (vmp_get_writable((const vm_page_t *)vmmp));
2470 }
2471 
2472 void
vmm_drv_page_mark_dirty(vmm_page_t * vmmp)2473 vmm_drv_page_mark_dirty(vmm_page_t *vmmp)
2474 {
2475 	return (vmp_mark_dirty((vm_page_t *)vmmp));
2476 }
2477 
2478 void
vmm_drv_page_chain(vmm_page_t * vmmp,vmm_page_t * to_chain)2479 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2480 {
2481 	vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2482 }
2483 
2484 vmm_page_t *
vmm_drv_page_next(const vmm_page_t * vmmp)2485 vmm_drv_page_next(const vmm_page_t *vmmp)
2486 {
2487 	return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2488 }
2489 
2490 int
vmm_drv_msi(vmm_lease_t * lease,uint64_t addr,uint64_t msg)2491 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2492 {
2493 	ASSERT(lease != NULL);
2494 
2495 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
2496 }
2497 
2498 int
vmm_drv_ioport_hook(vmm_hold_t * hold,uint16_t ioport,vmm_drv_iop_cb_t func,void * arg,void ** cookie)2499 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2500     void *arg, void **cookie)
2501 {
2502 	vmm_softc_t *sc;
2503 	int err;
2504 
2505 	ASSERT(hold != NULL);
2506 	ASSERT(cookie != NULL);
2507 
2508 	sc = hold->vmh_sc;
2509 	mutex_enter(&vmm_mtx);
2510 	/* Confirm that hook installation is not blocked */
2511 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2512 		mutex_exit(&vmm_mtx);
2513 		return (EBUSY);
2514 	}
2515 	/*
2516 	 * Optimistically record an installed hook which will prevent a block
2517 	 * from being asserted while the mutex is dropped.
2518 	 */
2519 	hold->vmh_ioport_hook_cnt++;
2520 	mutex_exit(&vmm_mtx);
2521 
2522 	vmm_write_lock(sc);
2523 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2524 	    arg, cookie);
2525 	vmm_write_unlock(sc);
2526 
2527 	if (err != 0) {
2528 		mutex_enter(&vmm_mtx);
2529 		/* Walk back optimism about the hook installation */
2530 		hold->vmh_ioport_hook_cnt--;
2531 		mutex_exit(&vmm_mtx);
2532 	}
2533 	return (err);
2534 }
2535 
2536 void
vmm_drv_ioport_unhook(vmm_hold_t * hold,void ** cookie)2537 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2538 {
2539 	vmm_softc_t *sc;
2540 
2541 	ASSERT(hold != NULL);
2542 	ASSERT(cookie != NULL);
2543 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
2544 
2545 	sc = hold->vmh_sc;
2546 	vmm_write_lock(sc);
2547 	vm_ioport_unhook(sc->vmm_vm, cookie);
2548 	vmm_write_unlock(sc);
2549 
2550 	mutex_enter(&vmm_mtx);
2551 	hold->vmh_ioport_hook_cnt--;
2552 	mutex_exit(&vmm_mtx);
2553 }
2554 
2555 static void
vmm_drv_purge(vmm_softc_t * sc)2556 vmm_drv_purge(vmm_softc_t *sc)
2557 {
2558 	ASSERT(MUTEX_HELD(&vmm_mtx));
2559 
2560 	if ((sc->vmm_flags & VMM_HELD) != 0) {
2561 		vmm_hold_t *hold;
2562 
2563 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2564 		    hold = list_next(&sc->vmm_holds, hold)) {
2565 			hold->vmh_release_req = B_TRUE;
2566 		}
2567 
2568 		/*
2569 		 * Require that all leases on the instance be broken, now that
2570 		 * all associated holds have been marked as needing release.
2571 		 *
2572 		 * Dropping vmm_mtx is not strictly necessary, but if any of the
2573 		 * lessees are slow to respond, it would be nice to leave it
2574 		 * available for other parties.
2575 		 */
2576 		mutex_exit(&vmm_mtx);
2577 		vmm_lease_block(sc);
2578 		vmm_lease_unblock(sc);
2579 		mutex_enter(&vmm_mtx);
2580 	}
2581 }
2582 
2583 static int
vmm_drv_block_hook(vmm_softc_t * sc,boolean_t enable_block)2584 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2585 {
2586 	int err = 0;
2587 
2588 	mutex_enter(&vmm_mtx);
2589 	if (!enable_block) {
2590 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2591 
2592 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2593 		goto done;
2594 	}
2595 
2596 	/* If any holds have hooks installed, the block is a failure */
2597 	if (!list_is_empty(&sc->vmm_holds)) {
2598 		vmm_hold_t *hold;
2599 
2600 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2601 		    hold = list_next(&sc->vmm_holds, hold)) {
2602 			if (hold->vmh_ioport_hook_cnt != 0) {
2603 				err = EBUSY;
2604 				goto done;
2605 			}
2606 		}
2607 	}
2608 	sc->vmm_flags |= VMM_BLOCK_HOOK;
2609 
2610 done:
2611 	mutex_exit(&vmm_mtx);
2612 	return (err);
2613 }
2614 
2615 
2616 static void
vmm_destroy_begin(vmm_softc_t * sc,vmm_destroy_opts_t opts)2617 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts)
2618 {
2619 	ASSERT(MUTEX_HELD(&vmm_mtx));
2620 	ASSERT0(sc->vmm_flags & VMM_DESTROY);
2621 
2622 	sc->vmm_flags |= VMM_DESTROY;
2623 
2624 	/*
2625 	 * Lock and unlock all of the vCPUs to ensure that they are kicked out
2626 	 * of guest context, being unable to return now that the instance is
2627 	 * marked for destruction.
2628 	 */
2629 	const int maxcpus = vm_get_maxcpus(sc->vmm_vm);
2630 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
2631 		vcpu_lock_one(sc, vcpu);
2632 		vcpu_unlock_one(sc, vcpu);
2633 	}
2634 
2635 	vmmdev_devmem_purge(sc);
2636 	if ((opts & VDO_NO_CLEAN_ZSD) == 0) {
2637 		/*
2638 		 * The ZSD should be cleaned up now, unless destruction of the
2639 		 * instance was initated by destruction of the containing zone,
2640 		 * in which case the ZSD has already been removed.
2641 		 */
2642 		vmm_zsd_rem_vm(sc);
2643 	}
2644 	zone_rele(sc->vmm_zone);
2645 
2646 	vmm_drv_purge(sc);
2647 }
2648 
2649 static bool
vmm_destroy_ready(vmm_softc_t * sc)2650 vmm_destroy_ready(vmm_softc_t *sc)
2651 {
2652 	ASSERT(MUTEX_HELD(&vmm_mtx));
2653 
2654 	if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) {
2655 		VERIFY(list_is_empty(&sc->vmm_holds));
2656 		return (true);
2657 	}
2658 
2659 	return (false);
2660 }
2661 
2662 static void
vmm_destroy_finish(vmm_softc_t * sc)2663 vmm_destroy_finish(vmm_softc_t *sc)
2664 {
2665 	ASSERT(MUTEX_HELD(&vmm_mtx));
2666 	ASSERT(vmm_destroy_ready(sc));
2667 
2668 	list_remove(&vmm_list, sc);
2669 	vmm_kstat_fini(sc);
2670 	vm_destroy(sc->vmm_vm);
2671 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2672 	(void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE);
2673 
2674 	const minor_t minor = sc->vmm_minor;
2675 	ddi_soft_state_free(vmm_statep, minor);
2676 	id_free(vmm_minors, minor);
2677 }
2678 
2679 /*
2680  * Initiate or attempt to finish destruction of a VMM instance.
2681  *
2682  * This is called from several contexts:
2683  * - An explicit destroy ioctl is made
2684  * - A vmm_drv consumer releases its hold (being the last on the instance)
2685  * - The vmm device is closed, and auto-destruct is enabled
2686  */
2687 static int
vmm_destroy_locked(vmm_softc_t * sc,vmm_destroy_opts_t opts,bool * hma_release)2688 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts,
2689     bool *hma_release)
2690 {
2691 	ASSERT(MUTEX_HELD(&vmm_mtx));
2692 
2693 	*hma_release = false;
2694 
2695 	/*
2696 	 * When instance destruction begins, it is so marked such that any
2697 	 * further requests to operate the instance will fail.
2698 	 */
2699 	if ((sc->vmm_flags & VMM_DESTROY) == 0) {
2700 		vmm_destroy_begin(sc, opts);
2701 	}
2702 
2703 	if (vmm_destroy_ready(sc)) {
2704 
2705 		/*
2706 		 * Notify anyone waiting for the destruction to finish.  They
2707 		 * must be clear before we can safely tear down the softc.
2708 		 */
2709 		if (sc->vmm_destroy_waiters != 0) {
2710 			cv_broadcast(&sc->vmm_cv);
2711 			while (sc->vmm_destroy_waiters != 0) {
2712 				cv_wait(&sc->vmm_cv, &vmm_mtx);
2713 			}
2714 		}
2715 
2716 		/*
2717 		 * Finish destruction of instance.  After this point, the softc
2718 		 * is freed and cannot be accessed again.
2719 		 *
2720 		 * With destruction complete, the HMA hold can be released
2721 		 */
2722 		vmm_destroy_finish(sc);
2723 		*hma_release = true;
2724 		return (0);
2725 	} else if ((opts & VDO_ATTEMPT_WAIT) != 0) {
2726 		int err = 0;
2727 
2728 		sc->vmm_destroy_waiters++;
2729 		while (!vmm_destroy_ready(sc) && err == 0) {
2730 			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2731 				err = EINTR;
2732 			}
2733 		}
2734 		sc->vmm_destroy_waiters--;
2735 
2736 		if (sc->vmm_destroy_waiters == 0) {
2737 			/*
2738 			 * If we were the last waiter, it could be that VM
2739 			 * destruction is waiting on _us_ to proceed with the
2740 			 * final clean-up.
2741 			 */
2742 			cv_signal(&sc->vmm_cv);
2743 		}
2744 		return (err);
2745 	} else {
2746 		/*
2747 		 * Since the instance is not ready for destruction, and the
2748 		 * caller did not ask to wait, consider it a success for now.
2749 		 */
2750 		return (0);
2751 	}
2752 }
2753 
2754 void
vmm_zone_vm_destroy(vmm_softc_t * sc)2755 vmm_zone_vm_destroy(vmm_softc_t *sc)
2756 {
2757 	bool hma_release = false;
2758 	int err;
2759 
2760 	mutex_enter(&vmm_mtx);
2761 	err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release);
2762 	mutex_exit(&vmm_mtx);
2763 
2764 	VERIFY0(err);
2765 
2766 	if (hma_release) {
2767 		vmm_hma_release();
2768 	}
2769 }
2770 
2771 static int
vmmdev_do_vm_destroy(const struct vm_destroy_req * req,cred_t * cr)2772 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2773 {
2774 	vmm_softc_t *sc;
2775 	bool hma_release = false;
2776 	int err;
2777 
2778 	if (crgetuid(cr) != 0) {
2779 		return (EPERM);
2780 	}
2781 
2782 	mutex_enter(&vmm_mtx);
2783 	sc = vmm_lookup(req->name);
2784 	if (sc == NULL) {
2785 		mutex_exit(&vmm_mtx);
2786 		return (ENOENT);
2787 	}
2788 	/*
2789 	 * We don't check this in vmm_lookup() since that function is also used
2790 	 * for validation during create and currently vmm names must be unique.
2791 	 */
2792 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2793 		mutex_exit(&vmm_mtx);
2794 		return (EPERM);
2795 	}
2796 
2797 	err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release);
2798 	mutex_exit(&vmm_mtx);
2799 
2800 	if (hma_release) {
2801 		vmm_hma_release();
2802 	}
2803 
2804 	return (err);
2805 }
2806 
2807 #define	VCPU_NAME_BUFLEN	32
2808 
2809 static int
vmm_kstat_alloc(vmm_softc_t * sc,minor_t minor,const cred_t * cr)2810 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2811 {
2812 	zoneid_t zid = crgetzoneid(cr);
2813 	int instance = minor;
2814 	kstat_t *ksp;
2815 
2816 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2817 
2818 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2819 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2820 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2821 
2822 	if (ksp == NULL) {
2823 		return (-1);
2824 	}
2825 	sc->vmm_kstat_vm = ksp;
2826 
2827 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2828 		char namebuf[VCPU_NAME_BUFLEN];
2829 
2830 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2831 
2832 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2833 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2834 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2835 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2836 		    0, zid);
2837 		if (ksp == NULL) {
2838 			goto fail;
2839 		}
2840 
2841 		sc->vmm_kstat_vcpu[i] = ksp;
2842 	}
2843 
2844 	/*
2845 	 * If this instance is associated with a non-global zone, make its
2846 	 * kstats visible from the GZ.
2847 	 */
2848 	if (zid != GLOBAL_ZONEID) {
2849 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2850 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2851 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2852 		}
2853 	}
2854 
2855 	return (0);
2856 
2857 fail:
2858 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2859 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2860 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2861 			sc->vmm_kstat_vcpu[i] = NULL;
2862 		} else {
2863 			break;
2864 		}
2865 	}
2866 	kstat_delete(sc->vmm_kstat_vm);
2867 	sc->vmm_kstat_vm = NULL;
2868 	return (-1);
2869 }
2870 
2871 static void
vmm_kstat_init(vmm_softc_t * sc)2872 vmm_kstat_init(vmm_softc_t *sc)
2873 {
2874 	kstat_t *ksp;
2875 
2876 	ASSERT3P(sc->vmm_vm, !=, NULL);
2877 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2878 
2879 	ksp = sc->vmm_kstat_vm;
2880 	vmm_kstats_t *vk = ksp->ks_data;
2881 	ksp->ks_private = sc->vmm_vm;
2882 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2883 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2884 
2885 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2886 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2887 
2888 		ksp = sc->vmm_kstat_vcpu[i];
2889 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2890 
2891 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2892 		vvk->vvk_vcpu.value.ui32 = i;
2893 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2894 		    KSTAT_DATA_UINT64);
2895 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2896 		    KSTAT_DATA_UINT64);
2897 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2898 		    KSTAT_DATA_UINT64);
2899 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2900 		    KSTAT_DATA_UINT64);
2901 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2902 		    KSTAT_DATA_UINT64);
2903 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2904 		    KSTAT_DATA_UINT64);
2905 		ksp->ks_private = sc->vmm_vm;
2906 		ksp->ks_update = vmm_kstat_update_vcpu;
2907 	}
2908 
2909 	kstat_install(sc->vmm_kstat_vm);
2910 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2911 		kstat_install(sc->vmm_kstat_vcpu[i]);
2912 	}
2913 }
2914 
2915 static void
vmm_kstat_fini(vmm_softc_t * sc)2916 vmm_kstat_fini(vmm_softc_t *sc)
2917 {
2918 	ASSERT(sc->vmm_kstat_vm != NULL);
2919 
2920 	kstat_delete(sc->vmm_kstat_vm);
2921 	sc->vmm_kstat_vm = NULL;
2922 
2923 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2924 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2925 
2926 		kstat_delete(sc->vmm_kstat_vcpu[i]);
2927 		sc->vmm_kstat_vcpu[i] = NULL;
2928 	}
2929 }
2930 
2931 static int
vmm_open(dev_t * devp,int flag,int otyp,cred_t * credp)2932 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2933 {
2934 	minor_t		minor;
2935 	vmm_softc_t	*sc;
2936 
2937 	/*
2938 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2939 	 * verified to be safe.
2940 	 */
2941 	if (curproc->p_model != DATAMODEL_LP64) {
2942 		return (EFBIG);
2943 	}
2944 
2945 	minor = getminor(*devp);
2946 	if (minor == VMM_CTL_MINOR) {
2947 		/*
2948 		 * Master control device must be opened exclusively.
2949 		 */
2950 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2951 			return (EINVAL);
2952 		}
2953 
2954 		return (0);
2955 	}
2956 
2957 	mutex_enter(&vmm_mtx);
2958 	sc = ddi_get_soft_state(vmm_statep, minor);
2959 	if (sc == NULL) {
2960 		mutex_exit(&vmm_mtx);
2961 		return (ENXIO);
2962 	}
2963 
2964 	sc->vmm_flags |= VMM_IS_OPEN;
2965 	mutex_exit(&vmm_mtx);
2966 
2967 	return (0);
2968 }
2969 
2970 static int
vmm_close(dev_t dev,int flag,int otyp,cred_t * credp)2971 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2972 {
2973 	const minor_t minor = getminor(dev);
2974 	vmm_softc_t *sc;
2975 	bool hma_release = false;
2976 
2977 	if (minor == VMM_CTL_MINOR) {
2978 		return (0);
2979 	}
2980 
2981 	mutex_enter(&vmm_mtx);
2982 	sc = ddi_get_soft_state(vmm_statep, minor);
2983 	if (sc == NULL) {
2984 		mutex_exit(&vmm_mtx);
2985 		return (ENXIO);
2986 	}
2987 
2988 	VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0);
2989 	sc->vmm_flags &= ~VMM_IS_OPEN;
2990 
2991 	/*
2992 	 * If instance was marked for auto-destruction begin that now.  Instance
2993 	 * destruction may have been initated already, so try to make progress
2994 	 * in that case, since closure of the device is one of its requirements.
2995 	 */
2996 	if ((sc->vmm_flags & VMM_DESTROY) != 0 ||
2997 	    (sc->vmm_flags & VMM_AUTODESTROY) != 0) {
2998 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
2999 	}
3000 	mutex_exit(&vmm_mtx);
3001 
3002 	if (hma_release) {
3003 		vmm_hma_release();
3004 	}
3005 
3006 	return (0);
3007 }
3008 
3009 static int
vmm_is_supported(intptr_t arg)3010 vmm_is_supported(intptr_t arg)
3011 {
3012 	int r;
3013 	const char *msg;
3014 
3015 	if (vmm_is_intel()) {
3016 		r = vmx_x86_supported(&msg);
3017 	} else if (vmm_is_svm()) {
3018 		/*
3019 		 * HMA already ensured that the features necessary for SVM
3020 		 * operation were present and online during vmm_attach().
3021 		 */
3022 		r = 0;
3023 	} else {
3024 		r = ENXIO;
3025 		msg = "Unsupported CPU vendor";
3026 	}
3027 
3028 	if (r != 0 && arg != (intptr_t)NULL) {
3029 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
3030 			return (EFAULT);
3031 	}
3032 	return (r);
3033 }
3034 
3035 static int
vmm_ctl_ioctl(int cmd,intptr_t arg,int md,cred_t * cr,int * rvalp)3036 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
3037 {
3038 	void *argp = (void *)arg;
3039 
3040 	switch (cmd) {
3041 	case VMM_CREATE_VM: {
3042 		struct vm_create_req req;
3043 
3044 		if ((md & FWRITE) == 0) {
3045 			return (EPERM);
3046 		}
3047 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
3048 			return (EFAULT);
3049 		}
3050 		return (vmmdev_do_vm_create(&req, cr));
3051 	}
3052 	case VMM_DESTROY_VM: {
3053 		struct vm_destroy_req req;
3054 
3055 		if ((md & FWRITE) == 0) {
3056 			return (EPERM);
3057 		}
3058 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
3059 			return (EFAULT);
3060 		}
3061 		return (vmmdev_do_vm_destroy(&req, cr));
3062 	}
3063 	case VMM_VM_SUPPORTED:
3064 		return (vmm_is_supported(arg));
3065 	case VMM_CHECK_IOMMU:
3066 		if (!vmm_check_iommu()) {
3067 			return (ENXIO);
3068 		}
3069 		return (0);
3070 	case VMM_RESV_QUERY:
3071 	case VMM_RESV_SET_TARGET:
3072 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
3073 	default:
3074 		break;
3075 	}
3076 	/* No other actions are legal on ctl device */
3077 	return (ENOTTY);
3078 }
3079 
3080 static int
vmm_ioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * credp,int * rvalp)3081 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
3082     int *rvalp)
3083 {
3084 	vmm_softc_t	*sc;
3085 	minor_t		minor;
3086 
3087 	/*
3088 	 * Forbid running bhyve in a 32-bit process until it has been tested and
3089 	 * verified to be safe.
3090 	 */
3091 	if (curproc->p_model != DATAMODEL_LP64) {
3092 		return (EFBIG);
3093 	}
3094 
3095 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
3096 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
3097 		return (ENOTSUP);
3098 	}
3099 
3100 	/*
3101 	 * Regardless of minor (vmmctl or instance), we respond to queries of
3102 	 * the interface version.
3103 	 */
3104 	if (cmd == VMM_INTERFACE_VERSION) {
3105 		*rvalp = VMM_CURRENT_INTERFACE_VERSION;
3106 		return (0);
3107 	}
3108 
3109 	minor = getminor(dev);
3110 
3111 	if (minor == VMM_CTL_MINOR) {
3112 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
3113 	}
3114 
3115 	sc = ddi_get_soft_state(vmm_statep, minor);
3116 	ASSERT(sc != NULL);
3117 
3118 	/*
3119 	 * Turn away any ioctls against an instance when it is being destroyed.
3120 	 * (Except for the ioctl inquiring about that destroy-in-progress.)
3121 	 */
3122 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
3123 		if (cmd == VM_DESTROY_PENDING) {
3124 			*rvalp = 1;
3125 			return (0);
3126 		}
3127 		return (ENXIO);
3128 	}
3129 
3130 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
3131 }
3132 
3133 static int
vmm_segmap(dev_t dev,off_t off,struct as * as,caddr_t * addrp,off_t len,unsigned int prot,unsigned int maxprot,unsigned int flags,cred_t * credp)3134 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
3135     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
3136 {
3137 	vmm_softc_t *sc;
3138 	const minor_t minor = getminor(dev);
3139 	int err;
3140 
3141 	if (minor == VMM_CTL_MINOR) {
3142 		return (ENODEV);
3143 	}
3144 	if (off < 0 || (off + len) <= 0) {
3145 		return (EINVAL);
3146 	}
3147 	if ((prot & PROT_USER) == 0) {
3148 		return (EACCES);
3149 	}
3150 
3151 	sc = ddi_get_soft_state(vmm_statep, minor);
3152 	ASSERT(sc);
3153 
3154 	if (sc->vmm_flags & VMM_DESTROY)
3155 		return (ENXIO);
3156 
3157 	/* Grab read lock on the VM to prevent any changes to the memory map */
3158 	vmm_read_lock(sc);
3159 
3160 	if (off >= VM_DEVMEM_START) {
3161 		int segid;
3162 		off_t segoff;
3163 
3164 		/* Mapping a devmem "device" */
3165 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
3166 			err = ENODEV;
3167 		} else {
3168 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
3169 			    addrp, prot, maxprot, flags);
3170 		}
3171 	} else {
3172 		/* Mapping a part of the guest physical space */
3173 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
3174 		    maxprot, flags);
3175 	}
3176 
3177 	vmm_read_unlock(sc);
3178 	return (err);
3179 }
3180 
3181 static sdev_plugin_validate_t
vmm_sdev_validate(sdev_ctx_t ctx)3182 vmm_sdev_validate(sdev_ctx_t ctx)
3183 {
3184 	const char *name = sdev_ctx_name(ctx);
3185 	vmm_softc_t *sc;
3186 	sdev_plugin_validate_t ret;
3187 	minor_t minor;
3188 
3189 	if (sdev_ctx_vtype(ctx) != VCHR)
3190 		return (SDEV_VTOR_INVALID);
3191 
3192 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
3193 
3194 	mutex_enter(&vmm_mtx);
3195 	if ((sc = vmm_lookup(name)) == NULL)
3196 		ret = SDEV_VTOR_INVALID;
3197 	else if (sc->vmm_minor != minor)
3198 		ret = SDEV_VTOR_STALE;
3199 	else
3200 		ret = SDEV_VTOR_VALID;
3201 	mutex_exit(&vmm_mtx);
3202 
3203 	return (ret);
3204 }
3205 
3206 static int
vmm_sdev_filldir(sdev_ctx_t ctx)3207 vmm_sdev_filldir(sdev_ctx_t ctx)
3208 {
3209 	vmm_softc_t *sc;
3210 	int ret;
3211 
3212 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
3213 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
3214 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
3215 		return (EINVAL);
3216 	}
3217 
3218 	mutex_enter(&vmm_mtx);
3219 	ASSERT(vmmdev_dip != NULL);
3220 	for (sc = list_head(&vmm_list); sc != NULL;
3221 	    sc = list_next(&vmm_list, sc)) {
3222 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
3223 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
3224 			    S_IFCHR | 0600,
3225 			    makedevice(ddi_driver_major(vmmdev_dip),
3226 			    sc->vmm_minor));
3227 		} else {
3228 			continue;
3229 		}
3230 		if (ret != 0 && ret != EEXIST)
3231 			goto out;
3232 	}
3233 
3234 	ret = 0;
3235 
3236 out:
3237 	mutex_exit(&vmm_mtx);
3238 	return (ret);
3239 }
3240 
3241 /* ARGSUSED */
3242 static void
vmm_sdev_inactive(sdev_ctx_t ctx)3243 vmm_sdev_inactive(sdev_ctx_t ctx)
3244 {
3245 }
3246 
3247 static sdev_plugin_ops_t vmm_sdev_ops = {
3248 	.spo_version = SDEV_PLUGIN_VERSION,
3249 	.spo_flags = SDEV_PLUGIN_SUBDIR,
3250 	.spo_validate = vmm_sdev_validate,
3251 	.spo_filldir = vmm_sdev_filldir,
3252 	.spo_inactive = vmm_sdev_inactive
3253 };
3254 
3255 /* ARGSUSED */
3256 static int
vmm_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)3257 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
3258 {
3259 	int error;
3260 
3261 	switch (cmd) {
3262 	case DDI_INFO_DEVT2DEVINFO:
3263 		*result = (void *)vmmdev_dip;
3264 		error = DDI_SUCCESS;
3265 		break;
3266 	case DDI_INFO_DEVT2INSTANCE:
3267 		*result = (void *)0;
3268 		error = DDI_SUCCESS;
3269 		break;
3270 	default:
3271 		error = DDI_FAILURE;
3272 		break;
3273 	}
3274 	return (error);
3275 }
3276 
3277 static int
vmm_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)3278 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3279 {
3280 	sdev_plugin_hdl_t sph;
3281 	hma_reg_t *reg = NULL;
3282 	boolean_t vmm_loaded = B_FALSE;
3283 
3284 	if (cmd != DDI_ATTACH) {
3285 		return (DDI_FAILURE);
3286 	}
3287 
3288 	mutex_enter(&vmmdev_mtx);
3289 	/* Ensure we are not already attached. */
3290 	if (vmmdev_dip != NULL) {
3291 		mutex_exit(&vmmdev_mtx);
3292 		return (DDI_FAILURE);
3293 	}
3294 
3295 	vmm_sol_glue_init();
3296 
3297 	/*
3298 	 * Perform temporary HMA registration to determine if the system
3299 	 * is capable.
3300 	 */
3301 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
3302 		goto fail;
3303 	} else if (vmm_mod_load() != 0) {
3304 		goto fail;
3305 	}
3306 	vmm_loaded = B_TRUE;
3307 	hma_unregister(reg);
3308 	reg = NULL;
3309 
3310 	/* Create control node.  Other nodes will be created on demand. */
3311 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
3312 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
3313 		goto fail;
3314 	}
3315 
3316 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
3317 	if (sph == (sdev_plugin_hdl_t)NULL) {
3318 		ddi_remove_minor_node(dip, NULL);
3319 		goto fail;
3320 	}
3321 
3322 	ddi_report_dev(dip);
3323 	vmmdev_sdev_hdl = sph;
3324 	vmmdev_dip = dip;
3325 	mutex_exit(&vmmdev_mtx);
3326 	return (DDI_SUCCESS);
3327 
3328 fail:
3329 	if (vmm_loaded) {
3330 		VERIFY0(vmm_mod_unload());
3331 	}
3332 	if (reg != NULL) {
3333 		hma_unregister(reg);
3334 	}
3335 	vmm_sol_glue_cleanup();
3336 	mutex_exit(&vmmdev_mtx);
3337 	return (DDI_FAILURE);
3338 }
3339 
3340 static int
vmm_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)3341 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3342 {
3343 	if (cmd != DDI_DETACH) {
3344 		return (DDI_FAILURE);
3345 	}
3346 
3347 	/*
3348 	 * Ensure that all resources have been cleaned up.
3349 	 *
3350 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
3351 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
3352 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
3353 	 * devinfo, including our own, while holding vmmdev_mtx.
3354 	 */
3355 	if (mutex_tryenter(&vmmdev_mtx) == 0)
3356 		return (DDI_FAILURE);
3357 
3358 	mutex_enter(&vmm_mtx);
3359 	if (!list_is_empty(&vmm_list)) {
3360 		mutex_exit(&vmm_mtx);
3361 		mutex_exit(&vmmdev_mtx);
3362 		return (DDI_FAILURE);
3363 	}
3364 	mutex_exit(&vmm_mtx);
3365 
3366 	if (!vmmr_is_empty()) {
3367 		mutex_exit(&vmmdev_mtx);
3368 		return (DDI_FAILURE);
3369 	}
3370 
3371 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
3372 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
3373 		mutex_exit(&vmmdev_mtx);
3374 		return (DDI_FAILURE);
3375 	}
3376 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
3377 
3378 	/* Remove the control node. */
3379 	ddi_remove_minor_node(dip, "ctl");
3380 	vmmdev_dip = NULL;
3381 
3382 	VERIFY0(vmm_mod_unload());
3383 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
3384 	vmm_sol_glue_cleanup();
3385 
3386 	mutex_exit(&vmmdev_mtx);
3387 
3388 	return (DDI_SUCCESS);
3389 }
3390 
3391 static struct cb_ops vmm_cb_ops = {
3392 	vmm_open,
3393 	vmm_close,
3394 	nodev,		/* strategy */
3395 	nodev,		/* print */
3396 	nodev,		/* dump */
3397 	nodev,		/* read */
3398 	nodev,		/* write */
3399 	vmm_ioctl,
3400 	nodev,		/* devmap */
3401 	nodev,		/* mmap */
3402 	vmm_segmap,
3403 	nochpoll,	/* poll */
3404 	ddi_prop_op,
3405 	NULL,
3406 	D_NEW | D_MP | D_DEVMAP
3407 };
3408 
3409 static struct dev_ops vmm_ops = {
3410 	DEVO_REV,
3411 	0,
3412 	vmm_info,
3413 	nulldev,	/* identify */
3414 	nulldev,	/* probe */
3415 	vmm_attach,
3416 	vmm_detach,
3417 	nodev,		/* reset */
3418 	&vmm_cb_ops,
3419 	(struct bus_ops *)NULL
3420 };
3421 
3422 static struct modldrv modldrv = {
3423 	&mod_driverops,
3424 	"bhyve vmm",
3425 	&vmm_ops
3426 };
3427 
3428 static struct modlinkage modlinkage = {
3429 	MODREV_1,
3430 	&modldrv,
3431 	NULL
3432 };
3433 
3434 int
_init(void)3435 _init(void)
3436 {
3437 	int	error;
3438 
3439 	sysinit();
3440 
3441 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3442 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3443 	list_create(&vmm_list, sizeof (vmm_softc_t),
3444 	    offsetof(vmm_softc_t, vmm_node));
3445 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3446 
3447 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3448 	if (error) {
3449 		return (error);
3450 	}
3451 
3452 	error = vmmr_init();
3453 	if (error) {
3454 		ddi_soft_state_fini(&vmm_statep);
3455 		return (error);
3456 	}
3457 
3458 	vmm_zsd_init();
3459 
3460 	error = mod_install(&modlinkage);
3461 	if (error) {
3462 		ddi_soft_state_fini(&vmm_statep);
3463 		vmm_zsd_fini();
3464 		vmmr_fini();
3465 	}
3466 
3467 	return (error);
3468 }
3469 
3470 int
_fini(void)3471 _fini(void)
3472 {
3473 	int	error;
3474 
3475 	error = mod_remove(&modlinkage);
3476 	if (error) {
3477 		return (error);
3478 	}
3479 
3480 	vmm_zsd_fini();
3481 	vmmr_fini();
3482 
3483 	ddi_soft_state_fini(&vmm_statep);
3484 
3485 	return (0);
3486 }
3487 
3488 int
_info(struct modinfo * modinfop)3489 _info(struct modinfo *modinfop)
3490 {
3491 	return (mod_info(&modlinkage, modinfop));
3492 }
3493