xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision c3b97060722accbd08cd9eb3f18cc189b2c07b5e)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2025 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static id_space_t	*vmm_minors;
81 static void		*vmm_statep;
82 
83 /*
84  * Until device emulation in bhyve had been adequately scrutinized and tested,
85  * there was (justified) concern that unusual or corrupt device state payloads
86  * could crash the host when loaded via the vmm-data interface.
87  *
88  * Now that those concerns have been mitigated, this protection is loosened to
89  * default-allow, but the switch is left in place, in case there is a need to
90  * once again clamp down on vmm-data writes.
91  */
92 int		vmm_allow_state_writes = 1;
93 
94 static const char *vmmdev_hvm_name = "bhyve";
95 
96 /* For sdev plugin (/dev) */
97 #define	VMM_SDEV_ROOT "/dev/vmm"
98 
99 /* From uts/intel/io/vmm/intel/vmx.c */
100 extern int vmx_x86_supported(const char **);
101 
102 /* Holds and hooks from drivers external to vmm */
103 struct vmm_hold {
104 	list_node_t	vmh_node;
105 	vmm_softc_t	*vmh_sc;
106 	boolean_t	vmh_release_req;
107 	uint_t		vmh_ioport_hook_cnt;
108 	uint_t		vmh_mmio_hook_cnt;
109 };
110 
111 struct vmm_lease {
112 	list_node_t		vml_node;
113 	struct vm		*vml_vm;
114 	vm_client_t		*vml_vmclient;
115 	boolean_t		vml_expired;
116 	boolean_t		vml_break_deferred;
117 	boolean_t		(*vml_expire_func)(void *);
118 	void			*vml_expire_arg;
119 	struct vmm_hold		*vml_hold;
120 };
121 
122 /* Options for vmm_destroy_locked */
123 typedef enum vmm_destroy_opts {
124 	VDO_DEFAULT		= 0,
125 	/*
126 	 * Indicate that zone-specific-data associated with this VM not be
127 	 * cleaned up as part of the destroy.  Skipping ZSD clean-up is
128 	 * necessary when VM is being destroyed as part of zone destruction,
129 	 * when said ZSD is already being cleaned up.
130 	 */
131 	VDO_NO_CLEAN_ZSD	= (1 << 0),
132 	/*
133 	 * Attempt to wait for VM destruction to complete.  This is opt-in,
134 	 * since there are many normal conditions which could lead to
135 	 * destruction being stalled pending other clean-up.
136 	 */
137 	VDO_ATTEMPT_WAIT	= (1 << 1),
138 } vmm_destroy_opts_t;
139 
140 static void vmm_hma_release(void);
141 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *);
142 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
143 static void vmm_lease_block(vmm_softc_t *);
144 static void vmm_lease_unblock(vmm_softc_t *);
145 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
146 static void vmm_kstat_init(vmm_softc_t *);
147 static void vmm_kstat_fini(vmm_softc_t *);
148 
149 /*
150  * The 'devmem' hack:
151  *
152  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
153  * in the vm which appear with their own name related to the vm under /dev.
154  * Since this would be a hassle from an sdev perspective and would require a
155  * new cdev interface (or complicate the existing one), we choose to implement
156  * this in a different manner.  Direct access to the underlying vm memory
157  * segments is exposed by placing them in a range of offsets beyond the normal
158  * guest memory space.  Userspace can query the appropriate offset to mmap()
159  * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
160  */
161 
162 static vmm_devmem_entry_t *
vmmdev_devmem_find(vmm_softc_t * sc,int segid)163 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
164 {
165 	vmm_devmem_entry_t *ent = NULL;
166 	list_t *dl = &sc->vmm_devmem_list;
167 
168 	for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
169 		if (ent->vde_segid == segid) {
170 			return (ent);
171 		}
172 	}
173 	return (NULL);
174 }
175 
176 static int
vmmdev_get_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)177 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
178 {
179 	int error;
180 	bool sysmem;
181 
182 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
183 	    NULL);
184 	if (error || mseg->len == 0)
185 		return (error);
186 
187 	if (!sysmem) {
188 		vmm_devmem_entry_t *de;
189 
190 		de = vmmdev_devmem_find(sc, mseg->segid);
191 		if (de != NULL) {
192 			(void) strlcpy(mseg->name, de->vde_name,
193 			    sizeof (mseg->name));
194 		}
195 	} else {
196 		bzero(mseg->name, sizeof (mseg->name));
197 	}
198 
199 	return (error);
200 }
201 
202 static int
vmmdev_devmem_create(vmm_softc_t * sc,struct vm_memseg * mseg,const char * name)203 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
204 {
205 	off_t map_offset;
206 	vmm_devmem_entry_t *entry;
207 
208 	if (list_is_empty(&sc->vmm_devmem_list)) {
209 		map_offset = VM_DEVMEM_START;
210 	} else {
211 		entry = list_tail(&sc->vmm_devmem_list);
212 		if (sum_overflows_off(entry->vde_off, (off_t)entry->vde_len)) {
213 			/* Do not tolerate overflow */
214 			return (ERANGE);
215 		}
216 		map_offset = entry->vde_off + (off_t)entry->vde_len;
217 		/*
218 		 * XXXJOY: We could choose to search the list for duplicate
219 		 * names and toss an error.  Since we're using the offset
220 		 * method for now, it does not make much of a difference.
221 		 */
222 	}
223 
224 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
225 	entry->vde_segid = mseg->segid;
226 	entry->vde_len = mseg->len;
227 	entry->vde_off = map_offset;
228 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
229 	list_insert_tail(&sc->vmm_devmem_list, entry);
230 
231 	return (0);
232 }
233 
234 static boolean_t
vmmdev_devmem_segid(vmm_softc_t * sc,off_t off,off_t len,int * segidp,off_t * map_offp)235 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
236     off_t *map_offp)
237 {
238 	list_t *dl = &sc->vmm_devmem_list;
239 	vmm_devmem_entry_t *de = NULL;
240 
241 	VERIFY(off >= VM_DEVMEM_START);
242 
243 	if (sum_overflows_off(off, len)) {
244 		/* No match on overflow */
245 		return (B_FALSE);
246 	}
247 	const off_t map_end = off + len;
248 
249 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
250 		const off_t item_end = de->vde_off + de->vde_len;
251 
252 		if (de->vde_off <= off && item_end >= map_end) {
253 			*segidp = de->vde_segid;
254 			*map_offp = off - de->vde_off;
255 			return (B_TRUE);
256 		}
257 	}
258 	return (B_FALSE);
259 }
260 
261 /*
262  * When an instance is being destroyed, the devmem list of named memory objects
263  * can be torn down, as no new mappings are allowed.
264  */
265 static void
vmmdev_devmem_purge(vmm_softc_t * sc)266 vmmdev_devmem_purge(vmm_softc_t *sc)
267 {
268 	vmm_devmem_entry_t *entry;
269 
270 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
271 		kmem_free(entry, sizeof (*entry));
272 	}
273 }
274 
275 static int
vmmdev_alloc_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)276 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
277 {
278 	int error;
279 	bool sysmem = true;
280 
281 	if (VM_MEMSEG_NAME(mseg)) {
282 		sysmem = false;
283 	}
284 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
285 
286 	if (error == 0) {
287 		/*
288 		 * Rather than create a whole fresh device from which userspace
289 		 * can mmap this segment, instead make it available at an
290 		 * offset above where the main guest memory resides.
291 		 */
292 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
293 		if (error != 0) {
294 			vm_free_memseg(sc->vmm_vm, mseg->segid);
295 		}
296 	}
297 	return (error);
298 }
299 
300 /*
301  * Resource Locking and Exclusion
302  *
303  * Much of bhyve depends on key portions of VM state, such as the guest memory
304  * map, to remain unchanged while the guest is running.  As ported from
305  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
306  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
307  * performing the work of actually running the guest in VMX/SVM, would lock
308  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
309  * state, all of the vCPUs would be first locked, ensuring that the
310  * operation(s) could complete without any other threads stumbling into
311  * intermediate states.
312  *
313  * This approach is largely effective for bhyve.  Common operations, such as
314  * running the vCPUs, steer clear of lock contention.  The model begins to
315  * break down for operations which do not occur in the context of a specific
316  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
317  * thread in the bhyve process.  In order to properly protect those vCPU-less
318  * operations from encountering invalid states, additional locking is required.
319  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
320  * It does mean that class of operations will be serialized on locking the
321  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
322  * undue contention on the VM_MAXCPU-1 vCPU.
323  *
324  * In order to address the shortcomings of this model, the concept of a
325  * read/write lock has been added to bhyve.  Operations which change
326  * fundamental aspects of a VM (such as the memory map) must acquire the write
327  * lock, which also implies locking all of the vCPUs and waiting for all read
328  * lock holders to release.  While it increases the cost and waiting time for
329  * those few operations, it allows most hot-path operations on the VM (which
330  * depend on its configuration remaining stable) to occur with minimal locking.
331  *
332  * Consumers of the Driver API (see below) are a special case when it comes to
333  * this locking, since they may hold a read lock via the drv_lease mechanism
334  * for an extended period of time.  Rather than forcing those consumers to
335  * continuously poll for a write lock attempt, the lease system forces them to
336  * provide a release callback to trigger their clean-up (and potential later
337  * reacquisition) of the read lock.
338  */
339 
340 static void
vcpu_lock_one(vmm_softc_t * sc,int vcpu)341 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
342 {
343 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
344 
345 	/*
346 	 * Since this state transition is utilizing from_idle=true, it should
347 	 * not fail, but rather block until it can be successful.
348 	 */
349 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
350 }
351 
352 static void
vcpu_unlock_one(vmm_softc_t * sc,int vcpu)353 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
354 {
355 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
356 
357 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
358 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
359 }
360 
361 static void
vmm_read_lock(vmm_softc_t * sc)362 vmm_read_lock(vmm_softc_t *sc)
363 {
364 	rw_enter(&sc->vmm_rwlock, RW_READER);
365 }
366 
367 static void
vmm_read_unlock(vmm_softc_t * sc)368 vmm_read_unlock(vmm_softc_t *sc)
369 {
370 	rw_exit(&sc->vmm_rwlock);
371 }
372 
373 static void
vmm_write_lock(vmm_softc_t * sc)374 vmm_write_lock(vmm_softc_t *sc)
375 {
376 	int maxcpus;
377 
378 	/* First lock all the vCPUs */
379 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
380 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
381 		vcpu_lock_one(sc, vcpu);
382 	}
383 
384 	/*
385 	 * Block vmm_drv leases from being acquired or held while the VM write
386 	 * lock is held.
387 	 */
388 	vmm_lease_block(sc);
389 
390 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
391 	/*
392 	 * For now, the 'maxcpus' value for an instance is fixed at the
393 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
394 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
395 	 * of the write lock will need to be wary of such changes.
396 	 */
397 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
398 }
399 
400 static void
vmm_write_unlock(vmm_softc_t * sc)401 vmm_write_unlock(vmm_softc_t *sc)
402 {
403 	int maxcpus;
404 
405 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
406 	vmm_lease_unblock(sc);
407 
408 	/*
409 	 * The VM write lock _must_ be released from the same thread it was
410 	 * acquired in, unlike the read lock.
411 	 */
412 	VERIFY(rw_write_held(&sc->vmm_rwlock));
413 	rw_exit(&sc->vmm_rwlock);
414 
415 	/* Unlock all the vCPUs */
416 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
417 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
418 		vcpu_unlock_one(sc, vcpu);
419 	}
420 }
421 
422 static int
vmmdev_do_ioctl(vmm_softc_t * sc,int cmd,intptr_t arg,int md,cred_t * credp,int * rvalp)423 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
424     cred_t *credp, int *rvalp)
425 {
426 	int error = 0, vcpu = -1;
427 	void *datap = (void *)arg;
428 	enum vm_lock_type {
429 		LOCK_NONE = 0,
430 		LOCK_VCPU,
431 		LOCK_READ_HOLD,
432 		LOCK_WRITE_HOLD
433 	} lock_type = LOCK_NONE;
434 
435 	/* Acquire any exclusion resources needed for the operation. */
436 	switch (cmd) {
437 	case VM_RUN:
438 	case VM_GET_REGISTER:
439 	case VM_SET_REGISTER:
440 	case VM_GET_SEGMENT_DESCRIPTOR:
441 	case VM_SET_SEGMENT_DESCRIPTOR:
442 	case VM_GET_REGISTER_SET:
443 	case VM_SET_REGISTER_SET:
444 	case VM_INJECT_EXCEPTION:
445 	case VM_GET_CAPABILITY:
446 	case VM_SET_CAPABILITY:
447 	case VM_PPTDEV_MSI:
448 	case VM_PPTDEV_MSIX:
449 	case VM_SET_X2APIC_STATE:
450 	case VM_GLA2GPA:
451 	case VM_GLA2GPA_NOFAULT:
452 	case VM_ACTIVATE_CPU:
453 	case VM_SET_INTINFO:
454 	case VM_GET_INTINFO:
455 	case VM_RESTART_INSTRUCTION:
456 	case VM_SET_KERNEMU_DEV:
457 	case VM_GET_KERNEMU_DEV:
458 	case VM_RESET_CPU:
459 	case VM_GET_RUN_STATE:
460 	case VM_SET_RUN_STATE:
461 	case VM_GET_FPU:
462 	case VM_SET_FPU:
463 	case VM_GET_CPUID:
464 	case VM_SET_CPUID:
465 	case VM_LEGACY_CPUID:
466 		/*
467 		 * Copy in the ID of the vCPU chosen for this operation.
468 		 * Since a nefarious caller could update their struct between
469 		 * this locking and when the rest of the ioctl data is copied
470 		 * in, it is _critical_ that this local 'vcpu' variable be used
471 		 * rather than the in-struct one when performing the ioctl.
472 		 */
473 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
474 			return (EFAULT);
475 		}
476 		if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) {
477 			return (EINVAL);
478 		}
479 		vcpu_lock_one(sc, vcpu);
480 		lock_type = LOCK_VCPU;
481 		break;
482 
483 	case VM_REINIT:
484 	case VM_BIND_PPTDEV:
485 	case VM_UNBIND_PPTDEV:
486 	case VM_MAP_PPTDEV_MMIO:
487 	case VM_UNMAP_PPTDEV_MMIO:
488 	case VM_ALLOC_MEMSEG:
489 	case VM_MMAP_MEMSEG:
490 	case VM_MUNMAP_MEMSEG:
491 	case VM_WRLOCK_CYCLE:
492 	case VM_PMTMR_LOCATE:
493 	case VM_PAUSE:
494 	case VM_RESUME:
495 		vmm_write_lock(sc);
496 		lock_type = LOCK_WRITE_HOLD;
497 		break;
498 
499 	case VM_GET_MEMSEG:
500 	case VM_MMAP_GETNEXT:
501 	case VM_LAPIC_IRQ:
502 	case VM_INJECT_NMI:
503 	case VM_IOAPIC_ASSERT_IRQ:
504 	case VM_IOAPIC_DEASSERT_IRQ:
505 	case VM_IOAPIC_PULSE_IRQ:
506 	case VM_LAPIC_MSI:
507 	case VM_LAPIC_LOCAL_IRQ:
508 	case VM_GET_X2APIC_STATE:
509 	case VM_RTC_READ:
510 	case VM_RTC_WRITE:
511 	case VM_RTC_SETTIME:
512 	case VM_RTC_GETTIME:
513 	case VM_PPTDEV_DISABLE_MSIX:
514 	case VM_DEVMEM_GETOFFSET:
515 	case VM_TRACK_DIRTY_PAGES:
516 	case VM_NPT_OPERATION:
517 		vmm_read_lock(sc);
518 		lock_type = LOCK_READ_HOLD;
519 		break;
520 
521 	case VM_DATA_READ:
522 	case VM_DATA_WRITE:
523 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
524 			return (EFAULT);
525 		}
526 		if (vcpu == -1) {
527 			/* Access data for VM-wide devices */
528 			vmm_write_lock(sc);
529 			lock_type = LOCK_WRITE_HOLD;
530 		} else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
531 			/* Access data associated with a specific vCPU */
532 			vcpu_lock_one(sc, vcpu);
533 			lock_type = LOCK_VCPU;
534 		} else {
535 			return (EINVAL);
536 		}
537 		break;
538 
539 	case VM_GET_GPA_PMAP:
540 	case VM_IOAPIC_PINCOUNT:
541 	case VM_SUSPEND:
542 	case VM_DESC_FPU_AREA:
543 	case VM_SET_AUTODESTRUCT:
544 	case VM_DESTROY_SELF:
545 	case VM_DESTROY_PENDING:
546 	case VM_VCPU_BARRIER:
547 	default:
548 		break;
549 	}
550 
551 	/* Execute the primary logic for the ioctl. */
552 	switch (cmd) {
553 	case VM_RUN: {
554 		struct vm_entry entry;
555 
556 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
557 			error = EFAULT;
558 			break;
559 		}
560 
561 		if (!(curthread->t_schedflag & TS_VCPU))
562 			smt_mark_as_vcpu();
563 
564 		error = vm_run(sc->vmm_vm, vcpu, &entry);
565 
566 		/*
567 		 * Unexpected states in vm_run() are expressed through positive
568 		 * errno-oriented return values.  VM states which expect further
569 		 * processing in userspace (necessary context via exitinfo) are
570 		 * expressed through negative return values.  For the time being
571 		 * a return value of 0 is not expected from vm_run().
572 		 */
573 		ASSERT(error != 0);
574 		if (error < 0) {
575 			const struct vm_exit *vme;
576 			void *outp = entry.exit_data;
577 
578 			error = 0;
579 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
580 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
581 				error = EFAULT;
582 			}
583 		}
584 		break;
585 	}
586 	case VM_SUSPEND: {
587 		struct vm_suspend vmsuspend;
588 
589 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
590 			error = EFAULT;
591 			break;
592 		}
593 		error = vm_suspend(sc->vmm_vm, vmsuspend.how, vmsuspend.source);
594 		break;
595 	}
596 	case VM_REINIT: {
597 		struct vm_reinit reinit;
598 
599 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
600 			error = EFAULT;
601 			break;
602 		}
603 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
604 			/*
605 			 * The VM instance should be free of driver-attached
606 			 * hooks during the reinitialization process.
607 			 */
608 			break;
609 		}
610 		error = vm_reinit(sc->vmm_vm, reinit.flags);
611 		(void) vmm_drv_block_hook(sc, B_FALSE);
612 		break;
613 	}
614 	case VM_STAT_DESC: {
615 		struct vm_stat_desc statdesc;
616 
617 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
618 			error = EFAULT;
619 			break;
620 		}
621 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
622 		    sizeof (statdesc.desc));
623 		if (error == 0 &&
624 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
625 			error = EFAULT;
626 			break;
627 		}
628 		break;
629 	}
630 	case VM_STATS_IOC: {
631 		struct vm_stats vmstats;
632 
633 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
634 			error = EFAULT;
635 			break;
636 		}
637 		hrt2tv(gethrtime(), &vmstats.tv);
638 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
639 		    nitems(vmstats.statbuf),
640 		    &vmstats.num_entries, vmstats.statbuf);
641 		if (error == 0 &&
642 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
643 			error = EFAULT;
644 			break;
645 		}
646 		break;
647 	}
648 
649 	case VM_PPTDEV_MSI: {
650 		struct vm_pptdev_msi pptmsi;
651 
652 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
653 			error = EFAULT;
654 			break;
655 		}
656 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
657 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
658 		break;
659 	}
660 	case VM_PPTDEV_MSIX: {
661 		struct vm_pptdev_msix pptmsix;
662 
663 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
664 			error = EFAULT;
665 			break;
666 		}
667 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
668 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
669 		    pptmsix.vector_control);
670 		break;
671 	}
672 	case VM_PPTDEV_DISABLE_MSIX: {
673 		struct vm_pptdev pptdev;
674 
675 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
676 			error = EFAULT;
677 			break;
678 		}
679 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
680 		break;
681 	}
682 	case VM_MAP_PPTDEV_MMIO: {
683 		struct vm_pptdev_mmio pptmmio;
684 
685 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
686 			error = EFAULT;
687 			break;
688 		}
689 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
690 		    pptmmio.len, pptmmio.hpa);
691 		break;
692 	}
693 	case VM_UNMAP_PPTDEV_MMIO: {
694 		struct vm_pptdev_mmio pptmmio;
695 
696 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
697 			error = EFAULT;
698 			break;
699 		}
700 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
701 		    pptmmio.len);
702 		break;
703 	}
704 	case VM_BIND_PPTDEV: {
705 		struct vm_pptdev pptdev;
706 
707 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
708 			error = EFAULT;
709 			break;
710 		}
711 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
712 		break;
713 	}
714 	case VM_UNBIND_PPTDEV: {
715 		struct vm_pptdev pptdev;
716 
717 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
718 			error = EFAULT;
719 			break;
720 		}
721 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
722 		break;
723 	}
724 	case VM_GET_PPTDEV_LIMITS: {
725 		struct vm_pptdev_limits pptlimits;
726 
727 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
728 			error = EFAULT;
729 			break;
730 		}
731 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
732 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
733 		if (error == 0 &&
734 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
735 			error = EFAULT;
736 			break;
737 		}
738 		break;
739 	}
740 	case VM_INJECT_EXCEPTION: {
741 		struct vm_exception vmexc;
742 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
743 			error = EFAULT;
744 			break;
745 		}
746 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
747 		    vmexc.error_code_valid != 0, vmexc.error_code,
748 		    vmexc.restart_instruction != 0);
749 		break;
750 	}
751 	case VM_INJECT_NMI: {
752 		struct vm_nmi vmnmi;
753 
754 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
755 			error = EFAULT;
756 			break;
757 		}
758 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
759 		break;
760 	}
761 	case VM_LAPIC_IRQ: {
762 		struct vm_lapic_irq vmirq;
763 
764 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
765 			error = EFAULT;
766 			break;
767 		}
768 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
769 		break;
770 	}
771 	case VM_LAPIC_LOCAL_IRQ: {
772 		struct vm_lapic_irq vmirq;
773 
774 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
775 			error = EFAULT;
776 			break;
777 		}
778 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
779 		    vmirq.vector);
780 		break;
781 	}
782 	case VM_LAPIC_MSI: {
783 		struct vm_lapic_msi vmmsi;
784 
785 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
786 			error = EFAULT;
787 			break;
788 		}
789 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
790 		break;
791 	}
792 
793 	case VM_IOAPIC_ASSERT_IRQ: {
794 		struct vm_ioapic_irq ioapic_irq;
795 
796 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
797 			error = EFAULT;
798 			break;
799 		}
800 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
801 		break;
802 	}
803 	case VM_IOAPIC_DEASSERT_IRQ: {
804 		struct vm_ioapic_irq ioapic_irq;
805 
806 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
807 			error = EFAULT;
808 			break;
809 		}
810 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
811 		break;
812 	}
813 	case VM_IOAPIC_PULSE_IRQ: {
814 		struct vm_ioapic_irq ioapic_irq;
815 
816 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
817 			error = EFAULT;
818 			break;
819 		}
820 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
821 		break;
822 	}
823 	case VM_IOAPIC_PINCOUNT: {
824 		int pincount;
825 
826 		pincount = vioapic_pincount(sc->vmm_vm);
827 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
828 			error = EFAULT;
829 			break;
830 		}
831 		break;
832 	}
833 	case VM_DESC_FPU_AREA: {
834 		struct vm_fpu_desc desc;
835 		void *buf = NULL;
836 
837 		if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
838 			error = EFAULT;
839 			break;
840 		}
841 		if (desc.vfd_num_entries > 64) {
842 			error = EINVAL;
843 			break;
844 		}
845 		const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
846 		    desc.vfd_num_entries;
847 		if (buf_sz != 0) {
848 			buf = kmem_zalloc(buf_sz, KM_SLEEP);
849 		}
850 
851 		/*
852 		 * For now, we are depending on vm_fpu_desc_entry and
853 		 * hma_xsave_state_desc_t having the same format.
854 		 */
855 		CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
856 		    sizeof (hma_xsave_state_desc_t));
857 
858 		size_t req_size;
859 		const uint_t max_entries = hma_fpu_describe_xsave_state(
860 		    (hma_xsave_state_desc_t *)buf,
861 		    desc.vfd_num_entries,
862 		    &req_size);
863 
864 		desc.vfd_req_size = req_size;
865 		desc.vfd_num_entries = max_entries;
866 		if (buf_sz != 0) {
867 			if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
868 				error = EFAULT;
869 			}
870 			kmem_free(buf, buf_sz);
871 		}
872 
873 		if (error == 0) {
874 			if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
875 				error = EFAULT;
876 			}
877 		}
878 		break;
879 	}
880 	case VM_SET_AUTODESTRUCT: {
881 		/*
882 		 * Since this has to do with controlling the lifetime of the
883 		 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather
884 		 * than the vcpu-centric or rwlock exclusion mechanisms.
885 		 */
886 		mutex_enter(&vmm_mtx);
887 		if (arg != 0) {
888 			sc->vmm_flags |= VMM_AUTODESTROY;
889 		} else {
890 			sc->vmm_flags &= ~VMM_AUTODESTROY;
891 		}
892 		mutex_exit(&vmm_mtx);
893 		break;
894 	}
895 	case VM_DESTROY_SELF: {
896 		bool hma_release = false;
897 
898 		/*
899 		 * Just like VMM_DESTROY_VM, but on the instance file descriptor
900 		 * itself, rather than having to perform a racy name lookup as
901 		 * part of the destroy process.
902 		 *
903 		 * Since vmm_destroy_locked() performs vCPU lock acquisition in
904 		 * order to kick the vCPUs out of guest context as part of any
905 		 * destruction, we do not need to worry about it ourself using
906 		 * the `lock_type` logic here.
907 		 */
908 		mutex_enter(&vmm_mtx);
909 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
910 		mutex_exit(&vmm_mtx);
911 		if (hma_release) {
912 			vmm_hma_release();
913 		}
914 		break;
915 	}
916 	case VM_DESTROY_PENDING: {
917 		/*
918 		 * If we have made it this far, then destruction of the instance
919 		 * has not been initiated.
920 		 */
921 		*rvalp = 0;
922 		break;
923 	}
924 
925 	case VM_ISA_ASSERT_IRQ: {
926 		struct vm_isa_irq isa_irq;
927 
928 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
929 			error = EFAULT;
930 			break;
931 		}
932 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
933 		if (error == 0 && isa_irq.ioapic_irq != -1) {
934 			error = vioapic_assert_irq(sc->vmm_vm,
935 			    isa_irq.ioapic_irq);
936 		}
937 		break;
938 	}
939 	case VM_ISA_DEASSERT_IRQ: {
940 		struct vm_isa_irq isa_irq;
941 
942 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
943 			error = EFAULT;
944 			break;
945 		}
946 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
947 		if (error == 0 && isa_irq.ioapic_irq != -1) {
948 			error = vioapic_deassert_irq(sc->vmm_vm,
949 			    isa_irq.ioapic_irq);
950 		}
951 		break;
952 	}
953 	case VM_ISA_PULSE_IRQ: {
954 		struct vm_isa_irq isa_irq;
955 
956 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
957 			error = EFAULT;
958 			break;
959 		}
960 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
961 		if (error == 0 && isa_irq.ioapic_irq != -1) {
962 			error = vioapic_pulse_irq(sc->vmm_vm,
963 			    isa_irq.ioapic_irq);
964 		}
965 		break;
966 	}
967 	case VM_ISA_SET_IRQ_TRIGGER: {
968 		struct vm_isa_irq_trigger isa_irq_trigger;
969 
970 		if (ddi_copyin(datap, &isa_irq_trigger,
971 		    sizeof (isa_irq_trigger), md)) {
972 			error = EFAULT;
973 			break;
974 		}
975 		error = vatpic_set_irq_trigger(sc->vmm_vm,
976 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
977 		break;
978 	}
979 
980 	case VM_MMAP_GETNEXT: {
981 		struct vm_memmap mm;
982 
983 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
984 			error = EFAULT;
985 			break;
986 		}
987 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
988 		    (uintptr_t *)&mm.segoff, &mm.len, &mm.prot, &mm.flags);
989 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
990 			error = EFAULT;
991 			break;
992 		}
993 		break;
994 	}
995 	case VM_MMAP_MEMSEG: {
996 		struct vm_memmap mm;
997 
998 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
999 			error = EFAULT;
1000 			break;
1001 		}
1002 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid,
1003 		    (uintptr_t)mm.segoff, mm.len, mm.prot, mm.flags);
1004 		break;
1005 	}
1006 	case VM_MUNMAP_MEMSEG: {
1007 		struct vm_munmap mu;
1008 
1009 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
1010 			error = EFAULT;
1011 			break;
1012 		}
1013 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
1014 		break;
1015 	}
1016 	case VM_ALLOC_MEMSEG: {
1017 		struct vm_memseg vmseg;
1018 
1019 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1020 			error = EFAULT;
1021 			break;
1022 		}
1023 		error = vmmdev_alloc_memseg(sc, &vmseg);
1024 		break;
1025 	}
1026 	case VM_GET_MEMSEG: {
1027 		struct vm_memseg vmseg;
1028 
1029 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1030 			error = EFAULT;
1031 			break;
1032 		}
1033 		error = vmmdev_get_memseg(sc, &vmseg);
1034 		if (error == 0 &&
1035 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
1036 			error = EFAULT;
1037 			break;
1038 		}
1039 		break;
1040 	}
1041 	case VM_GET_REGISTER: {
1042 		struct vm_register vmreg;
1043 
1044 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1045 			error = EFAULT;
1046 			break;
1047 		}
1048 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
1049 		    &vmreg.regval);
1050 		if (error == 0 &&
1051 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
1052 			error = EFAULT;
1053 			break;
1054 		}
1055 		break;
1056 	}
1057 	case VM_SET_REGISTER: {
1058 		struct vm_register vmreg;
1059 
1060 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1061 			error = EFAULT;
1062 			break;
1063 		}
1064 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
1065 		    vmreg.regval);
1066 		break;
1067 	}
1068 	case VM_SET_SEGMENT_DESCRIPTOR: {
1069 		struct vm_seg_desc vmsegd;
1070 
1071 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1072 			error = EFAULT;
1073 			break;
1074 		}
1075 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1076 		    &vmsegd.desc);
1077 		break;
1078 	}
1079 	case VM_GET_SEGMENT_DESCRIPTOR: {
1080 		struct vm_seg_desc vmsegd;
1081 
1082 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1083 			error = EFAULT;
1084 			break;
1085 		}
1086 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1087 		    &vmsegd.desc);
1088 		if (error == 0 &&
1089 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1090 			error = EFAULT;
1091 			break;
1092 		}
1093 		break;
1094 	}
1095 	case VM_GET_REGISTER_SET: {
1096 		struct vm_register_set vrs;
1097 		int regnums[VM_REG_LAST];
1098 		uint64_t regvals[VM_REG_LAST];
1099 
1100 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1101 			error = EFAULT;
1102 			break;
1103 		}
1104 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1105 			error = EINVAL;
1106 			break;
1107 		}
1108 		if (ddi_copyin(vrs.regnums, regnums,
1109 		    sizeof (int) * vrs.count, md)) {
1110 			error = EFAULT;
1111 			break;
1112 		}
1113 
1114 		error = 0;
1115 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1116 			if (regnums[i] < 0) {
1117 				error = EINVAL;
1118 				break;
1119 			}
1120 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1121 			    &regvals[i]);
1122 		}
1123 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1124 		    sizeof (uint64_t) * vrs.count, md)) {
1125 			error = EFAULT;
1126 		}
1127 		break;
1128 	}
1129 	case VM_SET_REGISTER_SET: {
1130 		struct vm_register_set vrs;
1131 		int regnums[VM_REG_LAST];
1132 		uint64_t regvals[VM_REG_LAST];
1133 
1134 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1135 			error = EFAULT;
1136 			break;
1137 		}
1138 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1139 			error = EINVAL;
1140 			break;
1141 		}
1142 		if (ddi_copyin(vrs.regnums, regnums,
1143 		    sizeof (int) * vrs.count, md)) {
1144 			error = EFAULT;
1145 			break;
1146 		}
1147 		if (ddi_copyin(vrs.regvals, regvals,
1148 		    sizeof (uint64_t) * vrs.count, md)) {
1149 			error = EFAULT;
1150 			break;
1151 		}
1152 
1153 		error = 0;
1154 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1155 			/*
1156 			 * Setting registers in a set is not atomic, since a
1157 			 * failure in the middle of the set will cause a
1158 			 * bail-out and inconsistent register state.  Callers
1159 			 * should be wary of this.
1160 			 */
1161 			if (regnums[i] < 0) {
1162 				error = EINVAL;
1163 				break;
1164 			}
1165 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1166 			    regvals[i]);
1167 		}
1168 		break;
1169 	}
1170 	case VM_RESET_CPU: {
1171 		struct vm_vcpu_reset vvr;
1172 
1173 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1174 			error = EFAULT;
1175 			break;
1176 		}
1177 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1178 			error = EINVAL;
1179 		}
1180 
1181 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1182 		break;
1183 	}
1184 	case VM_GET_RUN_STATE: {
1185 		struct vm_run_state vrs;
1186 
1187 		bzero(&vrs, sizeof (vrs));
1188 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1189 		    &vrs.sipi_vector);
1190 		if (error == 0) {
1191 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1192 				error = EFAULT;
1193 				break;
1194 			}
1195 		}
1196 		break;
1197 	}
1198 	case VM_SET_RUN_STATE: {
1199 		struct vm_run_state vrs;
1200 
1201 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1202 			error = EFAULT;
1203 			break;
1204 		}
1205 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1206 		    vrs.sipi_vector);
1207 		break;
1208 	}
1209 	case VM_GET_FPU: {
1210 		struct vm_fpu_state req;
1211 		const size_t max_len = (PAGESIZE * 2);
1212 		void *kbuf;
1213 
1214 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1215 			error = EFAULT;
1216 			break;
1217 		}
1218 		if (req.len > max_len || req.len == 0) {
1219 			error = EINVAL;
1220 			break;
1221 		}
1222 		kbuf = kmem_zalloc(req.len, KM_SLEEP);
1223 		error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1224 		if (error == 0) {
1225 			if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1226 				error = EFAULT;
1227 			}
1228 		}
1229 		kmem_free(kbuf, req.len);
1230 		break;
1231 	}
1232 	case VM_SET_FPU: {
1233 		struct vm_fpu_state req;
1234 		const size_t max_len = (PAGESIZE * 2);
1235 		void *kbuf;
1236 
1237 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1238 			error = EFAULT;
1239 			break;
1240 		}
1241 		if (req.len > max_len || req.len == 0) {
1242 			error = EINVAL;
1243 			break;
1244 		}
1245 		kbuf = kmem_alloc(req.len, KM_SLEEP);
1246 		if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1247 			error = EFAULT;
1248 		} else {
1249 			error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1250 		}
1251 		kmem_free(kbuf, req.len);
1252 		break;
1253 	}
1254 	case VM_GET_CPUID: {
1255 		struct vm_vcpu_cpuid_config cfg;
1256 		struct vcpu_cpuid_entry *entries = NULL;
1257 
1258 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1259 			error = EFAULT;
1260 			break;
1261 		}
1262 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1263 			error = EINVAL;
1264 			break;
1265 		}
1266 
1267 		const size_t entries_size =
1268 		    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1269 		if (entries_size != 0) {
1270 			entries = kmem_zalloc(entries_size, KM_SLEEP);
1271 		}
1272 
1273 		vcpu_cpuid_config_t vm_cfg = {
1274 			.vcc_nent = cfg.vvcc_nent,
1275 			.vcc_entries = entries,
1276 		};
1277 		error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1278 
1279 		/*
1280 		 * Only attempt to copy out the resultant entries if we were
1281 		 * able to query them from the instance.  The flags and number
1282 		 * of entries are emitted regardless.
1283 		 */
1284 		cfg.vvcc_flags = vm_cfg.vcc_flags;
1285 		cfg.vvcc_nent = vm_cfg.vcc_nent;
1286 		if (entries != NULL) {
1287 			if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries,
1288 			    entries_size, md) != 0) {
1289 				error = EFAULT;
1290 			}
1291 
1292 			kmem_free(entries, entries_size);
1293 		}
1294 
1295 		if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) {
1296 			error = EFAULT;
1297 		}
1298 		break;
1299 	}
1300 	case VM_SET_CPUID: {
1301 		struct vm_vcpu_cpuid_config cfg;
1302 		struct vcpu_cpuid_entry *entries = NULL;
1303 		size_t entries_size = 0;
1304 
1305 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1306 			error = EFAULT;
1307 			break;
1308 		}
1309 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1310 			error = EFBIG;
1311 			break;
1312 		}
1313 		if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) {
1314 			/*
1315 			 * If we are being instructed to use "legacy" handling,
1316 			 * then no entries should be provided, since the static
1317 			 * in-kernel masking will be used.
1318 			 */
1319 			if (cfg.vvcc_nent != 0) {
1320 				error = EINVAL;
1321 				break;
1322 			}
1323 		} else if (cfg.vvcc_nent != 0) {
1324 			entries_size =
1325 			    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1326 			entries = kmem_alloc(entries_size, KM_SLEEP);
1327 
1328 			if (ddi_copyin(cfg.vvcc_entries, entries, entries_size,
1329 			    md) != 0) {
1330 				error = EFAULT;
1331 				kmem_free(entries, entries_size);
1332 				break;
1333 			}
1334 		}
1335 
1336 		vcpu_cpuid_config_t vm_cfg = {
1337 			.vcc_flags = cfg.vvcc_flags,
1338 			.vcc_nent = cfg.vvcc_nent,
1339 			.vcc_entries = entries,
1340 		};
1341 		error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1342 
1343 		if (entries != NULL) {
1344 			kmem_free(entries, entries_size);
1345 		}
1346 		break;
1347 	}
1348 	case VM_LEGACY_CPUID: {
1349 		struct vm_legacy_cpuid vlc;
1350 		if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) {
1351 			error = EFAULT;
1352 			break;
1353 		}
1354 		vlc.vlc_vcpuid = vcpu;
1355 
1356 		legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax,
1357 		    &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx);
1358 
1359 		if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) {
1360 			error = EFAULT;
1361 			break;
1362 		}
1363 		break;
1364 	}
1365 
1366 	case VM_SET_KERNEMU_DEV:
1367 	case VM_GET_KERNEMU_DEV: {
1368 		struct vm_readwrite_kernemu_device kemu;
1369 		size_t size = 0;
1370 
1371 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1372 			error = EFAULT;
1373 			break;
1374 		}
1375 
1376 		if (kemu.access_width > 3) {
1377 			error = EINVAL;
1378 			break;
1379 		}
1380 		size = (1 << kemu.access_width);
1381 		ASSERT(size >= 1 && size <= 8);
1382 
1383 		if (cmd == VM_SET_KERNEMU_DEV) {
1384 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1385 			    kemu.gpa, kemu.value, size);
1386 		} else {
1387 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1388 			    kemu.gpa, &kemu.value, size);
1389 		}
1390 
1391 		if (error == 0) {
1392 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1393 				error = EFAULT;
1394 				break;
1395 			}
1396 		}
1397 		break;
1398 	}
1399 
1400 	case VM_GET_CAPABILITY: {
1401 		struct vm_capability vmcap;
1402 
1403 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1404 			error = EFAULT;
1405 			break;
1406 		}
1407 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1408 		    &vmcap.capval);
1409 		if (error == 0 &&
1410 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1411 			error = EFAULT;
1412 			break;
1413 		}
1414 		break;
1415 	}
1416 	case VM_SET_CAPABILITY: {
1417 		struct vm_capability vmcap;
1418 
1419 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1420 			error = EFAULT;
1421 			break;
1422 		}
1423 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1424 		    vmcap.capval);
1425 		break;
1426 	}
1427 	case VM_SET_X2APIC_STATE: {
1428 		struct vm_x2apic x2apic;
1429 
1430 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1431 			error = EFAULT;
1432 			break;
1433 		}
1434 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1435 		break;
1436 	}
1437 	case VM_GET_X2APIC_STATE: {
1438 		struct vm_x2apic x2apic;
1439 
1440 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1441 			error = EFAULT;
1442 			break;
1443 		}
1444 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1445 		    &x2apic.state);
1446 		if (error == 0 &&
1447 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1448 			error = EFAULT;
1449 			break;
1450 		}
1451 		break;
1452 	}
1453 	case VM_GET_GPA_PMAP: {
1454 		/*
1455 		 * Until there is a necessity to leak EPT/RVI PTE values to
1456 		 * userspace, this will remain unimplemented
1457 		 */
1458 		error = EINVAL;
1459 		break;
1460 	}
1461 	case VM_GET_HPET_CAPABILITIES: {
1462 		struct vm_hpet_cap hpetcap;
1463 
1464 		error = vhpet_getcap(&hpetcap);
1465 		if (error == 0 &&
1466 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1467 			error = EFAULT;
1468 			break;
1469 		}
1470 		break;
1471 	}
1472 	case VM_GLA2GPA: {
1473 		struct vm_gla2gpa gg;
1474 
1475 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1476 			error = EFAULT;
1477 			break;
1478 		}
1479 		gg.vcpuid = vcpu;
1480 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1481 		    gg.prot, &gg.gpa, &gg.fault);
1482 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1483 			error = EFAULT;
1484 			break;
1485 		}
1486 		break;
1487 	}
1488 	case VM_GLA2GPA_NOFAULT: {
1489 		struct vm_gla2gpa gg;
1490 
1491 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1492 			error = EFAULT;
1493 			break;
1494 		}
1495 		gg.vcpuid = vcpu;
1496 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1497 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1498 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1499 			error = EFAULT;
1500 			break;
1501 		}
1502 		break;
1503 	}
1504 
1505 	case VM_ACTIVATE_CPU:
1506 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1507 		break;
1508 
1509 	case VM_SUSPEND_CPU:
1510 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1511 			error = EFAULT;
1512 		} else {
1513 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1514 		}
1515 		break;
1516 
1517 	case VM_RESUME_CPU:
1518 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1519 			error = EFAULT;
1520 		} else {
1521 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1522 		}
1523 		break;
1524 
1525 	case VM_VCPU_BARRIER:
1526 		vcpu = arg;
1527 		error = vm_vcpu_barrier(sc->vmm_vm, vcpu);
1528 		break;
1529 
1530 	case VM_GET_CPUS: {
1531 		struct vm_cpuset vm_cpuset;
1532 		cpuset_t tempset;
1533 		void *srcp = &tempset;
1534 		int size;
1535 
1536 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1537 			error = EFAULT;
1538 			break;
1539 		}
1540 
1541 		/* Be more generous about sizing since our cpuset_t is large. */
1542 		size = vm_cpuset.cpusetsize;
1543 		if (size <= 0 || size > sizeof (cpuset_t)) {
1544 			error = ERANGE;
1545 		}
1546 		/*
1547 		 * If they want a ulong_t or less, make sure they receive the
1548 		 * low bits with all the useful information.
1549 		 */
1550 		if (size <= sizeof (tempset.cpub[0])) {
1551 			srcp = &tempset.cpub[0];
1552 		}
1553 
1554 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1555 			tempset = vm_active_cpus(sc->vmm_vm);
1556 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1557 			tempset = vm_debug_cpus(sc->vmm_vm);
1558 		} else {
1559 			error = EINVAL;
1560 		}
1561 
1562 		ASSERT(size > 0 && size <= sizeof (tempset));
1563 		if (error == 0 &&
1564 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1565 			error = EFAULT;
1566 			break;
1567 		}
1568 		break;
1569 	}
1570 	case VM_SET_INTINFO: {
1571 		struct vm_intinfo vmii;
1572 
1573 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1574 			error = EFAULT;
1575 			break;
1576 		}
1577 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1578 		break;
1579 	}
1580 	case VM_GET_INTINFO: {
1581 		struct vm_intinfo vmii;
1582 
1583 		vmii.vcpuid = vcpu;
1584 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1585 		    &vmii.info2);
1586 		if (error == 0 &&
1587 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1588 			error = EFAULT;
1589 			break;
1590 		}
1591 		break;
1592 	}
1593 	case VM_RTC_WRITE: {
1594 		struct vm_rtc_data rtcdata;
1595 
1596 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1597 			error = EFAULT;
1598 			break;
1599 		}
1600 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1601 		    rtcdata.value);
1602 		break;
1603 	}
1604 	case VM_RTC_READ: {
1605 		struct vm_rtc_data rtcdata;
1606 
1607 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1608 			error = EFAULT;
1609 			break;
1610 		}
1611 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1612 		    &rtcdata.value);
1613 		if (error == 0 &&
1614 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1615 			error = EFAULT;
1616 			break;
1617 		}
1618 		break;
1619 	}
1620 	case VM_RTC_SETTIME: {
1621 		timespec_t ts;
1622 
1623 		if (ddi_copyin(datap, &ts, sizeof (ts), md)) {
1624 			error = EFAULT;
1625 			break;
1626 		}
1627 		error = vrtc_set_time(sc->vmm_vm, &ts);
1628 		break;
1629 	}
1630 	case VM_RTC_GETTIME: {
1631 		timespec_t ts;
1632 
1633 		vrtc_get_time(sc->vmm_vm, &ts);
1634 		if (ddi_copyout(&ts, datap, sizeof (ts), md)) {
1635 			error = EFAULT;
1636 			break;
1637 		}
1638 		break;
1639 	}
1640 
1641 	case VM_PMTMR_LOCATE: {
1642 		uint16_t port = arg;
1643 		error = vpmtmr_set_location(sc->vmm_vm, port);
1644 		break;
1645 	}
1646 
1647 	case VM_RESTART_INSTRUCTION:
1648 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1649 		break;
1650 
1651 	case VM_SET_TOPOLOGY: {
1652 		struct vm_cpu_topology topo;
1653 
1654 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1655 			error = EFAULT;
1656 			break;
1657 		}
1658 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1659 		    topo.threads, topo.maxcpus);
1660 		break;
1661 	}
1662 	case VM_GET_TOPOLOGY: {
1663 		struct vm_cpu_topology topo;
1664 
1665 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1666 		    &topo.threads, &topo.maxcpus);
1667 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1668 			error = EFAULT;
1669 			break;
1670 		}
1671 		break;
1672 	}
1673 	case VM_DEVMEM_GETOFFSET: {
1674 		struct vm_devmem_offset vdo;
1675 		vmm_devmem_entry_t *de;
1676 
1677 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1678 			error = EFAULT;
1679 			break;
1680 		}
1681 
1682 		de = vmmdev_devmem_find(sc, vdo.segid);
1683 		if (de != NULL) {
1684 			vdo.offset = de->vde_off;
1685 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1686 				error = EFAULT;
1687 			}
1688 		} else {
1689 			error = ENOENT;
1690 		}
1691 		break;
1692 	}
1693 	case VM_TRACK_DIRTY_PAGES: {
1694 		const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1695 		struct vmm_dirty_tracker tracker;
1696 		uint8_t *bitmap;
1697 		size_t len;
1698 
1699 		if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1700 			error = EFAULT;
1701 			break;
1702 		}
1703 		if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1704 			error = EINVAL;
1705 			break;
1706 		}
1707 		if (tracker.vdt_len == 0) {
1708 			break;
1709 		}
1710 		if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1711 			error = EINVAL;
1712 			break;
1713 		}
1714 		if (tracker.vdt_len > max_track_region_len) {
1715 			error = EINVAL;
1716 			break;
1717 		}
1718 		len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1719 		bitmap = kmem_zalloc(len, KM_SLEEP);
1720 		error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1721 		    tracker.vdt_len, bitmap);
1722 		if (error == 0 &&
1723 		    ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1724 			error = EFAULT;
1725 		}
1726 		kmem_free(bitmap, len);
1727 
1728 		break;
1729 	}
1730 	case VM_NPT_OPERATION: {
1731 		struct vm_npt_operation vno;
1732 		uint8_t *bitmap = NULL;
1733 		uint64_t bitmap_size = 0;
1734 
1735 		if (ddi_copyin(datap, &vno, sizeof (vno), md) != 0) {
1736 			error = EFAULT;
1737 			break;
1738 		}
1739 		if ((vno.vno_gpa & PAGEOFFSET) != 0 ||
1740 		    (vno.vno_len & PAGEOFFSET) != 0) {
1741 			error = EINVAL;
1742 			break;
1743 		}
1744 		if ((UINT64_MAX - vno.vno_len) < vno.vno_gpa) {
1745 			error = EOVERFLOW;
1746 			break;
1747 		}
1748 
1749 		/*
1750 		 * Allocate a bitmap for the operation if it is specified as
1751 		 * part of the input or output.
1752 		 */
1753 		if ((vno.vno_operation &
1754 		    (VNO_FLAG_BITMAP_IN | VNO_FLAG_BITMAP_OUT)) != 0) {
1755 			/*
1756 			 * Operations expecting data to be copied in or out
1757 			 * should not have zero length.
1758 			 */
1759 			if (vno.vno_len == 0) {
1760 				error = EINVAL;
1761 				break;
1762 			}
1763 
1764 			/*
1765 			 * Maximum bitmap size of 8 pages results in 1 GiB of
1766 			 * coverage.
1767 			 */
1768 			const uint64_t max_bitmap_size = 8 * PAGESIZE;
1769 
1770 			bitmap_size = roundup(vno.vno_len / PAGESIZE, 8) / 8;
1771 			if (bitmap_size > max_bitmap_size) {
1772 				error = E2BIG;
1773 				break;
1774 			}
1775 			bitmap = kmem_zalloc(bitmap_size, KM_SLEEP);
1776 		}
1777 
1778 		if ((vno.vno_operation & VNO_FLAG_BITMAP_IN) != 0) {
1779 			ASSERT(bitmap != NULL);
1780 			if (ddi_copyin(vno.vno_bitmap, bitmap, bitmap_size,
1781 			    md) != 0) {
1782 				error = EFAULT;
1783 			}
1784 		}
1785 
1786 		if (error == 0) {
1787 			error = vm_npt_do_operation(sc->vmm_vm, vno.vno_gpa,
1788 			    vno.vno_len, vno.vno_operation, bitmap, rvalp);
1789 		}
1790 
1791 		if ((vno.vno_operation & VNO_FLAG_BITMAP_OUT) != 0 &&
1792 		    error == 0) {
1793 			ASSERT(bitmap != NULL);
1794 			if (ddi_copyout(bitmap, vno.vno_bitmap, bitmap_size,
1795 			    md) != 0) {
1796 				error = EFAULT;
1797 			}
1798 		}
1799 
1800 		if (bitmap != NULL) {
1801 			kmem_free(bitmap, bitmap_size);
1802 		}
1803 
1804 		break;
1805 	}
1806 	case VM_WRLOCK_CYCLE: {
1807 		/*
1808 		 * Present a test mechanism to acquire/release the write lock
1809 		 * on the VM without any other effects.
1810 		 */
1811 		break;
1812 	}
1813 	case VM_DATA_READ: {
1814 		struct vm_data_xfer vdx;
1815 
1816 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1817 			error = EFAULT;
1818 			break;
1819 		}
1820 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1821 			error = EINVAL;
1822 			break;
1823 		}
1824 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1825 			error = EFBIG;
1826 			break;
1827 		}
1828 
1829 		const size_t len = vdx.vdx_len;
1830 		void *buf = NULL;
1831 		if (len != 0) {
1832 			const void *udata = vdx.vdx_data;
1833 
1834 			buf = kmem_alloc(len, KM_SLEEP);
1835 			if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) {
1836 				bzero(buf, len);
1837 			} else if (ddi_copyin(udata, buf, len, md) != 0) {
1838 				kmem_free(buf, len);
1839 				error = EFAULT;
1840 				break;
1841 			}
1842 		}
1843 
1844 		vdx.vdx_result_len = 0;
1845 		vmm_data_req_t req = {
1846 			.vdr_class = vdx.vdx_class,
1847 			.vdr_version = vdx.vdx_version,
1848 			.vdr_flags = vdx.vdx_flags,
1849 			.vdr_len = len,
1850 			.vdr_data = buf,
1851 			.vdr_result_len = &vdx.vdx_result_len,
1852 			.vdr_vcpuid = vdx.vdx_vcpuid,
1853 		};
1854 		error = vmm_data_read(sc->vmm_vm, &req);
1855 
1856 		if (error == 0 && buf != NULL) {
1857 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1858 				error = EFAULT;
1859 			}
1860 		}
1861 
1862 		/*
1863 		 * Copy out the transfer request so that the value of
1864 		 * vdx_result_len can be made available, regardless of any
1865 		 * error(s) which may have occurred.
1866 		 */
1867 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1868 			error = (error != 0) ? error : EFAULT;
1869 		}
1870 
1871 		if (buf != NULL) {
1872 			kmem_free(buf, len);
1873 		}
1874 		break;
1875 	}
1876 	case VM_DATA_WRITE: {
1877 		struct vm_data_xfer vdx;
1878 
1879 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1880 			error = EFAULT;
1881 			break;
1882 		}
1883 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1884 			error = EINVAL;
1885 			break;
1886 		}
1887 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1888 			error = EFBIG;
1889 			break;
1890 		}
1891 
1892 		const size_t len = vdx.vdx_len;
1893 		void *buf = NULL;
1894 		if (len != 0) {
1895 			buf = kmem_alloc(len, KM_SLEEP);
1896 			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1897 				kmem_free(buf, len);
1898 				error = EFAULT;
1899 				break;
1900 			}
1901 		}
1902 
1903 		vdx.vdx_result_len = 0;
1904 		vmm_data_req_t req = {
1905 			.vdr_class = vdx.vdx_class,
1906 			.vdr_version = vdx.vdx_version,
1907 			.vdr_flags = vdx.vdx_flags,
1908 			.vdr_len = len,
1909 			.vdr_data = buf,
1910 			.vdr_result_len = &vdx.vdx_result_len,
1911 			.vdr_vcpuid = vdx.vdx_vcpuid,
1912 		};
1913 		if (vmm_allow_state_writes != 0) {
1914 			error = vmm_data_write(sc->vmm_vm, &req);
1915 		} else {
1916 			/*
1917 			 * Reject the write if somone has thrown the switch back
1918 			 * into the "disallow" position.
1919 			 */
1920 			error = EPERM;
1921 		}
1922 
1923 		if (error == 0 && buf != NULL &&
1924 		    (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1925 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1926 				error = EFAULT;
1927 			}
1928 		}
1929 
1930 		/*
1931 		 * Copy out the transfer request so that the value of
1932 		 * vdx_result_len can be made available, regardless of any
1933 		 * error(s) which may have occurred.
1934 		 */
1935 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1936 			error = (error != 0) ? error : EFAULT;
1937 		}
1938 
1939 		if (buf != NULL) {
1940 			kmem_free(buf, len);
1941 		}
1942 		break;
1943 	}
1944 
1945 	case VM_PAUSE: {
1946 		error = vm_pause_instance(sc->vmm_vm);
1947 		break;
1948 	}
1949 	case VM_RESUME: {
1950 		error = vm_resume_instance(sc->vmm_vm);
1951 		break;
1952 	}
1953 
1954 	default:
1955 		error = ENOTTY;
1956 		break;
1957 	}
1958 
1959 	/* Release exclusion resources */
1960 	switch (lock_type) {
1961 	case LOCK_NONE:
1962 		break;
1963 	case LOCK_VCPU:
1964 		vcpu_unlock_one(sc, vcpu);
1965 		break;
1966 	case LOCK_READ_HOLD:
1967 		vmm_read_unlock(sc);
1968 		break;
1969 	case LOCK_WRITE_HOLD:
1970 		vmm_write_unlock(sc);
1971 		break;
1972 	default:
1973 		panic("unexpected lock type");
1974 		break;
1975 	}
1976 
1977 	return (error);
1978 }
1979 
1980 static vmm_softc_t *
vmm_lookup(const char * name)1981 vmm_lookup(const char *name)
1982 {
1983 	list_t *vml = &vmm_list;
1984 	vmm_softc_t *sc;
1985 
1986 	ASSERT(MUTEX_HELD(&vmm_mtx));
1987 
1988 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1989 		if (strcmp(sc->vmm_name, name) == 0) {
1990 			break;
1991 		}
1992 	}
1993 
1994 	return (sc);
1995 }
1996 
1997 /*
1998  * Acquire an HMA registration if not already held.
1999  */
2000 static boolean_t
vmm_hma_acquire(void)2001 vmm_hma_acquire(void)
2002 {
2003 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
2004 
2005 	mutex_enter(&vmmdev_mtx);
2006 
2007 	if (vmmdev_hma_reg == NULL) {
2008 		VERIFY3U(vmmdev_hma_ref, ==, 0);
2009 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
2010 		if (vmmdev_hma_reg == NULL) {
2011 			cmn_err(CE_WARN, "%s HMA registration failed.",
2012 			    vmmdev_hvm_name);
2013 			mutex_exit(&vmmdev_mtx);
2014 			return (B_FALSE);
2015 		}
2016 	}
2017 
2018 	vmmdev_hma_ref++;
2019 
2020 	mutex_exit(&vmmdev_mtx);
2021 
2022 	return (B_TRUE);
2023 }
2024 
2025 /*
2026  * Release the HMA registration if held and there are no remaining VMs.
2027  */
2028 static void
vmm_hma_release(void)2029 vmm_hma_release(void)
2030 {
2031 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
2032 
2033 	mutex_enter(&vmmdev_mtx);
2034 
2035 	VERIFY3U(vmmdev_hma_ref, !=, 0);
2036 
2037 	vmmdev_hma_ref--;
2038 
2039 	if (vmmdev_hma_ref == 0) {
2040 		VERIFY(vmmdev_hma_reg != NULL);
2041 		hma_unregister(vmmdev_hma_reg);
2042 		vmmdev_hma_reg = NULL;
2043 	}
2044 	mutex_exit(&vmmdev_mtx);
2045 }
2046 
2047 static int
vmmdev_do_vm_create(const struct vm_create_req * req,cred_t * cr)2048 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
2049 {
2050 	vmm_softc_t	*sc = NULL;
2051 	minor_t		minor;
2052 	int		error = ENOMEM;
2053 	size_t		len;
2054 	const char	*name = req->name;
2055 
2056 	len = strnlen(name, VM_MAX_NAMELEN);
2057 	if (len == 0) {
2058 		return (EINVAL);
2059 	}
2060 	if (len >= VM_MAX_NAMELEN) {
2061 		return (ENAMETOOLONG);
2062 	}
2063 	if (strchr(name, '/') != NULL) {
2064 		return (EINVAL);
2065 	}
2066 
2067 	if (!vmm_hma_acquire())
2068 		return (ENXIO);
2069 
2070 	mutex_enter(&vmm_mtx);
2071 
2072 	/* Look for duplicate names */
2073 	if (vmm_lookup(name) != NULL) {
2074 		mutex_exit(&vmm_mtx);
2075 		vmm_hma_release();
2076 		return (EEXIST);
2077 	}
2078 
2079 	/* Allow only one instance per non-global zone. */
2080 	if (!INGLOBALZONE(curproc)) {
2081 		for (sc = list_head(&vmm_list); sc != NULL;
2082 		    sc = list_next(&vmm_list, sc)) {
2083 			if (sc->vmm_zone == curzone) {
2084 				mutex_exit(&vmm_mtx);
2085 				vmm_hma_release();
2086 				return (EINVAL);
2087 			}
2088 		}
2089 	}
2090 
2091 	minor = id_alloc(vmm_minors);
2092 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
2093 		goto fail;
2094 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2095 		ddi_soft_state_free(vmm_statep, minor);
2096 		goto fail;
2097 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
2098 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
2099 		goto fail;
2100 	}
2101 
2102 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
2103 		goto fail;
2104 	}
2105 
2106 	error = vm_create(req->flags, &sc->vmm_vm);
2107 	if (error == 0) {
2108 		/* Complete VM intialization and report success. */
2109 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
2110 		sc->vmm_minor = minor;
2111 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
2112 		    offsetof(vmm_devmem_entry_t, vde_node));
2113 
2114 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
2115 		    offsetof(vmm_hold_t, vmh_node));
2116 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
2117 
2118 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
2119 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
2120 		    offsetof(vmm_lease_t, vml_node));
2121 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
2122 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
2123 
2124 		sc->vmm_zone = crgetzone(cr);
2125 		zone_hold(sc->vmm_zone);
2126 		vmm_zsd_add_vm(sc);
2127 		vmm_kstat_init(sc);
2128 
2129 		list_insert_tail(&vmm_list, sc);
2130 		mutex_exit(&vmm_mtx);
2131 		return (0);
2132 	}
2133 
2134 	vmm_kstat_fini(sc);
2135 	ddi_remove_minor_node(vmmdev_dip, name);
2136 fail:
2137 	id_free(vmm_minors, minor);
2138 	if (sc != NULL) {
2139 		ddi_soft_state_free(vmm_statep, minor);
2140 	}
2141 	mutex_exit(&vmm_mtx);
2142 	vmm_hma_release();
2143 
2144 	return (error);
2145 }
2146 
2147 /*
2148  * Bhyve 'Driver' Interface
2149  *
2150  * While many devices are emulated in the bhyve userspace process, there are
2151  * others with performance constraints which require that they run mostly or
2152  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
2153  * needed so they can query/manipulate the portions of VM state needed to
2154  * fulfill their purpose.
2155  *
2156  * This includes:
2157  * - Translating guest-physical addresses to host-virtual pointers
2158  * - Injecting MSIs
2159  * - Hooking IO port addresses
2160  *
2161  * The vmm_drv interface exists to provide that functionality to its consumers.
2162  * (At this time, 'viona' is the only user)
2163  */
2164 int
vmm_drv_hold(file_t * fp,cred_t * cr,vmm_hold_t ** holdp)2165 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
2166 {
2167 	vnode_t *vp = fp->f_vnode;
2168 	const dev_t dev = vp->v_rdev;
2169 	vmm_softc_t *sc;
2170 	vmm_hold_t *hold;
2171 	int err = 0;
2172 
2173 	if (vp->v_type != VCHR) {
2174 		return (ENXIO);
2175 	}
2176 	const major_t major = getmajor(dev);
2177 	const minor_t minor = getminor(dev);
2178 
2179 	mutex_enter(&vmmdev_mtx);
2180 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
2181 		mutex_exit(&vmmdev_mtx);
2182 		return (ENOENT);
2183 	}
2184 	mutex_enter(&vmm_mtx);
2185 	mutex_exit(&vmmdev_mtx);
2186 
2187 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2188 		err = ENOENT;
2189 		goto out;
2190 	}
2191 	/* XXXJOY: check cred permissions against instance */
2192 
2193 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2194 		err = EBUSY;
2195 		goto out;
2196 	}
2197 
2198 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
2199 	hold->vmh_sc = sc;
2200 	hold->vmh_release_req = B_FALSE;
2201 
2202 	list_insert_tail(&sc->vmm_holds, hold);
2203 	sc->vmm_flags |= VMM_HELD;
2204 	*holdp = hold;
2205 
2206 out:
2207 	mutex_exit(&vmm_mtx);
2208 	return (err);
2209 }
2210 
2211 void
vmm_drv_rele(vmm_hold_t * hold)2212 vmm_drv_rele(vmm_hold_t *hold)
2213 {
2214 	vmm_softc_t *sc;
2215 	bool hma_release = false;
2216 
2217 	ASSERT(hold != NULL);
2218 	ASSERT(hold->vmh_sc != NULL);
2219 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
2220 	VERIFY(hold->vmh_mmio_hook_cnt == 0);
2221 
2222 	mutex_enter(&vmm_mtx);
2223 	sc = hold->vmh_sc;
2224 	list_remove(&sc->vmm_holds, hold);
2225 	kmem_free(hold, sizeof (*hold));
2226 
2227 	if (list_is_empty(&sc->vmm_holds)) {
2228 		sc->vmm_flags &= ~VMM_HELD;
2229 
2230 		/*
2231 		 * Since outstanding holds would prevent instance destruction
2232 		 * from completing, attempt to finish it now if it was already
2233 		 * set in motion.
2234 		 */
2235 		if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2236 			VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT,
2237 			    &hma_release));
2238 		}
2239 	}
2240 	mutex_exit(&vmm_mtx);
2241 
2242 	if (hma_release) {
2243 		vmm_hma_release();
2244 	}
2245 }
2246 
2247 boolean_t
vmm_drv_release_reqd(vmm_hold_t * hold)2248 vmm_drv_release_reqd(vmm_hold_t *hold)
2249 {
2250 	ASSERT(hold != NULL);
2251 
2252 	return (hold->vmh_release_req);
2253 }
2254 
2255 vmm_lease_t *
vmm_drv_lease_sign(vmm_hold_t * hold,boolean_t (* expiref)(void *),void * arg)2256 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
2257 {
2258 	vmm_softc_t *sc = hold->vmh_sc;
2259 	vmm_lease_t *lease;
2260 
2261 	ASSERT3P(expiref, !=, NULL);
2262 
2263 	if (hold->vmh_release_req) {
2264 		return (NULL);
2265 	}
2266 
2267 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
2268 	list_link_init(&lease->vml_node);
2269 	lease->vml_expire_func = expiref;
2270 	lease->vml_expire_arg = arg;
2271 	lease->vml_expired = B_FALSE;
2272 	lease->vml_break_deferred = B_FALSE;
2273 	lease->vml_hold = hold;
2274 	/* cache the VM pointer for one less pointer chase */
2275 	lease->vml_vm = sc->vmm_vm;
2276 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
2277 
2278 	mutex_enter(&sc->vmm_lease_lock);
2279 	while (sc->vmm_lease_blocker != 0) {
2280 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2281 	}
2282 	list_insert_tail(&sc->vmm_lease_list, lease);
2283 	vmm_read_lock(sc);
2284 	mutex_exit(&sc->vmm_lease_lock);
2285 
2286 	return (lease);
2287 }
2288 
2289 static void
vmm_lease_break_locked(vmm_softc_t * sc,vmm_lease_t * lease)2290 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
2291 {
2292 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
2293 
2294 	list_remove(&sc->vmm_lease_list, lease);
2295 	vmm_read_unlock(sc);
2296 	vmc_destroy(lease->vml_vmclient);
2297 	kmem_free(lease, sizeof (*lease));
2298 }
2299 
2300 static void
vmm_lease_block(vmm_softc_t * sc)2301 vmm_lease_block(vmm_softc_t *sc)
2302 {
2303 	mutex_enter(&sc->vmm_lease_lock);
2304 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
2305 	sc->vmm_lease_blocker++;
2306 	if (sc->vmm_lease_blocker == 1) {
2307 		list_t *list = &sc->vmm_lease_list;
2308 		vmm_lease_t *lease = list_head(list);
2309 
2310 		while (lease != NULL) {
2311 			void *arg = lease->vml_expire_arg;
2312 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
2313 			boolean_t sync_break = B_FALSE;
2314 
2315 			/*
2316 			 * Since the lease expiration notification may
2317 			 * need to take locks which would deadlock with
2318 			 * vmm_lease_lock, drop it across the call.
2319 			 *
2320 			 * We are the only one allowed to manipulate
2321 			 * vmm_lease_list right now, so it is safe to
2322 			 * continue iterating through it after
2323 			 * reacquiring the lock.
2324 			 */
2325 			lease->vml_expired = B_TRUE;
2326 			mutex_exit(&sc->vmm_lease_lock);
2327 			sync_break = expiref(arg);
2328 			mutex_enter(&sc->vmm_lease_lock);
2329 
2330 			if (sync_break) {
2331 				vmm_lease_t *next;
2332 
2333 				/*
2334 				 * These leases which are synchronously broken
2335 				 * result in vmm_read_unlock() calls from a
2336 				 * different thread than the corresponding
2337 				 * vmm_read_lock().  This is acceptable, given
2338 				 * that the rwlock underpinning the whole
2339 				 * mechanism tolerates the behavior.  This
2340 				 * flexibility is _only_ afforded to VM read
2341 				 * lock (RW_READER) holders.
2342 				 */
2343 				next = list_next(list, lease);
2344 				vmm_lease_break_locked(sc, lease);
2345 				lease = next;
2346 			} else {
2347 				lease = list_next(list, lease);
2348 			}
2349 		}
2350 
2351 		/* Process leases which were not broken synchronously. */
2352 		while (!list_is_empty(list)) {
2353 			/*
2354 			 * Although the nested loops are quadratic, the number
2355 			 * of leases is small.
2356 			 */
2357 			lease = list_head(list);
2358 			while (lease != NULL) {
2359 				vmm_lease_t *next = list_next(list, lease);
2360 				if (lease->vml_break_deferred) {
2361 					vmm_lease_break_locked(sc, lease);
2362 				}
2363 				lease = next;
2364 			}
2365 			if (list_is_empty(list)) {
2366 				break;
2367 			}
2368 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2369 		}
2370 		/* Wake anyone else waiting for the lease list to be empty  */
2371 		cv_broadcast(&sc->vmm_lease_cv);
2372 	} else {
2373 		list_t *list = &sc->vmm_lease_list;
2374 
2375 		/*
2376 		 * Some other thread beat us to the duty of lease cleanup.
2377 		 * Wait until that is complete.
2378 		 */
2379 		while (!list_is_empty(list)) {
2380 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2381 		}
2382 	}
2383 	mutex_exit(&sc->vmm_lease_lock);
2384 }
2385 
2386 static void
vmm_lease_unblock(vmm_softc_t * sc)2387 vmm_lease_unblock(vmm_softc_t *sc)
2388 {
2389 	mutex_enter(&sc->vmm_lease_lock);
2390 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2391 	sc->vmm_lease_blocker--;
2392 	if (sc->vmm_lease_blocker == 0) {
2393 		cv_broadcast(&sc->vmm_lease_cv);
2394 	}
2395 	mutex_exit(&sc->vmm_lease_lock);
2396 }
2397 
2398 void
vmm_drv_lease_break(vmm_hold_t * hold,vmm_lease_t * lease)2399 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2400 {
2401 	vmm_softc_t *sc = hold->vmh_sc;
2402 
2403 	VERIFY3P(hold, ==, lease->vml_hold);
2404 	VERIFY(!lease->vml_break_deferred);
2405 
2406 	mutex_enter(&sc->vmm_lease_lock);
2407 	if (sc->vmm_lease_blocker == 0) {
2408 		vmm_lease_break_locked(sc, lease);
2409 	} else {
2410 		/*
2411 		 * Defer the lease-breaking to whichever thread is currently
2412 		 * cleaning up all leases as part of a vmm_lease_block() call.
2413 		 */
2414 		lease->vml_break_deferred = B_TRUE;
2415 		cv_broadcast(&sc->vmm_lease_cv);
2416 	}
2417 	mutex_exit(&sc->vmm_lease_lock);
2418 }
2419 
2420 boolean_t
vmm_drv_lease_expired(vmm_lease_t * lease)2421 vmm_drv_lease_expired(vmm_lease_t *lease)
2422 {
2423 	return (lease->vml_expired);
2424 }
2425 
2426 vmm_page_t *
vmm_drv_page_hold(vmm_lease_t * lease,uintptr_t gpa,int prot)2427 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2428 {
2429 	ASSERT(lease != NULL);
2430 	ASSERT0(gpa & PAGEOFFSET);
2431 
2432 	return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2433 }
2434 
2435 
2436 /* Ensure that flags mirrored by vmm_drv interface properly match up */
2437 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY);
2438 
2439 vmm_page_t *
vmm_drv_page_hold_ext(vmm_lease_t * lease,uintptr_t gpa,int prot,int flags)2440 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags)
2441 {
2442 	ASSERT(lease != NULL);
2443 	ASSERT0(gpa & PAGEOFFSET);
2444 
2445 	vmm_page_t *page =
2446 	    (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags);
2447 	return (page);
2448 }
2449 
2450 void
vmm_drv_page_release(vmm_page_t * vmmp)2451 vmm_drv_page_release(vmm_page_t *vmmp)
2452 {
2453 	(void) vmp_release((vm_page_t *)vmmp);
2454 }
2455 
2456 void
vmm_drv_page_release_chain(vmm_page_t * vmmp)2457 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2458 {
2459 	(void) vmp_release_chain((vm_page_t *)vmmp);
2460 }
2461 
2462 const void *
vmm_drv_page_readable(const vmm_page_t * vmmp)2463 vmm_drv_page_readable(const vmm_page_t *vmmp)
2464 {
2465 	return (vmp_get_readable((const vm_page_t *)vmmp));
2466 }
2467 
2468 void *
vmm_drv_page_writable(const vmm_page_t * vmmp)2469 vmm_drv_page_writable(const vmm_page_t *vmmp)
2470 {
2471 	return (vmp_get_writable((const vm_page_t *)vmmp));
2472 }
2473 
2474 void
vmm_drv_page_mark_dirty(vmm_page_t * vmmp)2475 vmm_drv_page_mark_dirty(vmm_page_t *vmmp)
2476 {
2477 	return (vmp_mark_dirty((vm_page_t *)vmmp));
2478 }
2479 
2480 void
vmm_drv_page_chain(vmm_page_t * vmmp,vmm_page_t * to_chain)2481 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2482 {
2483 	vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2484 }
2485 
2486 vmm_page_t *
vmm_drv_page_next(const vmm_page_t * vmmp)2487 vmm_drv_page_next(const vmm_page_t *vmmp)
2488 {
2489 	return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2490 }
2491 
2492 int
vmm_drv_msi(vmm_lease_t * lease,uint64_t addr,uint64_t msg)2493 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2494 {
2495 	ASSERT(lease != NULL);
2496 
2497 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
2498 }
2499 
2500 int
vmm_drv_ioport_hook(vmm_hold_t * hold,uint16_t ioport,vmm_drv_iop_cb_t func,void * arg,void ** cookie)2501 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2502     void *arg, void **cookie)
2503 {
2504 	vmm_softc_t *sc;
2505 	int err;
2506 
2507 	ASSERT(hold != NULL);
2508 	ASSERT(cookie != NULL);
2509 
2510 	sc = hold->vmh_sc;
2511 	mutex_enter(&vmm_mtx);
2512 	/* Confirm that hook installation is not blocked */
2513 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2514 		mutex_exit(&vmm_mtx);
2515 		return (EBUSY);
2516 	}
2517 	/*
2518 	 * Optimistically record an installed hook which will prevent a block
2519 	 * from being asserted while the mutex is dropped.
2520 	 */
2521 	if (hold->vmh_ioport_hook_cnt == UINT_MAX) {
2522 		mutex_exit(&vmm_mtx);
2523 		return (ENOSPC);
2524 	}
2525 	hold->vmh_ioport_hook_cnt++;
2526 	mutex_exit(&vmm_mtx);
2527 
2528 	vmm_write_lock(sc);
2529 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2530 	    arg, cookie);
2531 	vmm_write_unlock(sc);
2532 
2533 	if (err != 0) {
2534 		mutex_enter(&vmm_mtx);
2535 		/* Walk back optimism about the hook installation */
2536 		hold->vmh_ioport_hook_cnt--;
2537 		mutex_exit(&vmm_mtx);
2538 	}
2539 	return (err);
2540 }
2541 
2542 void
vmm_drv_ioport_unhook(vmm_hold_t * hold,void ** cookie)2543 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2544 {
2545 	vmm_softc_t *sc;
2546 
2547 	ASSERT(hold != NULL);
2548 	ASSERT(cookie != NULL);
2549 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
2550 
2551 	sc = hold->vmh_sc;
2552 	vmm_write_lock(sc);
2553 	vm_ioport_unhook(sc->vmm_vm, cookie);
2554 	vmm_write_unlock(sc);
2555 
2556 	mutex_enter(&vmm_mtx);
2557 	hold->vmh_ioport_hook_cnt--;
2558 	mutex_exit(&vmm_mtx);
2559 }
2560 
2561 int
vmm_drv_mmio_hook(vmm_hold_t * hold,uint64_t address,uint32_t size,vmm_drv_mmio_cb_t func,void * arg,void ** cookie)2562 vmm_drv_mmio_hook(vmm_hold_t *hold, uint64_t address, uint32_t size,
2563     vmm_drv_mmio_cb_t func, void *arg, void **cookie)
2564 {
2565 	vmm_softc_t *sc;
2566 	int err;
2567 
2568 	ASSERT(hold != NULL);
2569 	ASSERT(cookie != NULL);
2570 
2571 	if (UINT64_MAX - size < address)
2572 		return (EOVERFLOW);
2573 
2574 	sc = hold->vmh_sc;
2575 	mutex_enter(&vmm_mtx);
2576 	/* Confirm that hook installation is not blocked */
2577 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2578 		mutex_exit(&vmm_mtx);
2579 		return (EBUSY);
2580 	}
2581 	/*
2582 	 * Optimistically record an installed hook which will prevent a block
2583 	 * from being asserted while the mutex is dropped.
2584 	 */
2585 	if (hold->vmh_mmio_hook_cnt == UINT_MAX) {
2586 		mutex_exit(&vmm_mtx);
2587 		return (ENOSPC);
2588 	}
2589 	hold->vmh_mmio_hook_cnt++;
2590 	mutex_exit(&vmm_mtx);
2591 
2592 	vmm_write_lock(sc);
2593 	err = vm_mmio_hook(sc->vmm_vm, address, size, (mmio_handler_t)func,
2594 	    arg, cookie);
2595 	vmm_write_unlock(sc);
2596 
2597 	if (err != 0) {
2598 		mutex_enter(&vmm_mtx);
2599 		/* Walk back optimism about the hook installation */
2600 		hold->vmh_mmio_hook_cnt--;
2601 		mutex_exit(&vmm_mtx);
2602 	}
2603 	return (err);
2604 }
2605 
2606 int
vmm_drv_mmio_unhook(vmm_hold_t * hold,void ** cookie)2607 vmm_drv_mmio_unhook(vmm_hold_t *hold, void **cookie)
2608 {
2609 	vmm_softc_t *sc;
2610 	int ret;
2611 
2612 	ASSERT(hold != NULL);
2613 	ASSERT(cookie != NULL);
2614 	ASSERT(hold->vmh_mmio_hook_cnt != 0);
2615 
2616 	sc = hold->vmh_sc;
2617 	vmm_write_lock(sc);
2618 	ret = vm_mmio_unhook(sc->vmm_vm, cookie);
2619 	vmm_write_unlock(sc);
2620 
2621 	if (ret == 0) {
2622 		mutex_enter(&vmm_mtx);
2623 		hold->vmh_mmio_hook_cnt--;
2624 		mutex_exit(&vmm_mtx);
2625 	}
2626 
2627 	return (ret);
2628 }
2629 
2630 static void
vmm_drv_purge(vmm_softc_t * sc)2631 vmm_drv_purge(vmm_softc_t *sc)
2632 {
2633 	ASSERT(MUTEX_HELD(&vmm_mtx));
2634 
2635 	if ((sc->vmm_flags & VMM_HELD) != 0) {
2636 		vmm_hold_t *hold;
2637 
2638 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2639 		    hold = list_next(&sc->vmm_holds, hold)) {
2640 			hold->vmh_release_req = B_TRUE;
2641 		}
2642 
2643 		/*
2644 		 * Require that all leases on the instance be broken, now that
2645 		 * all associated holds have been marked as needing release.
2646 		 *
2647 		 * Dropping vmm_mtx is not strictly necessary, but if any of the
2648 		 * lessees are slow to respond, it would be nice to leave it
2649 		 * available for other parties.
2650 		 */
2651 		mutex_exit(&vmm_mtx);
2652 		vmm_lease_block(sc);
2653 		vmm_lease_unblock(sc);
2654 		mutex_enter(&vmm_mtx);
2655 	}
2656 }
2657 
2658 static int
vmm_drv_block_hook(vmm_softc_t * sc,boolean_t enable_block)2659 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2660 {
2661 	int err = 0;
2662 
2663 	mutex_enter(&vmm_mtx);
2664 	if (!enable_block) {
2665 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2666 
2667 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2668 		goto done;
2669 	}
2670 
2671 	/* If any holds have hooks installed, the block is a failure */
2672 	if (!list_is_empty(&sc->vmm_holds)) {
2673 		vmm_hold_t *hold;
2674 
2675 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2676 		    hold = list_next(&sc->vmm_holds, hold)) {
2677 			if (hold->vmh_ioport_hook_cnt != 0 ||
2678 			    hold->vmh_mmio_hook_cnt != 0) {
2679 				err = EBUSY;
2680 				goto done;
2681 			}
2682 		}
2683 	}
2684 	sc->vmm_flags |= VMM_BLOCK_HOOK;
2685 
2686 done:
2687 	mutex_exit(&vmm_mtx);
2688 	return (err);
2689 }
2690 
2691 
2692 static void
vmm_destroy_begin(vmm_softc_t * sc,vmm_destroy_opts_t opts)2693 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts)
2694 {
2695 	ASSERT(MUTEX_HELD(&vmm_mtx));
2696 	ASSERT0(sc->vmm_flags & VMM_DESTROY);
2697 
2698 	sc->vmm_flags |= VMM_DESTROY;
2699 
2700 	/*
2701 	 * Lock and unlock all of the vCPUs to ensure that they are kicked out
2702 	 * of guest context, being unable to return now that the instance is
2703 	 * marked for destruction.
2704 	 */
2705 	const int maxcpus = vm_get_maxcpus(sc->vmm_vm);
2706 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
2707 		vcpu_lock_one(sc, vcpu);
2708 		vcpu_unlock_one(sc, vcpu);
2709 	}
2710 
2711 	vmmdev_devmem_purge(sc);
2712 	if ((opts & VDO_NO_CLEAN_ZSD) == 0) {
2713 		/*
2714 		 * The ZSD should be cleaned up now, unless destruction of the
2715 		 * instance was initated by destruction of the containing zone,
2716 		 * in which case the ZSD has already been removed.
2717 		 */
2718 		vmm_zsd_rem_vm(sc);
2719 	}
2720 	zone_rele(sc->vmm_zone);
2721 
2722 	vmm_drv_purge(sc);
2723 }
2724 
2725 static bool
vmm_destroy_ready(vmm_softc_t * sc)2726 vmm_destroy_ready(vmm_softc_t *sc)
2727 {
2728 	ASSERT(MUTEX_HELD(&vmm_mtx));
2729 
2730 	if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) {
2731 		VERIFY(list_is_empty(&sc->vmm_holds));
2732 		return (true);
2733 	}
2734 
2735 	return (false);
2736 }
2737 
2738 static void
vmm_destroy_finish(vmm_softc_t * sc)2739 vmm_destroy_finish(vmm_softc_t *sc)
2740 {
2741 	ASSERT(MUTEX_HELD(&vmm_mtx));
2742 	ASSERT(vmm_destroy_ready(sc));
2743 
2744 	list_remove(&vmm_list, sc);
2745 	vmm_kstat_fini(sc);
2746 	vm_destroy(sc->vmm_vm);
2747 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2748 	(void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE);
2749 
2750 	const minor_t minor = sc->vmm_minor;
2751 	ddi_soft_state_free(vmm_statep, minor);
2752 	id_free(vmm_minors, minor);
2753 }
2754 
2755 /*
2756  * Initiate or attempt to finish destruction of a VMM instance.
2757  *
2758  * This is called from several contexts:
2759  * - An explicit destroy ioctl is made
2760  * - A vmm_drv consumer releases its hold (being the last on the instance)
2761  * - The vmm device is closed, and auto-destruct is enabled
2762  */
2763 static int
vmm_destroy_locked(vmm_softc_t * sc,vmm_destroy_opts_t opts,bool * hma_release)2764 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts,
2765     bool *hma_release)
2766 {
2767 	ASSERT(MUTEX_HELD(&vmm_mtx));
2768 
2769 	*hma_release = false;
2770 
2771 	/*
2772 	 * When instance destruction begins, it is so marked such that any
2773 	 * further requests to operate the instance will fail.
2774 	 */
2775 	if ((sc->vmm_flags & VMM_DESTROY) == 0) {
2776 		vmm_destroy_begin(sc, opts);
2777 	}
2778 
2779 	if (vmm_destroy_ready(sc)) {
2780 
2781 		/*
2782 		 * Notify anyone waiting for the destruction to finish.  They
2783 		 * must be clear before we can safely tear down the softc.
2784 		 */
2785 		if (sc->vmm_destroy_waiters != 0) {
2786 			cv_broadcast(&sc->vmm_cv);
2787 			while (sc->vmm_destroy_waiters != 0) {
2788 				cv_wait(&sc->vmm_cv, &vmm_mtx);
2789 			}
2790 		}
2791 
2792 		/*
2793 		 * Finish destruction of instance.  After this point, the softc
2794 		 * is freed and cannot be accessed again.
2795 		 *
2796 		 * With destruction complete, the HMA hold can be released
2797 		 */
2798 		vmm_destroy_finish(sc);
2799 		*hma_release = true;
2800 		return (0);
2801 	} else if ((opts & VDO_ATTEMPT_WAIT) != 0) {
2802 		int err = 0;
2803 
2804 		sc->vmm_destroy_waiters++;
2805 		while (!vmm_destroy_ready(sc) && err == 0) {
2806 			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2807 				err = EINTR;
2808 			}
2809 		}
2810 		sc->vmm_destroy_waiters--;
2811 
2812 		if (sc->vmm_destroy_waiters == 0) {
2813 			/*
2814 			 * If we were the last waiter, it could be that VM
2815 			 * destruction is waiting on _us_ to proceed with the
2816 			 * final clean-up.
2817 			 */
2818 			cv_signal(&sc->vmm_cv);
2819 		}
2820 		return (err);
2821 	} else {
2822 		/*
2823 		 * Since the instance is not ready for destruction, and the
2824 		 * caller did not ask to wait, consider it a success for now.
2825 		 */
2826 		return (0);
2827 	}
2828 }
2829 
2830 void
vmm_zone_vm_destroy(vmm_softc_t * sc)2831 vmm_zone_vm_destroy(vmm_softc_t *sc)
2832 {
2833 	bool hma_release = false;
2834 	int err;
2835 
2836 	mutex_enter(&vmm_mtx);
2837 	err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release);
2838 	mutex_exit(&vmm_mtx);
2839 
2840 	VERIFY0(err);
2841 
2842 	if (hma_release) {
2843 		vmm_hma_release();
2844 	}
2845 }
2846 
2847 static int
vmmdev_do_vm_destroy(const struct vm_destroy_req * req,cred_t * cr)2848 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2849 {
2850 	vmm_softc_t *sc;
2851 	bool hma_release = false;
2852 	int err;
2853 
2854 	if (crgetuid(cr) != 0) {
2855 		return (EPERM);
2856 	}
2857 
2858 	mutex_enter(&vmm_mtx);
2859 	sc = vmm_lookup(req->name);
2860 	if (sc == NULL) {
2861 		mutex_exit(&vmm_mtx);
2862 		return (ENOENT);
2863 	}
2864 	/*
2865 	 * We don't check this in vmm_lookup() since that function is also used
2866 	 * for validation during create and currently vmm names must be unique.
2867 	 */
2868 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2869 		mutex_exit(&vmm_mtx);
2870 		return (EPERM);
2871 	}
2872 
2873 	err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release);
2874 	mutex_exit(&vmm_mtx);
2875 
2876 	if (hma_release) {
2877 		vmm_hma_release();
2878 	}
2879 
2880 	return (err);
2881 }
2882 
2883 #define	VCPU_NAME_BUFLEN	32
2884 
2885 static int
vmm_kstat_alloc(vmm_softc_t * sc,minor_t minor,const cred_t * cr)2886 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2887 {
2888 	zoneid_t zid = crgetzoneid(cr);
2889 	int instance = minor;
2890 	kstat_t *ksp;
2891 
2892 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2893 
2894 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2895 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2896 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2897 
2898 	if (ksp == NULL) {
2899 		return (-1);
2900 	}
2901 	sc->vmm_kstat_vm = ksp;
2902 
2903 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2904 		char namebuf[VCPU_NAME_BUFLEN];
2905 
2906 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2907 
2908 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2909 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2910 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2911 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2912 		    0, zid);
2913 		if (ksp == NULL) {
2914 			goto fail;
2915 		}
2916 
2917 		sc->vmm_kstat_vcpu[i] = ksp;
2918 	}
2919 
2920 	/*
2921 	 * If this instance is associated with a non-global zone, make its
2922 	 * kstats visible from the GZ.
2923 	 */
2924 	if (zid != GLOBAL_ZONEID) {
2925 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2926 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2927 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2928 		}
2929 	}
2930 
2931 	return (0);
2932 
2933 fail:
2934 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2935 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2936 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2937 			sc->vmm_kstat_vcpu[i] = NULL;
2938 		} else {
2939 			break;
2940 		}
2941 	}
2942 	kstat_delete(sc->vmm_kstat_vm);
2943 	sc->vmm_kstat_vm = NULL;
2944 	return (-1);
2945 }
2946 
2947 static void
vmm_kstat_init(vmm_softc_t * sc)2948 vmm_kstat_init(vmm_softc_t *sc)
2949 {
2950 	kstat_t *ksp;
2951 
2952 	ASSERT3P(sc->vmm_vm, !=, NULL);
2953 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2954 
2955 	ksp = sc->vmm_kstat_vm;
2956 	vmm_kstats_t *vk = ksp->ks_data;
2957 	ksp->ks_private = sc->vmm_vm;
2958 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2959 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2960 
2961 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2962 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2963 
2964 		ksp = sc->vmm_kstat_vcpu[i];
2965 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2966 
2967 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2968 		vvk->vvk_vcpu.value.ui32 = i;
2969 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2970 		    KSTAT_DATA_UINT64);
2971 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2972 		    KSTAT_DATA_UINT64);
2973 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2974 		    KSTAT_DATA_UINT64);
2975 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2976 		    KSTAT_DATA_UINT64);
2977 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2978 		    KSTAT_DATA_UINT64);
2979 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2980 		    KSTAT_DATA_UINT64);
2981 		ksp->ks_private = sc->vmm_vm;
2982 		ksp->ks_update = vmm_kstat_update_vcpu;
2983 	}
2984 
2985 	kstat_install(sc->vmm_kstat_vm);
2986 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2987 		kstat_install(sc->vmm_kstat_vcpu[i]);
2988 	}
2989 }
2990 
2991 static void
vmm_kstat_fini(vmm_softc_t * sc)2992 vmm_kstat_fini(vmm_softc_t *sc)
2993 {
2994 	ASSERT(sc->vmm_kstat_vm != NULL);
2995 
2996 	kstat_delete(sc->vmm_kstat_vm);
2997 	sc->vmm_kstat_vm = NULL;
2998 
2999 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
3000 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
3001 
3002 		kstat_delete(sc->vmm_kstat_vcpu[i]);
3003 		sc->vmm_kstat_vcpu[i] = NULL;
3004 	}
3005 }
3006 
3007 static int
vmm_open(dev_t * devp,int flag,int otyp,cred_t * credp)3008 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
3009 {
3010 	minor_t		minor;
3011 	vmm_softc_t	*sc;
3012 
3013 	/*
3014 	 * Forbid running bhyve in a 32-bit process until it has been tested and
3015 	 * verified to be safe.
3016 	 */
3017 	if (curproc->p_model != DATAMODEL_LP64) {
3018 		return (EFBIG);
3019 	}
3020 
3021 	minor = getminor(*devp);
3022 	if (minor == VMM_CTL_MINOR) {
3023 		/*
3024 		 * Master control device must be opened exclusively.
3025 		 */
3026 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
3027 			return (EINVAL);
3028 		}
3029 
3030 		return (0);
3031 	}
3032 
3033 	mutex_enter(&vmm_mtx);
3034 	sc = ddi_get_soft_state(vmm_statep, minor);
3035 	if (sc == NULL) {
3036 		mutex_exit(&vmm_mtx);
3037 		return (ENXIO);
3038 	}
3039 
3040 	sc->vmm_flags |= VMM_IS_OPEN;
3041 	mutex_exit(&vmm_mtx);
3042 
3043 	return (0);
3044 }
3045 
3046 static int
vmm_close(dev_t dev,int flag,int otyp,cred_t * credp)3047 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
3048 {
3049 	const minor_t minor = getminor(dev);
3050 	vmm_softc_t *sc;
3051 	bool hma_release = false;
3052 
3053 	if (minor == VMM_CTL_MINOR) {
3054 		return (0);
3055 	}
3056 
3057 	mutex_enter(&vmm_mtx);
3058 	sc = ddi_get_soft_state(vmm_statep, minor);
3059 	if (sc == NULL) {
3060 		mutex_exit(&vmm_mtx);
3061 		return (ENXIO);
3062 	}
3063 
3064 	VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0);
3065 	sc->vmm_flags &= ~VMM_IS_OPEN;
3066 
3067 	/*
3068 	 * If instance was marked for auto-destruction begin that now.  Instance
3069 	 * destruction may have been initated already, so try to make progress
3070 	 * in that case, since closure of the device is one of its requirements.
3071 	 */
3072 	if ((sc->vmm_flags & VMM_DESTROY) != 0 ||
3073 	    (sc->vmm_flags & VMM_AUTODESTROY) != 0) {
3074 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
3075 	}
3076 	mutex_exit(&vmm_mtx);
3077 
3078 	if (hma_release) {
3079 		vmm_hma_release();
3080 	}
3081 
3082 	return (0);
3083 }
3084 
3085 static int
vmm_is_supported(intptr_t arg)3086 vmm_is_supported(intptr_t arg)
3087 {
3088 	int r;
3089 	const char *msg;
3090 
3091 	if (vmm_is_intel()) {
3092 		r = vmx_x86_supported(&msg);
3093 	} else if (vmm_is_svm()) {
3094 		/*
3095 		 * HMA already ensured that the features necessary for SVM
3096 		 * operation were present and online during vmm_attach().
3097 		 */
3098 		r = 0;
3099 	} else {
3100 		r = ENXIO;
3101 		msg = "Unsupported CPU vendor";
3102 	}
3103 
3104 	if (r != 0 && arg != (intptr_t)NULL) {
3105 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
3106 			return (EFAULT);
3107 	}
3108 	return (r);
3109 }
3110 
3111 static int
vmm_ctl_ioctl(int cmd,intptr_t arg,int md,cred_t * cr,int * rvalp)3112 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
3113 {
3114 	void *argp = (void *)arg;
3115 
3116 	switch (cmd) {
3117 	case VMM_CREATE_VM: {
3118 		struct vm_create_req req;
3119 
3120 		if ((md & FWRITE) == 0) {
3121 			return (EPERM);
3122 		}
3123 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
3124 			return (EFAULT);
3125 		}
3126 		return (vmmdev_do_vm_create(&req, cr));
3127 	}
3128 	case VMM_DESTROY_VM: {
3129 		struct vm_destroy_req req;
3130 
3131 		if ((md & FWRITE) == 0) {
3132 			return (EPERM);
3133 		}
3134 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
3135 			return (EFAULT);
3136 		}
3137 		return (vmmdev_do_vm_destroy(&req, cr));
3138 	}
3139 	case VMM_VM_SUPPORTED:
3140 		return (vmm_is_supported(arg));
3141 	case VMM_CHECK_IOMMU:
3142 		if (!vmm_check_iommu()) {
3143 			return (ENXIO);
3144 		}
3145 		return (0);
3146 	case VMM_RESV_QUERY:
3147 	case VMM_RESV_SET_TARGET:
3148 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
3149 	default:
3150 		break;
3151 	}
3152 	/* No other actions are legal on ctl device */
3153 	return (ENOTTY);
3154 }
3155 
3156 static int
vmm_ioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * credp,int * rvalp)3157 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
3158     int *rvalp)
3159 {
3160 	vmm_softc_t	*sc;
3161 	minor_t		minor;
3162 
3163 	/*
3164 	 * Forbid running bhyve in a 32-bit process until it has been tested and
3165 	 * verified to be safe.
3166 	 */
3167 	if (curproc->p_model != DATAMODEL_LP64) {
3168 		return (EFBIG);
3169 	}
3170 
3171 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
3172 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
3173 		return (ENOTSUP);
3174 	}
3175 
3176 	/*
3177 	 * Regardless of minor (vmmctl or instance), we respond to queries of
3178 	 * the interface version.
3179 	 */
3180 	if (cmd == VMM_INTERFACE_VERSION) {
3181 		*rvalp = VMM_CURRENT_INTERFACE_VERSION;
3182 		return (0);
3183 	}
3184 
3185 	minor = getminor(dev);
3186 
3187 	if (minor == VMM_CTL_MINOR) {
3188 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
3189 	}
3190 
3191 	sc = ddi_get_soft_state(vmm_statep, minor);
3192 	ASSERT(sc != NULL);
3193 
3194 	/*
3195 	 * Turn away any ioctls against an instance when it is being destroyed.
3196 	 * (Except for the ioctl inquiring about that destroy-in-progress.)
3197 	 */
3198 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
3199 		if (cmd == VM_DESTROY_PENDING) {
3200 			*rvalp = 1;
3201 			return (0);
3202 		}
3203 		return (ENXIO);
3204 	}
3205 
3206 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
3207 }
3208 
3209 static int
vmm_segmap(dev_t dev,off_t off,struct as * as,caddr_t * addrp,off_t len,unsigned int prot,unsigned int maxprot,unsigned int flags,cred_t * credp)3210 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
3211     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
3212 {
3213 	vmm_softc_t *sc;
3214 	const minor_t minor = getminor(dev);
3215 	int err;
3216 
3217 	if (minor == VMM_CTL_MINOR) {
3218 		return (ENODEV);
3219 	}
3220 	if (off < 0 || (off + len) <= 0) {
3221 		return (EINVAL);
3222 	}
3223 	if ((prot & PROT_USER) == 0) {
3224 		return (EACCES);
3225 	}
3226 
3227 	sc = ddi_get_soft_state(vmm_statep, minor);
3228 	ASSERT(sc);
3229 
3230 	if (sc->vmm_flags & VMM_DESTROY)
3231 		return (ENXIO);
3232 
3233 	/* Grab read lock on the VM to prevent any changes to the memory map */
3234 	vmm_read_lock(sc);
3235 
3236 	if (off >= VM_DEVMEM_START) {
3237 		int segid;
3238 		off_t segoff;
3239 
3240 		/* Mapping a devmem "device" */
3241 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
3242 			err = ENODEV;
3243 		} else {
3244 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
3245 			    addrp, prot, maxprot, flags);
3246 		}
3247 	} else {
3248 		/* Mapping a part of the guest physical space */
3249 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
3250 		    maxprot, flags);
3251 	}
3252 
3253 	vmm_read_unlock(sc);
3254 	return (err);
3255 }
3256 
3257 static sdev_plugin_validate_t
vmm_sdev_validate(sdev_ctx_t ctx)3258 vmm_sdev_validate(sdev_ctx_t ctx)
3259 {
3260 	const char *name = sdev_ctx_name(ctx);
3261 	vmm_softc_t *sc;
3262 	sdev_plugin_validate_t ret;
3263 	minor_t minor;
3264 
3265 	if (sdev_ctx_vtype(ctx) != VCHR)
3266 		return (SDEV_VTOR_INVALID);
3267 
3268 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
3269 
3270 	mutex_enter(&vmm_mtx);
3271 	if ((sc = vmm_lookup(name)) == NULL)
3272 		ret = SDEV_VTOR_INVALID;
3273 	else if (sc->vmm_minor != minor)
3274 		ret = SDEV_VTOR_STALE;
3275 	else
3276 		ret = SDEV_VTOR_VALID;
3277 	mutex_exit(&vmm_mtx);
3278 
3279 	return (ret);
3280 }
3281 
3282 static int
vmm_sdev_filldir(sdev_ctx_t ctx)3283 vmm_sdev_filldir(sdev_ctx_t ctx)
3284 {
3285 	vmm_softc_t *sc;
3286 	int ret;
3287 
3288 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
3289 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
3290 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
3291 		return (EINVAL);
3292 	}
3293 
3294 	mutex_enter(&vmm_mtx);
3295 	ASSERT(vmmdev_dip != NULL);
3296 	for (sc = list_head(&vmm_list); sc != NULL;
3297 	    sc = list_next(&vmm_list, sc)) {
3298 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
3299 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
3300 			    S_IFCHR | 0600,
3301 			    makedevice(ddi_driver_major(vmmdev_dip),
3302 			    sc->vmm_minor));
3303 		} else {
3304 			continue;
3305 		}
3306 		if (ret != 0 && ret != EEXIST)
3307 			goto out;
3308 	}
3309 
3310 	ret = 0;
3311 
3312 out:
3313 	mutex_exit(&vmm_mtx);
3314 	return (ret);
3315 }
3316 
3317 /* ARGSUSED */
3318 static void
vmm_sdev_inactive(sdev_ctx_t ctx)3319 vmm_sdev_inactive(sdev_ctx_t ctx)
3320 {
3321 }
3322 
3323 static sdev_plugin_ops_t vmm_sdev_ops = {
3324 	.spo_version = SDEV_PLUGIN_VERSION,
3325 	.spo_flags = SDEV_PLUGIN_SUBDIR,
3326 	.spo_validate = vmm_sdev_validate,
3327 	.spo_filldir = vmm_sdev_filldir,
3328 	.spo_inactive = vmm_sdev_inactive
3329 };
3330 
3331 /* ARGSUSED */
3332 static int
vmm_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)3333 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
3334 {
3335 	int error;
3336 
3337 	switch (cmd) {
3338 	case DDI_INFO_DEVT2DEVINFO:
3339 		*result = (void *)vmmdev_dip;
3340 		error = DDI_SUCCESS;
3341 		break;
3342 	case DDI_INFO_DEVT2INSTANCE:
3343 		*result = (void *)0;
3344 		error = DDI_SUCCESS;
3345 		break;
3346 	default:
3347 		error = DDI_FAILURE;
3348 		break;
3349 	}
3350 	return (error);
3351 }
3352 
3353 static int
vmm_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)3354 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3355 {
3356 	sdev_plugin_hdl_t sph;
3357 	hma_reg_t *reg = NULL;
3358 	boolean_t vmm_loaded = B_FALSE;
3359 
3360 	if (cmd != DDI_ATTACH) {
3361 		return (DDI_FAILURE);
3362 	}
3363 
3364 	mutex_enter(&vmmdev_mtx);
3365 	/* Ensure we are not already attached. */
3366 	if (vmmdev_dip != NULL) {
3367 		mutex_exit(&vmmdev_mtx);
3368 		return (DDI_FAILURE);
3369 	}
3370 
3371 	vmm_sol_glue_init();
3372 
3373 	/*
3374 	 * Perform temporary HMA registration to determine if the system
3375 	 * is capable.
3376 	 */
3377 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
3378 		goto fail;
3379 	} else if (vmm_mod_load() != 0) {
3380 		goto fail;
3381 	}
3382 	vmm_loaded = B_TRUE;
3383 	hma_unregister(reg);
3384 	reg = NULL;
3385 
3386 	/* Create control node.  Other nodes will be created on demand. */
3387 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
3388 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
3389 		goto fail;
3390 	}
3391 
3392 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
3393 	if (sph == (sdev_plugin_hdl_t)NULL) {
3394 		ddi_remove_minor_node(dip, NULL);
3395 		goto fail;
3396 	}
3397 
3398 	ddi_report_dev(dip);
3399 	vmmdev_sdev_hdl = sph;
3400 	vmmdev_dip = dip;
3401 	mutex_exit(&vmmdev_mtx);
3402 	return (DDI_SUCCESS);
3403 
3404 fail:
3405 	if (vmm_loaded) {
3406 		vmm_mod_unload();
3407 	}
3408 	if (reg != NULL) {
3409 		hma_unregister(reg);
3410 	}
3411 	vmm_sol_glue_cleanup();
3412 	mutex_exit(&vmmdev_mtx);
3413 	return (DDI_FAILURE);
3414 }
3415 
3416 static int
vmm_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)3417 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3418 {
3419 	if (cmd != DDI_DETACH) {
3420 		return (DDI_FAILURE);
3421 	}
3422 
3423 	/*
3424 	 * Ensure that all resources have been cleaned up.
3425 	 *
3426 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
3427 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
3428 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
3429 	 * devinfo, including our own, while holding vmmdev_mtx.
3430 	 */
3431 	if (mutex_tryenter(&vmmdev_mtx) == 0)
3432 		return (DDI_FAILURE);
3433 
3434 	mutex_enter(&vmm_mtx);
3435 	if (!list_is_empty(&vmm_list)) {
3436 		mutex_exit(&vmm_mtx);
3437 		mutex_exit(&vmmdev_mtx);
3438 		return (DDI_FAILURE);
3439 	}
3440 	mutex_exit(&vmm_mtx);
3441 
3442 	if (!vmmr_is_empty()) {
3443 		mutex_exit(&vmmdev_mtx);
3444 		return (DDI_FAILURE);
3445 	}
3446 
3447 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
3448 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
3449 		mutex_exit(&vmmdev_mtx);
3450 		return (DDI_FAILURE);
3451 	}
3452 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
3453 
3454 	/* Remove the control node. */
3455 	ddi_remove_minor_node(dip, "ctl");
3456 	vmmdev_dip = NULL;
3457 
3458 	vmm_mod_unload();
3459 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
3460 	vmm_sol_glue_cleanup();
3461 
3462 	mutex_exit(&vmmdev_mtx);
3463 
3464 	return (DDI_SUCCESS);
3465 }
3466 
3467 static struct cb_ops vmm_cb_ops = {
3468 	vmm_open,
3469 	vmm_close,
3470 	nodev,		/* strategy */
3471 	nodev,		/* print */
3472 	nodev,		/* dump */
3473 	nodev,		/* read */
3474 	nodev,		/* write */
3475 	vmm_ioctl,
3476 	nodev,		/* devmap */
3477 	nodev,		/* mmap */
3478 	vmm_segmap,
3479 	nochpoll,	/* poll */
3480 	ddi_prop_op,
3481 	NULL,
3482 	D_NEW | D_MP | D_DEVMAP
3483 };
3484 
3485 static struct dev_ops vmm_ops = {
3486 	DEVO_REV,
3487 	0,
3488 	vmm_info,
3489 	nulldev,	/* identify */
3490 	nulldev,	/* probe */
3491 	vmm_attach,
3492 	vmm_detach,
3493 	nodev,		/* reset */
3494 	&vmm_cb_ops,
3495 	(struct bus_ops *)NULL
3496 };
3497 
3498 static struct modldrv modldrv = {
3499 	&mod_driverops,
3500 	"bhyve vmm",
3501 	&vmm_ops
3502 };
3503 
3504 static struct modlinkage modlinkage = {
3505 	MODREV_1,
3506 	&modldrv,
3507 	NULL
3508 };
3509 
3510 int
_init(void)3511 _init(void)
3512 {
3513 	int	error;
3514 
3515 	sysinit();
3516 
3517 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3518 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3519 	list_create(&vmm_list, sizeof (vmm_softc_t),
3520 	    offsetof(vmm_softc_t, vmm_node));
3521 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3522 
3523 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3524 	if (error) {
3525 		return (error);
3526 	}
3527 
3528 	error = vmmr_init();
3529 	if (error) {
3530 		ddi_soft_state_fini(&vmm_statep);
3531 		return (error);
3532 	}
3533 
3534 	vmm_zsd_init();
3535 
3536 	error = mod_install(&modlinkage);
3537 	if (error) {
3538 		ddi_soft_state_fini(&vmm_statep);
3539 		vmm_zsd_fini();
3540 		vmmr_fini();
3541 	}
3542 
3543 	return (error);
3544 }
3545 
3546 int
_fini(void)3547 _fini(void)
3548 {
3549 	int	error;
3550 
3551 	error = mod_remove(&modlinkage);
3552 	if (error) {
3553 		return (error);
3554 	}
3555 
3556 	vmm_zsd_fini();
3557 	vmmr_fini();
3558 
3559 	ddi_soft_state_fini(&vmm_statep);
3560 
3561 	return (0);
3562 }
3563 
3564 int
_info(struct modinfo * modinfop)3565 _info(struct modinfo *modinfop)
3566 {
3567 	return (mod_info(&modlinkage, modinfop));
3568 }
3569