1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2015 Pluribus Networks Inc.
15 * Copyright 2019 Joyent, Inc.
16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17 * Copyright 2025 Oxide Computer Company
18 */
19
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49
50 #include <vm/seg_dev.h>
51
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61
62 /*
63 * Locking details:
64 *
65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data
67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire
68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to
69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70 */
71
72 static kmutex_t vmmdev_mtx;
73 static dev_info_t *vmmdev_dip;
74 static hma_reg_t *vmmdev_hma_reg;
75 static uint_t vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77
78 static kmutex_t vmm_mtx;
79 static list_t vmm_list;
80 static id_space_t *vmm_minors;
81 static void *vmm_statep;
82
83 /*
84 * Until device emulation in bhyve had been adequately scrutinized and tested,
85 * there was (justified) concern that unusual or corrupt device state payloads
86 * could crash the host when loaded via the vmm-data interface.
87 *
88 * Now that those concerns have been mitigated, this protection is loosened to
89 * default-allow, but the switch is left in place, in case there is a need to
90 * once again clamp down on vmm-data writes.
91 */
92 int vmm_allow_state_writes = 1;
93
94 static const char *vmmdev_hvm_name = "bhyve";
95
96 /* For sdev plugin (/dev) */
97 #define VMM_SDEV_ROOT "/dev/vmm"
98
99 /* From uts/intel/io/vmm/intel/vmx.c */
100 extern int vmx_x86_supported(const char **);
101
102 /* Holds and hooks from drivers external to vmm */
103 struct vmm_hold {
104 list_node_t vmh_node;
105 vmm_softc_t *vmh_sc;
106 boolean_t vmh_release_req;
107 uint_t vmh_ioport_hook_cnt;
108 uint_t vmh_mmio_hook_cnt;
109 };
110
111 struct vmm_lease {
112 list_node_t vml_node;
113 struct vm *vml_vm;
114 vm_client_t *vml_vmclient;
115 boolean_t vml_expired;
116 boolean_t vml_break_deferred;
117 boolean_t (*vml_expire_func)(void *);
118 void *vml_expire_arg;
119 struct vmm_hold *vml_hold;
120 };
121
122 /* Options for vmm_destroy_locked */
123 typedef enum vmm_destroy_opts {
124 VDO_DEFAULT = 0,
125 /*
126 * Indicate that zone-specific-data associated with this VM not be
127 * cleaned up as part of the destroy. Skipping ZSD clean-up is
128 * necessary when VM is being destroyed as part of zone destruction,
129 * when said ZSD is already being cleaned up.
130 */
131 VDO_NO_CLEAN_ZSD = (1 << 0),
132 /*
133 * Attempt to wait for VM destruction to complete. This is opt-in,
134 * since there are many normal conditions which could lead to
135 * destruction being stalled pending other clean-up.
136 */
137 VDO_ATTEMPT_WAIT = (1 << 1),
138 } vmm_destroy_opts_t;
139
140 static void vmm_hma_release(void);
141 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *);
142 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
143 static void vmm_lease_block(vmm_softc_t *);
144 static void vmm_lease_unblock(vmm_softc_t *);
145 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
146 static void vmm_kstat_init(vmm_softc_t *);
147 static void vmm_kstat_fini(vmm_softc_t *);
148
149 /*
150 * The 'devmem' hack:
151 *
152 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
153 * in the vm which appear with their own name related to the vm under /dev.
154 * Since this would be a hassle from an sdev perspective and would require a
155 * new cdev interface (or complicate the existing one), we choose to implement
156 * this in a different manner. Direct access to the underlying vm memory
157 * segments is exposed by placing them in a range of offsets beyond the normal
158 * guest memory space. Userspace can query the appropriate offset to mmap()
159 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
160 */
161
162 static vmm_devmem_entry_t *
vmmdev_devmem_find(vmm_softc_t * sc,int segid)163 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
164 {
165 vmm_devmem_entry_t *ent = NULL;
166 list_t *dl = &sc->vmm_devmem_list;
167
168 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
169 if (ent->vde_segid == segid) {
170 return (ent);
171 }
172 }
173 return (NULL);
174 }
175
176 static int
vmmdev_get_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)177 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
178 {
179 int error;
180 bool sysmem;
181
182 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
183 NULL);
184 if (error || mseg->len == 0)
185 return (error);
186
187 if (!sysmem) {
188 vmm_devmem_entry_t *de;
189
190 de = vmmdev_devmem_find(sc, mseg->segid);
191 if (de != NULL) {
192 (void) strlcpy(mseg->name, de->vde_name,
193 sizeof (mseg->name));
194 }
195 } else {
196 bzero(mseg->name, sizeof (mseg->name));
197 }
198
199 return (error);
200 }
201
202 static int
vmmdev_devmem_create(vmm_softc_t * sc,struct vm_memseg * mseg,const char * name)203 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
204 {
205 off_t map_offset;
206 vmm_devmem_entry_t *entry;
207
208 if (list_is_empty(&sc->vmm_devmem_list)) {
209 map_offset = VM_DEVMEM_START;
210 } else {
211 entry = list_tail(&sc->vmm_devmem_list);
212 if (sum_overflows_off(entry->vde_off, (off_t)entry->vde_len)) {
213 /* Do not tolerate overflow */
214 return (ERANGE);
215 }
216 map_offset = entry->vde_off + (off_t)entry->vde_len;
217 /*
218 * XXXJOY: We could choose to search the list for duplicate
219 * names and toss an error. Since we're using the offset
220 * method for now, it does not make much of a difference.
221 */
222 }
223
224 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
225 entry->vde_segid = mseg->segid;
226 entry->vde_len = mseg->len;
227 entry->vde_off = map_offset;
228 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
229 list_insert_tail(&sc->vmm_devmem_list, entry);
230
231 return (0);
232 }
233
234 static boolean_t
vmmdev_devmem_segid(vmm_softc_t * sc,off_t off,off_t len,int * segidp,off_t * map_offp)235 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
236 off_t *map_offp)
237 {
238 list_t *dl = &sc->vmm_devmem_list;
239 vmm_devmem_entry_t *de = NULL;
240
241 VERIFY(off >= VM_DEVMEM_START);
242
243 if (sum_overflows_off(off, len)) {
244 /* No match on overflow */
245 return (B_FALSE);
246 }
247 const off_t map_end = off + len;
248
249 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
250 const off_t item_end = de->vde_off + de->vde_len;
251
252 if (de->vde_off <= off && item_end >= map_end) {
253 *segidp = de->vde_segid;
254 *map_offp = off - de->vde_off;
255 return (B_TRUE);
256 }
257 }
258 return (B_FALSE);
259 }
260
261 /*
262 * When an instance is being destroyed, the devmem list of named memory objects
263 * can be torn down, as no new mappings are allowed.
264 */
265 static void
vmmdev_devmem_purge(vmm_softc_t * sc)266 vmmdev_devmem_purge(vmm_softc_t *sc)
267 {
268 vmm_devmem_entry_t *entry;
269
270 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
271 kmem_free(entry, sizeof (*entry));
272 }
273 }
274
275 static int
vmmdev_alloc_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)276 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
277 {
278 int error;
279 bool sysmem = true;
280
281 if (VM_MEMSEG_NAME(mseg)) {
282 sysmem = false;
283 }
284 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
285
286 if (error == 0) {
287 /*
288 * Rather than create a whole fresh device from which userspace
289 * can mmap this segment, instead make it available at an
290 * offset above where the main guest memory resides.
291 */
292 error = vmmdev_devmem_create(sc, mseg, mseg->name);
293 if (error != 0) {
294 vm_free_memseg(sc->vmm_vm, mseg->segid);
295 }
296 }
297 return (error);
298 }
299
300 /*
301 * Resource Locking and Exclusion
302 *
303 * Much of bhyve depends on key portions of VM state, such as the guest memory
304 * map, to remain unchanged while the guest is running. As ported from
305 * FreeBSD, the initial strategy for this resource exclusion hinged on gating
306 * access to the instance vCPUs. Threads acting on a single vCPU, like those
307 * performing the work of actually running the guest in VMX/SVM, would lock
308 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide
309 * state, all of the vCPUs would be first locked, ensuring that the
310 * operation(s) could complete without any other threads stumbling into
311 * intermediate states.
312 *
313 * This approach is largely effective for bhyve. Common operations, such as
314 * running the vCPUs, steer clear of lock contention. The model begins to
315 * break down for operations which do not occur in the context of a specific
316 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker
317 * thread in the bhyve process. In order to properly protect those vCPU-less
318 * operations from encountering invalid states, additional locking is required.
319 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
320 * It does mean that class of operations will be serialized on locking the
321 * specific vCPU and that instances sized at VM_MAXCPU will potentially see
322 * undue contention on the VM_MAXCPU-1 vCPU.
323 *
324 * In order to address the shortcomings of this model, the concept of a
325 * read/write lock has been added to bhyve. Operations which change
326 * fundamental aspects of a VM (such as the memory map) must acquire the write
327 * lock, which also implies locking all of the vCPUs and waiting for all read
328 * lock holders to release. While it increases the cost and waiting time for
329 * those few operations, it allows most hot-path operations on the VM (which
330 * depend on its configuration remaining stable) to occur with minimal locking.
331 *
332 * Consumers of the Driver API (see below) are a special case when it comes to
333 * this locking, since they may hold a read lock via the drv_lease mechanism
334 * for an extended period of time. Rather than forcing those consumers to
335 * continuously poll for a write lock attempt, the lease system forces them to
336 * provide a release callback to trigger their clean-up (and potential later
337 * reacquisition) of the read lock.
338 */
339
340 static void
vcpu_lock_one(vmm_softc_t * sc,int vcpu)341 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
342 {
343 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
344
345 /*
346 * Since this state transition is utilizing from_idle=true, it should
347 * not fail, but rather block until it can be successful.
348 */
349 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
350 }
351
352 static void
vcpu_unlock_one(vmm_softc_t * sc,int vcpu)353 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
354 {
355 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
356
357 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
358 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
359 }
360
361 static void
vmm_read_lock(vmm_softc_t * sc)362 vmm_read_lock(vmm_softc_t *sc)
363 {
364 rw_enter(&sc->vmm_rwlock, RW_READER);
365 }
366
367 static void
vmm_read_unlock(vmm_softc_t * sc)368 vmm_read_unlock(vmm_softc_t *sc)
369 {
370 rw_exit(&sc->vmm_rwlock);
371 }
372
373 static void
vmm_write_lock(vmm_softc_t * sc)374 vmm_write_lock(vmm_softc_t *sc)
375 {
376 int maxcpus;
377
378 /* First lock all the vCPUs */
379 maxcpus = vm_get_maxcpus(sc->vmm_vm);
380 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
381 vcpu_lock_one(sc, vcpu);
382 }
383
384 /*
385 * Block vmm_drv leases from being acquired or held while the VM write
386 * lock is held.
387 */
388 vmm_lease_block(sc);
389
390 rw_enter(&sc->vmm_rwlock, RW_WRITER);
391 /*
392 * For now, the 'maxcpus' value for an instance is fixed at the
393 * compile-time constant of VM_MAXCPU at creation. If this changes in
394 * the future, allowing for dynamic vCPU resource sizing, acquisition
395 * of the write lock will need to be wary of such changes.
396 */
397 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
398 }
399
400 static void
vmm_write_unlock(vmm_softc_t * sc)401 vmm_write_unlock(vmm_softc_t *sc)
402 {
403 int maxcpus;
404
405 /* Allow vmm_drv leases to be acquired once write lock is dropped */
406 vmm_lease_unblock(sc);
407
408 /*
409 * The VM write lock _must_ be released from the same thread it was
410 * acquired in, unlike the read lock.
411 */
412 VERIFY(rw_write_held(&sc->vmm_rwlock));
413 rw_exit(&sc->vmm_rwlock);
414
415 /* Unlock all the vCPUs */
416 maxcpus = vm_get_maxcpus(sc->vmm_vm);
417 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
418 vcpu_unlock_one(sc, vcpu);
419 }
420 }
421
422 static int
vmmdev_do_ioctl(vmm_softc_t * sc,int cmd,intptr_t arg,int md,cred_t * credp,int * rvalp)423 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
424 cred_t *credp, int *rvalp)
425 {
426 int error = 0, vcpu = -1;
427 void *datap = (void *)arg;
428 enum vm_lock_type {
429 LOCK_NONE = 0,
430 LOCK_VCPU,
431 LOCK_READ_HOLD,
432 LOCK_WRITE_HOLD
433 } lock_type = LOCK_NONE;
434
435 /* Acquire any exclusion resources needed for the operation. */
436 switch (cmd) {
437 case VM_RUN:
438 case VM_GET_REGISTER:
439 case VM_SET_REGISTER:
440 case VM_GET_SEGMENT_DESCRIPTOR:
441 case VM_SET_SEGMENT_DESCRIPTOR:
442 case VM_GET_REGISTER_SET:
443 case VM_SET_REGISTER_SET:
444 case VM_INJECT_EXCEPTION:
445 case VM_GET_CAPABILITY:
446 case VM_SET_CAPABILITY:
447 case VM_PPTDEV_MSI:
448 case VM_PPTDEV_MSIX:
449 case VM_SET_X2APIC_STATE:
450 case VM_GLA2GPA:
451 case VM_GLA2GPA_NOFAULT:
452 case VM_ACTIVATE_CPU:
453 case VM_SET_INTINFO:
454 case VM_GET_INTINFO:
455 case VM_RESTART_INSTRUCTION:
456 case VM_SET_KERNEMU_DEV:
457 case VM_GET_KERNEMU_DEV:
458 case VM_RESET_CPU:
459 case VM_GET_RUN_STATE:
460 case VM_SET_RUN_STATE:
461 case VM_GET_FPU:
462 case VM_SET_FPU:
463 case VM_GET_CPUID:
464 case VM_SET_CPUID:
465 case VM_LEGACY_CPUID:
466 /*
467 * Copy in the ID of the vCPU chosen for this operation.
468 * Since a nefarious caller could update their struct between
469 * this locking and when the rest of the ioctl data is copied
470 * in, it is _critical_ that this local 'vcpu' variable be used
471 * rather than the in-struct one when performing the ioctl.
472 */
473 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
474 return (EFAULT);
475 }
476 if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) {
477 return (EINVAL);
478 }
479 vcpu_lock_one(sc, vcpu);
480 lock_type = LOCK_VCPU;
481 break;
482
483 case VM_REINIT:
484 case VM_BIND_PPTDEV:
485 case VM_UNBIND_PPTDEV:
486 case VM_MAP_PPTDEV_MMIO:
487 case VM_UNMAP_PPTDEV_MMIO:
488 case VM_ALLOC_MEMSEG:
489 case VM_MMAP_MEMSEG:
490 case VM_MUNMAP_MEMSEG:
491 case VM_WRLOCK_CYCLE:
492 case VM_PMTMR_LOCATE:
493 case VM_PAUSE:
494 case VM_RESUME:
495 vmm_write_lock(sc);
496 lock_type = LOCK_WRITE_HOLD;
497 break;
498
499 case VM_GET_MEMSEG:
500 case VM_MMAP_GETNEXT:
501 case VM_LAPIC_IRQ:
502 case VM_INJECT_NMI:
503 case VM_IOAPIC_ASSERT_IRQ:
504 case VM_IOAPIC_DEASSERT_IRQ:
505 case VM_IOAPIC_PULSE_IRQ:
506 case VM_LAPIC_MSI:
507 case VM_LAPIC_LOCAL_IRQ:
508 case VM_GET_X2APIC_STATE:
509 case VM_RTC_READ:
510 case VM_RTC_WRITE:
511 case VM_RTC_SETTIME:
512 case VM_RTC_GETTIME:
513 case VM_PPTDEV_DISABLE_MSIX:
514 case VM_DEVMEM_GETOFFSET:
515 case VM_TRACK_DIRTY_PAGES:
516 case VM_NPT_OPERATION:
517 vmm_read_lock(sc);
518 lock_type = LOCK_READ_HOLD;
519 break;
520
521 case VM_DATA_READ:
522 case VM_DATA_WRITE:
523 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
524 return (EFAULT);
525 }
526 if (vcpu == -1) {
527 /* Access data for VM-wide devices */
528 vmm_write_lock(sc);
529 lock_type = LOCK_WRITE_HOLD;
530 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
531 /* Access data associated with a specific vCPU */
532 vcpu_lock_one(sc, vcpu);
533 lock_type = LOCK_VCPU;
534 } else {
535 return (EINVAL);
536 }
537 break;
538
539 case VM_GET_GPA_PMAP:
540 case VM_IOAPIC_PINCOUNT:
541 case VM_SUSPEND:
542 case VM_DESC_FPU_AREA:
543 case VM_SET_AUTODESTRUCT:
544 case VM_DESTROY_SELF:
545 case VM_DESTROY_PENDING:
546 case VM_VCPU_BARRIER:
547 default:
548 break;
549 }
550
551 /* Execute the primary logic for the ioctl. */
552 switch (cmd) {
553 case VM_RUN: {
554 struct vm_entry entry;
555
556 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
557 error = EFAULT;
558 break;
559 }
560
561 if (!(curthread->t_schedflag & TS_VCPU))
562 smt_mark_as_vcpu();
563
564 error = vm_run(sc->vmm_vm, vcpu, &entry);
565
566 /*
567 * Unexpected states in vm_run() are expressed through positive
568 * errno-oriented return values. VM states which expect further
569 * processing in userspace (necessary context via exitinfo) are
570 * expressed through negative return values. For the time being
571 * a return value of 0 is not expected from vm_run().
572 */
573 ASSERT(error != 0);
574 if (error < 0) {
575 const struct vm_exit *vme;
576 void *outp = entry.exit_data;
577
578 error = 0;
579 vme = vm_exitinfo(sc->vmm_vm, vcpu);
580 if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
581 error = EFAULT;
582 }
583 }
584 break;
585 }
586 case VM_SUSPEND: {
587 struct vm_suspend vmsuspend;
588
589 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
590 error = EFAULT;
591 break;
592 }
593 error = vm_suspend(sc->vmm_vm, vmsuspend.how, vmsuspend.source);
594 break;
595 }
596 case VM_REINIT: {
597 struct vm_reinit reinit;
598
599 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
600 error = EFAULT;
601 break;
602 }
603 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
604 /*
605 * The VM instance should be free of driver-attached
606 * hooks during the reinitialization process.
607 */
608 break;
609 }
610 error = vm_reinit(sc->vmm_vm, reinit.flags);
611 (void) vmm_drv_block_hook(sc, B_FALSE);
612 break;
613 }
614 case VM_STAT_DESC: {
615 struct vm_stat_desc statdesc;
616
617 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
618 error = EFAULT;
619 break;
620 }
621 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
622 sizeof (statdesc.desc));
623 if (error == 0 &&
624 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
625 error = EFAULT;
626 break;
627 }
628 break;
629 }
630 case VM_STATS_IOC: {
631 struct vm_stats vmstats;
632
633 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
634 error = EFAULT;
635 break;
636 }
637 hrt2tv(gethrtime(), &vmstats.tv);
638 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
639 nitems(vmstats.statbuf),
640 &vmstats.num_entries, vmstats.statbuf);
641 if (error == 0 &&
642 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
643 error = EFAULT;
644 break;
645 }
646 break;
647 }
648
649 case VM_PPTDEV_MSI: {
650 struct vm_pptdev_msi pptmsi;
651
652 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
653 error = EFAULT;
654 break;
655 }
656 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
657 pptmsi.addr, pptmsi.msg, pptmsi.numvec);
658 break;
659 }
660 case VM_PPTDEV_MSIX: {
661 struct vm_pptdev_msix pptmsix;
662
663 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
664 error = EFAULT;
665 break;
666 }
667 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
668 pptmsix.idx, pptmsix.addr, pptmsix.msg,
669 pptmsix.vector_control);
670 break;
671 }
672 case VM_PPTDEV_DISABLE_MSIX: {
673 struct vm_pptdev pptdev;
674
675 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
676 error = EFAULT;
677 break;
678 }
679 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
680 break;
681 }
682 case VM_MAP_PPTDEV_MMIO: {
683 struct vm_pptdev_mmio pptmmio;
684
685 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
686 error = EFAULT;
687 break;
688 }
689 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
690 pptmmio.len, pptmmio.hpa);
691 break;
692 }
693 case VM_UNMAP_PPTDEV_MMIO: {
694 struct vm_pptdev_mmio pptmmio;
695
696 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
697 error = EFAULT;
698 break;
699 }
700 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
701 pptmmio.len);
702 break;
703 }
704 case VM_BIND_PPTDEV: {
705 struct vm_pptdev pptdev;
706
707 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
708 error = EFAULT;
709 break;
710 }
711 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
712 break;
713 }
714 case VM_UNBIND_PPTDEV: {
715 struct vm_pptdev pptdev;
716
717 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
718 error = EFAULT;
719 break;
720 }
721 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
722 break;
723 }
724 case VM_GET_PPTDEV_LIMITS: {
725 struct vm_pptdev_limits pptlimits;
726
727 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
728 error = EFAULT;
729 break;
730 }
731 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
732 &pptlimits.msi_limit, &pptlimits.msix_limit);
733 if (error == 0 &&
734 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
735 error = EFAULT;
736 break;
737 }
738 break;
739 }
740 case VM_INJECT_EXCEPTION: {
741 struct vm_exception vmexc;
742 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
743 error = EFAULT;
744 break;
745 }
746 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
747 vmexc.error_code_valid != 0, vmexc.error_code,
748 vmexc.restart_instruction != 0);
749 break;
750 }
751 case VM_INJECT_NMI: {
752 struct vm_nmi vmnmi;
753
754 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
755 error = EFAULT;
756 break;
757 }
758 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
759 break;
760 }
761 case VM_LAPIC_IRQ: {
762 struct vm_lapic_irq vmirq;
763
764 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
765 error = EFAULT;
766 break;
767 }
768 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
769 break;
770 }
771 case VM_LAPIC_LOCAL_IRQ: {
772 struct vm_lapic_irq vmirq;
773
774 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
775 error = EFAULT;
776 break;
777 }
778 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
779 vmirq.vector);
780 break;
781 }
782 case VM_LAPIC_MSI: {
783 struct vm_lapic_msi vmmsi;
784
785 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
786 error = EFAULT;
787 break;
788 }
789 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
790 break;
791 }
792
793 case VM_IOAPIC_ASSERT_IRQ: {
794 struct vm_ioapic_irq ioapic_irq;
795
796 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
797 error = EFAULT;
798 break;
799 }
800 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
801 break;
802 }
803 case VM_IOAPIC_DEASSERT_IRQ: {
804 struct vm_ioapic_irq ioapic_irq;
805
806 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
807 error = EFAULT;
808 break;
809 }
810 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
811 break;
812 }
813 case VM_IOAPIC_PULSE_IRQ: {
814 struct vm_ioapic_irq ioapic_irq;
815
816 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
817 error = EFAULT;
818 break;
819 }
820 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
821 break;
822 }
823 case VM_IOAPIC_PINCOUNT: {
824 int pincount;
825
826 pincount = vioapic_pincount(sc->vmm_vm);
827 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
828 error = EFAULT;
829 break;
830 }
831 break;
832 }
833 case VM_DESC_FPU_AREA: {
834 struct vm_fpu_desc desc;
835 void *buf = NULL;
836
837 if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
838 error = EFAULT;
839 break;
840 }
841 if (desc.vfd_num_entries > 64) {
842 error = EINVAL;
843 break;
844 }
845 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
846 desc.vfd_num_entries;
847 if (buf_sz != 0) {
848 buf = kmem_zalloc(buf_sz, KM_SLEEP);
849 }
850
851 /*
852 * For now, we are depending on vm_fpu_desc_entry and
853 * hma_xsave_state_desc_t having the same format.
854 */
855 CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
856 sizeof (hma_xsave_state_desc_t));
857
858 size_t req_size;
859 const uint_t max_entries = hma_fpu_describe_xsave_state(
860 (hma_xsave_state_desc_t *)buf,
861 desc.vfd_num_entries,
862 &req_size);
863
864 desc.vfd_req_size = req_size;
865 desc.vfd_num_entries = max_entries;
866 if (buf_sz != 0) {
867 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
868 error = EFAULT;
869 }
870 kmem_free(buf, buf_sz);
871 }
872
873 if (error == 0) {
874 if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
875 error = EFAULT;
876 }
877 }
878 break;
879 }
880 case VM_SET_AUTODESTRUCT: {
881 /*
882 * Since this has to do with controlling the lifetime of the
883 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather
884 * than the vcpu-centric or rwlock exclusion mechanisms.
885 */
886 mutex_enter(&vmm_mtx);
887 if (arg != 0) {
888 sc->vmm_flags |= VMM_AUTODESTROY;
889 } else {
890 sc->vmm_flags &= ~VMM_AUTODESTROY;
891 }
892 mutex_exit(&vmm_mtx);
893 break;
894 }
895 case VM_DESTROY_SELF: {
896 bool hma_release = false;
897
898 /*
899 * Just like VMM_DESTROY_VM, but on the instance file descriptor
900 * itself, rather than having to perform a racy name lookup as
901 * part of the destroy process.
902 *
903 * Since vmm_destroy_locked() performs vCPU lock acquisition in
904 * order to kick the vCPUs out of guest context as part of any
905 * destruction, we do not need to worry about it ourself using
906 * the `lock_type` logic here.
907 */
908 mutex_enter(&vmm_mtx);
909 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
910 mutex_exit(&vmm_mtx);
911 if (hma_release) {
912 vmm_hma_release();
913 }
914 break;
915 }
916 case VM_DESTROY_PENDING: {
917 /*
918 * If we have made it this far, then destruction of the instance
919 * has not been initiated.
920 */
921 *rvalp = 0;
922 break;
923 }
924
925 case VM_ISA_ASSERT_IRQ: {
926 struct vm_isa_irq isa_irq;
927
928 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
929 error = EFAULT;
930 break;
931 }
932 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
933 if (error == 0 && isa_irq.ioapic_irq != -1) {
934 error = vioapic_assert_irq(sc->vmm_vm,
935 isa_irq.ioapic_irq);
936 }
937 break;
938 }
939 case VM_ISA_DEASSERT_IRQ: {
940 struct vm_isa_irq isa_irq;
941
942 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
943 error = EFAULT;
944 break;
945 }
946 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
947 if (error == 0 && isa_irq.ioapic_irq != -1) {
948 error = vioapic_deassert_irq(sc->vmm_vm,
949 isa_irq.ioapic_irq);
950 }
951 break;
952 }
953 case VM_ISA_PULSE_IRQ: {
954 struct vm_isa_irq isa_irq;
955
956 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
957 error = EFAULT;
958 break;
959 }
960 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
961 if (error == 0 && isa_irq.ioapic_irq != -1) {
962 error = vioapic_pulse_irq(sc->vmm_vm,
963 isa_irq.ioapic_irq);
964 }
965 break;
966 }
967 case VM_ISA_SET_IRQ_TRIGGER: {
968 struct vm_isa_irq_trigger isa_irq_trigger;
969
970 if (ddi_copyin(datap, &isa_irq_trigger,
971 sizeof (isa_irq_trigger), md)) {
972 error = EFAULT;
973 break;
974 }
975 error = vatpic_set_irq_trigger(sc->vmm_vm,
976 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
977 break;
978 }
979
980 case VM_MMAP_GETNEXT: {
981 struct vm_memmap mm;
982
983 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
984 error = EFAULT;
985 break;
986 }
987 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
988 (uintptr_t *)&mm.segoff, &mm.len, &mm.prot, &mm.flags);
989 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
990 error = EFAULT;
991 break;
992 }
993 break;
994 }
995 case VM_MMAP_MEMSEG: {
996 struct vm_memmap mm;
997
998 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
999 error = EFAULT;
1000 break;
1001 }
1002 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid,
1003 (uintptr_t)mm.segoff, mm.len, mm.prot, mm.flags);
1004 break;
1005 }
1006 case VM_MUNMAP_MEMSEG: {
1007 struct vm_munmap mu;
1008
1009 if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
1010 error = EFAULT;
1011 break;
1012 }
1013 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
1014 break;
1015 }
1016 case VM_ALLOC_MEMSEG: {
1017 struct vm_memseg vmseg;
1018
1019 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1020 error = EFAULT;
1021 break;
1022 }
1023 error = vmmdev_alloc_memseg(sc, &vmseg);
1024 break;
1025 }
1026 case VM_GET_MEMSEG: {
1027 struct vm_memseg vmseg;
1028
1029 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1030 error = EFAULT;
1031 break;
1032 }
1033 error = vmmdev_get_memseg(sc, &vmseg);
1034 if (error == 0 &&
1035 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
1036 error = EFAULT;
1037 break;
1038 }
1039 break;
1040 }
1041 case VM_GET_REGISTER: {
1042 struct vm_register vmreg;
1043
1044 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1045 error = EFAULT;
1046 break;
1047 }
1048 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
1049 &vmreg.regval);
1050 if (error == 0 &&
1051 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
1052 error = EFAULT;
1053 break;
1054 }
1055 break;
1056 }
1057 case VM_SET_REGISTER: {
1058 struct vm_register vmreg;
1059
1060 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1061 error = EFAULT;
1062 break;
1063 }
1064 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
1065 vmreg.regval);
1066 break;
1067 }
1068 case VM_SET_SEGMENT_DESCRIPTOR: {
1069 struct vm_seg_desc vmsegd;
1070
1071 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1072 error = EFAULT;
1073 break;
1074 }
1075 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1076 &vmsegd.desc);
1077 break;
1078 }
1079 case VM_GET_SEGMENT_DESCRIPTOR: {
1080 struct vm_seg_desc vmsegd;
1081
1082 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1083 error = EFAULT;
1084 break;
1085 }
1086 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1087 &vmsegd.desc);
1088 if (error == 0 &&
1089 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1090 error = EFAULT;
1091 break;
1092 }
1093 break;
1094 }
1095 case VM_GET_REGISTER_SET: {
1096 struct vm_register_set vrs;
1097 int regnums[VM_REG_LAST];
1098 uint64_t regvals[VM_REG_LAST];
1099
1100 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1101 error = EFAULT;
1102 break;
1103 }
1104 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1105 error = EINVAL;
1106 break;
1107 }
1108 if (ddi_copyin(vrs.regnums, regnums,
1109 sizeof (int) * vrs.count, md)) {
1110 error = EFAULT;
1111 break;
1112 }
1113
1114 error = 0;
1115 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1116 if (regnums[i] < 0) {
1117 error = EINVAL;
1118 break;
1119 }
1120 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1121 ®vals[i]);
1122 }
1123 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1124 sizeof (uint64_t) * vrs.count, md)) {
1125 error = EFAULT;
1126 }
1127 break;
1128 }
1129 case VM_SET_REGISTER_SET: {
1130 struct vm_register_set vrs;
1131 int regnums[VM_REG_LAST];
1132 uint64_t regvals[VM_REG_LAST];
1133
1134 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1135 error = EFAULT;
1136 break;
1137 }
1138 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1139 error = EINVAL;
1140 break;
1141 }
1142 if (ddi_copyin(vrs.regnums, regnums,
1143 sizeof (int) * vrs.count, md)) {
1144 error = EFAULT;
1145 break;
1146 }
1147 if (ddi_copyin(vrs.regvals, regvals,
1148 sizeof (uint64_t) * vrs.count, md)) {
1149 error = EFAULT;
1150 break;
1151 }
1152
1153 error = 0;
1154 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1155 /*
1156 * Setting registers in a set is not atomic, since a
1157 * failure in the middle of the set will cause a
1158 * bail-out and inconsistent register state. Callers
1159 * should be wary of this.
1160 */
1161 if (regnums[i] < 0) {
1162 error = EINVAL;
1163 break;
1164 }
1165 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1166 regvals[i]);
1167 }
1168 break;
1169 }
1170 case VM_RESET_CPU: {
1171 struct vm_vcpu_reset vvr;
1172
1173 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1174 error = EFAULT;
1175 break;
1176 }
1177 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1178 error = EINVAL;
1179 }
1180
1181 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1182 break;
1183 }
1184 case VM_GET_RUN_STATE: {
1185 struct vm_run_state vrs;
1186
1187 bzero(&vrs, sizeof (vrs));
1188 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1189 &vrs.sipi_vector);
1190 if (error == 0) {
1191 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1192 error = EFAULT;
1193 break;
1194 }
1195 }
1196 break;
1197 }
1198 case VM_SET_RUN_STATE: {
1199 struct vm_run_state vrs;
1200
1201 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1202 error = EFAULT;
1203 break;
1204 }
1205 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1206 vrs.sipi_vector);
1207 break;
1208 }
1209 case VM_GET_FPU: {
1210 struct vm_fpu_state req;
1211 const size_t max_len = (PAGESIZE * 2);
1212 void *kbuf;
1213
1214 if (ddi_copyin(datap, &req, sizeof (req), md)) {
1215 error = EFAULT;
1216 break;
1217 }
1218 if (req.len > max_len || req.len == 0) {
1219 error = EINVAL;
1220 break;
1221 }
1222 kbuf = kmem_zalloc(req.len, KM_SLEEP);
1223 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1224 if (error == 0) {
1225 if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1226 error = EFAULT;
1227 }
1228 }
1229 kmem_free(kbuf, req.len);
1230 break;
1231 }
1232 case VM_SET_FPU: {
1233 struct vm_fpu_state req;
1234 const size_t max_len = (PAGESIZE * 2);
1235 void *kbuf;
1236
1237 if (ddi_copyin(datap, &req, sizeof (req), md)) {
1238 error = EFAULT;
1239 break;
1240 }
1241 if (req.len > max_len || req.len == 0) {
1242 error = EINVAL;
1243 break;
1244 }
1245 kbuf = kmem_alloc(req.len, KM_SLEEP);
1246 if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1247 error = EFAULT;
1248 } else {
1249 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1250 }
1251 kmem_free(kbuf, req.len);
1252 break;
1253 }
1254 case VM_GET_CPUID: {
1255 struct vm_vcpu_cpuid_config cfg;
1256 struct vcpu_cpuid_entry *entries = NULL;
1257
1258 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1259 error = EFAULT;
1260 break;
1261 }
1262 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1263 error = EINVAL;
1264 break;
1265 }
1266
1267 const size_t entries_size =
1268 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1269 if (entries_size != 0) {
1270 entries = kmem_zalloc(entries_size, KM_SLEEP);
1271 }
1272
1273 vcpu_cpuid_config_t vm_cfg = {
1274 .vcc_nent = cfg.vvcc_nent,
1275 .vcc_entries = entries,
1276 };
1277 error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1278
1279 /*
1280 * Only attempt to copy out the resultant entries if we were
1281 * able to query them from the instance. The flags and number
1282 * of entries are emitted regardless.
1283 */
1284 cfg.vvcc_flags = vm_cfg.vcc_flags;
1285 cfg.vvcc_nent = vm_cfg.vcc_nent;
1286 if (entries != NULL) {
1287 if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries,
1288 entries_size, md) != 0) {
1289 error = EFAULT;
1290 }
1291
1292 kmem_free(entries, entries_size);
1293 }
1294
1295 if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) {
1296 error = EFAULT;
1297 }
1298 break;
1299 }
1300 case VM_SET_CPUID: {
1301 struct vm_vcpu_cpuid_config cfg;
1302 struct vcpu_cpuid_entry *entries = NULL;
1303 size_t entries_size = 0;
1304
1305 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1306 error = EFAULT;
1307 break;
1308 }
1309 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1310 error = EFBIG;
1311 break;
1312 }
1313 if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) {
1314 /*
1315 * If we are being instructed to use "legacy" handling,
1316 * then no entries should be provided, since the static
1317 * in-kernel masking will be used.
1318 */
1319 if (cfg.vvcc_nent != 0) {
1320 error = EINVAL;
1321 break;
1322 }
1323 } else if (cfg.vvcc_nent != 0) {
1324 entries_size =
1325 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1326 entries = kmem_alloc(entries_size, KM_SLEEP);
1327
1328 if (ddi_copyin(cfg.vvcc_entries, entries, entries_size,
1329 md) != 0) {
1330 error = EFAULT;
1331 kmem_free(entries, entries_size);
1332 break;
1333 }
1334 }
1335
1336 vcpu_cpuid_config_t vm_cfg = {
1337 .vcc_flags = cfg.vvcc_flags,
1338 .vcc_nent = cfg.vvcc_nent,
1339 .vcc_entries = entries,
1340 };
1341 error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1342
1343 if (entries != NULL) {
1344 kmem_free(entries, entries_size);
1345 }
1346 break;
1347 }
1348 case VM_LEGACY_CPUID: {
1349 struct vm_legacy_cpuid vlc;
1350 if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) {
1351 error = EFAULT;
1352 break;
1353 }
1354 vlc.vlc_vcpuid = vcpu;
1355
1356 legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax,
1357 &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx);
1358
1359 if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) {
1360 error = EFAULT;
1361 break;
1362 }
1363 break;
1364 }
1365
1366 case VM_SET_KERNEMU_DEV:
1367 case VM_GET_KERNEMU_DEV: {
1368 struct vm_readwrite_kernemu_device kemu;
1369 size_t size = 0;
1370
1371 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1372 error = EFAULT;
1373 break;
1374 }
1375
1376 if (kemu.access_width > 3) {
1377 error = EINVAL;
1378 break;
1379 }
1380 size = (1 << kemu.access_width);
1381 ASSERT(size >= 1 && size <= 8);
1382
1383 if (cmd == VM_SET_KERNEMU_DEV) {
1384 error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1385 kemu.gpa, kemu.value, size);
1386 } else {
1387 error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1388 kemu.gpa, &kemu.value, size);
1389 }
1390
1391 if (error == 0) {
1392 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1393 error = EFAULT;
1394 break;
1395 }
1396 }
1397 break;
1398 }
1399
1400 case VM_GET_CAPABILITY: {
1401 struct vm_capability vmcap;
1402
1403 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1404 error = EFAULT;
1405 break;
1406 }
1407 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1408 &vmcap.capval);
1409 if (error == 0 &&
1410 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1411 error = EFAULT;
1412 break;
1413 }
1414 break;
1415 }
1416 case VM_SET_CAPABILITY: {
1417 struct vm_capability vmcap;
1418
1419 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1420 error = EFAULT;
1421 break;
1422 }
1423 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1424 vmcap.capval);
1425 break;
1426 }
1427 case VM_SET_X2APIC_STATE: {
1428 struct vm_x2apic x2apic;
1429
1430 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1431 error = EFAULT;
1432 break;
1433 }
1434 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1435 break;
1436 }
1437 case VM_GET_X2APIC_STATE: {
1438 struct vm_x2apic x2apic;
1439
1440 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1441 error = EFAULT;
1442 break;
1443 }
1444 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1445 &x2apic.state);
1446 if (error == 0 &&
1447 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1448 error = EFAULT;
1449 break;
1450 }
1451 break;
1452 }
1453 case VM_GET_GPA_PMAP: {
1454 /*
1455 * Until there is a necessity to leak EPT/RVI PTE values to
1456 * userspace, this will remain unimplemented
1457 */
1458 error = EINVAL;
1459 break;
1460 }
1461 case VM_GET_HPET_CAPABILITIES: {
1462 struct vm_hpet_cap hpetcap;
1463
1464 error = vhpet_getcap(&hpetcap);
1465 if (error == 0 &&
1466 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1467 error = EFAULT;
1468 break;
1469 }
1470 break;
1471 }
1472 case VM_GLA2GPA: {
1473 struct vm_gla2gpa gg;
1474
1475 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1476 error = EFAULT;
1477 break;
1478 }
1479 gg.vcpuid = vcpu;
1480 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1481 gg.prot, &gg.gpa, &gg.fault);
1482 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1483 error = EFAULT;
1484 break;
1485 }
1486 break;
1487 }
1488 case VM_GLA2GPA_NOFAULT: {
1489 struct vm_gla2gpa gg;
1490
1491 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1492 error = EFAULT;
1493 break;
1494 }
1495 gg.vcpuid = vcpu;
1496 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1497 gg.gla, gg.prot, &gg.gpa, &gg.fault);
1498 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1499 error = EFAULT;
1500 break;
1501 }
1502 break;
1503 }
1504
1505 case VM_ACTIVATE_CPU:
1506 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1507 break;
1508
1509 case VM_SUSPEND_CPU:
1510 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1511 error = EFAULT;
1512 } else {
1513 error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1514 }
1515 break;
1516
1517 case VM_RESUME_CPU:
1518 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1519 error = EFAULT;
1520 } else {
1521 error = vm_resume_cpu(sc->vmm_vm, vcpu);
1522 }
1523 break;
1524
1525 case VM_VCPU_BARRIER:
1526 vcpu = arg;
1527 error = vm_vcpu_barrier(sc->vmm_vm, vcpu);
1528 break;
1529
1530 case VM_GET_CPUS: {
1531 struct vm_cpuset vm_cpuset;
1532 cpuset_t tempset;
1533 void *srcp = &tempset;
1534 int size;
1535
1536 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1537 error = EFAULT;
1538 break;
1539 }
1540
1541 /* Be more generous about sizing since our cpuset_t is large. */
1542 size = vm_cpuset.cpusetsize;
1543 if (size <= 0 || size > sizeof (cpuset_t)) {
1544 error = ERANGE;
1545 }
1546 /*
1547 * If they want a ulong_t or less, make sure they receive the
1548 * low bits with all the useful information.
1549 */
1550 if (size <= sizeof (tempset.cpub[0])) {
1551 srcp = &tempset.cpub[0];
1552 }
1553
1554 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1555 tempset = vm_active_cpus(sc->vmm_vm);
1556 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1557 tempset = vm_debug_cpus(sc->vmm_vm);
1558 } else {
1559 error = EINVAL;
1560 }
1561
1562 ASSERT(size > 0 && size <= sizeof (tempset));
1563 if (error == 0 &&
1564 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1565 error = EFAULT;
1566 break;
1567 }
1568 break;
1569 }
1570 case VM_SET_INTINFO: {
1571 struct vm_intinfo vmii;
1572
1573 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1574 error = EFAULT;
1575 break;
1576 }
1577 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1578 break;
1579 }
1580 case VM_GET_INTINFO: {
1581 struct vm_intinfo vmii;
1582
1583 vmii.vcpuid = vcpu;
1584 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1585 &vmii.info2);
1586 if (error == 0 &&
1587 ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1588 error = EFAULT;
1589 break;
1590 }
1591 break;
1592 }
1593 case VM_RTC_WRITE: {
1594 struct vm_rtc_data rtcdata;
1595
1596 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1597 error = EFAULT;
1598 break;
1599 }
1600 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1601 rtcdata.value);
1602 break;
1603 }
1604 case VM_RTC_READ: {
1605 struct vm_rtc_data rtcdata;
1606
1607 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1608 error = EFAULT;
1609 break;
1610 }
1611 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1612 &rtcdata.value);
1613 if (error == 0 &&
1614 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1615 error = EFAULT;
1616 break;
1617 }
1618 break;
1619 }
1620 case VM_RTC_SETTIME: {
1621 timespec_t ts;
1622
1623 if (ddi_copyin(datap, &ts, sizeof (ts), md)) {
1624 error = EFAULT;
1625 break;
1626 }
1627 error = vrtc_set_time(sc->vmm_vm, &ts);
1628 break;
1629 }
1630 case VM_RTC_GETTIME: {
1631 timespec_t ts;
1632
1633 vrtc_get_time(sc->vmm_vm, &ts);
1634 if (ddi_copyout(&ts, datap, sizeof (ts), md)) {
1635 error = EFAULT;
1636 break;
1637 }
1638 break;
1639 }
1640
1641 case VM_PMTMR_LOCATE: {
1642 uint16_t port = arg;
1643 error = vpmtmr_set_location(sc->vmm_vm, port);
1644 break;
1645 }
1646
1647 case VM_RESTART_INSTRUCTION:
1648 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1649 break;
1650
1651 case VM_SET_TOPOLOGY: {
1652 struct vm_cpu_topology topo;
1653
1654 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1655 error = EFAULT;
1656 break;
1657 }
1658 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1659 topo.threads, topo.maxcpus);
1660 break;
1661 }
1662 case VM_GET_TOPOLOGY: {
1663 struct vm_cpu_topology topo;
1664
1665 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1666 &topo.threads, &topo.maxcpus);
1667 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1668 error = EFAULT;
1669 break;
1670 }
1671 break;
1672 }
1673 case VM_DEVMEM_GETOFFSET: {
1674 struct vm_devmem_offset vdo;
1675 vmm_devmem_entry_t *de;
1676
1677 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1678 error = EFAULT;
1679 break;
1680 }
1681
1682 de = vmmdev_devmem_find(sc, vdo.segid);
1683 if (de != NULL) {
1684 vdo.offset = de->vde_off;
1685 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1686 error = EFAULT;
1687 }
1688 } else {
1689 error = ENOENT;
1690 }
1691 break;
1692 }
1693 case VM_TRACK_DIRTY_PAGES: {
1694 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1695 struct vmm_dirty_tracker tracker;
1696 uint8_t *bitmap;
1697 size_t len;
1698
1699 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1700 error = EFAULT;
1701 break;
1702 }
1703 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1704 error = EINVAL;
1705 break;
1706 }
1707 if (tracker.vdt_len == 0) {
1708 break;
1709 }
1710 if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1711 error = EINVAL;
1712 break;
1713 }
1714 if (tracker.vdt_len > max_track_region_len) {
1715 error = EINVAL;
1716 break;
1717 }
1718 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1719 bitmap = kmem_zalloc(len, KM_SLEEP);
1720 error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1721 tracker.vdt_len, bitmap);
1722 if (error == 0 &&
1723 ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1724 error = EFAULT;
1725 }
1726 kmem_free(bitmap, len);
1727
1728 break;
1729 }
1730 case VM_NPT_OPERATION: {
1731 struct vm_npt_operation vno;
1732 uint8_t *bitmap = NULL;
1733 uint64_t bitmap_size = 0;
1734
1735 if (ddi_copyin(datap, &vno, sizeof (vno), md) != 0) {
1736 error = EFAULT;
1737 break;
1738 }
1739 if ((vno.vno_gpa & PAGEOFFSET) != 0 ||
1740 (vno.vno_len & PAGEOFFSET) != 0) {
1741 error = EINVAL;
1742 break;
1743 }
1744 if ((UINT64_MAX - vno.vno_len) < vno.vno_gpa) {
1745 error = EOVERFLOW;
1746 break;
1747 }
1748
1749 /*
1750 * Allocate a bitmap for the operation if it is specified as
1751 * part of the input or output.
1752 */
1753 if ((vno.vno_operation &
1754 (VNO_FLAG_BITMAP_IN | VNO_FLAG_BITMAP_OUT)) != 0) {
1755 /*
1756 * Operations expecting data to be copied in or out
1757 * should not have zero length.
1758 */
1759 if (vno.vno_len == 0) {
1760 error = EINVAL;
1761 break;
1762 }
1763
1764 /*
1765 * Maximum bitmap size of 8 pages results in 1 GiB of
1766 * coverage.
1767 */
1768 const uint64_t max_bitmap_size = 8 * PAGESIZE;
1769
1770 bitmap_size = roundup(vno.vno_len / PAGESIZE, 8) / 8;
1771 if (bitmap_size > max_bitmap_size) {
1772 error = E2BIG;
1773 break;
1774 }
1775 bitmap = kmem_zalloc(bitmap_size, KM_SLEEP);
1776 }
1777
1778 if ((vno.vno_operation & VNO_FLAG_BITMAP_IN) != 0) {
1779 ASSERT(bitmap != NULL);
1780 if (ddi_copyin(vno.vno_bitmap, bitmap, bitmap_size,
1781 md) != 0) {
1782 error = EFAULT;
1783 }
1784 }
1785
1786 if (error == 0) {
1787 error = vm_npt_do_operation(sc->vmm_vm, vno.vno_gpa,
1788 vno.vno_len, vno.vno_operation, bitmap, rvalp);
1789 }
1790
1791 if ((vno.vno_operation & VNO_FLAG_BITMAP_OUT) != 0 &&
1792 error == 0) {
1793 ASSERT(bitmap != NULL);
1794 if (ddi_copyout(bitmap, vno.vno_bitmap, bitmap_size,
1795 md) != 0) {
1796 error = EFAULT;
1797 }
1798 }
1799
1800 if (bitmap != NULL) {
1801 kmem_free(bitmap, bitmap_size);
1802 }
1803
1804 break;
1805 }
1806 case VM_WRLOCK_CYCLE: {
1807 /*
1808 * Present a test mechanism to acquire/release the write lock
1809 * on the VM without any other effects.
1810 */
1811 break;
1812 }
1813 case VM_DATA_READ: {
1814 struct vm_data_xfer vdx;
1815
1816 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1817 error = EFAULT;
1818 break;
1819 }
1820 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1821 error = EINVAL;
1822 break;
1823 }
1824 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1825 error = EFBIG;
1826 break;
1827 }
1828
1829 const size_t len = vdx.vdx_len;
1830 void *buf = NULL;
1831 if (len != 0) {
1832 const void *udata = vdx.vdx_data;
1833
1834 buf = kmem_alloc(len, KM_SLEEP);
1835 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) {
1836 bzero(buf, len);
1837 } else if (ddi_copyin(udata, buf, len, md) != 0) {
1838 kmem_free(buf, len);
1839 error = EFAULT;
1840 break;
1841 }
1842 }
1843
1844 vdx.vdx_result_len = 0;
1845 vmm_data_req_t req = {
1846 .vdr_class = vdx.vdx_class,
1847 .vdr_version = vdx.vdx_version,
1848 .vdr_flags = vdx.vdx_flags,
1849 .vdr_len = len,
1850 .vdr_data = buf,
1851 .vdr_result_len = &vdx.vdx_result_len,
1852 .vdr_vcpuid = vdx.vdx_vcpuid,
1853 };
1854 error = vmm_data_read(sc->vmm_vm, &req);
1855
1856 if (error == 0 && buf != NULL) {
1857 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1858 error = EFAULT;
1859 }
1860 }
1861
1862 /*
1863 * Copy out the transfer request so that the value of
1864 * vdx_result_len can be made available, regardless of any
1865 * error(s) which may have occurred.
1866 */
1867 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1868 error = (error != 0) ? error : EFAULT;
1869 }
1870
1871 if (buf != NULL) {
1872 kmem_free(buf, len);
1873 }
1874 break;
1875 }
1876 case VM_DATA_WRITE: {
1877 struct vm_data_xfer vdx;
1878
1879 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1880 error = EFAULT;
1881 break;
1882 }
1883 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1884 error = EINVAL;
1885 break;
1886 }
1887 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1888 error = EFBIG;
1889 break;
1890 }
1891
1892 const size_t len = vdx.vdx_len;
1893 void *buf = NULL;
1894 if (len != 0) {
1895 buf = kmem_alloc(len, KM_SLEEP);
1896 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1897 kmem_free(buf, len);
1898 error = EFAULT;
1899 break;
1900 }
1901 }
1902
1903 vdx.vdx_result_len = 0;
1904 vmm_data_req_t req = {
1905 .vdr_class = vdx.vdx_class,
1906 .vdr_version = vdx.vdx_version,
1907 .vdr_flags = vdx.vdx_flags,
1908 .vdr_len = len,
1909 .vdr_data = buf,
1910 .vdr_result_len = &vdx.vdx_result_len,
1911 .vdr_vcpuid = vdx.vdx_vcpuid,
1912 };
1913 if (vmm_allow_state_writes != 0) {
1914 error = vmm_data_write(sc->vmm_vm, &req);
1915 } else {
1916 /*
1917 * Reject the write if somone has thrown the switch back
1918 * into the "disallow" position.
1919 */
1920 error = EPERM;
1921 }
1922
1923 if (error == 0 && buf != NULL &&
1924 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1925 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1926 error = EFAULT;
1927 }
1928 }
1929
1930 /*
1931 * Copy out the transfer request so that the value of
1932 * vdx_result_len can be made available, regardless of any
1933 * error(s) which may have occurred.
1934 */
1935 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1936 error = (error != 0) ? error : EFAULT;
1937 }
1938
1939 if (buf != NULL) {
1940 kmem_free(buf, len);
1941 }
1942 break;
1943 }
1944
1945 case VM_PAUSE: {
1946 error = vm_pause_instance(sc->vmm_vm);
1947 break;
1948 }
1949 case VM_RESUME: {
1950 error = vm_resume_instance(sc->vmm_vm);
1951 break;
1952 }
1953
1954 default:
1955 error = ENOTTY;
1956 break;
1957 }
1958
1959 /* Release exclusion resources */
1960 switch (lock_type) {
1961 case LOCK_NONE:
1962 break;
1963 case LOCK_VCPU:
1964 vcpu_unlock_one(sc, vcpu);
1965 break;
1966 case LOCK_READ_HOLD:
1967 vmm_read_unlock(sc);
1968 break;
1969 case LOCK_WRITE_HOLD:
1970 vmm_write_unlock(sc);
1971 break;
1972 default:
1973 panic("unexpected lock type");
1974 break;
1975 }
1976
1977 return (error);
1978 }
1979
1980 static vmm_softc_t *
vmm_lookup(const char * name)1981 vmm_lookup(const char *name)
1982 {
1983 list_t *vml = &vmm_list;
1984 vmm_softc_t *sc;
1985
1986 ASSERT(MUTEX_HELD(&vmm_mtx));
1987
1988 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1989 if (strcmp(sc->vmm_name, name) == 0) {
1990 break;
1991 }
1992 }
1993
1994 return (sc);
1995 }
1996
1997 /*
1998 * Acquire an HMA registration if not already held.
1999 */
2000 static boolean_t
vmm_hma_acquire(void)2001 vmm_hma_acquire(void)
2002 {
2003 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
2004
2005 mutex_enter(&vmmdev_mtx);
2006
2007 if (vmmdev_hma_reg == NULL) {
2008 VERIFY3U(vmmdev_hma_ref, ==, 0);
2009 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
2010 if (vmmdev_hma_reg == NULL) {
2011 cmn_err(CE_WARN, "%s HMA registration failed.",
2012 vmmdev_hvm_name);
2013 mutex_exit(&vmmdev_mtx);
2014 return (B_FALSE);
2015 }
2016 }
2017
2018 vmmdev_hma_ref++;
2019
2020 mutex_exit(&vmmdev_mtx);
2021
2022 return (B_TRUE);
2023 }
2024
2025 /*
2026 * Release the HMA registration if held and there are no remaining VMs.
2027 */
2028 static void
vmm_hma_release(void)2029 vmm_hma_release(void)
2030 {
2031 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
2032
2033 mutex_enter(&vmmdev_mtx);
2034
2035 VERIFY3U(vmmdev_hma_ref, !=, 0);
2036
2037 vmmdev_hma_ref--;
2038
2039 if (vmmdev_hma_ref == 0) {
2040 VERIFY(vmmdev_hma_reg != NULL);
2041 hma_unregister(vmmdev_hma_reg);
2042 vmmdev_hma_reg = NULL;
2043 }
2044 mutex_exit(&vmmdev_mtx);
2045 }
2046
2047 static int
vmmdev_do_vm_create(const struct vm_create_req * req,cred_t * cr)2048 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
2049 {
2050 vmm_softc_t *sc = NULL;
2051 minor_t minor;
2052 int error = ENOMEM;
2053 size_t len;
2054 const char *name = req->name;
2055
2056 len = strnlen(name, VM_MAX_NAMELEN);
2057 if (len == 0) {
2058 return (EINVAL);
2059 }
2060 if (len >= VM_MAX_NAMELEN) {
2061 return (ENAMETOOLONG);
2062 }
2063 if (strchr(name, '/') != NULL) {
2064 return (EINVAL);
2065 }
2066
2067 if (!vmm_hma_acquire())
2068 return (ENXIO);
2069
2070 mutex_enter(&vmm_mtx);
2071
2072 /* Look for duplicate names */
2073 if (vmm_lookup(name) != NULL) {
2074 mutex_exit(&vmm_mtx);
2075 vmm_hma_release();
2076 return (EEXIST);
2077 }
2078
2079 /* Allow only one instance per non-global zone. */
2080 if (!INGLOBALZONE(curproc)) {
2081 for (sc = list_head(&vmm_list); sc != NULL;
2082 sc = list_next(&vmm_list, sc)) {
2083 if (sc->vmm_zone == curzone) {
2084 mutex_exit(&vmm_mtx);
2085 vmm_hma_release();
2086 return (EINVAL);
2087 }
2088 }
2089 }
2090
2091 minor = id_alloc(vmm_minors);
2092 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
2093 goto fail;
2094 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2095 ddi_soft_state_free(vmm_statep, minor);
2096 goto fail;
2097 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
2098 DDI_PSEUDO, 0) != DDI_SUCCESS) {
2099 goto fail;
2100 }
2101
2102 if (vmm_kstat_alloc(sc, minor, cr) != 0) {
2103 goto fail;
2104 }
2105
2106 error = vm_create(req->flags, &sc->vmm_vm);
2107 if (error == 0) {
2108 /* Complete VM intialization and report success. */
2109 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
2110 sc->vmm_minor = minor;
2111 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
2112 offsetof(vmm_devmem_entry_t, vde_node));
2113
2114 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
2115 offsetof(vmm_hold_t, vmh_node));
2116 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
2117
2118 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
2119 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
2120 offsetof(vmm_lease_t, vml_node));
2121 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
2122 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
2123
2124 sc->vmm_zone = crgetzone(cr);
2125 zone_hold(sc->vmm_zone);
2126 vmm_zsd_add_vm(sc);
2127 vmm_kstat_init(sc);
2128
2129 list_insert_tail(&vmm_list, sc);
2130 mutex_exit(&vmm_mtx);
2131 return (0);
2132 }
2133
2134 vmm_kstat_fini(sc);
2135 ddi_remove_minor_node(vmmdev_dip, name);
2136 fail:
2137 id_free(vmm_minors, minor);
2138 if (sc != NULL) {
2139 ddi_soft_state_free(vmm_statep, minor);
2140 }
2141 mutex_exit(&vmm_mtx);
2142 vmm_hma_release();
2143
2144 return (error);
2145 }
2146
2147 /*
2148 * Bhyve 'Driver' Interface
2149 *
2150 * While many devices are emulated in the bhyve userspace process, there are
2151 * others with performance constraints which require that they run mostly or
2152 * entirely in-kernel. For those not integrated directly into bhyve, an API is
2153 * needed so they can query/manipulate the portions of VM state needed to
2154 * fulfill their purpose.
2155 *
2156 * This includes:
2157 * - Translating guest-physical addresses to host-virtual pointers
2158 * - Injecting MSIs
2159 * - Hooking IO port addresses
2160 *
2161 * The vmm_drv interface exists to provide that functionality to its consumers.
2162 * (At this time, 'viona' is the only user)
2163 */
2164 int
vmm_drv_hold(file_t * fp,cred_t * cr,vmm_hold_t ** holdp)2165 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
2166 {
2167 vnode_t *vp = fp->f_vnode;
2168 const dev_t dev = vp->v_rdev;
2169 vmm_softc_t *sc;
2170 vmm_hold_t *hold;
2171 int err = 0;
2172
2173 if (vp->v_type != VCHR) {
2174 return (ENXIO);
2175 }
2176 const major_t major = getmajor(dev);
2177 const minor_t minor = getminor(dev);
2178
2179 mutex_enter(&vmmdev_mtx);
2180 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
2181 mutex_exit(&vmmdev_mtx);
2182 return (ENOENT);
2183 }
2184 mutex_enter(&vmm_mtx);
2185 mutex_exit(&vmmdev_mtx);
2186
2187 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2188 err = ENOENT;
2189 goto out;
2190 }
2191 /* XXXJOY: check cred permissions against instance */
2192
2193 if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2194 err = EBUSY;
2195 goto out;
2196 }
2197
2198 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
2199 hold->vmh_sc = sc;
2200 hold->vmh_release_req = B_FALSE;
2201
2202 list_insert_tail(&sc->vmm_holds, hold);
2203 sc->vmm_flags |= VMM_HELD;
2204 *holdp = hold;
2205
2206 out:
2207 mutex_exit(&vmm_mtx);
2208 return (err);
2209 }
2210
2211 void
vmm_drv_rele(vmm_hold_t * hold)2212 vmm_drv_rele(vmm_hold_t *hold)
2213 {
2214 vmm_softc_t *sc;
2215 bool hma_release = false;
2216
2217 ASSERT(hold != NULL);
2218 ASSERT(hold->vmh_sc != NULL);
2219 VERIFY(hold->vmh_ioport_hook_cnt == 0);
2220 VERIFY(hold->vmh_mmio_hook_cnt == 0);
2221
2222 mutex_enter(&vmm_mtx);
2223 sc = hold->vmh_sc;
2224 list_remove(&sc->vmm_holds, hold);
2225 kmem_free(hold, sizeof (*hold));
2226
2227 if (list_is_empty(&sc->vmm_holds)) {
2228 sc->vmm_flags &= ~VMM_HELD;
2229
2230 /*
2231 * Since outstanding holds would prevent instance destruction
2232 * from completing, attempt to finish it now if it was already
2233 * set in motion.
2234 */
2235 if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2236 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT,
2237 &hma_release));
2238 }
2239 }
2240 mutex_exit(&vmm_mtx);
2241
2242 if (hma_release) {
2243 vmm_hma_release();
2244 }
2245 }
2246
2247 boolean_t
vmm_drv_release_reqd(vmm_hold_t * hold)2248 vmm_drv_release_reqd(vmm_hold_t *hold)
2249 {
2250 ASSERT(hold != NULL);
2251
2252 return (hold->vmh_release_req);
2253 }
2254
2255 vmm_lease_t *
vmm_drv_lease_sign(vmm_hold_t * hold,boolean_t (* expiref)(void *),void * arg)2256 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
2257 {
2258 vmm_softc_t *sc = hold->vmh_sc;
2259 vmm_lease_t *lease;
2260
2261 ASSERT3P(expiref, !=, NULL);
2262
2263 if (hold->vmh_release_req) {
2264 return (NULL);
2265 }
2266
2267 lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
2268 list_link_init(&lease->vml_node);
2269 lease->vml_expire_func = expiref;
2270 lease->vml_expire_arg = arg;
2271 lease->vml_expired = B_FALSE;
2272 lease->vml_break_deferred = B_FALSE;
2273 lease->vml_hold = hold;
2274 /* cache the VM pointer for one less pointer chase */
2275 lease->vml_vm = sc->vmm_vm;
2276 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
2277
2278 mutex_enter(&sc->vmm_lease_lock);
2279 while (sc->vmm_lease_blocker != 0) {
2280 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2281 }
2282 list_insert_tail(&sc->vmm_lease_list, lease);
2283 vmm_read_lock(sc);
2284 mutex_exit(&sc->vmm_lease_lock);
2285
2286 return (lease);
2287 }
2288
2289 static void
vmm_lease_break_locked(vmm_softc_t * sc,vmm_lease_t * lease)2290 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
2291 {
2292 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
2293
2294 list_remove(&sc->vmm_lease_list, lease);
2295 vmm_read_unlock(sc);
2296 vmc_destroy(lease->vml_vmclient);
2297 kmem_free(lease, sizeof (*lease));
2298 }
2299
2300 static void
vmm_lease_block(vmm_softc_t * sc)2301 vmm_lease_block(vmm_softc_t *sc)
2302 {
2303 mutex_enter(&sc->vmm_lease_lock);
2304 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
2305 sc->vmm_lease_blocker++;
2306 if (sc->vmm_lease_blocker == 1) {
2307 list_t *list = &sc->vmm_lease_list;
2308 vmm_lease_t *lease = list_head(list);
2309
2310 while (lease != NULL) {
2311 void *arg = lease->vml_expire_arg;
2312 boolean_t (*expiref)(void *) = lease->vml_expire_func;
2313 boolean_t sync_break = B_FALSE;
2314
2315 /*
2316 * Since the lease expiration notification may
2317 * need to take locks which would deadlock with
2318 * vmm_lease_lock, drop it across the call.
2319 *
2320 * We are the only one allowed to manipulate
2321 * vmm_lease_list right now, so it is safe to
2322 * continue iterating through it after
2323 * reacquiring the lock.
2324 */
2325 lease->vml_expired = B_TRUE;
2326 mutex_exit(&sc->vmm_lease_lock);
2327 sync_break = expiref(arg);
2328 mutex_enter(&sc->vmm_lease_lock);
2329
2330 if (sync_break) {
2331 vmm_lease_t *next;
2332
2333 /*
2334 * These leases which are synchronously broken
2335 * result in vmm_read_unlock() calls from a
2336 * different thread than the corresponding
2337 * vmm_read_lock(). This is acceptable, given
2338 * that the rwlock underpinning the whole
2339 * mechanism tolerates the behavior. This
2340 * flexibility is _only_ afforded to VM read
2341 * lock (RW_READER) holders.
2342 */
2343 next = list_next(list, lease);
2344 vmm_lease_break_locked(sc, lease);
2345 lease = next;
2346 } else {
2347 lease = list_next(list, lease);
2348 }
2349 }
2350
2351 /* Process leases which were not broken synchronously. */
2352 while (!list_is_empty(list)) {
2353 /*
2354 * Although the nested loops are quadratic, the number
2355 * of leases is small.
2356 */
2357 lease = list_head(list);
2358 while (lease != NULL) {
2359 vmm_lease_t *next = list_next(list, lease);
2360 if (lease->vml_break_deferred) {
2361 vmm_lease_break_locked(sc, lease);
2362 }
2363 lease = next;
2364 }
2365 if (list_is_empty(list)) {
2366 break;
2367 }
2368 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2369 }
2370 /* Wake anyone else waiting for the lease list to be empty */
2371 cv_broadcast(&sc->vmm_lease_cv);
2372 } else {
2373 list_t *list = &sc->vmm_lease_list;
2374
2375 /*
2376 * Some other thread beat us to the duty of lease cleanup.
2377 * Wait until that is complete.
2378 */
2379 while (!list_is_empty(list)) {
2380 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2381 }
2382 }
2383 mutex_exit(&sc->vmm_lease_lock);
2384 }
2385
2386 static void
vmm_lease_unblock(vmm_softc_t * sc)2387 vmm_lease_unblock(vmm_softc_t *sc)
2388 {
2389 mutex_enter(&sc->vmm_lease_lock);
2390 VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2391 sc->vmm_lease_blocker--;
2392 if (sc->vmm_lease_blocker == 0) {
2393 cv_broadcast(&sc->vmm_lease_cv);
2394 }
2395 mutex_exit(&sc->vmm_lease_lock);
2396 }
2397
2398 void
vmm_drv_lease_break(vmm_hold_t * hold,vmm_lease_t * lease)2399 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2400 {
2401 vmm_softc_t *sc = hold->vmh_sc;
2402
2403 VERIFY3P(hold, ==, lease->vml_hold);
2404 VERIFY(!lease->vml_break_deferred);
2405
2406 mutex_enter(&sc->vmm_lease_lock);
2407 if (sc->vmm_lease_blocker == 0) {
2408 vmm_lease_break_locked(sc, lease);
2409 } else {
2410 /*
2411 * Defer the lease-breaking to whichever thread is currently
2412 * cleaning up all leases as part of a vmm_lease_block() call.
2413 */
2414 lease->vml_break_deferred = B_TRUE;
2415 cv_broadcast(&sc->vmm_lease_cv);
2416 }
2417 mutex_exit(&sc->vmm_lease_lock);
2418 }
2419
2420 boolean_t
vmm_drv_lease_expired(vmm_lease_t * lease)2421 vmm_drv_lease_expired(vmm_lease_t *lease)
2422 {
2423 return (lease->vml_expired);
2424 }
2425
2426 vmm_page_t *
vmm_drv_page_hold(vmm_lease_t * lease,uintptr_t gpa,int prot)2427 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2428 {
2429 ASSERT(lease != NULL);
2430 ASSERT0(gpa & PAGEOFFSET);
2431
2432 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2433 }
2434
2435
2436 /* Ensure that flags mirrored by vmm_drv interface properly match up */
2437 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY);
2438
2439 vmm_page_t *
vmm_drv_page_hold_ext(vmm_lease_t * lease,uintptr_t gpa,int prot,int flags)2440 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags)
2441 {
2442 ASSERT(lease != NULL);
2443 ASSERT0(gpa & PAGEOFFSET);
2444
2445 vmm_page_t *page =
2446 (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags);
2447 return (page);
2448 }
2449
2450 void
vmm_drv_page_release(vmm_page_t * vmmp)2451 vmm_drv_page_release(vmm_page_t *vmmp)
2452 {
2453 (void) vmp_release((vm_page_t *)vmmp);
2454 }
2455
2456 void
vmm_drv_page_release_chain(vmm_page_t * vmmp)2457 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2458 {
2459 (void) vmp_release_chain((vm_page_t *)vmmp);
2460 }
2461
2462 const void *
vmm_drv_page_readable(const vmm_page_t * vmmp)2463 vmm_drv_page_readable(const vmm_page_t *vmmp)
2464 {
2465 return (vmp_get_readable((const vm_page_t *)vmmp));
2466 }
2467
2468 void *
vmm_drv_page_writable(const vmm_page_t * vmmp)2469 vmm_drv_page_writable(const vmm_page_t *vmmp)
2470 {
2471 return (vmp_get_writable((const vm_page_t *)vmmp));
2472 }
2473
2474 void
vmm_drv_page_mark_dirty(vmm_page_t * vmmp)2475 vmm_drv_page_mark_dirty(vmm_page_t *vmmp)
2476 {
2477 return (vmp_mark_dirty((vm_page_t *)vmmp));
2478 }
2479
2480 void
vmm_drv_page_chain(vmm_page_t * vmmp,vmm_page_t * to_chain)2481 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2482 {
2483 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2484 }
2485
2486 vmm_page_t *
vmm_drv_page_next(const vmm_page_t * vmmp)2487 vmm_drv_page_next(const vmm_page_t *vmmp)
2488 {
2489 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2490 }
2491
2492 int
vmm_drv_msi(vmm_lease_t * lease,uint64_t addr,uint64_t msg)2493 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2494 {
2495 ASSERT(lease != NULL);
2496
2497 return (lapic_intr_msi(lease->vml_vm, addr, msg));
2498 }
2499
2500 int
vmm_drv_ioport_hook(vmm_hold_t * hold,uint16_t ioport,vmm_drv_iop_cb_t func,void * arg,void ** cookie)2501 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2502 void *arg, void **cookie)
2503 {
2504 vmm_softc_t *sc;
2505 int err;
2506
2507 ASSERT(hold != NULL);
2508 ASSERT(cookie != NULL);
2509
2510 sc = hold->vmh_sc;
2511 mutex_enter(&vmm_mtx);
2512 /* Confirm that hook installation is not blocked */
2513 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2514 mutex_exit(&vmm_mtx);
2515 return (EBUSY);
2516 }
2517 /*
2518 * Optimistically record an installed hook which will prevent a block
2519 * from being asserted while the mutex is dropped.
2520 */
2521 if (hold->vmh_ioport_hook_cnt == UINT_MAX) {
2522 mutex_exit(&vmm_mtx);
2523 return (ENOSPC);
2524 }
2525 hold->vmh_ioport_hook_cnt++;
2526 mutex_exit(&vmm_mtx);
2527
2528 vmm_write_lock(sc);
2529 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2530 arg, cookie);
2531 vmm_write_unlock(sc);
2532
2533 if (err != 0) {
2534 mutex_enter(&vmm_mtx);
2535 /* Walk back optimism about the hook installation */
2536 hold->vmh_ioport_hook_cnt--;
2537 mutex_exit(&vmm_mtx);
2538 }
2539 return (err);
2540 }
2541
2542 void
vmm_drv_ioport_unhook(vmm_hold_t * hold,void ** cookie)2543 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2544 {
2545 vmm_softc_t *sc;
2546
2547 ASSERT(hold != NULL);
2548 ASSERT(cookie != NULL);
2549 ASSERT(hold->vmh_ioport_hook_cnt != 0);
2550
2551 sc = hold->vmh_sc;
2552 vmm_write_lock(sc);
2553 vm_ioport_unhook(sc->vmm_vm, cookie);
2554 vmm_write_unlock(sc);
2555
2556 mutex_enter(&vmm_mtx);
2557 hold->vmh_ioport_hook_cnt--;
2558 mutex_exit(&vmm_mtx);
2559 }
2560
2561 int
vmm_drv_mmio_hook(vmm_hold_t * hold,uint64_t address,uint32_t size,vmm_drv_mmio_cb_t func,void * arg,void ** cookie)2562 vmm_drv_mmio_hook(vmm_hold_t *hold, uint64_t address, uint32_t size,
2563 vmm_drv_mmio_cb_t func, void *arg, void **cookie)
2564 {
2565 vmm_softc_t *sc;
2566 int err;
2567
2568 ASSERT(hold != NULL);
2569 ASSERT(cookie != NULL);
2570
2571 if (UINT64_MAX - size < address)
2572 return (EOVERFLOW);
2573
2574 sc = hold->vmh_sc;
2575 mutex_enter(&vmm_mtx);
2576 /* Confirm that hook installation is not blocked */
2577 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2578 mutex_exit(&vmm_mtx);
2579 return (EBUSY);
2580 }
2581 /*
2582 * Optimistically record an installed hook which will prevent a block
2583 * from being asserted while the mutex is dropped.
2584 */
2585 if (hold->vmh_mmio_hook_cnt == UINT_MAX) {
2586 mutex_exit(&vmm_mtx);
2587 return (ENOSPC);
2588 }
2589 hold->vmh_mmio_hook_cnt++;
2590 mutex_exit(&vmm_mtx);
2591
2592 vmm_write_lock(sc);
2593 err = vm_mmio_hook(sc->vmm_vm, address, size, (mmio_handler_t)func,
2594 arg, cookie);
2595 vmm_write_unlock(sc);
2596
2597 if (err != 0) {
2598 mutex_enter(&vmm_mtx);
2599 /* Walk back optimism about the hook installation */
2600 hold->vmh_mmio_hook_cnt--;
2601 mutex_exit(&vmm_mtx);
2602 }
2603 return (err);
2604 }
2605
2606 int
vmm_drv_mmio_unhook(vmm_hold_t * hold,void ** cookie)2607 vmm_drv_mmio_unhook(vmm_hold_t *hold, void **cookie)
2608 {
2609 vmm_softc_t *sc;
2610 int ret;
2611
2612 ASSERT(hold != NULL);
2613 ASSERT(cookie != NULL);
2614 ASSERT(hold->vmh_mmio_hook_cnt != 0);
2615
2616 sc = hold->vmh_sc;
2617 vmm_write_lock(sc);
2618 ret = vm_mmio_unhook(sc->vmm_vm, cookie);
2619 vmm_write_unlock(sc);
2620
2621 if (ret == 0) {
2622 mutex_enter(&vmm_mtx);
2623 hold->vmh_mmio_hook_cnt--;
2624 mutex_exit(&vmm_mtx);
2625 }
2626
2627 return (ret);
2628 }
2629
2630 static void
vmm_drv_purge(vmm_softc_t * sc)2631 vmm_drv_purge(vmm_softc_t *sc)
2632 {
2633 ASSERT(MUTEX_HELD(&vmm_mtx));
2634
2635 if ((sc->vmm_flags & VMM_HELD) != 0) {
2636 vmm_hold_t *hold;
2637
2638 for (hold = list_head(&sc->vmm_holds); hold != NULL;
2639 hold = list_next(&sc->vmm_holds, hold)) {
2640 hold->vmh_release_req = B_TRUE;
2641 }
2642
2643 /*
2644 * Require that all leases on the instance be broken, now that
2645 * all associated holds have been marked as needing release.
2646 *
2647 * Dropping vmm_mtx is not strictly necessary, but if any of the
2648 * lessees are slow to respond, it would be nice to leave it
2649 * available for other parties.
2650 */
2651 mutex_exit(&vmm_mtx);
2652 vmm_lease_block(sc);
2653 vmm_lease_unblock(sc);
2654 mutex_enter(&vmm_mtx);
2655 }
2656 }
2657
2658 static int
vmm_drv_block_hook(vmm_softc_t * sc,boolean_t enable_block)2659 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2660 {
2661 int err = 0;
2662
2663 mutex_enter(&vmm_mtx);
2664 if (!enable_block) {
2665 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2666
2667 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2668 goto done;
2669 }
2670
2671 /* If any holds have hooks installed, the block is a failure */
2672 if (!list_is_empty(&sc->vmm_holds)) {
2673 vmm_hold_t *hold;
2674
2675 for (hold = list_head(&sc->vmm_holds); hold != NULL;
2676 hold = list_next(&sc->vmm_holds, hold)) {
2677 if (hold->vmh_ioport_hook_cnt != 0 ||
2678 hold->vmh_mmio_hook_cnt != 0) {
2679 err = EBUSY;
2680 goto done;
2681 }
2682 }
2683 }
2684 sc->vmm_flags |= VMM_BLOCK_HOOK;
2685
2686 done:
2687 mutex_exit(&vmm_mtx);
2688 return (err);
2689 }
2690
2691
2692 static void
vmm_destroy_begin(vmm_softc_t * sc,vmm_destroy_opts_t opts)2693 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts)
2694 {
2695 ASSERT(MUTEX_HELD(&vmm_mtx));
2696 ASSERT0(sc->vmm_flags & VMM_DESTROY);
2697
2698 sc->vmm_flags |= VMM_DESTROY;
2699
2700 /*
2701 * Lock and unlock all of the vCPUs to ensure that they are kicked out
2702 * of guest context, being unable to return now that the instance is
2703 * marked for destruction.
2704 */
2705 const int maxcpus = vm_get_maxcpus(sc->vmm_vm);
2706 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
2707 vcpu_lock_one(sc, vcpu);
2708 vcpu_unlock_one(sc, vcpu);
2709 }
2710
2711 vmmdev_devmem_purge(sc);
2712 if ((opts & VDO_NO_CLEAN_ZSD) == 0) {
2713 /*
2714 * The ZSD should be cleaned up now, unless destruction of the
2715 * instance was initated by destruction of the containing zone,
2716 * in which case the ZSD has already been removed.
2717 */
2718 vmm_zsd_rem_vm(sc);
2719 }
2720 zone_rele(sc->vmm_zone);
2721
2722 vmm_drv_purge(sc);
2723 }
2724
2725 static bool
vmm_destroy_ready(vmm_softc_t * sc)2726 vmm_destroy_ready(vmm_softc_t *sc)
2727 {
2728 ASSERT(MUTEX_HELD(&vmm_mtx));
2729
2730 if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) {
2731 VERIFY(list_is_empty(&sc->vmm_holds));
2732 return (true);
2733 }
2734
2735 return (false);
2736 }
2737
2738 static void
vmm_destroy_finish(vmm_softc_t * sc)2739 vmm_destroy_finish(vmm_softc_t *sc)
2740 {
2741 ASSERT(MUTEX_HELD(&vmm_mtx));
2742 ASSERT(vmm_destroy_ready(sc));
2743
2744 list_remove(&vmm_list, sc);
2745 vmm_kstat_fini(sc);
2746 vm_destroy(sc->vmm_vm);
2747 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2748 (void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE);
2749
2750 const minor_t minor = sc->vmm_minor;
2751 ddi_soft_state_free(vmm_statep, minor);
2752 id_free(vmm_minors, minor);
2753 }
2754
2755 /*
2756 * Initiate or attempt to finish destruction of a VMM instance.
2757 *
2758 * This is called from several contexts:
2759 * - An explicit destroy ioctl is made
2760 * - A vmm_drv consumer releases its hold (being the last on the instance)
2761 * - The vmm device is closed, and auto-destruct is enabled
2762 */
2763 static int
vmm_destroy_locked(vmm_softc_t * sc,vmm_destroy_opts_t opts,bool * hma_release)2764 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts,
2765 bool *hma_release)
2766 {
2767 ASSERT(MUTEX_HELD(&vmm_mtx));
2768
2769 *hma_release = false;
2770
2771 /*
2772 * When instance destruction begins, it is so marked such that any
2773 * further requests to operate the instance will fail.
2774 */
2775 if ((sc->vmm_flags & VMM_DESTROY) == 0) {
2776 vmm_destroy_begin(sc, opts);
2777 }
2778
2779 if (vmm_destroy_ready(sc)) {
2780
2781 /*
2782 * Notify anyone waiting for the destruction to finish. They
2783 * must be clear before we can safely tear down the softc.
2784 */
2785 if (sc->vmm_destroy_waiters != 0) {
2786 cv_broadcast(&sc->vmm_cv);
2787 while (sc->vmm_destroy_waiters != 0) {
2788 cv_wait(&sc->vmm_cv, &vmm_mtx);
2789 }
2790 }
2791
2792 /*
2793 * Finish destruction of instance. After this point, the softc
2794 * is freed and cannot be accessed again.
2795 *
2796 * With destruction complete, the HMA hold can be released
2797 */
2798 vmm_destroy_finish(sc);
2799 *hma_release = true;
2800 return (0);
2801 } else if ((opts & VDO_ATTEMPT_WAIT) != 0) {
2802 int err = 0;
2803
2804 sc->vmm_destroy_waiters++;
2805 while (!vmm_destroy_ready(sc) && err == 0) {
2806 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2807 err = EINTR;
2808 }
2809 }
2810 sc->vmm_destroy_waiters--;
2811
2812 if (sc->vmm_destroy_waiters == 0) {
2813 /*
2814 * If we were the last waiter, it could be that VM
2815 * destruction is waiting on _us_ to proceed with the
2816 * final clean-up.
2817 */
2818 cv_signal(&sc->vmm_cv);
2819 }
2820 return (err);
2821 } else {
2822 /*
2823 * Since the instance is not ready for destruction, and the
2824 * caller did not ask to wait, consider it a success for now.
2825 */
2826 return (0);
2827 }
2828 }
2829
2830 void
vmm_zone_vm_destroy(vmm_softc_t * sc)2831 vmm_zone_vm_destroy(vmm_softc_t *sc)
2832 {
2833 bool hma_release = false;
2834 int err;
2835
2836 mutex_enter(&vmm_mtx);
2837 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release);
2838 mutex_exit(&vmm_mtx);
2839
2840 VERIFY0(err);
2841
2842 if (hma_release) {
2843 vmm_hma_release();
2844 }
2845 }
2846
2847 static int
vmmdev_do_vm_destroy(const struct vm_destroy_req * req,cred_t * cr)2848 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2849 {
2850 vmm_softc_t *sc;
2851 bool hma_release = false;
2852 int err;
2853
2854 if (crgetuid(cr) != 0) {
2855 return (EPERM);
2856 }
2857
2858 mutex_enter(&vmm_mtx);
2859 sc = vmm_lookup(req->name);
2860 if (sc == NULL) {
2861 mutex_exit(&vmm_mtx);
2862 return (ENOENT);
2863 }
2864 /*
2865 * We don't check this in vmm_lookup() since that function is also used
2866 * for validation during create and currently vmm names must be unique.
2867 */
2868 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2869 mutex_exit(&vmm_mtx);
2870 return (EPERM);
2871 }
2872
2873 err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release);
2874 mutex_exit(&vmm_mtx);
2875
2876 if (hma_release) {
2877 vmm_hma_release();
2878 }
2879
2880 return (err);
2881 }
2882
2883 #define VCPU_NAME_BUFLEN 32
2884
2885 static int
vmm_kstat_alloc(vmm_softc_t * sc,minor_t minor,const cred_t * cr)2886 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2887 {
2888 zoneid_t zid = crgetzoneid(cr);
2889 int instance = minor;
2890 kstat_t *ksp;
2891
2892 ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2893
2894 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2895 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2896 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2897
2898 if (ksp == NULL) {
2899 return (-1);
2900 }
2901 sc->vmm_kstat_vm = ksp;
2902
2903 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2904 char namebuf[VCPU_NAME_BUFLEN];
2905
2906 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2907
2908 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2909 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2910 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2911 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2912 0, zid);
2913 if (ksp == NULL) {
2914 goto fail;
2915 }
2916
2917 sc->vmm_kstat_vcpu[i] = ksp;
2918 }
2919
2920 /*
2921 * If this instance is associated with a non-global zone, make its
2922 * kstats visible from the GZ.
2923 */
2924 if (zid != GLOBAL_ZONEID) {
2925 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2926 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2927 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2928 }
2929 }
2930
2931 return (0);
2932
2933 fail:
2934 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2935 if (sc->vmm_kstat_vcpu[i] != NULL) {
2936 kstat_delete(sc->vmm_kstat_vcpu[i]);
2937 sc->vmm_kstat_vcpu[i] = NULL;
2938 } else {
2939 break;
2940 }
2941 }
2942 kstat_delete(sc->vmm_kstat_vm);
2943 sc->vmm_kstat_vm = NULL;
2944 return (-1);
2945 }
2946
2947 static void
vmm_kstat_init(vmm_softc_t * sc)2948 vmm_kstat_init(vmm_softc_t *sc)
2949 {
2950 kstat_t *ksp;
2951
2952 ASSERT3P(sc->vmm_vm, !=, NULL);
2953 ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2954
2955 ksp = sc->vmm_kstat_vm;
2956 vmm_kstats_t *vk = ksp->ks_data;
2957 ksp->ks_private = sc->vmm_vm;
2958 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2959 kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2960
2961 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2962 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2963
2964 ksp = sc->vmm_kstat_vcpu[i];
2965 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2966
2967 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2968 vvk->vvk_vcpu.value.ui32 = i;
2969 kstat_named_init(&vvk->vvk_time_init, "time_init",
2970 KSTAT_DATA_UINT64);
2971 kstat_named_init(&vvk->vvk_time_run, "time_run",
2972 KSTAT_DATA_UINT64);
2973 kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2974 KSTAT_DATA_UINT64);
2975 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2976 KSTAT_DATA_UINT64);
2977 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2978 KSTAT_DATA_UINT64);
2979 kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2980 KSTAT_DATA_UINT64);
2981 ksp->ks_private = sc->vmm_vm;
2982 ksp->ks_update = vmm_kstat_update_vcpu;
2983 }
2984
2985 kstat_install(sc->vmm_kstat_vm);
2986 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2987 kstat_install(sc->vmm_kstat_vcpu[i]);
2988 }
2989 }
2990
2991 static void
vmm_kstat_fini(vmm_softc_t * sc)2992 vmm_kstat_fini(vmm_softc_t *sc)
2993 {
2994 ASSERT(sc->vmm_kstat_vm != NULL);
2995
2996 kstat_delete(sc->vmm_kstat_vm);
2997 sc->vmm_kstat_vm = NULL;
2998
2999 for (uint_t i = 0; i < VM_MAXCPU; i++) {
3000 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
3001
3002 kstat_delete(sc->vmm_kstat_vcpu[i]);
3003 sc->vmm_kstat_vcpu[i] = NULL;
3004 }
3005 }
3006
3007 static int
vmm_open(dev_t * devp,int flag,int otyp,cred_t * credp)3008 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
3009 {
3010 minor_t minor;
3011 vmm_softc_t *sc;
3012
3013 /*
3014 * Forbid running bhyve in a 32-bit process until it has been tested and
3015 * verified to be safe.
3016 */
3017 if (curproc->p_model != DATAMODEL_LP64) {
3018 return (EFBIG);
3019 }
3020
3021 minor = getminor(*devp);
3022 if (minor == VMM_CTL_MINOR) {
3023 /*
3024 * Master control device must be opened exclusively.
3025 */
3026 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
3027 return (EINVAL);
3028 }
3029
3030 return (0);
3031 }
3032
3033 mutex_enter(&vmm_mtx);
3034 sc = ddi_get_soft_state(vmm_statep, minor);
3035 if (sc == NULL) {
3036 mutex_exit(&vmm_mtx);
3037 return (ENXIO);
3038 }
3039
3040 sc->vmm_flags |= VMM_IS_OPEN;
3041 mutex_exit(&vmm_mtx);
3042
3043 return (0);
3044 }
3045
3046 static int
vmm_close(dev_t dev,int flag,int otyp,cred_t * credp)3047 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
3048 {
3049 const minor_t minor = getminor(dev);
3050 vmm_softc_t *sc;
3051 bool hma_release = false;
3052
3053 if (minor == VMM_CTL_MINOR) {
3054 return (0);
3055 }
3056
3057 mutex_enter(&vmm_mtx);
3058 sc = ddi_get_soft_state(vmm_statep, minor);
3059 if (sc == NULL) {
3060 mutex_exit(&vmm_mtx);
3061 return (ENXIO);
3062 }
3063
3064 VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0);
3065 sc->vmm_flags &= ~VMM_IS_OPEN;
3066
3067 /*
3068 * If instance was marked for auto-destruction begin that now. Instance
3069 * destruction may have been initated already, so try to make progress
3070 * in that case, since closure of the device is one of its requirements.
3071 */
3072 if ((sc->vmm_flags & VMM_DESTROY) != 0 ||
3073 (sc->vmm_flags & VMM_AUTODESTROY) != 0) {
3074 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
3075 }
3076 mutex_exit(&vmm_mtx);
3077
3078 if (hma_release) {
3079 vmm_hma_release();
3080 }
3081
3082 return (0);
3083 }
3084
3085 static int
vmm_is_supported(intptr_t arg)3086 vmm_is_supported(intptr_t arg)
3087 {
3088 int r;
3089 const char *msg;
3090
3091 if (vmm_is_intel()) {
3092 r = vmx_x86_supported(&msg);
3093 } else if (vmm_is_svm()) {
3094 /*
3095 * HMA already ensured that the features necessary for SVM
3096 * operation were present and online during vmm_attach().
3097 */
3098 r = 0;
3099 } else {
3100 r = ENXIO;
3101 msg = "Unsupported CPU vendor";
3102 }
3103
3104 if (r != 0 && arg != (intptr_t)NULL) {
3105 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
3106 return (EFAULT);
3107 }
3108 return (r);
3109 }
3110
3111 static int
vmm_ctl_ioctl(int cmd,intptr_t arg,int md,cred_t * cr,int * rvalp)3112 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
3113 {
3114 void *argp = (void *)arg;
3115
3116 switch (cmd) {
3117 case VMM_CREATE_VM: {
3118 struct vm_create_req req;
3119
3120 if ((md & FWRITE) == 0) {
3121 return (EPERM);
3122 }
3123 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
3124 return (EFAULT);
3125 }
3126 return (vmmdev_do_vm_create(&req, cr));
3127 }
3128 case VMM_DESTROY_VM: {
3129 struct vm_destroy_req req;
3130
3131 if ((md & FWRITE) == 0) {
3132 return (EPERM);
3133 }
3134 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
3135 return (EFAULT);
3136 }
3137 return (vmmdev_do_vm_destroy(&req, cr));
3138 }
3139 case VMM_VM_SUPPORTED:
3140 return (vmm_is_supported(arg));
3141 case VMM_CHECK_IOMMU:
3142 if (!vmm_check_iommu()) {
3143 return (ENXIO);
3144 }
3145 return (0);
3146 case VMM_RESV_QUERY:
3147 case VMM_RESV_SET_TARGET:
3148 return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
3149 default:
3150 break;
3151 }
3152 /* No other actions are legal on ctl device */
3153 return (ENOTTY);
3154 }
3155
3156 static int
vmm_ioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * credp,int * rvalp)3157 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
3158 int *rvalp)
3159 {
3160 vmm_softc_t *sc;
3161 minor_t minor;
3162
3163 /*
3164 * Forbid running bhyve in a 32-bit process until it has been tested and
3165 * verified to be safe.
3166 */
3167 if (curproc->p_model != DATAMODEL_LP64) {
3168 return (EFBIG);
3169 }
3170
3171 /* The structs in bhyve ioctls assume a 64-bit datamodel */
3172 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
3173 return (ENOTSUP);
3174 }
3175
3176 /*
3177 * Regardless of minor (vmmctl or instance), we respond to queries of
3178 * the interface version.
3179 */
3180 if (cmd == VMM_INTERFACE_VERSION) {
3181 *rvalp = VMM_CURRENT_INTERFACE_VERSION;
3182 return (0);
3183 }
3184
3185 minor = getminor(dev);
3186
3187 if (minor == VMM_CTL_MINOR) {
3188 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
3189 }
3190
3191 sc = ddi_get_soft_state(vmm_statep, minor);
3192 ASSERT(sc != NULL);
3193
3194 /*
3195 * Turn away any ioctls against an instance when it is being destroyed.
3196 * (Except for the ioctl inquiring about that destroy-in-progress.)
3197 */
3198 if ((sc->vmm_flags & VMM_DESTROY) != 0) {
3199 if (cmd == VM_DESTROY_PENDING) {
3200 *rvalp = 1;
3201 return (0);
3202 }
3203 return (ENXIO);
3204 }
3205
3206 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
3207 }
3208
3209 static int
vmm_segmap(dev_t dev,off_t off,struct as * as,caddr_t * addrp,off_t len,unsigned int prot,unsigned int maxprot,unsigned int flags,cred_t * credp)3210 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
3211 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
3212 {
3213 vmm_softc_t *sc;
3214 const minor_t minor = getminor(dev);
3215 int err;
3216
3217 if (minor == VMM_CTL_MINOR) {
3218 return (ENODEV);
3219 }
3220 if (off < 0 || (off + len) <= 0) {
3221 return (EINVAL);
3222 }
3223 if ((prot & PROT_USER) == 0) {
3224 return (EACCES);
3225 }
3226
3227 sc = ddi_get_soft_state(vmm_statep, minor);
3228 ASSERT(sc);
3229
3230 if (sc->vmm_flags & VMM_DESTROY)
3231 return (ENXIO);
3232
3233 /* Grab read lock on the VM to prevent any changes to the memory map */
3234 vmm_read_lock(sc);
3235
3236 if (off >= VM_DEVMEM_START) {
3237 int segid;
3238 off_t segoff;
3239
3240 /* Mapping a devmem "device" */
3241 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
3242 err = ENODEV;
3243 } else {
3244 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
3245 addrp, prot, maxprot, flags);
3246 }
3247 } else {
3248 /* Mapping a part of the guest physical space */
3249 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
3250 maxprot, flags);
3251 }
3252
3253 vmm_read_unlock(sc);
3254 return (err);
3255 }
3256
3257 static sdev_plugin_validate_t
vmm_sdev_validate(sdev_ctx_t ctx)3258 vmm_sdev_validate(sdev_ctx_t ctx)
3259 {
3260 const char *name = sdev_ctx_name(ctx);
3261 vmm_softc_t *sc;
3262 sdev_plugin_validate_t ret;
3263 minor_t minor;
3264
3265 if (sdev_ctx_vtype(ctx) != VCHR)
3266 return (SDEV_VTOR_INVALID);
3267
3268 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
3269
3270 mutex_enter(&vmm_mtx);
3271 if ((sc = vmm_lookup(name)) == NULL)
3272 ret = SDEV_VTOR_INVALID;
3273 else if (sc->vmm_minor != minor)
3274 ret = SDEV_VTOR_STALE;
3275 else
3276 ret = SDEV_VTOR_VALID;
3277 mutex_exit(&vmm_mtx);
3278
3279 return (ret);
3280 }
3281
3282 static int
vmm_sdev_filldir(sdev_ctx_t ctx)3283 vmm_sdev_filldir(sdev_ctx_t ctx)
3284 {
3285 vmm_softc_t *sc;
3286 int ret;
3287
3288 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
3289 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
3290 sdev_ctx_path(ctx), VMM_SDEV_ROOT);
3291 return (EINVAL);
3292 }
3293
3294 mutex_enter(&vmm_mtx);
3295 ASSERT(vmmdev_dip != NULL);
3296 for (sc = list_head(&vmm_list); sc != NULL;
3297 sc = list_next(&vmm_list, sc)) {
3298 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
3299 ret = sdev_plugin_mknod(ctx, sc->vmm_name,
3300 S_IFCHR | 0600,
3301 makedevice(ddi_driver_major(vmmdev_dip),
3302 sc->vmm_minor));
3303 } else {
3304 continue;
3305 }
3306 if (ret != 0 && ret != EEXIST)
3307 goto out;
3308 }
3309
3310 ret = 0;
3311
3312 out:
3313 mutex_exit(&vmm_mtx);
3314 return (ret);
3315 }
3316
3317 /* ARGSUSED */
3318 static void
vmm_sdev_inactive(sdev_ctx_t ctx)3319 vmm_sdev_inactive(sdev_ctx_t ctx)
3320 {
3321 }
3322
3323 static sdev_plugin_ops_t vmm_sdev_ops = {
3324 .spo_version = SDEV_PLUGIN_VERSION,
3325 .spo_flags = SDEV_PLUGIN_SUBDIR,
3326 .spo_validate = vmm_sdev_validate,
3327 .spo_filldir = vmm_sdev_filldir,
3328 .spo_inactive = vmm_sdev_inactive
3329 };
3330
3331 /* ARGSUSED */
3332 static int
vmm_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)3333 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
3334 {
3335 int error;
3336
3337 switch (cmd) {
3338 case DDI_INFO_DEVT2DEVINFO:
3339 *result = (void *)vmmdev_dip;
3340 error = DDI_SUCCESS;
3341 break;
3342 case DDI_INFO_DEVT2INSTANCE:
3343 *result = (void *)0;
3344 error = DDI_SUCCESS;
3345 break;
3346 default:
3347 error = DDI_FAILURE;
3348 break;
3349 }
3350 return (error);
3351 }
3352
3353 static int
vmm_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)3354 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3355 {
3356 sdev_plugin_hdl_t sph;
3357 hma_reg_t *reg = NULL;
3358 boolean_t vmm_loaded = B_FALSE;
3359
3360 if (cmd != DDI_ATTACH) {
3361 return (DDI_FAILURE);
3362 }
3363
3364 mutex_enter(&vmmdev_mtx);
3365 /* Ensure we are not already attached. */
3366 if (vmmdev_dip != NULL) {
3367 mutex_exit(&vmmdev_mtx);
3368 return (DDI_FAILURE);
3369 }
3370
3371 vmm_sol_glue_init();
3372
3373 /*
3374 * Perform temporary HMA registration to determine if the system
3375 * is capable.
3376 */
3377 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
3378 goto fail;
3379 } else if (vmm_mod_load() != 0) {
3380 goto fail;
3381 }
3382 vmm_loaded = B_TRUE;
3383 hma_unregister(reg);
3384 reg = NULL;
3385
3386 /* Create control node. Other nodes will be created on demand. */
3387 if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
3388 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
3389 goto fail;
3390 }
3391
3392 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
3393 if (sph == (sdev_plugin_hdl_t)NULL) {
3394 ddi_remove_minor_node(dip, NULL);
3395 goto fail;
3396 }
3397
3398 ddi_report_dev(dip);
3399 vmmdev_sdev_hdl = sph;
3400 vmmdev_dip = dip;
3401 mutex_exit(&vmmdev_mtx);
3402 return (DDI_SUCCESS);
3403
3404 fail:
3405 if (vmm_loaded) {
3406 vmm_mod_unload();
3407 }
3408 if (reg != NULL) {
3409 hma_unregister(reg);
3410 }
3411 vmm_sol_glue_cleanup();
3412 mutex_exit(&vmmdev_mtx);
3413 return (DDI_FAILURE);
3414 }
3415
3416 static int
vmm_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)3417 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3418 {
3419 if (cmd != DDI_DETACH) {
3420 return (DDI_FAILURE);
3421 }
3422
3423 /*
3424 * Ensure that all resources have been cleaned up.
3425 *
3426 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
3427 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
3428 * devinfo locked as iommu_cleanup() tries to recursively lock each
3429 * devinfo, including our own, while holding vmmdev_mtx.
3430 */
3431 if (mutex_tryenter(&vmmdev_mtx) == 0)
3432 return (DDI_FAILURE);
3433
3434 mutex_enter(&vmm_mtx);
3435 if (!list_is_empty(&vmm_list)) {
3436 mutex_exit(&vmm_mtx);
3437 mutex_exit(&vmmdev_mtx);
3438 return (DDI_FAILURE);
3439 }
3440 mutex_exit(&vmm_mtx);
3441
3442 if (!vmmr_is_empty()) {
3443 mutex_exit(&vmmdev_mtx);
3444 return (DDI_FAILURE);
3445 }
3446
3447 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
3448 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
3449 mutex_exit(&vmmdev_mtx);
3450 return (DDI_FAILURE);
3451 }
3452 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
3453
3454 /* Remove the control node. */
3455 ddi_remove_minor_node(dip, "ctl");
3456 vmmdev_dip = NULL;
3457
3458 vmm_mod_unload();
3459 VERIFY3U(vmmdev_hma_reg, ==, NULL);
3460 vmm_sol_glue_cleanup();
3461
3462 mutex_exit(&vmmdev_mtx);
3463
3464 return (DDI_SUCCESS);
3465 }
3466
3467 static struct cb_ops vmm_cb_ops = {
3468 vmm_open,
3469 vmm_close,
3470 nodev, /* strategy */
3471 nodev, /* print */
3472 nodev, /* dump */
3473 nodev, /* read */
3474 nodev, /* write */
3475 vmm_ioctl,
3476 nodev, /* devmap */
3477 nodev, /* mmap */
3478 vmm_segmap,
3479 nochpoll, /* poll */
3480 ddi_prop_op,
3481 NULL,
3482 D_NEW | D_MP | D_DEVMAP
3483 };
3484
3485 static struct dev_ops vmm_ops = {
3486 DEVO_REV,
3487 0,
3488 vmm_info,
3489 nulldev, /* identify */
3490 nulldev, /* probe */
3491 vmm_attach,
3492 vmm_detach,
3493 nodev, /* reset */
3494 &vmm_cb_ops,
3495 (struct bus_ops *)NULL
3496 };
3497
3498 static struct modldrv modldrv = {
3499 &mod_driverops,
3500 "bhyve vmm",
3501 &vmm_ops
3502 };
3503
3504 static struct modlinkage modlinkage = {
3505 MODREV_1,
3506 &modldrv,
3507 NULL
3508 };
3509
3510 int
_init(void)3511 _init(void)
3512 {
3513 int error;
3514
3515 sysinit();
3516
3517 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3518 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3519 list_create(&vmm_list, sizeof (vmm_softc_t),
3520 offsetof(vmm_softc_t, vmm_node));
3521 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3522
3523 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3524 if (error) {
3525 return (error);
3526 }
3527
3528 error = vmmr_init();
3529 if (error) {
3530 ddi_soft_state_fini(&vmm_statep);
3531 return (error);
3532 }
3533
3534 vmm_zsd_init();
3535
3536 error = mod_install(&modlinkage);
3537 if (error) {
3538 ddi_soft_state_fini(&vmm_statep);
3539 vmm_zsd_fini();
3540 vmmr_fini();
3541 }
3542
3543 return (error);
3544 }
3545
3546 int
_fini(void)3547 _fini(void)
3548 {
3549 int error;
3550
3551 error = mod_remove(&modlinkage);
3552 if (error) {
3553 return (error);
3554 }
3555
3556 vmm_zsd_fini();
3557 vmmr_fini();
3558
3559 ddi_soft_state_fini(&vmm_statep);
3560
3561 return (0);
3562 }
3563
3564 int
_info(struct modinfo * modinfop)3565 _info(struct modinfo *modinfop)
3566 {
3567 return (mod_info(&modlinkage, modinfop));
3568 }
3569