xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision 32cd3ee5901ea33d41ff550e5f40ce743c8d4165)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #define	EXTERR_CATEGORY	EXTERR_CAT_VMM
12 #include <sys/exterrvar.h>
13 #include <sys/fcntl.h>
14 #include <sys/ioccom.h>
15 #include <sys/jail.h>
16 #include <sys/kernel.h>
17 #include <sys/malloc.h>
18 #include <sys/mman.h>
19 #include <sys/module.h>
20 #include <sys/priv.h>
21 #include <sys/proc.h>
22 #include <sys/queue.h>
23 #include <sys/resourcevar.h>
24 #include <sys/smp.h>
25 #include <sys/sx.h>
26 #include <sys/sysctl.h>
27 #include <sys/ucred.h>
28 #include <sys/uio.h>
29 
30 #include <machine/vmm.h>
31 
32 #include <vm/vm.h>
33 #include <vm/vm_object.h>
34 
35 #include <dev/vmm/vmm_dev.h>
36 #include <dev/vmm/vmm_mem.h>
37 #include <dev/vmm/vmm_stat.h>
38 #include <dev/vmm/vmm_vm.h>
39 
40 #ifdef __amd64__
41 #ifdef COMPAT_FREEBSD12
42 struct vm_memseg_12 {
43 	int		segid;
44 	size_t		len;
45 	char		name[64];
46 };
47 _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI");
48 
49 #define	VM_ALLOC_MEMSEG_12	\
50 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
51 #define	VM_GET_MEMSEG_12	\
52 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
53 #endif /* COMPAT_FREEBSD12 */
54 #ifdef COMPAT_FREEBSD14
55 struct vm_memseg_14 {
56 	int		segid;
57 	size_t		len;
58 	char		name[VM_MAX_SUFFIXLEN + 1];
59 };
60 _Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16),
61     "COMPAT_FREEBSD14 ABI");
62 
63 #define	VM_ALLOC_MEMSEG_14	\
64 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14)
65 #define	VM_GET_MEMSEG_14	\
66 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14)
67 #endif /* COMPAT_FREEBSD14 */
68 #endif /* __amd64__ */
69 
70 struct devmem_softc {
71 	int	segid;
72 	char	*name;
73 	struct cdev *cdev;
74 	struct vmmdev_softc *sc;
75 	SLIST_ENTRY(devmem_softc) link;
76 };
77 
78 struct vmmdev_softc {
79 	struct vm	*vm;		/* vm instance cookie */
80 	struct cdev	*cdev;
81 	struct ucred	*ucred;
82 	SLIST_ENTRY(vmmdev_softc) link;
83 	LIST_ENTRY(vmmdev_softc) priv_link;
84 	SLIST_HEAD(, devmem_softc) devmem;
85 	int		flags;
86 };
87 
88 struct vmmctl_priv {
89 	LIST_HEAD(, vmmdev_softc) softcs;
90 };
91 
92 static bool vmm_initialized = false;
93 
94 static SLIST_HEAD(, vmmdev_softc) head;
95 
96 static unsigned int pr_allow_vmm_flag, pr_allow_vmm_ppt_flag;
97 static struct sx vmmdev_mtx;
98 SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex");
99 
100 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
101 
102 SYSCTL_DECL(_hw_vmm);
103 
104 u_int vm_maxcpu;
105 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
106     &vm_maxcpu, 0, "Maximum number of vCPUs");
107 
108 u_int vm_maxvmms;
109 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxvmms, CTLFLAG_RWTUN,
110     &vm_maxvmms, 0, "Maximum number of VMM instances per user");
111 
112 static void devmem_destroy(void *arg);
113 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem);
114 static void vmmdev_destroy(struct vmmdev_softc *sc);
115 
116 static int
117 vmm_priv_check(struct ucred *ucred)
118 {
119 	if (jailed(ucred) &&
120 	    (ucred->cr_prison->pr_allow & pr_allow_vmm_flag) == 0)
121 		return (EPERM);
122 
123 	return (0);
124 }
125 
126 static int
127 vcpu_lock_one(struct vcpu *vcpu)
128 {
129 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
130 }
131 
132 static void
133 vcpu_unlock_one(struct vcpu *vcpu)
134 {
135 	enum vcpu_state state;
136 
137 	state = vcpu_get_state(vcpu, NULL);
138 	if (state != VCPU_FROZEN) {
139 		panic("vcpu %s(%d) has invalid state %d",
140 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
141 	}
142 
143 	vcpu_set_state(vcpu, VCPU_IDLE, false);
144 }
145 
146 static int
147 vcpu_lock_all(struct vmmdev_softc *sc)
148 {
149 	int error;
150 
151 	/*
152 	 * Serialize vcpu_lock_all() callers.  Individual vCPUs are not locked
153 	 * in a consistent order so we need to serialize to avoid deadlocks.
154 	 */
155 	vm_lock_vcpus(sc->vm);
156 	error = vcpu_set_state_all(sc->vm, VCPU_FROZEN);
157 	if (error != 0)
158 		vm_unlock_vcpus(sc->vm);
159 	return (error);
160 }
161 
162 static void
163 vcpu_unlock_all(struct vmmdev_softc *sc)
164 {
165 	struct vcpu *vcpu;
166 	uint16_t i, maxcpus;
167 
168 	maxcpus = vm_get_maxcpus(sc->vm);
169 	for (i = 0; i < maxcpus; i++) {
170 		vcpu = vm_vcpu(sc->vm, i);
171 		if (vcpu == NULL)
172 			continue;
173 		vcpu_unlock_one(vcpu);
174 	}
175 	vm_unlock_vcpus(sc->vm);
176 }
177 
178 static struct vmmdev_softc *
179 vmmdev_lookup(const char *name, struct ucred *cred)
180 {
181 	struct vmmdev_softc *sc;
182 
183 	sx_assert(&vmmdev_mtx, SA_XLOCKED);
184 
185 	SLIST_FOREACH(sc, &head, link) {
186 		if (strcmp(name, vm_name(sc->vm)) == 0)
187 			break;
188 	}
189 
190 	if (sc == NULL)
191 		return (NULL);
192 
193 	if (cr_cansee(cred, sc->ucred))
194 		return (NULL);
195 
196 	return (sc);
197 }
198 
199 static struct vmmdev_softc *
200 vmmdev_lookup2(struct cdev *cdev)
201 {
202 	return (cdev->si_drv1);
203 }
204 
205 static int
206 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
207 {
208 	int error, off, c, prot;
209 	vm_paddr_t gpa, maxaddr;
210 	void *hpa, *cookie;
211 	struct vmmdev_softc *sc;
212 
213 	sc = vmmdev_lookup2(cdev);
214 	if (sc == NULL)
215 		return (ENXIO);
216 
217 	/*
218 	 * Get a read lock on the guest memory map.
219 	 */
220 	vm_slock_memsegs(sc->vm);
221 
222 	error = 0;
223 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
224 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
225 	while (uio->uio_resid > 0 && error == 0) {
226 		gpa = uio->uio_offset;
227 		off = gpa & PAGE_MASK;
228 		c = min(uio->uio_resid, PAGE_SIZE - off);
229 
230 		/*
231 		 * The VM has a hole in its physical memory map. If we want to
232 		 * use 'dd' to inspect memory beyond the hole we need to
233 		 * provide bogus data for memory that lies in the hole.
234 		 *
235 		 * Since this device does not support lseek(2), dd(1) will
236 		 * read(2) blocks of data to simulate the lseek(2).
237 		 */
238 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
239 		if (hpa == NULL) {
240 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
241 				error = uiomove(__DECONST(void *, zero_region),
242 				    c, uio);
243 			else
244 				error = EFAULT;
245 		} else {
246 			error = uiomove(hpa, c, uio);
247 			vm_gpa_release(cookie);
248 		}
249 	}
250 	vm_unlock_memsegs(sc->vm);
251 	return (error);
252 }
253 
254 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
255 
256 static int
257 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
258 {
259 	struct devmem_softc *dsc;
260 	int error;
261 	bool sysmem;
262 
263 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
264 	if (error || mseg->len == 0)
265 		return (error);
266 
267 	if (!sysmem) {
268 		SLIST_FOREACH(dsc, &sc->devmem, link) {
269 			if (dsc->segid == mseg->segid)
270 				break;
271 		}
272 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
273 		    __func__, mseg->segid));
274 		error = copystr(dsc->name, mseg->name, len, NULL);
275 	} else {
276 		bzero(mseg->name, len);
277 	}
278 
279 	return (error);
280 }
281 
282 static int
283 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len,
284     struct domainset *domainset)
285 {
286 	char *name;
287 	int error;
288 	bool sysmem;
289 
290 	error = 0;
291 	name = NULL;
292 	sysmem = true;
293 
294 	/*
295 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
296 	 * by stripped off when devfs processes the full string.
297 	 */
298 	if (VM_MEMSEG_NAME(mseg)) {
299 		sysmem = false;
300 		name = malloc(len, M_VMMDEV, M_WAITOK);
301 		error = copystr(mseg->name, name, len, NULL);
302 		if (error)
303 			goto done;
304 	}
305 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, domainset);
306 	if (error)
307 		goto done;
308 
309 	if (VM_MEMSEG_NAME(mseg)) {
310 		error = devmem_create_cdev(sc, mseg->segid, name);
311 		if (error)
312 			vm_free_memseg(sc->vm, mseg->segid);
313 		else
314 			name = NULL;	/* freed when 'cdev' is destroyed */
315 	}
316 done:
317 	free(name, M_VMMDEV);
318 	return (error);
319 }
320 
321 #if defined(__amd64__) && \
322     (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12))
323 /*
324  * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts.
325  */
326 static void
327 adjust_segid(struct vm_memseg *mseg)
328 {
329 	if (mseg->segid != VM_SYSMEM) {
330 		mseg->segid += (VM_BOOTROM - 1);
331 	}
332 }
333 #endif
334 
335 static int
336 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
337     uint64_t *regval)
338 {
339 	int error, i;
340 
341 	error = 0;
342 	for (i = 0; i < count; i++) {
343 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
344 		if (error)
345 			break;
346 	}
347 	return (error);
348 }
349 
350 static int
351 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
352     uint64_t *regval)
353 {
354 	int error, i;
355 
356 	error = 0;
357 	for (i = 0; i < count; i++) {
358 		error = vm_set_register(vcpu, regnum[i], regval[i]);
359 		if (error)
360 			break;
361 	}
362 	return (error);
363 }
364 
365 static int
366 vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
367 {
368 	int error;
369 
370 	/*
371 	 * A jail without vmm access shouldn't be able to access vmm device
372 	 * files at all, but check here just to be thorough.
373 	 */
374 	error = vmm_priv_check(td->td_ucred);
375 	if (error != 0)
376 		return (error);
377 
378 	return (0);
379 }
380 
381 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
382 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
383 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
384 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
385 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
386 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
387 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
388 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
389 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
390 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
391 	VMMDEV_IOCTL(VM_STAT_DESC, 0),
392 
393 #ifdef __amd64__
394 #ifdef COMPAT_FREEBSD12
395 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
396 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
397 #endif
398 #ifdef COMPAT_FREEBSD14
399 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14,
400 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
401 #endif
402 #endif /* __amd64__ */
403 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
404 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
405 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
406 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
407 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
408 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
409 	VMMDEV_IOCTL(VM_REINIT,
410 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
411 
412 #ifdef __amd64__
413 #if defined(COMPAT_FREEBSD12)
414 	VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
415 #endif
416 #ifdef COMPAT_FREEBSD14
417 	VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS),
418 #endif
419 #endif /* __amd64__ */
420 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
421 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
422 
423 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
424 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
425 
426 	VMMDEV_IOCTL(VM_SUSPEND, 0),
427 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
428 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
429 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
430 };
431 
432 static int
433 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
434     struct thread *td)
435 {
436 	struct vmmdev_softc *sc;
437 	struct vcpu *vcpu;
438 	const struct vmmdev_ioctl *ioctl;
439 	struct vm_memseg *mseg;
440 	int error, vcpuid;
441 
442 	sc = vmmdev_lookup2(cdev);
443 	if (sc == NULL)
444 		return (ENXIO);
445 
446 	ioctl = NULL;
447 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
448 		if (vmmdev_ioctls[i].cmd == cmd) {
449 			ioctl = &vmmdev_ioctls[i];
450 			break;
451 		}
452 	}
453 	if (ioctl == NULL) {
454 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
455 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
456 				ioctl = &vmmdev_machdep_ioctls[i];
457 				break;
458 			}
459 		}
460 	}
461 	if (ioctl == NULL)
462 		return (ENOTTY);
463 
464 	if ((ioctl->flags & VMMDEV_IOCTL_PPT) != 0) {
465 		if (jailed(td->td_ucred) && (td->td_ucred->cr_prison->pr_allow &
466 		    pr_allow_vmm_ppt_flag) == 0)
467 			return (EPERM);
468 		error = priv_check(td, PRIV_VMM_PPTDEV);
469 		if (error != 0)
470 			return (error);
471 	}
472 
473 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
474 		vm_xlock_memsegs(sc->vm);
475 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
476 		vm_slock_memsegs(sc->vm);
477 
478 	vcpu = NULL;
479 	vcpuid = -1;
480 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
481 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
482 		vcpuid = *(int *)data;
483 		if (vcpuid == -1) {
484 			if ((ioctl->flags &
485 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
486 				error = EINVAL;
487 				goto lockfail;
488 			}
489 		} else {
490 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
491 			if (vcpu == NULL) {
492 				error = EINVAL;
493 				goto lockfail;
494 			}
495 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
496 				error = vcpu_lock_one(vcpu);
497 				if (error)
498 					goto lockfail;
499 			}
500 		}
501 	}
502 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
503 		error = vcpu_lock_all(sc);
504 		if (error)
505 			goto lockfail;
506 	}
507 
508 	switch (cmd) {
509 	case VM_SUSPEND: {
510 		struct vm_suspend *vmsuspend;
511 
512 		vmsuspend = (struct vm_suspend *)data;
513 		error = vm_suspend(sc->vm, vmsuspend->how);
514 		break;
515 	}
516 	case VM_REINIT:
517 		error = vm_reinit(sc->vm);
518 		break;
519 	case VM_STAT_DESC: {
520 		struct vm_stat_desc *statdesc;
521 
522 		statdesc = (struct vm_stat_desc *)data;
523 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
524 		    sizeof(statdesc->desc));
525 		break;
526 	}
527 	case VM_STATS: {
528 		struct vm_stats *vmstats;
529 
530 		vmstats = (struct vm_stats *)data;
531 		getmicrotime(&vmstats->tv);
532 		error = vmm_stat_copy(vcpu, vmstats->index,
533 		    nitems(vmstats->statbuf), &vmstats->num_entries,
534 		    vmstats->statbuf);
535 		break;
536 	}
537 	case VM_MMAP_GETNEXT: {
538 		struct vm_memmap *mm;
539 
540 		mm = (struct vm_memmap *)data;
541 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
542 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
543 		break;
544 	}
545 	case VM_MMAP_MEMSEG: {
546 		struct vm_memmap *mm;
547 
548 		mm = (struct vm_memmap *)data;
549 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
550 		    mm->len, mm->prot, mm->flags);
551 		break;
552 	}
553 	case VM_MUNMAP_MEMSEG: {
554 		struct vm_munmap *mu;
555 
556 		mu = (struct vm_munmap *)data;
557 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
558 		break;
559 	}
560 #ifdef __amd64__
561 #ifdef COMPAT_FREEBSD12
562 	case VM_ALLOC_MEMSEG_12:
563 		mseg = (struct vm_memseg *)data;
564 
565 		adjust_segid(mseg);
566 		error = alloc_memseg(sc, mseg,
567 		    sizeof(((struct vm_memseg_12 *)0)->name), NULL);
568 		break;
569 	case VM_GET_MEMSEG_12:
570 		mseg = (struct vm_memseg *)data;
571 
572 		adjust_segid(mseg);
573 		error = get_memseg(sc, mseg,
574 		    sizeof(((struct vm_memseg_12 *)0)->name));
575 		break;
576 #endif /* COMPAT_FREEBSD12 */
577 #ifdef COMPAT_FREEBSD14
578 	case VM_ALLOC_MEMSEG_14:
579 		mseg = (struct vm_memseg *)data;
580 
581 		adjust_segid(mseg);
582 		error = alloc_memseg(sc, mseg,
583 		    sizeof(((struct vm_memseg_14 *)0)->name), NULL);
584 		break;
585 	case VM_GET_MEMSEG_14:
586 		mseg = (struct vm_memseg *)data;
587 
588 		adjust_segid(mseg);
589 		error = get_memseg(sc, mseg,
590 		    sizeof(((struct vm_memseg_14 *)0)->name));
591 		break;
592 #endif /* COMPAT_FREEBSD14 */
593 #endif /* __amd64__ */
594 	case VM_ALLOC_MEMSEG: {
595 		domainset_t *mask;
596 		struct domainset *domainset, domain;
597 
598 		domainset = NULL;
599 		mseg = (struct vm_memseg *)data;
600 		if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && mseg->ds_mask != NULL) {
601 			if (mseg->ds_mask_size < sizeof(domainset_t) ||
602 			    mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) {
603 				error = ERANGE;
604 				break;
605 			}
606 			memset(&domain, 0, sizeof(domain));
607 			mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK);
608 			error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size);
609 			if (error) {
610 				free(mask, M_VMMDEV);
611 				break;
612 			}
613 			error = domainset_populate(&domain, mask, mseg->ds_policy,
614 			    mseg->ds_mask_size);
615 			free(mask, M_VMMDEV);
616 			if (error)
617 				break;
618 			domainset = domainset_create(&domain);
619 			if (domainset == NULL) {
620 				error = EINVAL;
621 				break;
622 			}
623 		}
624 		error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset);
625 		break;
626 	}
627 	case VM_GET_MEMSEG:
628 		error = get_memseg(sc, (struct vm_memseg *)data,
629 		    sizeof(((struct vm_memseg *)0)->name));
630 		break;
631 	case VM_GET_REGISTER: {
632 		struct vm_register *vmreg;
633 
634 		vmreg = (struct vm_register *)data;
635 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
636 		break;
637 	}
638 	case VM_SET_REGISTER: {
639 		struct vm_register *vmreg;
640 
641 		vmreg = (struct vm_register *)data;
642 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
643 		break;
644 	}
645 	case VM_GET_REGISTER_SET: {
646 		struct vm_register_set *vmregset;
647 		uint64_t *regvals;
648 		int *regnums;
649 
650 		vmregset = (struct vm_register_set *)data;
651 		if (vmregset->count > VM_REG_LAST) {
652 			error = EINVAL;
653 			break;
654 		}
655 		regvals = mallocarray(vmregset->count, sizeof(regvals[0]),
656 		    M_VMMDEV, M_WAITOK);
657 		regnums = mallocarray(vmregset->count, sizeof(regnums[0]),
658 		    M_VMMDEV, M_WAITOK);
659 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
660 		    vmregset->count);
661 		if (error == 0)
662 			error = vm_get_register_set(vcpu,
663 			    vmregset->count, regnums, regvals);
664 		if (error == 0)
665 			error = copyout(regvals, vmregset->regvals,
666 			    sizeof(regvals[0]) * vmregset->count);
667 		free(regvals, M_VMMDEV);
668 		free(regnums, M_VMMDEV);
669 		break;
670 	}
671 	case VM_SET_REGISTER_SET: {
672 		struct vm_register_set *vmregset;
673 		uint64_t *regvals;
674 		int *regnums;
675 
676 		vmregset = (struct vm_register_set *)data;
677 		if (vmregset->count > VM_REG_LAST) {
678 			error = EINVAL;
679 			break;
680 		}
681 		regvals = mallocarray(vmregset->count, sizeof(regvals[0]),
682 		    M_VMMDEV, M_WAITOK);
683 		regnums = mallocarray(vmregset->count, sizeof(regnums[0]),
684 		    M_VMMDEV, M_WAITOK);
685 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
686 		    vmregset->count);
687 		if (error == 0)
688 			error = copyin(vmregset->regvals, regvals,
689 			    sizeof(regvals[0]) * vmregset->count);
690 		if (error == 0)
691 			error = vm_set_register_set(vcpu,
692 			    vmregset->count, regnums, regvals);
693 		free(regvals, M_VMMDEV);
694 		free(regnums, M_VMMDEV);
695 		break;
696 	}
697 	case VM_GET_CAPABILITY: {
698 		struct vm_capability *vmcap;
699 
700 		vmcap = (struct vm_capability *)data;
701 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
702 		break;
703 	}
704 	case VM_SET_CAPABILITY: {
705 		struct vm_capability *vmcap;
706 
707 		vmcap = (struct vm_capability *)data;
708 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
709 		break;
710 	}
711 	case VM_ACTIVATE_CPU:
712 		error = vm_activate_cpu(vcpu);
713 		break;
714 	case VM_GET_CPUS: {
715 		struct vm_cpuset *vm_cpuset;
716 		cpuset_t *cpuset;
717 		int size;
718 
719 		error = 0;
720 		vm_cpuset = (struct vm_cpuset *)data;
721 		size = vm_cpuset->cpusetsize;
722 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
723 			error = ERANGE;
724 			break;
725 		}
726 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
727 		    M_WAITOK | M_ZERO);
728 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
729 			*cpuset = vm_active_cpus(sc->vm);
730 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
731 			*cpuset = vm_suspended_cpus(sc->vm);
732 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
733 			*cpuset = vm_debug_cpus(sc->vm);
734 		else
735 			error = EINVAL;
736 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
737 			error = ERANGE;
738 		if (error == 0)
739 			error = copyout(cpuset, vm_cpuset->cpus, size);
740 		free(cpuset, M_TEMP);
741 		break;
742 	}
743 	case VM_SUSPEND_CPU:
744 		error = vm_suspend_cpu(sc->vm, vcpu);
745 		break;
746 	case VM_RESUME_CPU:
747 		error = vm_resume_cpu(sc->vm, vcpu);
748 		break;
749 	case VM_SET_TOPOLOGY: {
750 		struct vm_cpu_topology *topology;
751 
752 		topology = (struct vm_cpu_topology *)data;
753 		error = vm_set_topology(sc->vm, topology->sockets,
754 		    topology->cores, topology->threads, topology->maxcpus);
755 		break;
756 	}
757 	case VM_GET_TOPOLOGY: {
758 		struct vm_cpu_topology *topology;
759 
760 		topology = (struct vm_cpu_topology *)data;
761 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
762 		    &topology->threads, &topology->maxcpus);
763 		error = 0;
764 		break;
765 	}
766 	default:
767 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
768 		    td);
769 		break;
770 	}
771 
772 	if ((ioctl->flags &
773 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
774 		vm_unlock_memsegs(sc->vm);
775 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
776 		vcpu_unlock_all(sc);
777 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
778 		vcpu_unlock_one(vcpu);
779 
780 	/*
781 	 * Make sure that no handler returns a kernel-internal
782 	 * error value to userspace.
783 	 */
784 	KASSERT(error == ERESTART || error >= 0,
785 	    ("vmmdev_ioctl: invalid error return %d", error));
786 	return (error);
787 
788 lockfail:
789 	if ((ioctl->flags &
790 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
791 		vm_unlock_memsegs(sc->vm);
792 	return (error);
793 }
794 
795 static int
796 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
797     struct vm_object **objp, int nprot)
798 {
799 	struct vmmdev_softc *sc;
800 	vm_paddr_t gpa;
801 	size_t len;
802 	vm_ooffset_t segoff, first, last;
803 	int error, found, segid;
804 	bool sysmem;
805 
806 	first = *offset;
807 	last = first + mapsize;
808 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
809 		return (EINVAL);
810 
811 	sc = vmmdev_lookup2(cdev);
812 	if (sc == NULL) {
813 		/* virtual machine is in the process of being created */
814 		return (EINVAL);
815 	}
816 
817 	/*
818 	 * Get a read lock on the guest memory map.
819 	 */
820 	vm_slock_memsegs(sc->vm);
821 
822 	gpa = 0;
823 	found = 0;
824 	while (!found) {
825 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
826 		    NULL, NULL);
827 		if (error)
828 			break;
829 
830 		if (first >= gpa && last <= gpa + len)
831 			found = 1;
832 		else
833 			gpa += len;
834 	}
835 
836 	if (found) {
837 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
838 		KASSERT(error == 0 && *objp != NULL,
839 		    ("%s: invalid memory segment %d", __func__, segid));
840 		if (sysmem) {
841 			vm_object_reference(*objp);
842 			*offset = segoff + (first - gpa);
843 		} else {
844 			error = EINVAL;
845 		}
846 	}
847 	vm_unlock_memsegs(sc->vm);
848 	return (error);
849 }
850 
851 static void
852 vmmdev_destroy(struct vmmdev_softc *sc)
853 {
854 	struct devmem_softc *dsc;
855 	int error __diagused;
856 
857 	KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__));
858 	KASSERT(sc->ucred != NULL, ("%s: missing ucred", __func__));
859 
860 	/*
861 	 * Destroy all cdevs:
862 	 *
863 	 * - any new operations on the 'cdev' will return an error (ENXIO).
864 	 *
865 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
866 	 */
867 	SLIST_FOREACH(dsc, &sc->devmem, link) {
868 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
869 		devmem_destroy(dsc);
870 	}
871 
872 	vm_disable_vcpu_creation(sc->vm);
873 	error = vcpu_lock_all(sc);
874 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
875 	vm_unlock_vcpus(sc->vm);
876 
877 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
878 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
879 		SLIST_REMOVE_HEAD(&sc->devmem, link);
880 		free(dsc->name, M_VMMDEV);
881 		free(dsc, M_VMMDEV);
882 	}
883 
884 	vm_destroy(sc->vm);
885 
886 	chgvmmcnt(sc->ucred->cr_ruidinfo, -1, 0);
887 	crfree(sc->ucred);
888 
889 	sx_xlock(&vmmdev_mtx);
890 	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
891 	if ((sc->flags & VMMCTL_CREATE_DESTROY_ON_CLOSE) != 0)
892 		LIST_REMOVE(sc, priv_link);
893 	sx_xunlock(&vmmdev_mtx);
894 	wakeup(sc);
895 	free(sc, M_VMMDEV);
896 }
897 
898 static int
899 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred)
900 {
901 	struct cdev *cdev;
902 	struct vmmdev_softc *sc;
903 	int error;
904 
905 	sx_xlock(&vmmdev_mtx);
906 	sc = vmmdev_lookup(name, cred);
907 	if (sc == NULL || sc->cdev == NULL) {
908 		sx_xunlock(&vmmdev_mtx);
909 		return (EINVAL);
910 	}
911 
912 	/*
913 	 * Only the creator of a VM or a privileged user can destroy it.
914 	 */
915 	if ((cred->cr_uid != sc->ucred->cr_uid ||
916 	     cred->cr_prison != sc->ucred->cr_prison) &&
917 	    (error = priv_check_cred(cred, PRIV_VMM_DESTROY)) != 0) {
918 		sx_xunlock(&vmmdev_mtx);
919 		return (error);
920 	}
921 
922 	/*
923 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
924 	 * is scheduled for destruction.
925 	 */
926 	cdev = sc->cdev;
927 	sc->cdev = NULL;
928 	sx_xunlock(&vmmdev_mtx);
929 
930 	(void)vm_suspend(sc->vm, VM_SUSPEND_DESTROY);
931 	destroy_dev(cdev);
932 	vmmdev_destroy(sc);
933 
934 	return (0);
935 }
936 
937 static int
938 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
939 {
940 	char *buf;
941 	int error, buflen;
942 
943 	error = vmm_priv_check(req->td->td_ucred);
944 	if (error)
945 		return (error);
946 
947 	buflen = VM_MAX_NAMELEN + 1;
948 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
949 	error = sysctl_handle_string(oidp, buf, buflen, req);
950 	if (error == 0 && req->newptr != NULL)
951 		error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
952 	free(buf, M_VMMDEV);
953 	return (error);
954 }
955 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
956     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
957     NULL, 0, sysctl_vmm_destroy, "A",
958     "Destroy a vmm(4) instance (legacy interface)");
959 
960 static struct cdevsw vmmdevsw = {
961 	.d_name		= "vmmdev",
962 	.d_version	= D_VERSION,
963 	.d_open		= vmmdev_open,
964 	.d_ioctl	= vmmdev_ioctl,
965 	.d_mmap_single	= vmmdev_mmap_single,
966 	.d_read		= vmmdev_rw,
967 	.d_write	= vmmdev_rw,
968 };
969 
970 static struct vmmdev_softc *
971 vmmdev_alloc(struct vm *vm, struct ucred *cred)
972 {
973 	struct vmmdev_softc *sc;
974 
975 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
976 	SLIST_INIT(&sc->devmem);
977 	sc->vm = vm;
978 	sc->ucred = crhold(cred);
979 	return (sc);
980 }
981 
982 static int
983 vmmdev_create(const char *name, uint32_t flags, struct ucred *cred)
984 {
985 	struct make_dev_args mda;
986 	struct cdev *cdev;
987 	struct vmmdev_softc *sc;
988 	struct vmmctl_priv *priv;
989 	struct vm *vm;
990 	int error;
991 
992 	if (name == NULL || strlen(name) > VM_MAX_NAMELEN)
993 		return (EINVAL);
994 
995 	if ((flags & ~VMMCTL_FLAGS_MASK) != 0)
996 		return (EINVAL);
997 	error = devfs_get_cdevpriv((void **)&priv);
998 	if (error)
999 		return (error);
1000 
1001 	sx_xlock(&vmmdev_mtx);
1002 	sc = vmmdev_lookup(name, cred);
1003 	if (sc != NULL) {
1004 		sx_xunlock(&vmmdev_mtx);
1005 		return (EEXIST);
1006 	}
1007 
1008 	/*
1009 	 * Unprivileged users can only create VMs that will be automatically
1010 	 * destroyed when the creating descriptor is closed.
1011 	 */
1012 	if ((flags & VMMCTL_CREATE_DESTROY_ON_CLOSE) == 0 &&
1013 	    (error = priv_check_cred(cred, PRIV_VMM_CREATE)) != 0) {
1014 		sx_xunlock(&vmmdev_mtx);
1015 		return (EXTERROR(error,
1016 		    "An unprivileged user must run VMs in monitor mode"));
1017 	}
1018 
1019 	if (!chgvmmcnt(cred->cr_ruidinfo, 1, vm_maxvmms)) {
1020 		sx_xunlock(&vmmdev_mtx);
1021 		return (ENOMEM);
1022 	}
1023 
1024 	error = vm_create(name, &vm);
1025 	if (error != 0) {
1026 		sx_xunlock(&vmmdev_mtx);
1027 		(void)chgvmmcnt(cred->cr_ruidinfo, -1, 0);
1028 		return (error);
1029 	}
1030 	sc = vmmdev_alloc(vm, cred);
1031 	SLIST_INSERT_HEAD(&head, sc, link);
1032 	sc->flags = flags;
1033 	if ((flags & VMMCTL_CREATE_DESTROY_ON_CLOSE) != 0)
1034 		LIST_INSERT_HEAD(&priv->softcs, sc, priv_link);
1035 
1036 	make_dev_args_init(&mda);
1037 	mda.mda_devsw = &vmmdevsw;
1038 	mda.mda_cr = sc->ucred;
1039 	mda.mda_uid = cred->cr_uid;
1040 	mda.mda_gid = GID_VMM;
1041 	mda.mda_mode = 0600;
1042 	mda.mda_si_drv1 = sc;
1043 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1044 	error = make_dev_s(&mda, &cdev, "vmm/%s", name);
1045 	if (error != 0) {
1046 		sx_xunlock(&vmmdev_mtx);
1047 		vmmdev_destroy(sc);
1048 		return (error);
1049 	}
1050 	sc->cdev = cdev;
1051 	sx_xunlock(&vmmdev_mtx);
1052 	return (0);
1053 }
1054 
1055 static int
1056 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
1057 {
1058 	char *buf;
1059 	int error, buflen;
1060 
1061 	if (!vmm_initialized)
1062 		return (ENXIO);
1063 
1064 	error = vmm_priv_check(req->td->td_ucred);
1065 	if (error != 0)
1066 		return (error);
1067 
1068 	buflen = VM_MAX_NAMELEN + 1;
1069 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1070 	error = sysctl_handle_string(oidp, buf, buflen, req);
1071 	if (error == 0 && req->newptr != NULL)
1072 		error = vmmdev_create(buf, 0, req->td->td_ucred);
1073 	free(buf, M_VMMDEV);
1074 	return (error);
1075 }
1076 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1077     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1078     NULL, 0, sysctl_vmm_create, "A",
1079     "Create a vmm(4) instance (legacy interface)");
1080 
1081 static void
1082 vmmctl_dtor(void *arg)
1083 {
1084 	struct cdev *sc_cdev;
1085 	struct vmmdev_softc *sc;
1086 	struct vmmctl_priv *priv = arg;
1087 
1088 	/*
1089 	 * Scan the softc list for any VMs associated with
1090 	 * the current descriptor and destroy them.
1091 	 */
1092 	sx_xlock(&vmmdev_mtx);
1093 	while (!LIST_EMPTY(&priv->softcs)) {
1094 		sc = LIST_FIRST(&priv->softcs);
1095 		sc_cdev = sc->cdev;
1096 		if (sc_cdev != NULL) {
1097 			sc->cdev = NULL;
1098 		} else {
1099 			/*
1100 			 * Another thread has already
1101 			 * started the removal process.
1102 			 * Sleep until 'vmmdev_destroy' notifies us
1103 			 * that the removal has finished.
1104 			 */
1105 			sx_sleep(sc, &vmmdev_mtx, 0, "vmmctl_dtor", 0);
1106 			continue;
1107 		}
1108 		/*
1109 		 * Temporarily drop the lock to allow vmmdev_destroy to run.
1110 		 */
1111 		sx_xunlock(&vmmdev_mtx);
1112 		(void)vm_suspend(sc->vm, VM_SUSPEND_DESTROY);
1113 		destroy_dev(sc_cdev);
1114 		/* vmmdev_destroy will unlink the 'priv_link' entry. */
1115 		vmmdev_destroy(sc);
1116 		sx_xlock(&vmmdev_mtx);
1117 	}
1118 	sx_xunlock(&vmmdev_mtx);
1119 
1120 	free(priv, M_VMMDEV);
1121 }
1122 
1123 static int
1124 vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td)
1125 {
1126 	int error;
1127 	struct vmmctl_priv *priv;
1128 
1129 	error = vmm_priv_check(td->td_ucred);
1130 	if (error != 0)
1131 		return (error);
1132 
1133 	if ((flags & FWRITE) == 0)
1134 		return (EPERM);
1135 
1136 	priv = malloc(sizeof(*priv), M_VMMDEV, M_WAITOK | M_ZERO);
1137 	LIST_INIT(&priv->softcs);
1138 	error = devfs_set_cdevpriv(priv, vmmctl_dtor);
1139 	if (error != 0) {
1140 		free(priv, M_VMMDEV);
1141 		return (error);
1142 	}
1143 
1144 	return (0);
1145 }
1146 
1147 static int
1148 vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
1149     struct thread *td)
1150 {
1151 	int error;
1152 
1153 	switch (cmd) {
1154 	case VMMCTL_VM_CREATE: {
1155 		struct vmmctl_vm_create *vmc;
1156 
1157 		vmc = (struct vmmctl_vm_create *)data;
1158 		vmc->name[VM_MAX_NAMELEN] = '\0';
1159 		for (size_t i = 0; i < nitems(vmc->reserved); i++) {
1160 			if (vmc->reserved[i] != 0) {
1161 				error = EINVAL;
1162 				return (error);
1163 			}
1164 		}
1165 
1166 		error = vmmdev_create(vmc->name, vmc->flags, td->td_ucred);
1167 		break;
1168 	}
1169 	case VMMCTL_VM_DESTROY: {
1170 		struct vmmctl_vm_destroy *vmd;
1171 
1172 		vmd = (struct vmmctl_vm_destroy *)data;
1173 		vmd->name[VM_MAX_NAMELEN] = '\0';
1174 		for (size_t i = 0; i < nitems(vmd->reserved); i++) {
1175 			if (vmd->reserved[i] != 0) {
1176 				error = EINVAL;
1177 				return (error);
1178 			}
1179 		}
1180 
1181 		error = vmmdev_lookup_and_destroy(vmd->name, td->td_ucred);
1182 		break;
1183 	}
1184 	default:
1185 		error = ENOTTY;
1186 		break;
1187 	}
1188 
1189 	return (error);
1190 }
1191 
1192 static struct cdev *vmmctl_cdev;
1193 static struct cdevsw vmmctlsw = {
1194 	.d_name		= "vmmctl",
1195 	.d_version	= D_VERSION,
1196 	.d_open		= vmmctl_open,
1197 	.d_ioctl	= vmmctl_ioctl,
1198 };
1199 
1200 static int
1201 vmmdev_init(void)
1202 {
1203 	int error;
1204 
1205 	sx_xlock(&vmmdev_mtx);
1206 	error = make_dev_p(MAKEDEV_CHECKNAME, &vmmctl_cdev, &vmmctlsw, NULL,
1207 	    UID_ROOT, GID_VMM, 0660, "vmmctl");
1208 	if (error == 0) {
1209 		pr_allow_vmm_flag = prison_add_allow(NULL, "vmm", NULL,
1210 		    "Allow use of vmm in a jail");
1211 		pr_allow_vmm_ppt_flag = prison_add_allow(NULL, "vmm_ppt", NULL,
1212 		    "Allow use of vmm with ppt devices in a jail");
1213 	}
1214 	sx_xunlock(&vmmdev_mtx);
1215 
1216 	return (error);
1217 }
1218 
1219 static int
1220 vmmdev_cleanup(void)
1221 {
1222 	sx_xlock(&vmmdev_mtx);
1223 	if (!SLIST_EMPTY(&head)) {
1224 		sx_xunlock(&vmmdev_mtx);
1225 		return (EBUSY);
1226 	}
1227 	if (vmmctl_cdev != NULL) {
1228 		destroy_dev(vmmctl_cdev);
1229 		vmmctl_cdev = NULL;
1230 	}
1231 	sx_xunlock(&vmmdev_mtx);
1232 
1233 	return (0);
1234 }
1235 
1236 static int
1237 vmm_handler(module_t mod, int what, void *arg)
1238 {
1239 	int error;
1240 
1241 	switch (what) {
1242 	case MOD_LOAD:
1243 		error = vmmdev_init();
1244 		if (error != 0)
1245 			break;
1246 
1247 		vm_maxcpu = mp_ncpus;
1248 		TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
1249 		if (vm_maxcpu > VM_MAXCPU) {
1250 			printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
1251 			vm_maxcpu = VM_MAXCPU;
1252 		}
1253 		if (vm_maxcpu == 0)
1254 			vm_maxcpu = 1;
1255 		vm_maxvmms = 4 * mp_ncpus;
1256 		error = vmm_modinit();
1257 		if (error == 0)
1258 			vmm_initialized = true;
1259 		else {
1260 			int error1 __diagused;
1261 
1262 			error1 = vmmdev_cleanup();
1263 			KASSERT(error1 == 0,
1264 			    ("%s: vmmdev_cleanup failed: %d", __func__, error1));
1265 		}
1266 		break;
1267 	case MOD_UNLOAD:
1268 		error = vmmdev_cleanup();
1269 		if (error == 0 && vmm_initialized) {
1270 			error = vmm_modcleanup();
1271 			if (error) {
1272 				/*
1273 				 * Something bad happened - prevent new
1274 				 * VMs from being created
1275 				 */
1276 				vmm_initialized = false;
1277 			}
1278 		}
1279 		break;
1280 	default:
1281 		error = 0;
1282 		break;
1283 	}
1284 	return (error);
1285 }
1286 
1287 static moduledata_t vmm_kmod = {
1288 	"vmm",
1289 	vmm_handler,
1290 	NULL
1291 };
1292 
1293 /*
1294  * vmm initialization has the following dependencies:
1295  *
1296  * - Initialization requires smp_rendezvous() and therefore must happen
1297  *   after SMP is fully functional (after SI_SUB_SMP).
1298  * - vmm device initialization requires an initialized devfs.
1299  */
1300 DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY);
1301 MODULE_VERSION(vmm, 1);
1302 
1303 static int
1304 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1305     struct vm_object **objp, int nprot)
1306 {
1307 	struct devmem_softc *dsc;
1308 	vm_ooffset_t first, last;
1309 	size_t seglen;
1310 	int error;
1311 	bool sysmem;
1312 
1313 	dsc = cdev->si_drv1;
1314 	if (dsc == NULL) {
1315 		/* 'cdev' has been created but is not ready for use */
1316 		return (ENXIO);
1317 	}
1318 
1319 	first = *offset;
1320 	last = *offset + len;
1321 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1322 		return (EINVAL);
1323 
1324 	vm_slock_memsegs(dsc->sc->vm);
1325 
1326 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1327 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1328 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1329 
1330 	if (seglen >= last)
1331 		vm_object_reference(*objp);
1332 	else
1333 		error = EINVAL;
1334 
1335 	vm_unlock_memsegs(dsc->sc->vm);
1336 	return (error);
1337 }
1338 
1339 static struct cdevsw devmemsw = {
1340 	.d_name		= "devmem",
1341 	.d_version	= D_VERSION,
1342 	.d_mmap_single	= devmem_mmap_single,
1343 };
1344 
1345 static int
1346 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname)
1347 {
1348 	struct make_dev_args mda;
1349 	struct devmem_softc *dsc;
1350 	int error;
1351 
1352 	sx_xlock(&vmmdev_mtx);
1353 
1354 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1355 	dsc->segid = segid;
1356 	dsc->name = devname;
1357 	dsc->sc = sc;
1358 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1359 
1360 	make_dev_args_init(&mda);
1361 	mda.mda_devsw = &devmemsw;
1362 	mda.mda_cr = sc->ucred;
1363 	mda.mda_uid = sc->ucred->cr_uid;
1364 	mda.mda_gid = GID_VMM;
1365 	mda.mda_mode = 0600;
1366 	mda.mda_si_drv1 = dsc;
1367 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1368 	error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm),
1369 	    devname);
1370 	if (error != 0) {
1371 		SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link);
1372 		free(dsc->name, M_VMMDEV);
1373 		free(dsc, M_VMMDEV);
1374 	}
1375 
1376 	sx_xunlock(&vmmdev_mtx);
1377 
1378 	return (error);
1379 }
1380 
1381 static void
1382 devmem_destroy(void *arg)
1383 {
1384 	struct devmem_softc *dsc = arg;
1385 
1386 	destroy_dev(dsc->cdev);
1387 	dsc->cdev = NULL;
1388 	dsc->sc = NULL;
1389 }
1390