xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision 4008758105a6da9eaa0b96b81dfb3042a33259be)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #include <sys/ioccom.h>
12 #include <sys/jail.h>
13 #include <sys/kernel.h>
14 #include <sys/malloc.h>
15 #include <sys/mman.h>
16 #include <sys/mutex.h>
17 #include <sys/proc.h>
18 #include <sys/queue.h>
19 #include <sys/sysctl.h>
20 #include <sys/ucred.h>
21 #include <sys/uio.h>
22 
23 #include <machine/vmm.h>
24 
25 #include <vm/vm.h>
26 #include <vm/vm_object.h>
27 
28 #include <dev/vmm/vmm_dev.h>
29 #include <dev/vmm/vmm_stat.h>
30 
31 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
32 struct vm_memseg_12 {
33 	int		segid;
34 	size_t		len;
35 	char		name[64];
36 };
37 _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI");
38 
39 #define	VM_ALLOC_MEMSEG_12	\
40 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
41 #define	VM_GET_MEMSEG_12	\
42 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
43 #endif
44 
45 struct devmem_softc {
46 	int	segid;
47 	char	*name;
48 	struct cdev *cdev;
49 	struct vmmdev_softc *sc;
50 	SLIST_ENTRY(devmem_softc) link;
51 };
52 
53 struct vmmdev_softc {
54 	struct vm	*vm;		/* vm instance cookie */
55 	struct cdev	*cdev;
56 	struct ucred	*ucred;
57 	SLIST_ENTRY(vmmdev_softc) link;
58 	SLIST_HEAD(, devmem_softc) devmem;
59 	int		flags;
60 };
61 #define	VSC_LINKED		0x01
62 
63 static SLIST_HEAD(, vmmdev_softc) head;
64 
65 static unsigned pr_allow_flag;
66 static struct mtx vmmdev_mtx;
67 MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
68 
69 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
70 
71 SYSCTL_DECL(_hw_vmm);
72 
73 static void devmem_destroy(void *arg);
74 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem);
75 
76 static int
77 vmm_priv_check(struct ucred *ucred)
78 {
79 	if (jailed(ucred) &&
80 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
81 		return (EPERM);
82 
83 	return (0);
84 }
85 
86 static int
87 vcpu_lock_one(struct vcpu *vcpu)
88 {
89 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
90 }
91 
92 static void
93 vcpu_unlock_one(struct vcpu *vcpu)
94 {
95 	enum vcpu_state state;
96 
97 	state = vcpu_get_state(vcpu, NULL);
98 	if (state != VCPU_FROZEN) {
99 		panic("vcpu %s(%d) has invalid state %d",
100 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
101 	}
102 
103 	vcpu_set_state(vcpu, VCPU_IDLE, false);
104 }
105 
106 static int
107 vcpu_lock_all(struct vmmdev_softc *sc)
108 {
109 	struct vcpu *vcpu;
110 	int error;
111 	uint16_t i, j, maxcpus;
112 
113 	error = 0;
114 	vm_slock_vcpus(sc->vm);
115 	maxcpus = vm_get_maxcpus(sc->vm);
116 	for (i = 0; i < maxcpus; i++) {
117 		vcpu = vm_vcpu(sc->vm, i);
118 		if (vcpu == NULL)
119 			continue;
120 		error = vcpu_lock_one(vcpu);
121 		if (error)
122 			break;
123 	}
124 
125 	if (error) {
126 		for (j = 0; j < i; j++) {
127 			vcpu = vm_vcpu(sc->vm, j);
128 			if (vcpu == NULL)
129 				continue;
130 			vcpu_unlock_one(vcpu);
131 		}
132 		vm_unlock_vcpus(sc->vm);
133 	}
134 
135 	return (error);
136 }
137 
138 static void
139 vcpu_unlock_all(struct vmmdev_softc *sc)
140 {
141 	struct vcpu *vcpu;
142 	uint16_t i, maxcpus;
143 
144 	maxcpus = vm_get_maxcpus(sc->vm);
145 	for (i = 0; i < maxcpus; i++) {
146 		vcpu = vm_vcpu(sc->vm, i);
147 		if (vcpu == NULL)
148 			continue;
149 		vcpu_unlock_one(vcpu);
150 	}
151 	vm_unlock_vcpus(sc->vm);
152 }
153 
154 static struct vmmdev_softc *
155 vmmdev_lookup(const char *name, struct ucred *cred)
156 {
157 	struct vmmdev_softc *sc;
158 
159 	mtx_assert(&vmmdev_mtx, MA_OWNED);
160 
161 	SLIST_FOREACH(sc, &head, link) {
162 		if (strcmp(name, vm_name(sc->vm)) == 0)
163 			break;
164 	}
165 
166 	if (sc == NULL)
167 		return (NULL);
168 
169 	if (cr_cansee(cred, sc->ucred))
170 		return (NULL);
171 
172 	return (sc);
173 }
174 
175 static struct vmmdev_softc *
176 vmmdev_lookup2(struct cdev *cdev)
177 {
178 	return (cdev->si_drv1);
179 }
180 
181 static int
182 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
183 {
184 	int error, off, c, prot;
185 	vm_paddr_t gpa, maxaddr;
186 	void *hpa, *cookie;
187 	struct vmmdev_softc *sc;
188 
189 	sc = vmmdev_lookup2(cdev);
190 	if (sc == NULL)
191 		return (ENXIO);
192 
193 	/*
194 	 * Get a read lock on the guest memory map.
195 	 */
196 	vm_slock_memsegs(sc->vm);
197 
198 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
199 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
200 	while (uio->uio_resid > 0 && error == 0) {
201 		gpa = uio->uio_offset;
202 		off = gpa & PAGE_MASK;
203 		c = min(uio->uio_resid, PAGE_SIZE - off);
204 
205 		/*
206 		 * The VM has a hole in its physical memory map. If we want to
207 		 * use 'dd' to inspect memory beyond the hole we need to
208 		 * provide bogus data for memory that lies in the hole.
209 		 *
210 		 * Since this device does not support lseek(2), dd(1) will
211 		 * read(2) blocks of data to simulate the lseek(2).
212 		 */
213 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
214 		if (hpa == NULL) {
215 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
216 				error = uiomove(__DECONST(void *, zero_region),
217 				    c, uio);
218 			else
219 				error = EFAULT;
220 		} else {
221 			error = uiomove(hpa, c, uio);
222 			vm_gpa_release(cookie);
223 		}
224 	}
225 	vm_unlock_memsegs(sc->vm);
226 	return (error);
227 }
228 
229 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
230 
231 static int
232 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
233 {
234 	struct devmem_softc *dsc;
235 	int error;
236 	bool sysmem;
237 
238 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
239 	if (error || mseg->len == 0)
240 		return (error);
241 
242 	if (!sysmem) {
243 		SLIST_FOREACH(dsc, &sc->devmem, link) {
244 			if (dsc->segid == mseg->segid)
245 				break;
246 		}
247 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
248 		    __func__, mseg->segid));
249 		error = copystr(dsc->name, mseg->name, len, NULL);
250 	} else {
251 		bzero(mseg->name, len);
252 	}
253 
254 	return (error);
255 }
256 
257 static int
258 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
259 {
260 	char *name;
261 	int error;
262 	bool sysmem;
263 
264 	error = 0;
265 	name = NULL;
266 	sysmem = true;
267 
268 	/*
269 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
270 	 * by stripped off when devfs processes the full string.
271 	 */
272 	if (VM_MEMSEG_NAME(mseg)) {
273 		sysmem = false;
274 		name = malloc(len, M_VMMDEV, M_WAITOK);
275 		error = copystr(mseg->name, name, len, NULL);
276 		if (error)
277 			goto done;
278 	}
279 
280 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
281 	if (error)
282 		goto done;
283 
284 	if (VM_MEMSEG_NAME(mseg)) {
285 		error = devmem_create_cdev(sc, mseg->segid, name);
286 		if (error)
287 			vm_free_memseg(sc->vm, mseg->segid);
288 		else
289 			name = NULL;	/* freed when 'cdev' is destroyed */
290 	}
291 done:
292 	free(name, M_VMMDEV);
293 	return (error);
294 }
295 
296 static int
297 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
298     uint64_t *regval)
299 {
300 	int error, i;
301 
302 	error = 0;
303 	for (i = 0; i < count; i++) {
304 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
305 		if (error)
306 			break;
307 	}
308 	return (error);
309 }
310 
311 static int
312 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
313     uint64_t *regval)
314 {
315 	int error, i;
316 
317 	error = 0;
318 	for (i = 0; i < count; i++) {
319 		error = vm_set_register(vcpu, regnum[i], regval[i]);
320 		if (error)
321 			break;
322 	}
323 	return (error);
324 }
325 
326 static int
327 vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
328 {
329 	struct vmmdev_softc *sc;
330 	int error;
331 
332 	sc = vmmdev_lookup2(dev);
333 	KASSERT(sc != NULL, ("%s: device not found", __func__));
334 
335 	/*
336 	 * A user can only access VMs that they themselves have created.
337 	 */
338 	if (td->td_ucred != sc->ucred)
339 		return (EPERM);
340 
341 	/*
342 	 * A jail without vmm access shouldn't be able to access vmm device
343 	 * files at all, but check here just to be thorough.
344 	 */
345 	error = vmm_priv_check(td->td_ucred);
346 	if (error != 0)
347 		return (error);
348 
349 	return (0);
350 }
351 
352 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
353 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
354 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
355 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
356 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
357 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
358 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
359 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
360 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
361 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
362 
363 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
364 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
365 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
366 #endif
367 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
368 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
369 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
370 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
371 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
372 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
373 	VMMDEV_IOCTL(VM_REINIT,
374 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
375 
376 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
377 	VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
378 #endif
379 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
380 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
381 
382 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
383 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
384 
385 	VMMDEV_IOCTL(VM_SUSPEND, 0),
386 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
387 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
388 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
389 };
390 
391 static int
392 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
393     struct thread *td)
394 {
395 	struct vmmdev_softc *sc;
396 	struct vcpu *vcpu;
397 	const struct vmmdev_ioctl *ioctl;
398 	int error, vcpuid;
399 
400 	sc = vmmdev_lookup2(cdev);
401 	if (sc == NULL)
402 		return (ENXIO);
403 
404 	ioctl = NULL;
405 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
406 		if (vmmdev_ioctls[i].cmd == cmd) {
407 			ioctl = &vmmdev_ioctls[i];
408 			break;
409 		}
410 	}
411 	if (ioctl == NULL) {
412 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
413 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
414 				ioctl = &vmmdev_machdep_ioctls[i];
415 				break;
416 			}
417 		}
418 	}
419 	if (ioctl == NULL)
420 		return (ENOTTY);
421 
422 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
423 		vm_xlock_memsegs(sc->vm);
424 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
425 		vm_slock_memsegs(sc->vm);
426 
427 	vcpu = NULL;
428 	vcpuid = -1;
429 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
430 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
431 		vcpuid = *(int *)data;
432 		if (vcpuid == -1) {
433 			if ((ioctl->flags &
434 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
435 				error = EINVAL;
436 				goto lockfail;
437 			}
438 		} else {
439 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
440 			if (vcpu == NULL) {
441 				error = EINVAL;
442 				goto lockfail;
443 			}
444 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
445 				error = vcpu_lock_one(vcpu);
446 				if (error)
447 					goto lockfail;
448 			}
449 		}
450 	}
451 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
452 		error = vcpu_lock_all(sc);
453 		if (error)
454 			goto lockfail;
455 	}
456 
457 	switch (cmd) {
458 	case VM_SUSPEND: {
459 		struct vm_suspend *vmsuspend;
460 
461 		vmsuspend = (struct vm_suspend *)data;
462 		error = vm_suspend(sc->vm, vmsuspend->how);
463 		break;
464 	}
465 	case VM_REINIT:
466 		error = vm_reinit(sc->vm);
467 		break;
468 	case VM_STAT_DESC: {
469 		struct vm_stat_desc *statdesc;
470 
471 		statdesc = (struct vm_stat_desc *)data;
472 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
473 		    sizeof(statdesc->desc));
474 		break;
475 	}
476 	case VM_STATS: {
477 		struct vm_stats *vmstats;
478 
479 		vmstats = (struct vm_stats *)data;
480 		getmicrotime(&vmstats->tv);
481 		error = vmm_stat_copy(vcpu, vmstats->index,
482 		    nitems(vmstats->statbuf), &vmstats->num_entries,
483 		    vmstats->statbuf);
484 		break;
485 	}
486 	case VM_MMAP_GETNEXT: {
487 		struct vm_memmap *mm;
488 
489 		mm = (struct vm_memmap *)data;
490 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
491 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
492 		break;
493 	}
494 	case VM_MMAP_MEMSEG: {
495 		struct vm_memmap *mm;
496 
497 		mm = (struct vm_memmap *)data;
498 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
499 		    mm->len, mm->prot, mm->flags);
500 		break;
501 	}
502 	case VM_MUNMAP_MEMSEG: {
503 		struct vm_munmap *mu;
504 
505 		mu = (struct vm_munmap *)data;
506 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
507 		break;
508 	}
509 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
510 	case VM_ALLOC_MEMSEG_12:
511 		error = alloc_memseg(sc, (struct vm_memseg *)data,
512 		    sizeof(((struct vm_memseg_12 *)0)->name));
513 		break;
514 	case VM_GET_MEMSEG_12:
515 		error = get_memseg(sc, (struct vm_memseg *)data,
516 		    sizeof(((struct vm_memseg_12 *)0)->name));
517 		break;
518 #endif
519 	case VM_ALLOC_MEMSEG:
520 		error = alloc_memseg(sc, (struct vm_memseg *)data,
521 		    sizeof(((struct vm_memseg *)0)->name));
522 		break;
523 	case VM_GET_MEMSEG:
524 		error = get_memseg(sc, (struct vm_memseg *)data,
525 		    sizeof(((struct vm_memseg *)0)->name));
526 		break;
527 	case VM_GET_REGISTER: {
528 		struct vm_register *vmreg;
529 
530 		vmreg = (struct vm_register *)data;
531 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
532 		break;
533 	}
534 	case VM_SET_REGISTER: {
535 		struct vm_register *vmreg;
536 
537 		vmreg = (struct vm_register *)data;
538 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
539 		break;
540 	}
541 	case VM_GET_REGISTER_SET: {
542 		struct vm_register_set *vmregset;
543 		uint64_t *regvals;
544 		int *regnums;
545 
546 		vmregset = (struct vm_register_set *)data;
547 		if (vmregset->count > VM_REG_LAST) {
548 			error = EINVAL;
549 			break;
550 		}
551 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
552 		    M_WAITOK);
553 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
554 		    M_WAITOK);
555 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
556 		    vmregset->count);
557 		if (error == 0)
558 			error = vm_get_register_set(vcpu,
559 			    vmregset->count, regnums, regvals);
560 		if (error == 0)
561 			error = copyout(regvals, vmregset->regvals,
562 			    sizeof(regvals[0]) * vmregset->count);
563 		free(regvals, M_VMMDEV);
564 		free(regnums, M_VMMDEV);
565 		break;
566 	}
567 	case VM_SET_REGISTER_SET: {
568 		struct vm_register_set *vmregset;
569 		uint64_t *regvals;
570 		int *regnums;
571 
572 		vmregset = (struct vm_register_set *)data;
573 		if (vmregset->count > VM_REG_LAST) {
574 			error = EINVAL;
575 			break;
576 		}
577 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
578 		    M_WAITOK);
579 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
580 		    M_WAITOK);
581 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
582 		    vmregset->count);
583 		if (error == 0)
584 			error = copyin(vmregset->regvals, regvals,
585 			    sizeof(regvals[0]) * vmregset->count);
586 		if (error == 0)
587 			error = vm_set_register_set(vcpu,
588 			    vmregset->count, regnums, regvals);
589 		free(regvals, M_VMMDEV);
590 		free(regnums, M_VMMDEV);
591 		break;
592 	}
593 	case VM_GET_CAPABILITY: {
594 		struct vm_capability *vmcap;
595 
596 		vmcap = (struct vm_capability *)data;
597 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
598 		break;
599 	}
600 	case VM_SET_CAPABILITY: {
601 		struct vm_capability *vmcap;
602 
603 		vmcap = (struct vm_capability *)data;
604 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
605 		break;
606 	}
607 	case VM_ACTIVATE_CPU:
608 		error = vm_activate_cpu(vcpu);
609 		break;
610 	case VM_GET_CPUS: {
611 		struct vm_cpuset *vm_cpuset;
612 		cpuset_t *cpuset;
613 		int size;
614 
615 		error = 0;
616 		vm_cpuset = (struct vm_cpuset *)data;
617 		size = vm_cpuset->cpusetsize;
618 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
619 			error = ERANGE;
620 			break;
621 		}
622 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
623 		    M_WAITOK | M_ZERO);
624 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
625 			*cpuset = vm_active_cpus(sc->vm);
626 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
627 			*cpuset = vm_suspended_cpus(sc->vm);
628 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
629 			*cpuset = vm_debug_cpus(sc->vm);
630 		else
631 			error = EINVAL;
632 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
633 			error = ERANGE;
634 		if (error == 0)
635 			error = copyout(cpuset, vm_cpuset->cpus, size);
636 		free(cpuset, M_TEMP);
637 		break;
638 	}
639 	case VM_SUSPEND_CPU:
640 		error = vm_suspend_cpu(sc->vm, vcpu);
641 		break;
642 	case VM_RESUME_CPU:
643 		error = vm_resume_cpu(sc->vm, vcpu);
644 		break;
645 	case VM_SET_TOPOLOGY: {
646 		struct vm_cpu_topology *topology;
647 
648 		topology = (struct vm_cpu_topology *)data;
649 		error = vm_set_topology(sc->vm, topology->sockets,
650 		    topology->cores, topology->threads, topology->maxcpus);
651 		break;
652 	}
653 	case VM_GET_TOPOLOGY: {
654 		struct vm_cpu_topology *topology;
655 
656 		topology = (struct vm_cpu_topology *)data;
657 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
658 		    &topology->threads, &topology->maxcpus);
659 		error = 0;
660 		break;
661 	}
662 	default:
663 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
664 		    td);
665 		break;
666 	}
667 
668 	if ((ioctl->flags &
669 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
670 		vm_unlock_memsegs(sc->vm);
671 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
672 		vcpu_unlock_all(sc);
673 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
674 		vcpu_unlock_one(vcpu);
675 
676 	/*
677 	 * Make sure that no handler returns a kernel-internal
678 	 * error value to userspace.
679 	 */
680 	KASSERT(error == ERESTART || error >= 0,
681 	    ("vmmdev_ioctl: invalid error return %d", error));
682 	return (error);
683 
684 lockfail:
685 	if ((ioctl->flags &
686 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
687 		vm_unlock_memsegs(sc->vm);
688 	return (error);
689 }
690 
691 static int
692 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
693     struct vm_object **objp, int nprot)
694 {
695 	struct vmmdev_softc *sc;
696 	vm_paddr_t gpa;
697 	size_t len;
698 	vm_ooffset_t segoff, first, last;
699 	int error, found, segid;
700 	bool sysmem;
701 
702 	first = *offset;
703 	last = first + mapsize;
704 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
705 		return (EINVAL);
706 
707 	sc = vmmdev_lookup2(cdev);
708 	if (sc == NULL) {
709 		/* virtual machine is in the process of being created */
710 		return (EINVAL);
711 	}
712 
713 	/*
714 	 * Get a read lock on the guest memory map.
715 	 */
716 	vm_slock_memsegs(sc->vm);
717 
718 	gpa = 0;
719 	found = 0;
720 	while (!found) {
721 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
722 		    NULL, NULL);
723 		if (error)
724 			break;
725 
726 		if (first >= gpa && last <= gpa + len)
727 			found = 1;
728 		else
729 			gpa += len;
730 	}
731 
732 	if (found) {
733 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
734 		KASSERT(error == 0 && *objp != NULL,
735 		    ("%s: invalid memory segment %d", __func__, segid));
736 		if (sysmem) {
737 			vm_object_reference(*objp);
738 			*offset = segoff + (first - gpa);
739 		} else {
740 			error = EINVAL;
741 		}
742 	}
743 	vm_unlock_memsegs(sc->vm);
744 	return (error);
745 }
746 
747 static void
748 vmmdev_destroy(struct vmmdev_softc *sc)
749 {
750 	struct devmem_softc *dsc;
751 	int error __diagused;
752 
753 	/*
754 	 * Destroy all cdevs:
755 	 *
756 	 * - any new operations on the 'cdev' will return an error (ENXIO).
757 	 *
758 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
759 	 */
760 	SLIST_FOREACH(dsc, &sc->devmem, link) {
761 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
762 		destroy_dev(dsc->cdev);
763 		devmem_destroy(dsc);
764 	}
765 
766 	vm_disable_vcpu_creation(sc->vm);
767 	error = vcpu_lock_all(sc);
768 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
769 	vm_unlock_vcpus(sc->vm);
770 
771 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
772 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
773 		SLIST_REMOVE_HEAD(&sc->devmem, link);
774 		free(dsc->name, M_VMMDEV);
775 		free(dsc, M_VMMDEV);
776 	}
777 
778 	if (sc->cdev != NULL)
779 		destroy_dev(sc->cdev);
780 
781 	if (sc->vm != NULL)
782 		vm_destroy(sc->vm);
783 
784 	if (sc->ucred != NULL)
785 		crfree(sc->ucred);
786 
787 	if ((sc->flags & VSC_LINKED) != 0) {
788 		mtx_lock(&vmmdev_mtx);
789 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
790 		mtx_unlock(&vmmdev_mtx);
791 	}
792 
793 	free(sc, M_VMMDEV);
794 }
795 
796 static int
797 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred)
798 {
799 	struct cdev *cdev;
800 	struct vmmdev_softc *sc;
801 
802 	mtx_lock(&vmmdev_mtx);
803 	sc = vmmdev_lookup(name, cred);
804 	if (sc == NULL || sc->cdev == NULL) {
805 		mtx_unlock(&vmmdev_mtx);
806 		return (EINVAL);
807 	}
808 
809 	/*
810 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
811 	 * is scheduled for destruction.
812 	 */
813 	cdev = sc->cdev;
814 	sc->cdev = NULL;
815 	mtx_unlock(&vmmdev_mtx);
816 
817 	destroy_dev(cdev);
818 	vmmdev_destroy(sc);
819 
820 	return (0);
821 }
822 
823 static int
824 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
825 {
826 	char *buf;
827 	int error, buflen;
828 
829 	error = vmm_priv_check(req->td->td_ucred);
830 	if (error)
831 		return (error);
832 
833 	buflen = VM_MAX_NAMELEN + 1;
834 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
835 	strlcpy(buf, "beavis", buflen);
836 	error = sysctl_handle_string(oidp, buf, buflen, req);
837 	if (error == 0 && req->newptr != NULL)
838 		error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
839 	free(buf, M_VMMDEV);
840 	return (error);
841 }
842 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
843     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
844     NULL, 0, sysctl_vmm_destroy, "A",
845     NULL);
846 
847 static struct cdevsw vmmdevsw = {
848 	.d_name		= "vmmdev",
849 	.d_version	= D_VERSION,
850 	.d_open		= vmmdev_open,
851 	.d_ioctl	= vmmdev_ioctl,
852 	.d_mmap_single	= vmmdev_mmap_single,
853 	.d_read		= vmmdev_rw,
854 	.d_write	= vmmdev_rw,
855 };
856 
857 static struct vmmdev_softc *
858 vmmdev_alloc(struct vm *vm, struct ucred *cred)
859 {
860 	struct vmmdev_softc *sc;
861 
862 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
863 	SLIST_INIT(&sc->devmem);
864 	sc->vm = vm;
865 	sc->ucred = crhold(cred);
866 	return (sc);
867 }
868 
869 static int
870 vmmdev_create(const char *name, struct ucred *cred)
871 {
872 	struct cdev *cdev;
873 	struct vmmdev_softc *sc, *sc2;
874 	struct vm *vm;
875 	int error;
876 
877 	mtx_lock(&vmmdev_mtx);
878 	sc = vmmdev_lookup(name, cred);
879 	mtx_unlock(&vmmdev_mtx);
880 	if (sc != NULL)
881 		return (EEXIST);
882 
883 	error = vm_create(name, &vm);
884 	if (error != 0)
885 		return (error);
886 
887 	sc = vmmdev_alloc(vm, cred);
888 
889 	/*
890 	 * Lookup the name again just in case somebody sneaked in when we
891 	 * dropped the lock.
892 	 */
893 	mtx_lock(&vmmdev_mtx);
894 	sc2 = vmmdev_lookup(name, cred);
895 	if (sc2 != NULL) {
896 		mtx_unlock(&vmmdev_mtx);
897 		vmmdev_destroy(sc);
898 		return (EEXIST);
899 	}
900 	sc->flags |= VSC_LINKED;
901 	SLIST_INSERT_HEAD(&head, sc, link);
902 	mtx_unlock(&vmmdev_mtx);
903 
904 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
905 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", name);
906 	if (error != 0) {
907 		vmmdev_destroy(sc);
908 		return (error);
909 	}
910 
911 	mtx_lock(&vmmdev_mtx);
912 	sc->cdev = cdev;
913 	sc->cdev->si_drv1 = sc;
914 	mtx_unlock(&vmmdev_mtx);
915 
916 	return (0);
917 }
918 
919 static int
920 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
921 {
922 	char *buf;
923 	int error, buflen;
924 
925 	error = vmm_priv_check(req->td->td_ucred);
926 	if (error != 0)
927 		return (error);
928 
929 	buflen = VM_MAX_NAMELEN + 1;
930 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
931 	strlcpy(buf, "beavis", buflen);
932 	error = sysctl_handle_string(oidp, buf, buflen, req);
933 	if (error == 0 && req->newptr != NULL)
934 		error = vmmdev_create(buf, req->td->td_ucred);
935 	free(buf, M_VMMDEV);
936 	return (error);
937 }
938 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
939     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
940     NULL, 0, sysctl_vmm_create, "A",
941     NULL);
942 
943 void
944 vmmdev_init(void)
945 {
946 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
947 	    "Allow use of vmm in a jail.");
948 }
949 
950 int
951 vmmdev_cleanup(void)
952 {
953 	int error;
954 
955 	if (SLIST_EMPTY(&head))
956 		error = 0;
957 	else
958 		error = EBUSY;
959 
960 	return (error);
961 }
962 
963 static int
964 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
965     struct vm_object **objp, int nprot)
966 {
967 	struct devmem_softc *dsc;
968 	vm_ooffset_t first, last;
969 	size_t seglen;
970 	int error;
971 	bool sysmem;
972 
973 	dsc = cdev->si_drv1;
974 	if (dsc == NULL) {
975 		/* 'cdev' has been created but is not ready for use */
976 		return (ENXIO);
977 	}
978 
979 	first = *offset;
980 	last = *offset + len;
981 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
982 		return (EINVAL);
983 
984 	vm_slock_memsegs(dsc->sc->vm);
985 
986 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
987 	KASSERT(error == 0 && !sysmem && *objp != NULL,
988 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
989 
990 	if (seglen >= last)
991 		vm_object_reference(*objp);
992 	else
993 		error = EINVAL;
994 
995 	vm_unlock_memsegs(dsc->sc->vm);
996 	return (error);
997 }
998 
999 static struct cdevsw devmemsw = {
1000 	.d_name		= "devmem",
1001 	.d_version	= D_VERSION,
1002 	.d_mmap_single	= devmem_mmap_single,
1003 };
1004 
1005 static int
1006 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname)
1007 {
1008 	struct devmem_softc *dsc;
1009 	struct cdev *cdev;
1010 	const char *vmname;
1011 	int error;
1012 
1013 	vmname = vm_name(sc->vm);
1014 
1015 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, sc->ucred,
1016 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1017 	if (error)
1018 		return (error);
1019 
1020 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1021 
1022 	mtx_lock(&vmmdev_mtx);
1023 	if (sc->cdev == NULL) {
1024 		/* virtual machine is being created or destroyed */
1025 		mtx_unlock(&vmmdev_mtx);
1026 		free(dsc, M_VMMDEV);
1027 		destroy_dev_sched_cb(cdev, NULL, 0);
1028 		return (ENODEV);
1029 	}
1030 
1031 	dsc->segid = segid;
1032 	dsc->name = devname;
1033 	dsc->cdev = cdev;
1034 	dsc->sc = sc;
1035 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1036 	mtx_unlock(&vmmdev_mtx);
1037 
1038 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1039 	cdev->si_drv1 = dsc;
1040 	return (0);
1041 }
1042 
1043 static void
1044 devmem_destroy(void *arg)
1045 {
1046 	struct devmem_softc *dsc = arg;
1047 
1048 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1049 	dsc->cdev = NULL;
1050 	dsc->sc = NULL;
1051 }
1052