xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision 1b9cfd6a625dc82611846cb9a53c1886f7af3758)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #include <sys/ioccom.h>
12 #include <sys/jail.h>
13 #include <sys/kernel.h>
14 #include <sys/malloc.h>
15 #include <sys/mman.h>
16 #include <sys/proc.h>
17 #include <sys/queue.h>
18 #include <sys/sx.h>
19 #include <sys/sysctl.h>
20 #include <sys/ucred.h>
21 #include <sys/uio.h>
22 
23 #include <machine/vmm.h>
24 
25 #include <vm/vm.h>
26 #include <vm/vm_object.h>
27 
28 #include <dev/vmm/vmm_dev.h>
29 #include <dev/vmm/vmm_stat.h>
30 
31 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
32 struct vm_memseg_12 {
33 	int		segid;
34 	size_t		len;
35 	char		name[64];
36 };
37 _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI");
38 
39 #define	VM_ALLOC_MEMSEG_12	\
40 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
41 #define	VM_GET_MEMSEG_12	\
42 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
43 #endif
44 
45 struct devmem_softc {
46 	int	segid;
47 	char	*name;
48 	struct cdev *cdev;
49 	struct vmmdev_softc *sc;
50 	SLIST_ENTRY(devmem_softc) link;
51 };
52 
53 struct vmmdev_softc {
54 	struct vm	*vm;		/* vm instance cookie */
55 	struct cdev	*cdev;
56 	struct ucred	*ucred;
57 	SLIST_ENTRY(vmmdev_softc) link;
58 	SLIST_HEAD(, devmem_softc) devmem;
59 	int		flags;
60 };
61 
62 static SLIST_HEAD(, vmmdev_softc) head;
63 
64 static unsigned pr_allow_flag;
65 static struct sx vmmdev_mtx;
66 SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex");
67 
68 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
69 
70 SYSCTL_DECL(_hw_vmm);
71 
72 static void devmem_destroy(void *arg);
73 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem);
74 
75 static int
76 vmm_priv_check(struct ucred *ucred)
77 {
78 	if (jailed(ucred) &&
79 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
80 		return (EPERM);
81 
82 	return (0);
83 }
84 
85 static int
86 vcpu_lock_one(struct vcpu *vcpu)
87 {
88 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
89 }
90 
91 static void
92 vcpu_unlock_one(struct vcpu *vcpu)
93 {
94 	enum vcpu_state state;
95 
96 	state = vcpu_get_state(vcpu, NULL);
97 	if (state != VCPU_FROZEN) {
98 		panic("vcpu %s(%d) has invalid state %d",
99 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
100 	}
101 
102 	vcpu_set_state(vcpu, VCPU_IDLE, false);
103 }
104 
105 static int
106 vcpu_lock_all(struct vmmdev_softc *sc)
107 {
108 	struct vcpu *vcpu;
109 	int error;
110 	uint16_t i, j, maxcpus;
111 
112 	error = 0;
113 	vm_slock_vcpus(sc->vm);
114 	maxcpus = vm_get_maxcpus(sc->vm);
115 	for (i = 0; i < maxcpus; i++) {
116 		vcpu = vm_vcpu(sc->vm, i);
117 		if (vcpu == NULL)
118 			continue;
119 		error = vcpu_lock_one(vcpu);
120 		if (error)
121 			break;
122 	}
123 
124 	if (error) {
125 		for (j = 0; j < i; j++) {
126 			vcpu = vm_vcpu(sc->vm, j);
127 			if (vcpu == NULL)
128 				continue;
129 			vcpu_unlock_one(vcpu);
130 		}
131 		vm_unlock_vcpus(sc->vm);
132 	}
133 
134 	return (error);
135 }
136 
137 static void
138 vcpu_unlock_all(struct vmmdev_softc *sc)
139 {
140 	struct vcpu *vcpu;
141 	uint16_t i, maxcpus;
142 
143 	maxcpus = vm_get_maxcpus(sc->vm);
144 	for (i = 0; i < maxcpus; i++) {
145 		vcpu = vm_vcpu(sc->vm, i);
146 		if (vcpu == NULL)
147 			continue;
148 		vcpu_unlock_one(vcpu);
149 	}
150 	vm_unlock_vcpus(sc->vm);
151 }
152 
153 static struct vmmdev_softc *
154 vmmdev_lookup(const char *name, struct ucred *cred)
155 {
156 	struct vmmdev_softc *sc;
157 
158 	sx_assert(&vmmdev_mtx, SA_XLOCKED);
159 
160 	SLIST_FOREACH(sc, &head, link) {
161 		if (strcmp(name, vm_name(sc->vm)) == 0)
162 			break;
163 	}
164 
165 	if (sc == NULL)
166 		return (NULL);
167 
168 	if (cr_cansee(cred, sc->ucred))
169 		return (NULL);
170 
171 	return (sc);
172 }
173 
174 static struct vmmdev_softc *
175 vmmdev_lookup2(struct cdev *cdev)
176 {
177 	return (cdev->si_drv1);
178 }
179 
180 static int
181 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
182 {
183 	int error, off, c, prot;
184 	vm_paddr_t gpa, maxaddr;
185 	void *hpa, *cookie;
186 	struct vmmdev_softc *sc;
187 
188 	sc = vmmdev_lookup2(cdev);
189 	if (sc == NULL)
190 		return (ENXIO);
191 
192 	/*
193 	 * Get a read lock on the guest memory map.
194 	 */
195 	vm_slock_memsegs(sc->vm);
196 
197 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
198 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
199 	while (uio->uio_resid > 0 && error == 0) {
200 		gpa = uio->uio_offset;
201 		off = gpa & PAGE_MASK;
202 		c = min(uio->uio_resid, PAGE_SIZE - off);
203 
204 		/*
205 		 * The VM has a hole in its physical memory map. If we want to
206 		 * use 'dd' to inspect memory beyond the hole we need to
207 		 * provide bogus data for memory that lies in the hole.
208 		 *
209 		 * Since this device does not support lseek(2), dd(1) will
210 		 * read(2) blocks of data to simulate the lseek(2).
211 		 */
212 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
213 		if (hpa == NULL) {
214 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
215 				error = uiomove(__DECONST(void *, zero_region),
216 				    c, uio);
217 			else
218 				error = EFAULT;
219 		} else {
220 			error = uiomove(hpa, c, uio);
221 			vm_gpa_release(cookie);
222 		}
223 	}
224 	vm_unlock_memsegs(sc->vm);
225 	return (error);
226 }
227 
228 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
229 
230 static int
231 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
232 {
233 	struct devmem_softc *dsc;
234 	int error;
235 	bool sysmem;
236 
237 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
238 	if (error || mseg->len == 0)
239 		return (error);
240 
241 	if (!sysmem) {
242 		SLIST_FOREACH(dsc, &sc->devmem, link) {
243 			if (dsc->segid == mseg->segid)
244 				break;
245 		}
246 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
247 		    __func__, mseg->segid));
248 		error = copystr(dsc->name, mseg->name, len, NULL);
249 	} else {
250 		bzero(mseg->name, len);
251 	}
252 
253 	return (error);
254 }
255 
256 static int
257 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
258 {
259 	char *name;
260 	int error;
261 	bool sysmem;
262 
263 	error = 0;
264 	name = NULL;
265 	sysmem = true;
266 
267 	/*
268 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
269 	 * by stripped off when devfs processes the full string.
270 	 */
271 	if (VM_MEMSEG_NAME(mseg)) {
272 		sysmem = false;
273 		name = malloc(len, M_VMMDEV, M_WAITOK);
274 		error = copystr(mseg->name, name, len, NULL);
275 		if (error)
276 			goto done;
277 	}
278 
279 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
280 	if (error)
281 		goto done;
282 
283 	if (VM_MEMSEG_NAME(mseg)) {
284 		error = devmem_create_cdev(sc, mseg->segid, name);
285 		if (error)
286 			vm_free_memseg(sc->vm, mseg->segid);
287 		else
288 			name = NULL;	/* freed when 'cdev' is destroyed */
289 	}
290 done:
291 	free(name, M_VMMDEV);
292 	return (error);
293 }
294 
295 static int
296 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
297     uint64_t *regval)
298 {
299 	int error, i;
300 
301 	error = 0;
302 	for (i = 0; i < count; i++) {
303 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
304 		if (error)
305 			break;
306 	}
307 	return (error);
308 }
309 
310 static int
311 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
312     uint64_t *regval)
313 {
314 	int error, i;
315 
316 	error = 0;
317 	for (i = 0; i < count; i++) {
318 		error = vm_set_register(vcpu, regnum[i], regval[i]);
319 		if (error)
320 			break;
321 	}
322 	return (error);
323 }
324 
325 static int
326 vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
327 {
328 	struct vmmdev_softc *sc;
329 	int error;
330 
331 	sc = vmmdev_lookup2(dev);
332 	KASSERT(sc != NULL, ("%s: device not found", __func__));
333 
334 	/*
335 	 * A user can only access VMs that they themselves have created.
336 	 */
337 	if (td->td_ucred != sc->ucred)
338 		return (EPERM);
339 
340 	/*
341 	 * A jail without vmm access shouldn't be able to access vmm device
342 	 * files at all, but check here just to be thorough.
343 	 */
344 	error = vmm_priv_check(td->td_ucred);
345 	if (error != 0)
346 		return (error);
347 
348 	return (0);
349 }
350 
351 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
352 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
353 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
354 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
355 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
356 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
357 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
358 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
359 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
360 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
361 
362 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
363 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
364 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
365 #endif
366 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
367 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
368 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
369 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
370 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
371 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
372 	VMMDEV_IOCTL(VM_REINIT,
373 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
374 
375 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
376 	VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
377 #endif
378 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
379 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
380 
381 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
382 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
383 
384 	VMMDEV_IOCTL(VM_SUSPEND, 0),
385 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
386 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
387 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
388 };
389 
390 static int
391 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
392     struct thread *td)
393 {
394 	struct vmmdev_softc *sc;
395 	struct vcpu *vcpu;
396 	const struct vmmdev_ioctl *ioctl;
397 	int error, vcpuid;
398 
399 	sc = vmmdev_lookup2(cdev);
400 	if (sc == NULL)
401 		return (ENXIO);
402 
403 	ioctl = NULL;
404 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
405 		if (vmmdev_ioctls[i].cmd == cmd) {
406 			ioctl = &vmmdev_ioctls[i];
407 			break;
408 		}
409 	}
410 	if (ioctl == NULL) {
411 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
412 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
413 				ioctl = &vmmdev_machdep_ioctls[i];
414 				break;
415 			}
416 		}
417 	}
418 	if (ioctl == NULL)
419 		return (ENOTTY);
420 
421 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
422 		vm_xlock_memsegs(sc->vm);
423 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
424 		vm_slock_memsegs(sc->vm);
425 
426 	vcpu = NULL;
427 	vcpuid = -1;
428 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
429 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
430 		vcpuid = *(int *)data;
431 		if (vcpuid == -1) {
432 			if ((ioctl->flags &
433 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
434 				error = EINVAL;
435 				goto lockfail;
436 			}
437 		} else {
438 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
439 			if (vcpu == NULL) {
440 				error = EINVAL;
441 				goto lockfail;
442 			}
443 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
444 				error = vcpu_lock_one(vcpu);
445 				if (error)
446 					goto lockfail;
447 			}
448 		}
449 	}
450 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
451 		error = vcpu_lock_all(sc);
452 		if (error)
453 			goto lockfail;
454 	}
455 
456 	switch (cmd) {
457 	case VM_SUSPEND: {
458 		struct vm_suspend *vmsuspend;
459 
460 		vmsuspend = (struct vm_suspend *)data;
461 		error = vm_suspend(sc->vm, vmsuspend->how);
462 		break;
463 	}
464 	case VM_REINIT:
465 		error = vm_reinit(sc->vm);
466 		break;
467 	case VM_STAT_DESC: {
468 		struct vm_stat_desc *statdesc;
469 
470 		statdesc = (struct vm_stat_desc *)data;
471 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
472 		    sizeof(statdesc->desc));
473 		break;
474 	}
475 	case VM_STATS: {
476 		struct vm_stats *vmstats;
477 
478 		vmstats = (struct vm_stats *)data;
479 		getmicrotime(&vmstats->tv);
480 		error = vmm_stat_copy(vcpu, vmstats->index,
481 		    nitems(vmstats->statbuf), &vmstats->num_entries,
482 		    vmstats->statbuf);
483 		break;
484 	}
485 	case VM_MMAP_GETNEXT: {
486 		struct vm_memmap *mm;
487 
488 		mm = (struct vm_memmap *)data;
489 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
490 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
491 		break;
492 	}
493 	case VM_MMAP_MEMSEG: {
494 		struct vm_memmap *mm;
495 
496 		mm = (struct vm_memmap *)data;
497 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
498 		    mm->len, mm->prot, mm->flags);
499 		break;
500 	}
501 	case VM_MUNMAP_MEMSEG: {
502 		struct vm_munmap *mu;
503 
504 		mu = (struct vm_munmap *)data;
505 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
506 		break;
507 	}
508 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
509 	case VM_ALLOC_MEMSEG_12:
510 		error = alloc_memseg(sc, (struct vm_memseg *)data,
511 		    sizeof(((struct vm_memseg_12 *)0)->name));
512 		break;
513 	case VM_GET_MEMSEG_12:
514 		error = get_memseg(sc, (struct vm_memseg *)data,
515 		    sizeof(((struct vm_memseg_12 *)0)->name));
516 		break;
517 #endif
518 	case VM_ALLOC_MEMSEG:
519 		error = alloc_memseg(sc, (struct vm_memseg *)data,
520 		    sizeof(((struct vm_memseg *)0)->name));
521 		break;
522 	case VM_GET_MEMSEG:
523 		error = get_memseg(sc, (struct vm_memseg *)data,
524 		    sizeof(((struct vm_memseg *)0)->name));
525 		break;
526 	case VM_GET_REGISTER: {
527 		struct vm_register *vmreg;
528 
529 		vmreg = (struct vm_register *)data;
530 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
531 		break;
532 	}
533 	case VM_SET_REGISTER: {
534 		struct vm_register *vmreg;
535 
536 		vmreg = (struct vm_register *)data;
537 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
538 		break;
539 	}
540 	case VM_GET_REGISTER_SET: {
541 		struct vm_register_set *vmregset;
542 		uint64_t *regvals;
543 		int *regnums;
544 
545 		vmregset = (struct vm_register_set *)data;
546 		if (vmregset->count > VM_REG_LAST) {
547 			error = EINVAL;
548 			break;
549 		}
550 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
551 		    M_WAITOK);
552 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
553 		    M_WAITOK);
554 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
555 		    vmregset->count);
556 		if (error == 0)
557 			error = vm_get_register_set(vcpu,
558 			    vmregset->count, regnums, regvals);
559 		if (error == 0)
560 			error = copyout(regvals, vmregset->regvals,
561 			    sizeof(regvals[0]) * vmregset->count);
562 		free(regvals, M_VMMDEV);
563 		free(regnums, M_VMMDEV);
564 		break;
565 	}
566 	case VM_SET_REGISTER_SET: {
567 		struct vm_register_set *vmregset;
568 		uint64_t *regvals;
569 		int *regnums;
570 
571 		vmregset = (struct vm_register_set *)data;
572 		if (vmregset->count > VM_REG_LAST) {
573 			error = EINVAL;
574 			break;
575 		}
576 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
577 		    M_WAITOK);
578 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
579 		    M_WAITOK);
580 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
581 		    vmregset->count);
582 		if (error == 0)
583 			error = copyin(vmregset->regvals, regvals,
584 			    sizeof(regvals[0]) * vmregset->count);
585 		if (error == 0)
586 			error = vm_set_register_set(vcpu,
587 			    vmregset->count, regnums, regvals);
588 		free(regvals, M_VMMDEV);
589 		free(regnums, M_VMMDEV);
590 		break;
591 	}
592 	case VM_GET_CAPABILITY: {
593 		struct vm_capability *vmcap;
594 
595 		vmcap = (struct vm_capability *)data;
596 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
597 		break;
598 	}
599 	case VM_SET_CAPABILITY: {
600 		struct vm_capability *vmcap;
601 
602 		vmcap = (struct vm_capability *)data;
603 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
604 		break;
605 	}
606 	case VM_ACTIVATE_CPU:
607 		error = vm_activate_cpu(vcpu);
608 		break;
609 	case VM_GET_CPUS: {
610 		struct vm_cpuset *vm_cpuset;
611 		cpuset_t *cpuset;
612 		int size;
613 
614 		error = 0;
615 		vm_cpuset = (struct vm_cpuset *)data;
616 		size = vm_cpuset->cpusetsize;
617 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
618 			error = ERANGE;
619 			break;
620 		}
621 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
622 		    M_WAITOK | M_ZERO);
623 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
624 			*cpuset = vm_active_cpus(sc->vm);
625 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
626 			*cpuset = vm_suspended_cpus(sc->vm);
627 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
628 			*cpuset = vm_debug_cpus(sc->vm);
629 		else
630 			error = EINVAL;
631 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
632 			error = ERANGE;
633 		if (error == 0)
634 			error = copyout(cpuset, vm_cpuset->cpus, size);
635 		free(cpuset, M_TEMP);
636 		break;
637 	}
638 	case VM_SUSPEND_CPU:
639 		error = vm_suspend_cpu(sc->vm, vcpu);
640 		break;
641 	case VM_RESUME_CPU:
642 		error = vm_resume_cpu(sc->vm, vcpu);
643 		break;
644 	case VM_SET_TOPOLOGY: {
645 		struct vm_cpu_topology *topology;
646 
647 		topology = (struct vm_cpu_topology *)data;
648 		error = vm_set_topology(sc->vm, topology->sockets,
649 		    topology->cores, topology->threads, topology->maxcpus);
650 		break;
651 	}
652 	case VM_GET_TOPOLOGY: {
653 		struct vm_cpu_topology *topology;
654 
655 		topology = (struct vm_cpu_topology *)data;
656 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
657 		    &topology->threads, &topology->maxcpus);
658 		error = 0;
659 		break;
660 	}
661 	default:
662 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
663 		    td);
664 		break;
665 	}
666 
667 	if ((ioctl->flags &
668 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
669 		vm_unlock_memsegs(sc->vm);
670 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
671 		vcpu_unlock_all(sc);
672 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
673 		vcpu_unlock_one(vcpu);
674 
675 	/*
676 	 * Make sure that no handler returns a kernel-internal
677 	 * error value to userspace.
678 	 */
679 	KASSERT(error == ERESTART || error >= 0,
680 	    ("vmmdev_ioctl: invalid error return %d", error));
681 	return (error);
682 
683 lockfail:
684 	if ((ioctl->flags &
685 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
686 		vm_unlock_memsegs(sc->vm);
687 	return (error);
688 }
689 
690 static int
691 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
692     struct vm_object **objp, int nprot)
693 {
694 	struct vmmdev_softc *sc;
695 	vm_paddr_t gpa;
696 	size_t len;
697 	vm_ooffset_t segoff, first, last;
698 	int error, found, segid;
699 	bool sysmem;
700 
701 	first = *offset;
702 	last = first + mapsize;
703 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
704 		return (EINVAL);
705 
706 	sc = vmmdev_lookup2(cdev);
707 	if (sc == NULL) {
708 		/* virtual machine is in the process of being created */
709 		return (EINVAL);
710 	}
711 
712 	/*
713 	 * Get a read lock on the guest memory map.
714 	 */
715 	vm_slock_memsegs(sc->vm);
716 
717 	gpa = 0;
718 	found = 0;
719 	while (!found) {
720 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
721 		    NULL, NULL);
722 		if (error)
723 			break;
724 
725 		if (first >= gpa && last <= gpa + len)
726 			found = 1;
727 		else
728 			gpa += len;
729 	}
730 
731 	if (found) {
732 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
733 		KASSERT(error == 0 && *objp != NULL,
734 		    ("%s: invalid memory segment %d", __func__, segid));
735 		if (sysmem) {
736 			vm_object_reference(*objp);
737 			*offset = segoff + (first - gpa);
738 		} else {
739 			error = EINVAL;
740 		}
741 	}
742 	vm_unlock_memsegs(sc->vm);
743 	return (error);
744 }
745 
746 static void
747 vmmdev_destroy(struct vmmdev_softc *sc)
748 {
749 	struct devmem_softc *dsc;
750 	int error __diagused;
751 
752 	KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__));
753 
754 	/*
755 	 * Destroy all cdevs:
756 	 *
757 	 * - any new operations on the 'cdev' will return an error (ENXIO).
758 	 *
759 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
760 	 */
761 	SLIST_FOREACH(dsc, &sc->devmem, link) {
762 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
763 		devmem_destroy(dsc);
764 	}
765 
766 	vm_disable_vcpu_creation(sc->vm);
767 	error = vcpu_lock_all(sc);
768 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
769 	vm_unlock_vcpus(sc->vm);
770 
771 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
772 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
773 		SLIST_REMOVE_HEAD(&sc->devmem, link);
774 		free(dsc->name, M_VMMDEV);
775 		free(dsc, M_VMMDEV);
776 	}
777 
778 	if (sc->vm != NULL)
779 		vm_destroy(sc->vm);
780 
781 	if (sc->ucred != NULL)
782 		crfree(sc->ucred);
783 
784 	sx_xlock(&vmmdev_mtx);
785 	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
786 	sx_xunlock(&vmmdev_mtx);
787 	free(sc, M_VMMDEV);
788 }
789 
790 static int
791 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred)
792 {
793 	struct cdev *cdev;
794 	struct vmmdev_softc *sc;
795 
796 	sx_xlock(&vmmdev_mtx);
797 	sc = vmmdev_lookup(name, cred);
798 	if (sc == NULL || sc->cdev == NULL) {
799 		sx_xunlock(&vmmdev_mtx);
800 		return (EINVAL);
801 	}
802 
803 	/*
804 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
805 	 * is scheduled for destruction.
806 	 */
807 	cdev = sc->cdev;
808 	sc->cdev = NULL;
809 	sx_xunlock(&vmmdev_mtx);
810 
811 	destroy_dev(cdev);
812 	vmmdev_destroy(sc);
813 
814 	return (0);
815 }
816 
817 static int
818 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
819 {
820 	char *buf;
821 	int error, buflen;
822 
823 	error = vmm_priv_check(req->td->td_ucred);
824 	if (error)
825 		return (error);
826 
827 	buflen = VM_MAX_NAMELEN + 1;
828 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
829 	strlcpy(buf, "beavis", buflen);
830 	error = sysctl_handle_string(oidp, buf, buflen, req);
831 	if (error == 0 && req->newptr != NULL)
832 		error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
833 	free(buf, M_VMMDEV);
834 	return (error);
835 }
836 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
837     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
838     NULL, 0, sysctl_vmm_destroy, "A",
839     NULL);
840 
841 static struct cdevsw vmmdevsw = {
842 	.d_name		= "vmmdev",
843 	.d_version	= D_VERSION,
844 	.d_open		= vmmdev_open,
845 	.d_ioctl	= vmmdev_ioctl,
846 	.d_mmap_single	= vmmdev_mmap_single,
847 	.d_read		= vmmdev_rw,
848 	.d_write	= vmmdev_rw,
849 };
850 
851 static struct vmmdev_softc *
852 vmmdev_alloc(struct vm *vm, struct ucred *cred)
853 {
854 	struct vmmdev_softc *sc;
855 
856 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
857 	SLIST_INIT(&sc->devmem);
858 	sc->vm = vm;
859 	sc->ucred = crhold(cred);
860 	return (sc);
861 }
862 
863 static int
864 vmmdev_create(const char *name, struct ucred *cred)
865 {
866 	struct make_dev_args mda;
867 	struct cdev *cdev;
868 	struct vmmdev_softc *sc;
869 	struct vm *vm;
870 	int error;
871 
872 	sx_xlock(&vmmdev_mtx);
873 	sc = vmmdev_lookup(name, cred);
874 	if (sc != NULL) {
875 		sx_xunlock(&vmmdev_mtx);
876 		return (EEXIST);
877 	}
878 
879 	error = vm_create(name, &vm);
880 	if (error != 0) {
881 		sx_xunlock(&vmmdev_mtx);
882 		return (error);
883 	}
884 	sc = vmmdev_alloc(vm, cred);
885 	SLIST_INSERT_HEAD(&head, sc, link);
886 
887 	make_dev_args_init(&mda);
888 	mda.mda_devsw = &vmmdevsw;
889 	mda.mda_cr = sc->ucred;
890 	mda.mda_uid = UID_ROOT;
891 	mda.mda_gid = GID_WHEEL;
892 	mda.mda_mode = 0600;
893 	mda.mda_si_drv1 = sc;
894 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
895 	error = make_dev_s(&mda, &cdev, "vmm/%s", name);
896 	if (error != 0) {
897 		sx_xunlock(&vmmdev_mtx);
898 		vmmdev_destroy(sc);
899 		return (error);
900 	}
901 	sc->cdev = cdev;
902 	sx_xunlock(&vmmdev_mtx);
903 	return (0);
904 }
905 
906 static int
907 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
908 {
909 	char *buf;
910 	int error, buflen;
911 
912 	error = vmm_priv_check(req->td->td_ucred);
913 	if (error != 0)
914 		return (error);
915 
916 	buflen = VM_MAX_NAMELEN + 1;
917 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
918 	strlcpy(buf, "beavis", buflen);
919 	error = sysctl_handle_string(oidp, buf, buflen, req);
920 	if (error == 0 && req->newptr != NULL)
921 		error = vmmdev_create(buf, req->td->td_ucred);
922 	free(buf, M_VMMDEV);
923 	return (error);
924 }
925 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
926     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
927     NULL, 0, sysctl_vmm_create, "A",
928     NULL);
929 
930 void
931 vmmdev_init(void)
932 {
933 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
934 	    "Allow use of vmm in a jail.");
935 }
936 
937 int
938 vmmdev_cleanup(void)
939 {
940 	int error;
941 
942 	if (SLIST_EMPTY(&head))
943 		error = 0;
944 	else
945 		error = EBUSY;
946 
947 	return (error);
948 }
949 
950 static int
951 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
952     struct vm_object **objp, int nprot)
953 {
954 	struct devmem_softc *dsc;
955 	vm_ooffset_t first, last;
956 	size_t seglen;
957 	int error;
958 	bool sysmem;
959 
960 	dsc = cdev->si_drv1;
961 	if (dsc == NULL) {
962 		/* 'cdev' has been created but is not ready for use */
963 		return (ENXIO);
964 	}
965 
966 	first = *offset;
967 	last = *offset + len;
968 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
969 		return (EINVAL);
970 
971 	vm_slock_memsegs(dsc->sc->vm);
972 
973 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
974 	KASSERT(error == 0 && !sysmem && *objp != NULL,
975 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
976 
977 	if (seglen >= last)
978 		vm_object_reference(*objp);
979 	else
980 		error = EINVAL;
981 
982 	vm_unlock_memsegs(dsc->sc->vm);
983 	return (error);
984 }
985 
986 static struct cdevsw devmemsw = {
987 	.d_name		= "devmem",
988 	.d_version	= D_VERSION,
989 	.d_mmap_single	= devmem_mmap_single,
990 };
991 
992 static int
993 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname)
994 {
995 	struct make_dev_args mda;
996 	struct devmem_softc *dsc;
997 	int error;
998 
999 	sx_xlock(&vmmdev_mtx);
1000 
1001 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1002 	dsc->segid = segid;
1003 	dsc->name = devname;
1004 	dsc->sc = sc;
1005 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1006 
1007 	make_dev_args_init(&mda);
1008 	mda.mda_devsw = &devmemsw;
1009 	mda.mda_cr = sc->ucred;
1010 	mda.mda_uid = UID_ROOT;
1011 	mda.mda_gid = GID_WHEEL;
1012 	mda.mda_mode = 0600;
1013 	mda.mda_si_drv1 = dsc;
1014 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1015 	error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm),
1016 	    devname);
1017 	if (error != 0) {
1018 		SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link);
1019 		free(dsc->name, M_VMMDEV);
1020 		free(dsc, M_VMMDEV);
1021 	}
1022 
1023 	sx_xunlock(&vmmdev_mtx);
1024 
1025 	return (error);
1026 }
1027 
1028 static void
1029 devmem_destroy(void *arg)
1030 {
1031 	struct devmem_softc *dsc = arg;
1032 
1033 	destroy_dev(dsc->cdev);
1034 	dsc->cdev = NULL;
1035 	dsc->sc = NULL;
1036 }
1037