xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision 5036d9652a5701d00e9e40ea942c278e9f77d33d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #include <sys/fcntl.h>
12 #include <sys/ioccom.h>
13 #include <sys/jail.h>
14 #include <sys/kernel.h>
15 #include <sys/malloc.h>
16 #include <sys/mman.h>
17 #include <sys/proc.h>
18 #include <sys/queue.h>
19 #include <sys/sx.h>
20 #include <sys/sysctl.h>
21 #include <sys/ucred.h>
22 #include <sys/uio.h>
23 
24 #include <machine/vmm.h>
25 
26 #include <vm/vm.h>
27 #include <vm/vm_object.h>
28 
29 #include <dev/vmm/vmm_dev.h>
30 #include <dev/vmm/vmm_stat.h>
31 
32 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
33 struct vm_memseg_12 {
34 	int		segid;
35 	size_t		len;
36 	char		name[64];
37 };
38 _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI");
39 
40 #define	VM_ALLOC_MEMSEG_12	\
41 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
42 #define	VM_GET_MEMSEG_12	\
43 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
44 #endif
45 
46 struct devmem_softc {
47 	int	segid;
48 	char	*name;
49 	struct cdev *cdev;
50 	struct vmmdev_softc *sc;
51 	SLIST_ENTRY(devmem_softc) link;
52 };
53 
54 struct vmmdev_softc {
55 	struct vm	*vm;		/* vm instance cookie */
56 	struct cdev	*cdev;
57 	struct ucred	*ucred;
58 	SLIST_ENTRY(vmmdev_softc) link;
59 	SLIST_HEAD(, devmem_softc) devmem;
60 	int		flags;
61 };
62 
63 static SLIST_HEAD(, vmmdev_softc) head;
64 
65 static unsigned pr_allow_flag;
66 static struct sx vmmdev_mtx;
67 SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex");
68 
69 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
70 
71 SYSCTL_DECL(_hw_vmm);
72 
73 static void devmem_destroy(void *arg);
74 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem);
75 
76 static int
77 vmm_priv_check(struct ucred *ucred)
78 {
79 	if (jailed(ucred) &&
80 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
81 		return (EPERM);
82 
83 	return (0);
84 }
85 
86 static int
87 vcpu_lock_one(struct vcpu *vcpu)
88 {
89 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
90 }
91 
92 static void
93 vcpu_unlock_one(struct vcpu *vcpu)
94 {
95 	enum vcpu_state state;
96 
97 	state = vcpu_get_state(vcpu, NULL);
98 	if (state != VCPU_FROZEN) {
99 		panic("vcpu %s(%d) has invalid state %d",
100 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
101 	}
102 
103 	vcpu_set_state(vcpu, VCPU_IDLE, false);
104 }
105 
106 static int
107 vcpu_lock_all(struct vmmdev_softc *sc)
108 {
109 	struct vcpu *vcpu;
110 	int error;
111 	uint16_t i, j, maxcpus;
112 
113 	error = 0;
114 	vm_slock_vcpus(sc->vm);
115 	maxcpus = vm_get_maxcpus(sc->vm);
116 	for (i = 0; i < maxcpus; i++) {
117 		vcpu = vm_vcpu(sc->vm, i);
118 		if (vcpu == NULL)
119 			continue;
120 		error = vcpu_lock_one(vcpu);
121 		if (error)
122 			break;
123 	}
124 
125 	if (error) {
126 		for (j = 0; j < i; j++) {
127 			vcpu = vm_vcpu(sc->vm, j);
128 			if (vcpu == NULL)
129 				continue;
130 			vcpu_unlock_one(vcpu);
131 		}
132 		vm_unlock_vcpus(sc->vm);
133 	}
134 
135 	return (error);
136 }
137 
138 static void
139 vcpu_unlock_all(struct vmmdev_softc *sc)
140 {
141 	struct vcpu *vcpu;
142 	uint16_t i, maxcpus;
143 
144 	maxcpus = vm_get_maxcpus(sc->vm);
145 	for (i = 0; i < maxcpus; i++) {
146 		vcpu = vm_vcpu(sc->vm, i);
147 		if (vcpu == NULL)
148 			continue;
149 		vcpu_unlock_one(vcpu);
150 	}
151 	vm_unlock_vcpus(sc->vm);
152 }
153 
154 static struct vmmdev_softc *
155 vmmdev_lookup(const char *name, struct ucred *cred)
156 {
157 	struct vmmdev_softc *sc;
158 
159 	sx_assert(&vmmdev_mtx, SA_XLOCKED);
160 
161 	SLIST_FOREACH(sc, &head, link) {
162 		if (strcmp(name, vm_name(sc->vm)) == 0)
163 			break;
164 	}
165 
166 	if (sc == NULL)
167 		return (NULL);
168 
169 	if (cr_cansee(cred, sc->ucred))
170 		return (NULL);
171 
172 	return (sc);
173 }
174 
175 static struct vmmdev_softc *
176 vmmdev_lookup2(struct cdev *cdev)
177 {
178 	return (cdev->si_drv1);
179 }
180 
181 static int
182 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
183 {
184 	int error, off, c, prot;
185 	vm_paddr_t gpa, maxaddr;
186 	void *hpa, *cookie;
187 	struct vmmdev_softc *sc;
188 
189 	sc = vmmdev_lookup2(cdev);
190 	if (sc == NULL)
191 		return (ENXIO);
192 
193 	/*
194 	 * Get a read lock on the guest memory map.
195 	 */
196 	vm_slock_memsegs(sc->vm);
197 
198 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
199 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
200 	while (uio->uio_resid > 0 && error == 0) {
201 		gpa = uio->uio_offset;
202 		off = gpa & PAGE_MASK;
203 		c = min(uio->uio_resid, PAGE_SIZE - off);
204 
205 		/*
206 		 * The VM has a hole in its physical memory map. If we want to
207 		 * use 'dd' to inspect memory beyond the hole we need to
208 		 * provide bogus data for memory that lies in the hole.
209 		 *
210 		 * Since this device does not support lseek(2), dd(1) will
211 		 * read(2) blocks of data to simulate the lseek(2).
212 		 */
213 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
214 		if (hpa == NULL) {
215 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
216 				error = uiomove(__DECONST(void *, zero_region),
217 				    c, uio);
218 			else
219 				error = EFAULT;
220 		} else {
221 			error = uiomove(hpa, c, uio);
222 			vm_gpa_release(cookie);
223 		}
224 	}
225 	vm_unlock_memsegs(sc->vm);
226 	return (error);
227 }
228 
229 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
230 
231 static int
232 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
233 {
234 	struct devmem_softc *dsc;
235 	int error;
236 	bool sysmem;
237 
238 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
239 	if (error || mseg->len == 0)
240 		return (error);
241 
242 	if (!sysmem) {
243 		SLIST_FOREACH(dsc, &sc->devmem, link) {
244 			if (dsc->segid == mseg->segid)
245 				break;
246 		}
247 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
248 		    __func__, mseg->segid));
249 		error = copystr(dsc->name, mseg->name, len, NULL);
250 	} else {
251 		bzero(mseg->name, len);
252 	}
253 
254 	return (error);
255 }
256 
257 static int
258 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
259 {
260 	char *name;
261 	int error;
262 	bool sysmem;
263 
264 	error = 0;
265 	name = NULL;
266 	sysmem = true;
267 
268 	/*
269 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
270 	 * by stripped off when devfs processes the full string.
271 	 */
272 	if (VM_MEMSEG_NAME(mseg)) {
273 		sysmem = false;
274 		name = malloc(len, M_VMMDEV, M_WAITOK);
275 		error = copystr(mseg->name, name, len, NULL);
276 		if (error)
277 			goto done;
278 	}
279 
280 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
281 	if (error)
282 		goto done;
283 
284 	if (VM_MEMSEG_NAME(mseg)) {
285 		error = devmem_create_cdev(sc, mseg->segid, name);
286 		if (error)
287 			vm_free_memseg(sc->vm, mseg->segid);
288 		else
289 			name = NULL;	/* freed when 'cdev' is destroyed */
290 	}
291 done:
292 	free(name, M_VMMDEV);
293 	return (error);
294 }
295 
296 static int
297 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
298     uint64_t *regval)
299 {
300 	int error, i;
301 
302 	error = 0;
303 	for (i = 0; i < count; i++) {
304 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
305 		if (error)
306 			break;
307 	}
308 	return (error);
309 }
310 
311 static int
312 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
313     uint64_t *regval)
314 {
315 	int error, i;
316 
317 	error = 0;
318 	for (i = 0; i < count; i++) {
319 		error = vm_set_register(vcpu, regnum[i], regval[i]);
320 		if (error)
321 			break;
322 	}
323 	return (error);
324 }
325 
326 static int
327 vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
328 {
329 	int error;
330 
331 	/*
332 	 * A jail without vmm access shouldn't be able to access vmm device
333 	 * files at all, but check here just to be thorough.
334 	 */
335 	error = vmm_priv_check(td->td_ucred);
336 	if (error != 0)
337 		return (error);
338 
339 	return (0);
340 }
341 
342 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
343 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
344 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
345 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
346 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
347 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
348 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
349 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
350 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
351 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
352 
353 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
354 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
355 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
356 #endif
357 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
358 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
359 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
360 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
361 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
362 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
363 	VMMDEV_IOCTL(VM_REINIT,
364 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
365 
366 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
367 	VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
368 #endif
369 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
370 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
371 
372 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
373 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
374 
375 	VMMDEV_IOCTL(VM_SUSPEND, 0),
376 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
377 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
378 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
379 };
380 
381 static int
382 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
383     struct thread *td)
384 {
385 	struct vmmdev_softc *sc;
386 	struct vcpu *vcpu;
387 	const struct vmmdev_ioctl *ioctl;
388 	int error, vcpuid;
389 
390 	sc = vmmdev_lookup2(cdev);
391 	if (sc == NULL)
392 		return (ENXIO);
393 
394 	ioctl = NULL;
395 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
396 		if (vmmdev_ioctls[i].cmd == cmd) {
397 			ioctl = &vmmdev_ioctls[i];
398 			break;
399 		}
400 	}
401 	if (ioctl == NULL) {
402 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
403 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
404 				ioctl = &vmmdev_machdep_ioctls[i];
405 				break;
406 			}
407 		}
408 	}
409 	if (ioctl == NULL)
410 		return (ENOTTY);
411 
412 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
413 		vm_xlock_memsegs(sc->vm);
414 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
415 		vm_slock_memsegs(sc->vm);
416 
417 	vcpu = NULL;
418 	vcpuid = -1;
419 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
420 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
421 		vcpuid = *(int *)data;
422 		if (vcpuid == -1) {
423 			if ((ioctl->flags &
424 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
425 				error = EINVAL;
426 				goto lockfail;
427 			}
428 		} else {
429 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
430 			if (vcpu == NULL) {
431 				error = EINVAL;
432 				goto lockfail;
433 			}
434 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
435 				error = vcpu_lock_one(vcpu);
436 				if (error)
437 					goto lockfail;
438 			}
439 		}
440 	}
441 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
442 		error = vcpu_lock_all(sc);
443 		if (error)
444 			goto lockfail;
445 	}
446 
447 	switch (cmd) {
448 	case VM_SUSPEND: {
449 		struct vm_suspend *vmsuspend;
450 
451 		vmsuspend = (struct vm_suspend *)data;
452 		error = vm_suspend(sc->vm, vmsuspend->how);
453 		break;
454 	}
455 	case VM_REINIT:
456 		error = vm_reinit(sc->vm);
457 		break;
458 	case VM_STAT_DESC: {
459 		struct vm_stat_desc *statdesc;
460 
461 		statdesc = (struct vm_stat_desc *)data;
462 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
463 		    sizeof(statdesc->desc));
464 		break;
465 	}
466 	case VM_STATS: {
467 		struct vm_stats *vmstats;
468 
469 		vmstats = (struct vm_stats *)data;
470 		getmicrotime(&vmstats->tv);
471 		error = vmm_stat_copy(vcpu, vmstats->index,
472 		    nitems(vmstats->statbuf), &vmstats->num_entries,
473 		    vmstats->statbuf);
474 		break;
475 	}
476 	case VM_MMAP_GETNEXT: {
477 		struct vm_memmap *mm;
478 
479 		mm = (struct vm_memmap *)data;
480 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
481 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
482 		break;
483 	}
484 	case VM_MMAP_MEMSEG: {
485 		struct vm_memmap *mm;
486 
487 		mm = (struct vm_memmap *)data;
488 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
489 		    mm->len, mm->prot, mm->flags);
490 		break;
491 	}
492 	case VM_MUNMAP_MEMSEG: {
493 		struct vm_munmap *mu;
494 
495 		mu = (struct vm_munmap *)data;
496 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
497 		break;
498 	}
499 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
500 	case VM_ALLOC_MEMSEG_12:
501 		error = alloc_memseg(sc, (struct vm_memseg *)data,
502 		    sizeof(((struct vm_memseg_12 *)0)->name));
503 		break;
504 	case VM_GET_MEMSEG_12:
505 		error = get_memseg(sc, (struct vm_memseg *)data,
506 		    sizeof(((struct vm_memseg_12 *)0)->name));
507 		break;
508 #endif
509 	case VM_ALLOC_MEMSEG:
510 		error = alloc_memseg(sc, (struct vm_memseg *)data,
511 		    sizeof(((struct vm_memseg *)0)->name));
512 		break;
513 	case VM_GET_MEMSEG:
514 		error = get_memseg(sc, (struct vm_memseg *)data,
515 		    sizeof(((struct vm_memseg *)0)->name));
516 		break;
517 	case VM_GET_REGISTER: {
518 		struct vm_register *vmreg;
519 
520 		vmreg = (struct vm_register *)data;
521 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
522 		break;
523 	}
524 	case VM_SET_REGISTER: {
525 		struct vm_register *vmreg;
526 
527 		vmreg = (struct vm_register *)data;
528 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
529 		break;
530 	}
531 	case VM_GET_REGISTER_SET: {
532 		struct vm_register_set *vmregset;
533 		uint64_t *regvals;
534 		int *regnums;
535 
536 		vmregset = (struct vm_register_set *)data;
537 		if (vmregset->count > VM_REG_LAST) {
538 			error = EINVAL;
539 			break;
540 		}
541 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
542 		    M_WAITOK);
543 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
544 		    M_WAITOK);
545 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
546 		    vmregset->count);
547 		if (error == 0)
548 			error = vm_get_register_set(vcpu,
549 			    vmregset->count, regnums, regvals);
550 		if (error == 0)
551 			error = copyout(regvals, vmregset->regvals,
552 			    sizeof(regvals[0]) * vmregset->count);
553 		free(regvals, M_VMMDEV);
554 		free(regnums, M_VMMDEV);
555 		break;
556 	}
557 	case VM_SET_REGISTER_SET: {
558 		struct vm_register_set *vmregset;
559 		uint64_t *regvals;
560 		int *regnums;
561 
562 		vmregset = (struct vm_register_set *)data;
563 		if (vmregset->count > VM_REG_LAST) {
564 			error = EINVAL;
565 			break;
566 		}
567 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
568 		    M_WAITOK);
569 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
570 		    M_WAITOK);
571 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
572 		    vmregset->count);
573 		if (error == 0)
574 			error = copyin(vmregset->regvals, regvals,
575 			    sizeof(regvals[0]) * vmregset->count);
576 		if (error == 0)
577 			error = vm_set_register_set(vcpu,
578 			    vmregset->count, regnums, regvals);
579 		free(regvals, M_VMMDEV);
580 		free(regnums, M_VMMDEV);
581 		break;
582 	}
583 	case VM_GET_CAPABILITY: {
584 		struct vm_capability *vmcap;
585 
586 		vmcap = (struct vm_capability *)data;
587 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
588 		break;
589 	}
590 	case VM_SET_CAPABILITY: {
591 		struct vm_capability *vmcap;
592 
593 		vmcap = (struct vm_capability *)data;
594 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
595 		break;
596 	}
597 	case VM_ACTIVATE_CPU:
598 		error = vm_activate_cpu(vcpu);
599 		break;
600 	case VM_GET_CPUS: {
601 		struct vm_cpuset *vm_cpuset;
602 		cpuset_t *cpuset;
603 		int size;
604 
605 		error = 0;
606 		vm_cpuset = (struct vm_cpuset *)data;
607 		size = vm_cpuset->cpusetsize;
608 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
609 			error = ERANGE;
610 			break;
611 		}
612 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
613 		    M_WAITOK | M_ZERO);
614 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
615 			*cpuset = vm_active_cpus(sc->vm);
616 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
617 			*cpuset = vm_suspended_cpus(sc->vm);
618 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
619 			*cpuset = vm_debug_cpus(sc->vm);
620 		else
621 			error = EINVAL;
622 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
623 			error = ERANGE;
624 		if (error == 0)
625 			error = copyout(cpuset, vm_cpuset->cpus, size);
626 		free(cpuset, M_TEMP);
627 		break;
628 	}
629 	case VM_SUSPEND_CPU:
630 		error = vm_suspend_cpu(sc->vm, vcpu);
631 		break;
632 	case VM_RESUME_CPU:
633 		error = vm_resume_cpu(sc->vm, vcpu);
634 		break;
635 	case VM_SET_TOPOLOGY: {
636 		struct vm_cpu_topology *topology;
637 
638 		topology = (struct vm_cpu_topology *)data;
639 		error = vm_set_topology(sc->vm, topology->sockets,
640 		    topology->cores, topology->threads, topology->maxcpus);
641 		break;
642 	}
643 	case VM_GET_TOPOLOGY: {
644 		struct vm_cpu_topology *topology;
645 
646 		topology = (struct vm_cpu_topology *)data;
647 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
648 		    &topology->threads, &topology->maxcpus);
649 		error = 0;
650 		break;
651 	}
652 	default:
653 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
654 		    td);
655 		break;
656 	}
657 
658 	if ((ioctl->flags &
659 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
660 		vm_unlock_memsegs(sc->vm);
661 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
662 		vcpu_unlock_all(sc);
663 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
664 		vcpu_unlock_one(vcpu);
665 
666 	/*
667 	 * Make sure that no handler returns a kernel-internal
668 	 * error value to userspace.
669 	 */
670 	KASSERT(error == ERESTART || error >= 0,
671 	    ("vmmdev_ioctl: invalid error return %d", error));
672 	return (error);
673 
674 lockfail:
675 	if ((ioctl->flags &
676 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
677 		vm_unlock_memsegs(sc->vm);
678 	return (error);
679 }
680 
681 static int
682 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
683     struct vm_object **objp, int nprot)
684 {
685 	struct vmmdev_softc *sc;
686 	vm_paddr_t gpa;
687 	size_t len;
688 	vm_ooffset_t segoff, first, last;
689 	int error, found, segid;
690 	bool sysmem;
691 
692 	first = *offset;
693 	last = first + mapsize;
694 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
695 		return (EINVAL);
696 
697 	sc = vmmdev_lookup2(cdev);
698 	if (sc == NULL) {
699 		/* virtual machine is in the process of being created */
700 		return (EINVAL);
701 	}
702 
703 	/*
704 	 * Get a read lock on the guest memory map.
705 	 */
706 	vm_slock_memsegs(sc->vm);
707 
708 	gpa = 0;
709 	found = 0;
710 	while (!found) {
711 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
712 		    NULL, NULL);
713 		if (error)
714 			break;
715 
716 		if (first >= gpa && last <= gpa + len)
717 			found = 1;
718 		else
719 			gpa += len;
720 	}
721 
722 	if (found) {
723 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
724 		KASSERT(error == 0 && *objp != NULL,
725 		    ("%s: invalid memory segment %d", __func__, segid));
726 		if (sysmem) {
727 			vm_object_reference(*objp);
728 			*offset = segoff + (first - gpa);
729 		} else {
730 			error = EINVAL;
731 		}
732 	}
733 	vm_unlock_memsegs(sc->vm);
734 	return (error);
735 }
736 
737 static void
738 vmmdev_destroy(struct vmmdev_softc *sc)
739 {
740 	struct devmem_softc *dsc;
741 	int error __diagused;
742 
743 	KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__));
744 
745 	/*
746 	 * Destroy all cdevs:
747 	 *
748 	 * - any new operations on the 'cdev' will return an error (ENXIO).
749 	 *
750 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
751 	 */
752 	SLIST_FOREACH(dsc, &sc->devmem, link) {
753 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
754 		devmem_destroy(dsc);
755 	}
756 
757 	vm_disable_vcpu_creation(sc->vm);
758 	error = vcpu_lock_all(sc);
759 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
760 	vm_unlock_vcpus(sc->vm);
761 
762 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
763 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
764 		SLIST_REMOVE_HEAD(&sc->devmem, link);
765 		free(dsc->name, M_VMMDEV);
766 		free(dsc, M_VMMDEV);
767 	}
768 
769 	if (sc->vm != NULL)
770 		vm_destroy(sc->vm);
771 
772 	if (sc->ucred != NULL)
773 		crfree(sc->ucred);
774 
775 	sx_xlock(&vmmdev_mtx);
776 	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
777 	sx_xunlock(&vmmdev_mtx);
778 	free(sc, M_VMMDEV);
779 }
780 
781 static int
782 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred)
783 {
784 	struct cdev *cdev;
785 	struct vmmdev_softc *sc;
786 
787 	sx_xlock(&vmmdev_mtx);
788 	sc = vmmdev_lookup(name, cred);
789 	if (sc == NULL || sc->cdev == NULL) {
790 		sx_xunlock(&vmmdev_mtx);
791 		return (EINVAL);
792 	}
793 
794 	/*
795 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
796 	 * is scheduled for destruction.
797 	 */
798 	cdev = sc->cdev;
799 	sc->cdev = NULL;
800 	sx_xunlock(&vmmdev_mtx);
801 
802 	destroy_dev(cdev);
803 	vmmdev_destroy(sc);
804 
805 	return (0);
806 }
807 
808 static int
809 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
810 {
811 	char *buf;
812 	int error, buflen;
813 
814 	error = vmm_priv_check(req->td->td_ucred);
815 	if (error)
816 		return (error);
817 
818 	buflen = VM_MAX_NAMELEN + 1;
819 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
820 	strlcpy(buf, "beavis", buflen);
821 	error = sysctl_handle_string(oidp, buf, buflen, req);
822 	if (error == 0 && req->newptr != NULL)
823 		error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
824 	free(buf, M_VMMDEV);
825 	return (error);
826 }
827 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
828     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
829     NULL, 0, sysctl_vmm_destroy, "A",
830     NULL);
831 
832 static struct cdevsw vmmdevsw = {
833 	.d_name		= "vmmdev",
834 	.d_version	= D_VERSION,
835 	.d_open		= vmmdev_open,
836 	.d_ioctl	= vmmdev_ioctl,
837 	.d_mmap_single	= vmmdev_mmap_single,
838 	.d_read		= vmmdev_rw,
839 	.d_write	= vmmdev_rw,
840 };
841 
842 static struct vmmdev_softc *
843 vmmdev_alloc(struct vm *vm, struct ucred *cred)
844 {
845 	struct vmmdev_softc *sc;
846 
847 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
848 	SLIST_INIT(&sc->devmem);
849 	sc->vm = vm;
850 	sc->ucred = crhold(cred);
851 	return (sc);
852 }
853 
854 static int
855 vmmdev_create(const char *name, struct ucred *cred)
856 {
857 	struct make_dev_args mda;
858 	struct cdev *cdev;
859 	struct vmmdev_softc *sc;
860 	struct vm *vm;
861 	int error;
862 
863 	sx_xlock(&vmmdev_mtx);
864 	sc = vmmdev_lookup(name, cred);
865 	if (sc != NULL) {
866 		sx_xunlock(&vmmdev_mtx);
867 		return (EEXIST);
868 	}
869 
870 	error = vm_create(name, &vm);
871 	if (error != 0) {
872 		sx_xunlock(&vmmdev_mtx);
873 		return (error);
874 	}
875 	sc = vmmdev_alloc(vm, cred);
876 	SLIST_INSERT_HEAD(&head, sc, link);
877 
878 	make_dev_args_init(&mda);
879 	mda.mda_devsw = &vmmdevsw;
880 	mda.mda_cr = sc->ucred;
881 	mda.mda_uid = UID_ROOT;
882 	mda.mda_gid = GID_WHEEL;
883 	mda.mda_mode = 0600;
884 	mda.mda_si_drv1 = sc;
885 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
886 	error = make_dev_s(&mda, &cdev, "vmm/%s", name);
887 	if (error != 0) {
888 		sx_xunlock(&vmmdev_mtx);
889 		vmmdev_destroy(sc);
890 		return (error);
891 	}
892 	sc->cdev = cdev;
893 	sx_xunlock(&vmmdev_mtx);
894 	return (0);
895 }
896 
897 static int
898 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
899 {
900 	char *buf;
901 	int error, buflen;
902 
903 	error = vmm_priv_check(req->td->td_ucred);
904 	if (error != 0)
905 		return (error);
906 
907 	buflen = VM_MAX_NAMELEN + 1;
908 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
909 	strlcpy(buf, "beavis", buflen);
910 	error = sysctl_handle_string(oidp, buf, buflen, req);
911 	if (error == 0 && req->newptr != NULL)
912 		error = vmmdev_create(buf, req->td->td_ucred);
913 	free(buf, M_VMMDEV);
914 	return (error);
915 }
916 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
917     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
918     NULL, 0, sysctl_vmm_create, "A",
919     NULL);
920 
921 static int
922 vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td)
923 {
924 	int error;
925 
926 	error = vmm_priv_check(td->td_ucred);
927 	if (error != 0)
928 		return (error);
929 
930 	if ((flags & FWRITE) == 0)
931 		return (EPERM);
932 
933 	return (0);
934 }
935 
936 static int
937 vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
938     struct thread *td)
939 {
940 	int error;
941 
942 	switch (cmd) {
943 	case VMMCTL_VM_CREATE: {
944 		struct vmmctl_vm_create *vmc;
945 
946 		vmc = (struct vmmctl_vm_create *)data;
947 		vmc->name[VM_MAX_NAMELEN] = '\0';
948 		for (size_t i = 0; i < nitems(vmc->reserved); i++) {
949 			if (vmc->reserved[i] != 0) {
950 				error = EINVAL;
951 				return (error);
952 			}
953 		}
954 
955 		error = vmmdev_create(vmc->name, td->td_ucred);
956 		break;
957 	}
958 	case VMMCTL_VM_DESTROY: {
959 		struct vmmctl_vm_destroy *vmd;
960 
961 		vmd = (struct vmmctl_vm_destroy *)data;
962 		vmd->name[VM_MAX_NAMELEN] = '\0';
963 		for (size_t i = 0; i < nitems(vmd->reserved); i++) {
964 			if (vmd->reserved[i] != 0) {
965 				error = EINVAL;
966 				return (error);
967 			}
968 		}
969 
970 		error = vmmdev_lookup_and_destroy(vmd->name, td->td_ucred);
971 		break;
972 	}
973 	default:
974 		error = ENOTTY;
975 		break;
976 	}
977 
978 	return (error);
979 }
980 
981 static struct cdevsw vmmctlsw = {
982 	.d_name		= "vmmctl",
983 	.d_version	= D_VERSION,
984 	.d_open		= vmmctl_open,
985 	.d_ioctl	= vmmctl_ioctl,
986 };
987 
988 int
989 vmmdev_init(void)
990 {
991 	struct cdev *cdev;
992 	int error;
993 
994 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmctlsw, NULL,
995 	    UID_ROOT, GID_WHEEL, 0600, "vmmctl");
996 	if (error)
997 		return (error);
998 
999 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1000 	    "Allow use of vmm in a jail.");
1001 
1002 	return (0);
1003 }
1004 
1005 int
1006 vmmdev_cleanup(void)
1007 {
1008 	int error;
1009 
1010 	if (SLIST_EMPTY(&head))
1011 		error = 0;
1012 	else
1013 		error = EBUSY;
1014 
1015 	return (error);
1016 }
1017 
1018 static int
1019 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1020     struct vm_object **objp, int nprot)
1021 {
1022 	struct devmem_softc *dsc;
1023 	vm_ooffset_t first, last;
1024 	size_t seglen;
1025 	int error;
1026 	bool sysmem;
1027 
1028 	dsc = cdev->si_drv1;
1029 	if (dsc == NULL) {
1030 		/* 'cdev' has been created but is not ready for use */
1031 		return (ENXIO);
1032 	}
1033 
1034 	first = *offset;
1035 	last = *offset + len;
1036 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1037 		return (EINVAL);
1038 
1039 	vm_slock_memsegs(dsc->sc->vm);
1040 
1041 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1042 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1043 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1044 
1045 	if (seglen >= last)
1046 		vm_object_reference(*objp);
1047 	else
1048 		error = EINVAL;
1049 
1050 	vm_unlock_memsegs(dsc->sc->vm);
1051 	return (error);
1052 }
1053 
1054 static struct cdevsw devmemsw = {
1055 	.d_name		= "devmem",
1056 	.d_version	= D_VERSION,
1057 	.d_mmap_single	= devmem_mmap_single,
1058 };
1059 
1060 static int
1061 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname)
1062 {
1063 	struct make_dev_args mda;
1064 	struct devmem_softc *dsc;
1065 	int error;
1066 
1067 	sx_xlock(&vmmdev_mtx);
1068 
1069 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1070 	dsc->segid = segid;
1071 	dsc->name = devname;
1072 	dsc->sc = sc;
1073 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1074 
1075 	make_dev_args_init(&mda);
1076 	mda.mda_devsw = &devmemsw;
1077 	mda.mda_cr = sc->ucred;
1078 	mda.mda_uid = UID_ROOT;
1079 	mda.mda_gid = GID_WHEEL;
1080 	mda.mda_mode = 0600;
1081 	mda.mda_si_drv1 = dsc;
1082 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1083 	error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm),
1084 	    devname);
1085 	if (error != 0) {
1086 		SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link);
1087 		free(dsc->name, M_VMMDEV);
1088 		free(dsc, M_VMMDEV);
1089 	}
1090 
1091 	sx_xunlock(&vmmdev_mtx);
1092 
1093 	return (error);
1094 }
1095 
1096 static void
1097 devmem_destroy(void *arg)
1098 {
1099 	struct devmem_softc *dsc = arg;
1100 
1101 	destroy_dev(dsc->cdev);
1102 	dsc->cdev = NULL;
1103 	dsc->sc = NULL;
1104 }
1105