xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #include <sys/fcntl.h>
12 #include <sys/ioccom.h>
13 #include <sys/jail.h>
14 #include <sys/kernel.h>
15 #include <sys/malloc.h>
16 #include <sys/mman.h>
17 #include <sys/proc.h>
18 #include <sys/queue.h>
19 #include <sys/sx.h>
20 #include <sys/sysctl.h>
21 #include <sys/ucred.h>
22 #include <sys/uio.h>
23 
24 #include <machine/vmm.h>
25 
26 #include <vm/vm.h>
27 #include <vm/vm_object.h>
28 
29 #include <dev/vmm/vmm_dev.h>
30 #include <dev/vmm/vmm_stat.h>
31 
32 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
33 struct vm_memseg_12 {
34 	int		segid;
35 	size_t		len;
36 	char		name[64];
37 };
38 _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI");
39 
40 #define	VM_ALLOC_MEMSEG_12	\
41 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
42 #define	VM_GET_MEMSEG_12	\
43 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
44 #endif
45 
46 struct devmem_softc {
47 	int	segid;
48 	char	*name;
49 	struct cdev *cdev;
50 	struct vmmdev_softc *sc;
51 	SLIST_ENTRY(devmem_softc) link;
52 };
53 
54 struct vmmdev_softc {
55 	struct vm	*vm;		/* vm instance cookie */
56 	struct cdev	*cdev;
57 	struct ucred	*ucred;
58 	SLIST_ENTRY(vmmdev_softc) link;
59 	SLIST_HEAD(, devmem_softc) devmem;
60 	int		flags;
61 };
62 
63 static SLIST_HEAD(, vmmdev_softc) head;
64 
65 static unsigned pr_allow_flag;
66 static struct sx vmmdev_mtx;
67 SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex");
68 
69 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
70 
71 SYSCTL_DECL(_hw_vmm);
72 
73 static void devmem_destroy(void *arg);
74 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem);
75 
76 static int
77 vmm_priv_check(struct ucred *ucred)
78 {
79 	if (jailed(ucred) &&
80 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
81 		return (EPERM);
82 
83 	return (0);
84 }
85 
86 static int
87 vcpu_lock_one(struct vcpu *vcpu)
88 {
89 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
90 }
91 
92 static void
93 vcpu_unlock_one(struct vcpu *vcpu)
94 {
95 	enum vcpu_state state;
96 
97 	state = vcpu_get_state(vcpu, NULL);
98 	if (state != VCPU_FROZEN) {
99 		panic("vcpu %s(%d) has invalid state %d",
100 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
101 	}
102 
103 	vcpu_set_state(vcpu, VCPU_IDLE, false);
104 }
105 
106 static int
107 vcpu_lock_all(struct vmmdev_softc *sc)
108 {
109 	struct vcpu *vcpu;
110 	int error;
111 	uint16_t i, j, maxcpus;
112 
113 	error = 0;
114 	vm_slock_vcpus(sc->vm);
115 	maxcpus = vm_get_maxcpus(sc->vm);
116 	for (i = 0; i < maxcpus; i++) {
117 		vcpu = vm_vcpu(sc->vm, i);
118 		if (vcpu == NULL)
119 			continue;
120 		error = vcpu_lock_one(vcpu);
121 		if (error)
122 			break;
123 	}
124 
125 	if (error) {
126 		for (j = 0; j < i; j++) {
127 			vcpu = vm_vcpu(sc->vm, j);
128 			if (vcpu == NULL)
129 				continue;
130 			vcpu_unlock_one(vcpu);
131 		}
132 		vm_unlock_vcpus(sc->vm);
133 	}
134 
135 	return (error);
136 }
137 
138 static void
139 vcpu_unlock_all(struct vmmdev_softc *sc)
140 {
141 	struct vcpu *vcpu;
142 	uint16_t i, maxcpus;
143 
144 	maxcpus = vm_get_maxcpus(sc->vm);
145 	for (i = 0; i < maxcpus; i++) {
146 		vcpu = vm_vcpu(sc->vm, i);
147 		if (vcpu == NULL)
148 			continue;
149 		vcpu_unlock_one(vcpu);
150 	}
151 	vm_unlock_vcpus(sc->vm);
152 }
153 
154 static struct vmmdev_softc *
155 vmmdev_lookup(const char *name, struct ucred *cred)
156 {
157 	struct vmmdev_softc *sc;
158 
159 	sx_assert(&vmmdev_mtx, SA_XLOCKED);
160 
161 	SLIST_FOREACH(sc, &head, link) {
162 		if (strcmp(name, vm_name(sc->vm)) == 0)
163 			break;
164 	}
165 
166 	if (sc == NULL)
167 		return (NULL);
168 
169 	if (cr_cansee(cred, sc->ucred))
170 		return (NULL);
171 
172 	return (sc);
173 }
174 
175 static struct vmmdev_softc *
176 vmmdev_lookup2(struct cdev *cdev)
177 {
178 	return (cdev->si_drv1);
179 }
180 
181 static int
182 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
183 {
184 	int error, off, c, prot;
185 	vm_paddr_t gpa, maxaddr;
186 	void *hpa, *cookie;
187 	struct vmmdev_softc *sc;
188 
189 	sc = vmmdev_lookup2(cdev);
190 	if (sc == NULL)
191 		return (ENXIO);
192 
193 	/*
194 	 * Get a read lock on the guest memory map.
195 	 */
196 	vm_slock_memsegs(sc->vm);
197 
198 	error = 0;
199 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
200 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
201 	while (uio->uio_resid > 0 && error == 0) {
202 		gpa = uio->uio_offset;
203 		off = gpa & PAGE_MASK;
204 		c = min(uio->uio_resid, PAGE_SIZE - off);
205 
206 		/*
207 		 * The VM has a hole in its physical memory map. If we want to
208 		 * use 'dd' to inspect memory beyond the hole we need to
209 		 * provide bogus data for memory that lies in the hole.
210 		 *
211 		 * Since this device does not support lseek(2), dd(1) will
212 		 * read(2) blocks of data to simulate the lseek(2).
213 		 */
214 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
215 		if (hpa == NULL) {
216 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
217 				error = uiomove(__DECONST(void *, zero_region),
218 				    c, uio);
219 			else
220 				error = EFAULT;
221 		} else {
222 			error = uiomove(hpa, c, uio);
223 			vm_gpa_release(cookie);
224 		}
225 	}
226 	vm_unlock_memsegs(sc->vm);
227 	return (error);
228 }
229 
230 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
231 
232 static int
233 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
234 {
235 	struct devmem_softc *dsc;
236 	int error;
237 	bool sysmem;
238 
239 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
240 	if (error || mseg->len == 0)
241 		return (error);
242 
243 	if (!sysmem) {
244 		SLIST_FOREACH(dsc, &sc->devmem, link) {
245 			if (dsc->segid == mseg->segid)
246 				break;
247 		}
248 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
249 		    __func__, mseg->segid));
250 		error = copystr(dsc->name, mseg->name, len, NULL);
251 	} else {
252 		bzero(mseg->name, len);
253 	}
254 
255 	return (error);
256 }
257 
258 static int
259 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
260 {
261 	char *name;
262 	int error;
263 	bool sysmem;
264 
265 	error = 0;
266 	name = NULL;
267 	sysmem = true;
268 
269 	/*
270 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
271 	 * by stripped off when devfs processes the full string.
272 	 */
273 	if (VM_MEMSEG_NAME(mseg)) {
274 		sysmem = false;
275 		name = malloc(len, M_VMMDEV, M_WAITOK);
276 		error = copystr(mseg->name, name, len, NULL);
277 		if (error)
278 			goto done;
279 	}
280 
281 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
282 	if (error)
283 		goto done;
284 
285 	if (VM_MEMSEG_NAME(mseg)) {
286 		error = devmem_create_cdev(sc, mseg->segid, name);
287 		if (error)
288 			vm_free_memseg(sc->vm, mseg->segid);
289 		else
290 			name = NULL;	/* freed when 'cdev' is destroyed */
291 	}
292 done:
293 	free(name, M_VMMDEV);
294 	return (error);
295 }
296 
297 static int
298 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
299     uint64_t *regval)
300 {
301 	int error, i;
302 
303 	error = 0;
304 	for (i = 0; i < count; i++) {
305 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
306 		if (error)
307 			break;
308 	}
309 	return (error);
310 }
311 
312 static int
313 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
314     uint64_t *regval)
315 {
316 	int error, i;
317 
318 	error = 0;
319 	for (i = 0; i < count; i++) {
320 		error = vm_set_register(vcpu, regnum[i], regval[i]);
321 		if (error)
322 			break;
323 	}
324 	return (error);
325 }
326 
327 static int
328 vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
329 {
330 	int error;
331 
332 	/*
333 	 * A jail without vmm access shouldn't be able to access vmm device
334 	 * files at all, but check here just to be thorough.
335 	 */
336 	error = vmm_priv_check(td->td_ucred);
337 	if (error != 0)
338 		return (error);
339 
340 	return (0);
341 }
342 
343 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
344 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
345 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
346 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
347 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
348 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
349 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
350 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
351 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
352 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
353 
354 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
355 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
356 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
357 #endif
358 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
359 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
360 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
361 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
362 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
363 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
364 	VMMDEV_IOCTL(VM_REINIT,
365 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
366 
367 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
368 	VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
369 #endif
370 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
371 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
372 
373 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
374 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
375 
376 	VMMDEV_IOCTL(VM_SUSPEND, 0),
377 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
378 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
379 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
380 };
381 
382 static int
383 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
384     struct thread *td)
385 {
386 	struct vmmdev_softc *sc;
387 	struct vcpu *vcpu;
388 	const struct vmmdev_ioctl *ioctl;
389 	int error, vcpuid;
390 
391 	sc = vmmdev_lookup2(cdev);
392 	if (sc == NULL)
393 		return (ENXIO);
394 
395 	ioctl = NULL;
396 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
397 		if (vmmdev_ioctls[i].cmd == cmd) {
398 			ioctl = &vmmdev_ioctls[i];
399 			break;
400 		}
401 	}
402 	if (ioctl == NULL) {
403 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
404 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
405 				ioctl = &vmmdev_machdep_ioctls[i];
406 				break;
407 			}
408 		}
409 	}
410 	if (ioctl == NULL)
411 		return (ENOTTY);
412 
413 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
414 		vm_xlock_memsegs(sc->vm);
415 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
416 		vm_slock_memsegs(sc->vm);
417 
418 	vcpu = NULL;
419 	vcpuid = -1;
420 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
421 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
422 		vcpuid = *(int *)data;
423 		if (vcpuid == -1) {
424 			if ((ioctl->flags &
425 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
426 				error = EINVAL;
427 				goto lockfail;
428 			}
429 		} else {
430 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
431 			if (vcpu == NULL) {
432 				error = EINVAL;
433 				goto lockfail;
434 			}
435 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
436 				error = vcpu_lock_one(vcpu);
437 				if (error)
438 					goto lockfail;
439 			}
440 		}
441 	}
442 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
443 		error = vcpu_lock_all(sc);
444 		if (error)
445 			goto lockfail;
446 	}
447 
448 	switch (cmd) {
449 	case VM_SUSPEND: {
450 		struct vm_suspend *vmsuspend;
451 
452 		vmsuspend = (struct vm_suspend *)data;
453 		error = vm_suspend(sc->vm, vmsuspend->how);
454 		break;
455 	}
456 	case VM_REINIT:
457 		error = vm_reinit(sc->vm);
458 		break;
459 	case VM_STAT_DESC: {
460 		struct vm_stat_desc *statdesc;
461 
462 		statdesc = (struct vm_stat_desc *)data;
463 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
464 		    sizeof(statdesc->desc));
465 		break;
466 	}
467 	case VM_STATS: {
468 		struct vm_stats *vmstats;
469 
470 		vmstats = (struct vm_stats *)data;
471 		getmicrotime(&vmstats->tv);
472 		error = vmm_stat_copy(vcpu, vmstats->index,
473 		    nitems(vmstats->statbuf), &vmstats->num_entries,
474 		    vmstats->statbuf);
475 		break;
476 	}
477 	case VM_MMAP_GETNEXT: {
478 		struct vm_memmap *mm;
479 
480 		mm = (struct vm_memmap *)data;
481 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
482 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
483 		break;
484 	}
485 	case VM_MMAP_MEMSEG: {
486 		struct vm_memmap *mm;
487 
488 		mm = (struct vm_memmap *)data;
489 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
490 		    mm->len, mm->prot, mm->flags);
491 		break;
492 	}
493 	case VM_MUNMAP_MEMSEG: {
494 		struct vm_munmap *mu;
495 
496 		mu = (struct vm_munmap *)data;
497 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
498 		break;
499 	}
500 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
501 	case VM_ALLOC_MEMSEG_12:
502 		error = alloc_memseg(sc, (struct vm_memseg *)data,
503 		    sizeof(((struct vm_memseg_12 *)0)->name));
504 		break;
505 	case VM_GET_MEMSEG_12:
506 		error = get_memseg(sc, (struct vm_memseg *)data,
507 		    sizeof(((struct vm_memseg_12 *)0)->name));
508 		break;
509 #endif
510 	case VM_ALLOC_MEMSEG:
511 		error = alloc_memseg(sc, (struct vm_memseg *)data,
512 		    sizeof(((struct vm_memseg *)0)->name));
513 		break;
514 	case VM_GET_MEMSEG:
515 		error = get_memseg(sc, (struct vm_memseg *)data,
516 		    sizeof(((struct vm_memseg *)0)->name));
517 		break;
518 	case VM_GET_REGISTER: {
519 		struct vm_register *vmreg;
520 
521 		vmreg = (struct vm_register *)data;
522 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
523 		break;
524 	}
525 	case VM_SET_REGISTER: {
526 		struct vm_register *vmreg;
527 
528 		vmreg = (struct vm_register *)data;
529 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
530 		break;
531 	}
532 	case VM_GET_REGISTER_SET: {
533 		struct vm_register_set *vmregset;
534 		uint64_t *regvals;
535 		int *regnums;
536 
537 		vmregset = (struct vm_register_set *)data;
538 		if (vmregset->count > VM_REG_LAST) {
539 			error = EINVAL;
540 			break;
541 		}
542 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
543 		    M_WAITOK);
544 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
545 		    M_WAITOK);
546 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
547 		    vmregset->count);
548 		if (error == 0)
549 			error = vm_get_register_set(vcpu,
550 			    vmregset->count, regnums, regvals);
551 		if (error == 0)
552 			error = copyout(regvals, vmregset->regvals,
553 			    sizeof(regvals[0]) * vmregset->count);
554 		free(regvals, M_VMMDEV);
555 		free(regnums, M_VMMDEV);
556 		break;
557 	}
558 	case VM_SET_REGISTER_SET: {
559 		struct vm_register_set *vmregset;
560 		uint64_t *regvals;
561 		int *regnums;
562 
563 		vmregset = (struct vm_register_set *)data;
564 		if (vmregset->count > VM_REG_LAST) {
565 			error = EINVAL;
566 			break;
567 		}
568 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
569 		    M_WAITOK);
570 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
571 		    M_WAITOK);
572 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
573 		    vmregset->count);
574 		if (error == 0)
575 			error = copyin(vmregset->regvals, regvals,
576 			    sizeof(regvals[0]) * vmregset->count);
577 		if (error == 0)
578 			error = vm_set_register_set(vcpu,
579 			    vmregset->count, regnums, regvals);
580 		free(regvals, M_VMMDEV);
581 		free(regnums, M_VMMDEV);
582 		break;
583 	}
584 	case VM_GET_CAPABILITY: {
585 		struct vm_capability *vmcap;
586 
587 		vmcap = (struct vm_capability *)data;
588 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
589 		break;
590 	}
591 	case VM_SET_CAPABILITY: {
592 		struct vm_capability *vmcap;
593 
594 		vmcap = (struct vm_capability *)data;
595 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
596 		break;
597 	}
598 	case VM_ACTIVATE_CPU:
599 		error = vm_activate_cpu(vcpu);
600 		break;
601 	case VM_GET_CPUS: {
602 		struct vm_cpuset *vm_cpuset;
603 		cpuset_t *cpuset;
604 		int size;
605 
606 		error = 0;
607 		vm_cpuset = (struct vm_cpuset *)data;
608 		size = vm_cpuset->cpusetsize;
609 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
610 			error = ERANGE;
611 			break;
612 		}
613 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
614 		    M_WAITOK | M_ZERO);
615 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
616 			*cpuset = vm_active_cpus(sc->vm);
617 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
618 			*cpuset = vm_suspended_cpus(sc->vm);
619 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
620 			*cpuset = vm_debug_cpus(sc->vm);
621 		else
622 			error = EINVAL;
623 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
624 			error = ERANGE;
625 		if (error == 0)
626 			error = copyout(cpuset, vm_cpuset->cpus, size);
627 		free(cpuset, M_TEMP);
628 		break;
629 	}
630 	case VM_SUSPEND_CPU:
631 		error = vm_suspend_cpu(sc->vm, vcpu);
632 		break;
633 	case VM_RESUME_CPU:
634 		error = vm_resume_cpu(sc->vm, vcpu);
635 		break;
636 	case VM_SET_TOPOLOGY: {
637 		struct vm_cpu_topology *topology;
638 
639 		topology = (struct vm_cpu_topology *)data;
640 		error = vm_set_topology(sc->vm, topology->sockets,
641 		    topology->cores, topology->threads, topology->maxcpus);
642 		break;
643 	}
644 	case VM_GET_TOPOLOGY: {
645 		struct vm_cpu_topology *topology;
646 
647 		topology = (struct vm_cpu_topology *)data;
648 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
649 		    &topology->threads, &topology->maxcpus);
650 		error = 0;
651 		break;
652 	}
653 	default:
654 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
655 		    td);
656 		break;
657 	}
658 
659 	if ((ioctl->flags &
660 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
661 		vm_unlock_memsegs(sc->vm);
662 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
663 		vcpu_unlock_all(sc);
664 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
665 		vcpu_unlock_one(vcpu);
666 
667 	/*
668 	 * Make sure that no handler returns a kernel-internal
669 	 * error value to userspace.
670 	 */
671 	KASSERT(error == ERESTART || error >= 0,
672 	    ("vmmdev_ioctl: invalid error return %d", error));
673 	return (error);
674 
675 lockfail:
676 	if ((ioctl->flags &
677 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
678 		vm_unlock_memsegs(sc->vm);
679 	return (error);
680 }
681 
682 static int
683 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
684     struct vm_object **objp, int nprot)
685 {
686 	struct vmmdev_softc *sc;
687 	vm_paddr_t gpa;
688 	size_t len;
689 	vm_ooffset_t segoff, first, last;
690 	int error, found, segid;
691 	bool sysmem;
692 
693 	first = *offset;
694 	last = first + mapsize;
695 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
696 		return (EINVAL);
697 
698 	sc = vmmdev_lookup2(cdev);
699 	if (sc == NULL) {
700 		/* virtual machine is in the process of being created */
701 		return (EINVAL);
702 	}
703 
704 	/*
705 	 * Get a read lock on the guest memory map.
706 	 */
707 	vm_slock_memsegs(sc->vm);
708 
709 	gpa = 0;
710 	found = 0;
711 	while (!found) {
712 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
713 		    NULL, NULL);
714 		if (error)
715 			break;
716 
717 		if (first >= gpa && last <= gpa + len)
718 			found = 1;
719 		else
720 			gpa += len;
721 	}
722 
723 	if (found) {
724 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
725 		KASSERT(error == 0 && *objp != NULL,
726 		    ("%s: invalid memory segment %d", __func__, segid));
727 		if (sysmem) {
728 			vm_object_reference(*objp);
729 			*offset = segoff + (first - gpa);
730 		} else {
731 			error = EINVAL;
732 		}
733 	}
734 	vm_unlock_memsegs(sc->vm);
735 	return (error);
736 }
737 
738 static void
739 vmmdev_destroy(struct vmmdev_softc *sc)
740 {
741 	struct devmem_softc *dsc;
742 	int error __diagused;
743 
744 	KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__));
745 
746 	/*
747 	 * Destroy all cdevs:
748 	 *
749 	 * - any new operations on the 'cdev' will return an error (ENXIO).
750 	 *
751 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
752 	 */
753 	SLIST_FOREACH(dsc, &sc->devmem, link) {
754 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
755 		devmem_destroy(dsc);
756 	}
757 
758 	vm_disable_vcpu_creation(sc->vm);
759 	error = vcpu_lock_all(sc);
760 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
761 	vm_unlock_vcpus(sc->vm);
762 
763 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
764 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
765 		SLIST_REMOVE_HEAD(&sc->devmem, link);
766 		free(dsc->name, M_VMMDEV);
767 		free(dsc, M_VMMDEV);
768 	}
769 
770 	if (sc->vm != NULL)
771 		vm_destroy(sc->vm);
772 
773 	if (sc->ucred != NULL)
774 		crfree(sc->ucred);
775 
776 	sx_xlock(&vmmdev_mtx);
777 	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
778 	sx_xunlock(&vmmdev_mtx);
779 	free(sc, M_VMMDEV);
780 }
781 
782 static int
783 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred)
784 {
785 	struct cdev *cdev;
786 	struct vmmdev_softc *sc;
787 
788 	sx_xlock(&vmmdev_mtx);
789 	sc = vmmdev_lookup(name, cred);
790 	if (sc == NULL || sc->cdev == NULL) {
791 		sx_xunlock(&vmmdev_mtx);
792 		return (EINVAL);
793 	}
794 
795 	/*
796 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
797 	 * is scheduled for destruction.
798 	 */
799 	cdev = sc->cdev;
800 	sc->cdev = NULL;
801 	sx_xunlock(&vmmdev_mtx);
802 
803 	destroy_dev(cdev);
804 	vmmdev_destroy(sc);
805 
806 	return (0);
807 }
808 
809 static int
810 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
811 {
812 	char *buf;
813 	int error, buflen;
814 
815 	error = vmm_priv_check(req->td->td_ucred);
816 	if (error)
817 		return (error);
818 
819 	buflen = VM_MAX_NAMELEN + 1;
820 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
821 	strlcpy(buf, "beavis", buflen);
822 	error = sysctl_handle_string(oidp, buf, buflen, req);
823 	if (error == 0 && req->newptr != NULL)
824 		error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
825 	free(buf, M_VMMDEV);
826 	return (error);
827 }
828 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
829     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
830     NULL, 0, sysctl_vmm_destroy, "A",
831     NULL);
832 
833 static struct cdevsw vmmdevsw = {
834 	.d_name		= "vmmdev",
835 	.d_version	= D_VERSION,
836 	.d_open		= vmmdev_open,
837 	.d_ioctl	= vmmdev_ioctl,
838 	.d_mmap_single	= vmmdev_mmap_single,
839 	.d_read		= vmmdev_rw,
840 	.d_write	= vmmdev_rw,
841 };
842 
843 static struct vmmdev_softc *
844 vmmdev_alloc(struct vm *vm, struct ucred *cred)
845 {
846 	struct vmmdev_softc *sc;
847 
848 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
849 	SLIST_INIT(&sc->devmem);
850 	sc->vm = vm;
851 	sc->ucred = crhold(cred);
852 	return (sc);
853 }
854 
855 static int
856 vmmdev_create(const char *name, struct ucred *cred)
857 {
858 	struct make_dev_args mda;
859 	struct cdev *cdev;
860 	struct vmmdev_softc *sc;
861 	struct vm *vm;
862 	int error;
863 
864 	sx_xlock(&vmmdev_mtx);
865 	sc = vmmdev_lookup(name, cred);
866 	if (sc != NULL) {
867 		sx_xunlock(&vmmdev_mtx);
868 		return (EEXIST);
869 	}
870 
871 	error = vm_create(name, &vm);
872 	if (error != 0) {
873 		sx_xunlock(&vmmdev_mtx);
874 		return (error);
875 	}
876 	sc = vmmdev_alloc(vm, cred);
877 	SLIST_INSERT_HEAD(&head, sc, link);
878 
879 	make_dev_args_init(&mda);
880 	mda.mda_devsw = &vmmdevsw;
881 	mda.mda_cr = sc->ucred;
882 	mda.mda_uid = UID_ROOT;
883 	mda.mda_gid = GID_WHEEL;
884 	mda.mda_mode = 0600;
885 	mda.mda_si_drv1 = sc;
886 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
887 	error = make_dev_s(&mda, &cdev, "vmm/%s", name);
888 	if (error != 0) {
889 		sx_xunlock(&vmmdev_mtx);
890 		vmmdev_destroy(sc);
891 		return (error);
892 	}
893 	sc->cdev = cdev;
894 	sx_xunlock(&vmmdev_mtx);
895 	return (0);
896 }
897 
898 static int
899 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
900 {
901 	char *buf;
902 	int error, buflen;
903 
904 	error = vmm_priv_check(req->td->td_ucred);
905 	if (error != 0)
906 		return (error);
907 
908 	buflen = VM_MAX_NAMELEN + 1;
909 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
910 	strlcpy(buf, "beavis", buflen);
911 	error = sysctl_handle_string(oidp, buf, buflen, req);
912 	if (error == 0 && req->newptr != NULL)
913 		error = vmmdev_create(buf, req->td->td_ucred);
914 	free(buf, M_VMMDEV);
915 	return (error);
916 }
917 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
918     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
919     NULL, 0, sysctl_vmm_create, "A",
920     NULL);
921 
922 static int
923 vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td)
924 {
925 	int error;
926 
927 	error = vmm_priv_check(td->td_ucred);
928 	if (error != 0)
929 		return (error);
930 
931 	if ((flags & FWRITE) == 0)
932 		return (EPERM);
933 
934 	return (0);
935 }
936 
937 static int
938 vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
939     struct thread *td)
940 {
941 	int error;
942 
943 	switch (cmd) {
944 	case VMMCTL_VM_CREATE: {
945 		struct vmmctl_vm_create *vmc;
946 
947 		vmc = (struct vmmctl_vm_create *)data;
948 		vmc->name[VM_MAX_NAMELEN] = '\0';
949 		for (size_t i = 0; i < nitems(vmc->reserved); i++) {
950 			if (vmc->reserved[i] != 0) {
951 				error = EINVAL;
952 				return (error);
953 			}
954 		}
955 
956 		error = vmmdev_create(vmc->name, td->td_ucred);
957 		break;
958 	}
959 	case VMMCTL_VM_DESTROY: {
960 		struct vmmctl_vm_destroy *vmd;
961 
962 		vmd = (struct vmmctl_vm_destroy *)data;
963 		vmd->name[VM_MAX_NAMELEN] = '\0';
964 		for (size_t i = 0; i < nitems(vmd->reserved); i++) {
965 			if (vmd->reserved[i] != 0) {
966 				error = EINVAL;
967 				return (error);
968 			}
969 		}
970 
971 		error = vmmdev_lookup_and_destroy(vmd->name, td->td_ucred);
972 		break;
973 	}
974 	default:
975 		error = ENOTTY;
976 		break;
977 	}
978 
979 	return (error);
980 }
981 
982 static struct cdev *vmmctl_cdev;
983 static struct cdevsw vmmctlsw = {
984 	.d_name		= "vmmctl",
985 	.d_version	= D_VERSION,
986 	.d_open		= vmmctl_open,
987 	.d_ioctl	= vmmctl_ioctl,
988 };
989 
990 int
991 vmmdev_init(void)
992 {
993 	int error;
994 
995 	sx_xlock(&vmmdev_mtx);
996 	error = make_dev_p(MAKEDEV_CHECKNAME, &vmmctl_cdev, &vmmctlsw, NULL,
997 	    UID_ROOT, GID_WHEEL, 0600, "vmmctl");
998 	if (error == 0)
999 		pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1000 		    "Allow use of vmm in a jail.");
1001 	sx_xunlock(&vmmdev_mtx);
1002 
1003 	return (error);
1004 }
1005 
1006 int
1007 vmmdev_cleanup(void)
1008 {
1009 	sx_xlock(&vmmdev_mtx);
1010 	if (!SLIST_EMPTY(&head)) {
1011 		sx_xunlock(&vmmdev_mtx);
1012 		return (EBUSY);
1013 	}
1014 	if (vmmctl_cdev != NULL) {
1015 		destroy_dev(vmmctl_cdev);
1016 		vmmctl_cdev = NULL;
1017 	}
1018 	sx_xunlock(&vmmdev_mtx);
1019 
1020 	return (0);
1021 }
1022 
1023 static int
1024 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1025     struct vm_object **objp, int nprot)
1026 {
1027 	struct devmem_softc *dsc;
1028 	vm_ooffset_t first, last;
1029 	size_t seglen;
1030 	int error;
1031 	bool sysmem;
1032 
1033 	dsc = cdev->si_drv1;
1034 	if (dsc == NULL) {
1035 		/* 'cdev' has been created but is not ready for use */
1036 		return (ENXIO);
1037 	}
1038 
1039 	first = *offset;
1040 	last = *offset + len;
1041 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1042 		return (EINVAL);
1043 
1044 	vm_slock_memsegs(dsc->sc->vm);
1045 
1046 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1047 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1048 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1049 
1050 	if (seglen >= last)
1051 		vm_object_reference(*objp);
1052 	else
1053 		error = EINVAL;
1054 
1055 	vm_unlock_memsegs(dsc->sc->vm);
1056 	return (error);
1057 }
1058 
1059 static struct cdevsw devmemsw = {
1060 	.d_name		= "devmem",
1061 	.d_version	= D_VERSION,
1062 	.d_mmap_single	= devmem_mmap_single,
1063 };
1064 
1065 static int
1066 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname)
1067 {
1068 	struct make_dev_args mda;
1069 	struct devmem_softc *dsc;
1070 	int error;
1071 
1072 	sx_xlock(&vmmdev_mtx);
1073 
1074 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1075 	dsc->segid = segid;
1076 	dsc->name = devname;
1077 	dsc->sc = sc;
1078 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1079 
1080 	make_dev_args_init(&mda);
1081 	mda.mda_devsw = &devmemsw;
1082 	mda.mda_cr = sc->ucred;
1083 	mda.mda_uid = UID_ROOT;
1084 	mda.mda_gid = GID_WHEEL;
1085 	mda.mda_mode = 0600;
1086 	mda.mda_si_drv1 = dsc;
1087 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1088 	error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm),
1089 	    devname);
1090 	if (error != 0) {
1091 		SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link);
1092 		free(dsc->name, M_VMMDEV);
1093 		free(dsc, M_VMMDEV);
1094 	}
1095 
1096 	sx_xunlock(&vmmdev_mtx);
1097 
1098 	return (error);
1099 }
1100 
1101 static void
1102 devmem_destroy(void *arg)
1103 {
1104 	struct devmem_softc *dsc = arg;
1105 
1106 	destroy_dev(dsc->cdev);
1107 	dsc->cdev = NULL;
1108 	dsc->sc = NULL;
1109 }
1110