xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision d581970976101e1bcbfc23823b97696b172f118c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #include <sys/ioccom.h>
12 #include <sys/jail.h>
13 #include <sys/kernel.h>
14 #include <sys/malloc.h>
15 #include <sys/mman.h>
16 #include <sys/mutex.h>
17 #include <sys/proc.h>
18 #include <sys/queue.h>
19 #include <sys/sysctl.h>
20 #include <sys/ucred.h>
21 #include <sys/uio.h>
22 
23 #include <machine/vmm.h>
24 
25 #include <vm/vm.h>
26 #include <vm/vm_object.h>
27 
28 #include <dev/vmm/vmm_dev.h>
29 #include <dev/vmm/vmm_stat.h>
30 
31 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
32 struct vm_memseg_fbsd12 {
33 	int		segid;
34 	size_t		len;
35 	char		name[64];
36 };
37 _Static_assert(sizeof(struct vm_memseg_fbsd12) == 80, "COMPAT_FREEBSD12 ABI");
38 
39 #define	VM_ALLOC_MEMSEG_FBSD12	\
40 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_fbsd12)
41 #define	VM_GET_MEMSEG_FBSD12	\
42 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_fbsd12)
43 #endif
44 
45 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
46 
47 struct devmem_softc {
48 	int	segid;
49 	char	*name;
50 	struct cdev *cdev;
51 	struct vmmdev_softc *sc;
52 	SLIST_ENTRY(devmem_softc) link;
53 };
54 
55 struct vmmdev_softc {
56 	struct vm	*vm;		/* vm instance cookie */
57 	struct cdev	*cdev;
58 	struct ucred	*ucred;
59 	SLIST_ENTRY(vmmdev_softc) link;
60 	SLIST_HEAD(, devmem_softc) devmem;
61 	int		flags;
62 };
63 #define	VSC_LINKED		0x01
64 
65 static SLIST_HEAD(, vmmdev_softc) head;
66 
67 static unsigned pr_allow_flag;
68 static struct mtx vmmdev_mtx;
69 MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
70 
71 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
72 
73 SYSCTL_DECL(_hw_vmm);
74 
75 static void devmem_destroy(void *arg);
76 
77 static int
78 vmm_priv_check(struct ucred *ucred)
79 {
80 	if (jailed(ucred) &&
81 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
82 		return (EPERM);
83 
84 	return (0);
85 }
86 
87 static int
88 vcpu_lock_one(struct vcpu *vcpu)
89 {
90 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
91 }
92 
93 static void
94 vcpu_unlock_one(struct vcpu *vcpu)
95 {
96 	enum vcpu_state state;
97 
98 	state = vcpu_get_state(vcpu, NULL);
99 	if (state != VCPU_FROZEN) {
100 		panic("vcpu %s(%d) has invalid state %d",
101 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
102 	}
103 
104 	vcpu_set_state(vcpu, VCPU_IDLE, false);
105 }
106 
107 static int
108 vcpu_lock_all(struct vmmdev_softc *sc)
109 {
110 	struct vcpu *vcpu;
111 	int error;
112 	uint16_t i, j, maxcpus;
113 
114 	error = 0;
115 	vm_slock_vcpus(sc->vm);
116 	maxcpus = vm_get_maxcpus(sc->vm);
117 	for (i = 0; i < maxcpus; i++) {
118 		vcpu = vm_vcpu(sc->vm, i);
119 		if (vcpu == NULL)
120 			continue;
121 		error = vcpu_lock_one(vcpu);
122 		if (error)
123 			break;
124 	}
125 
126 	if (error) {
127 		for (j = 0; j < i; j++) {
128 			vcpu = vm_vcpu(sc->vm, j);
129 			if (vcpu == NULL)
130 				continue;
131 			vcpu_unlock_one(vcpu);
132 		}
133 		vm_unlock_vcpus(sc->vm);
134 	}
135 
136 	return (error);
137 }
138 
139 static void
140 vcpu_unlock_all(struct vmmdev_softc *sc)
141 {
142 	struct vcpu *vcpu;
143 	uint16_t i, maxcpus;
144 
145 	maxcpus = vm_get_maxcpus(sc->vm);
146 	for (i = 0; i < maxcpus; i++) {
147 		vcpu = vm_vcpu(sc->vm, i);
148 		if (vcpu == NULL)
149 			continue;
150 		vcpu_unlock_one(vcpu);
151 	}
152 	vm_unlock_vcpus(sc->vm);
153 }
154 
155 static struct vmmdev_softc *
156 vmmdev_lookup(const char *name)
157 {
158 	struct vmmdev_softc *sc;
159 
160 	mtx_assert(&vmmdev_mtx, MA_OWNED);
161 
162 	SLIST_FOREACH(sc, &head, link) {
163 		if (strcmp(name, vm_name(sc->vm)) == 0)
164 			break;
165 	}
166 
167 	if (sc == NULL)
168 		return (NULL);
169 
170 	if (cr_cansee(curthread->td_ucred, sc->ucred))
171 		return (NULL);
172 
173 	return (sc);
174 }
175 
176 static struct vmmdev_softc *
177 vmmdev_lookup2(struct cdev *cdev)
178 {
179 	return (cdev->si_drv1);
180 }
181 
182 static int
183 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
184 {
185 	int error, off, c, prot;
186 	vm_paddr_t gpa, maxaddr;
187 	void *hpa, *cookie;
188 	struct vmmdev_softc *sc;
189 
190 	error = vmm_priv_check(curthread->td_ucred);
191 	if (error)
192 		return (error);
193 
194 	sc = vmmdev_lookup2(cdev);
195 	if (sc == NULL)
196 		return (ENXIO);
197 
198 	/*
199 	 * Get a read lock on the guest memory map.
200 	 */
201 	vm_slock_memsegs(sc->vm);
202 
203 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
204 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
205 	while (uio->uio_resid > 0 && error == 0) {
206 		gpa = uio->uio_offset;
207 		off = gpa & PAGE_MASK;
208 		c = min(uio->uio_resid, PAGE_SIZE - off);
209 
210 		/*
211 		 * The VM has a hole in its physical memory map. If we want to
212 		 * use 'dd' to inspect memory beyond the hole we need to
213 		 * provide bogus data for memory that lies in the hole.
214 		 *
215 		 * Since this device does not support lseek(2), dd(1) will
216 		 * read(2) blocks of data to simulate the lseek(2).
217 		 */
218 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
219 		if (hpa == NULL) {
220 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
221 				error = uiomove(__DECONST(void *, zero_region),
222 				    c, uio);
223 			else
224 				error = EFAULT;
225 		} else {
226 			error = uiomove(hpa, c, uio);
227 			vm_gpa_release(cookie);
228 		}
229 	}
230 	vm_unlock_memsegs(sc->vm);
231 	return (error);
232 }
233 
234 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
235 
236 static int
237 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
238 {
239 	struct devmem_softc *dsc;
240 	int error;
241 	bool sysmem;
242 
243 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
244 	if (error || mseg->len == 0)
245 		return (error);
246 
247 	if (!sysmem) {
248 		SLIST_FOREACH(dsc, &sc->devmem, link) {
249 			if (dsc->segid == mseg->segid)
250 				break;
251 		}
252 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
253 		    __func__, mseg->segid));
254 		error = copystr(dsc->name, mseg->name, len, NULL);
255 	} else {
256 		bzero(mseg->name, len);
257 	}
258 
259 	return (error);
260 }
261 
262 static int
263 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
264 {
265 	char *name;
266 	int error;
267 	bool sysmem;
268 
269 	error = 0;
270 	name = NULL;
271 	sysmem = true;
272 
273 	/*
274 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
275 	 * by stripped off when devfs processes the full string.
276 	 */
277 	if (VM_MEMSEG_NAME(mseg)) {
278 		sysmem = false;
279 		name = malloc(len, M_VMMDEV, M_WAITOK);
280 		error = copystr(mseg->name, name, len, NULL);
281 		if (error)
282 			goto done;
283 	}
284 
285 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
286 	if (error)
287 		goto done;
288 
289 	if (VM_MEMSEG_NAME(mseg)) {
290 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
291 		if (error)
292 			vm_free_memseg(sc->vm, mseg->segid);
293 		else
294 			name = NULL;	/* freed when 'cdev' is destroyed */
295 	}
296 done:
297 	free(name, M_VMMDEV);
298 	return (error);
299 }
300 
301 static int
302 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
303     uint64_t *regval)
304 {
305 	int error, i;
306 
307 	error = 0;
308 	for (i = 0; i < count; i++) {
309 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
310 		if (error)
311 			break;
312 	}
313 	return (error);
314 }
315 
316 static int
317 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
318     uint64_t *regval)
319 {
320 	int error, i;
321 
322 	error = 0;
323 	for (i = 0; i < count; i++) {
324 		error = vm_set_register(vcpu, regnum[i], regval[i]);
325 		if (error)
326 			break;
327 	}
328 	return (error);
329 }
330 
331 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
332 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
333 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
334 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
335 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
336 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
337 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
338 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
339 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
340 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
341 
342 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
343 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_FBSD12,
344 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
345 #endif
346 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
347 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
348 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
349 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
350 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
351 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
352 	VMMDEV_IOCTL(VM_REINIT,
353 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
354 
355 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
356 	VMMDEV_IOCTL(VM_GET_MEMSEG_FBSD12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
357 #endif
358 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
359 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
360 
361 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
362 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
363 
364 	VMMDEV_IOCTL(VM_SUSPEND, 0),
365 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
366 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
367 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
368 };
369 
370 static int
371 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
372     struct thread *td)
373 {
374 	struct vmmdev_softc *sc;
375 	struct vcpu *vcpu;
376 	const struct vmmdev_ioctl *ioctl;
377 	int error, vcpuid;
378 
379 	error = vmm_priv_check(td->td_ucred);
380 	if (error)
381 		return (error);
382 
383 	sc = vmmdev_lookup2(cdev);
384 	if (sc == NULL)
385 		return (ENXIO);
386 
387 	ioctl = NULL;
388 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
389 		if (vmmdev_ioctls[i].cmd == cmd) {
390 			ioctl = &vmmdev_ioctls[i];
391 			break;
392 		}
393 	}
394 	if (ioctl == NULL) {
395 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
396 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
397 				ioctl = &vmmdev_machdep_ioctls[i];
398 				break;
399 			}
400 		}
401 	}
402 	if (ioctl == NULL)
403 		return (ENOTTY);
404 
405 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
406 		vm_xlock_memsegs(sc->vm);
407 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
408 		vm_slock_memsegs(sc->vm);
409 
410 	vcpu = NULL;
411 	vcpuid = -1;
412 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
413 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
414 		vcpuid = *(int *)data;
415 		if (vcpuid == -1) {
416 			if ((ioctl->flags &
417 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
418 				error = EINVAL;
419 				goto lockfail;
420 			}
421 		} else {
422 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
423 			if (vcpu == NULL) {
424 				error = EINVAL;
425 				goto lockfail;
426 			}
427 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
428 				error = vcpu_lock_one(vcpu);
429 				if (error)
430 					goto lockfail;
431 			}
432 		}
433 	}
434 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
435 		error = vcpu_lock_all(sc);
436 		if (error)
437 			goto lockfail;
438 	}
439 
440 	switch (cmd) {
441 	case VM_SUSPEND: {
442 		struct vm_suspend *vmsuspend;
443 
444 		vmsuspend = (struct vm_suspend *)data;
445 		error = vm_suspend(sc->vm, vmsuspend->how);
446 		break;
447 	}
448 	case VM_REINIT:
449 		error = vm_reinit(sc->vm);
450 		break;
451 	case VM_STAT_DESC: {
452 		struct vm_stat_desc *statdesc;
453 
454 		statdesc = (struct vm_stat_desc *)data;
455 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
456 		    sizeof(statdesc->desc));
457 		break;
458 	}
459 	case VM_STATS: {
460 		struct vm_stats *vmstats;
461 
462 		vmstats = (struct vm_stats *)data;
463 		getmicrotime(&vmstats->tv);
464 		error = vmm_stat_copy(vcpu, vmstats->index,
465 		    nitems(vmstats->statbuf), &vmstats->num_entries,
466 		    vmstats->statbuf);
467 		break;
468 	}
469 	case VM_MMAP_GETNEXT: {
470 		struct vm_memmap *mm;
471 
472 		mm = (struct vm_memmap *)data;
473 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
474 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
475 		break;
476 	}
477 	case VM_MMAP_MEMSEG: {
478 		struct vm_memmap *mm;
479 
480 		mm = (struct vm_memmap *)data;
481 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
482 		    mm->len, mm->prot, mm->flags);
483 		break;
484 	}
485 	case VM_MUNMAP_MEMSEG: {
486 		struct vm_munmap *mu;
487 
488 		mu = (struct vm_munmap *)data;
489 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
490 		break;
491 	}
492 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
493 	case VM_ALLOC_MEMSEG_FBSD12:
494 		error = alloc_memseg(sc, (struct vm_memseg *)data,
495 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
496 		break;
497 	case VM_GET_MEMSEG_FBSD12:
498 		error = get_memseg(sc, (struct vm_memseg *)data,
499 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
500 		break;
501 #endif
502 	case VM_ALLOC_MEMSEG:
503 		error = alloc_memseg(sc, (struct vm_memseg *)data,
504 		    sizeof(((struct vm_memseg *)0)->name));
505 		break;
506 	case VM_GET_MEMSEG:
507 		error = get_memseg(sc, (struct vm_memseg *)data,
508 		    sizeof(((struct vm_memseg *)0)->name));
509 		break;
510 	case VM_GET_REGISTER: {
511 		struct vm_register *vmreg;
512 
513 		vmreg = (struct vm_register *)data;
514 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
515 		break;
516 	}
517 	case VM_SET_REGISTER: {
518 		struct vm_register *vmreg;
519 
520 		vmreg = (struct vm_register *)data;
521 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
522 		break;
523 	}
524 	case VM_GET_REGISTER_SET: {
525 		struct vm_register_set *vmregset;
526 		uint64_t *regvals;
527 		int *regnums;
528 
529 		vmregset = (struct vm_register_set *)data;
530 		if (vmregset->count > VM_REG_LAST) {
531 			error = EINVAL;
532 			break;
533 		}
534 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
535 		    M_WAITOK);
536 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
537 		    M_WAITOK);
538 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
539 		    vmregset->count);
540 		if (error == 0)
541 			error = vm_get_register_set(vcpu,
542 			    vmregset->count, regnums, regvals);
543 		if (error == 0)
544 			error = copyout(regvals, vmregset->regvals,
545 			    sizeof(regvals[0]) * vmregset->count);
546 		free(regvals, M_VMMDEV);
547 		free(regnums, M_VMMDEV);
548 		break;
549 	}
550 	case VM_SET_REGISTER_SET: {
551 		struct vm_register_set *vmregset;
552 		uint64_t *regvals;
553 		int *regnums;
554 
555 		vmregset = (struct vm_register_set *)data;
556 		if (vmregset->count > VM_REG_LAST) {
557 			error = EINVAL;
558 			break;
559 		}
560 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
561 		    M_WAITOK);
562 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
563 		    M_WAITOK);
564 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
565 		    vmregset->count);
566 		if (error == 0)
567 			error = copyin(vmregset->regvals, regvals,
568 			    sizeof(regvals[0]) * vmregset->count);
569 		if (error == 0)
570 			error = vm_set_register_set(vcpu,
571 			    vmregset->count, regnums, regvals);
572 		free(regvals, M_VMMDEV);
573 		free(regnums, M_VMMDEV);
574 		break;
575 	}
576 	case VM_GET_CAPABILITY: {
577 		struct vm_capability *vmcap;
578 
579 		vmcap = (struct vm_capability *)data;
580 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
581 		break;
582 	}
583 	case VM_SET_CAPABILITY: {
584 		struct vm_capability *vmcap;
585 
586 		vmcap = (struct vm_capability *)data;
587 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
588 		break;
589 	}
590 	case VM_ACTIVATE_CPU:
591 		error = vm_activate_cpu(vcpu);
592 		break;
593 	case VM_GET_CPUS: {
594 		struct vm_cpuset *vm_cpuset;
595 		cpuset_t *cpuset;
596 		int size;
597 
598 		error = 0;
599 		vm_cpuset = (struct vm_cpuset *)data;
600 		size = vm_cpuset->cpusetsize;
601 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
602 			error = ERANGE;
603 			break;
604 		}
605 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
606 		    M_WAITOK | M_ZERO);
607 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
608 			*cpuset = vm_active_cpus(sc->vm);
609 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
610 			*cpuset = vm_suspended_cpus(sc->vm);
611 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
612 			*cpuset = vm_debug_cpus(sc->vm);
613 		else
614 			error = EINVAL;
615 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
616 			error = ERANGE;
617 		if (error == 0)
618 			error = copyout(cpuset, vm_cpuset->cpus, size);
619 		free(cpuset, M_TEMP);
620 		break;
621 	}
622 	case VM_SUSPEND_CPU:
623 		error = vm_suspend_cpu(sc->vm, vcpu);
624 		break;
625 	case VM_RESUME_CPU:
626 		error = vm_resume_cpu(sc->vm, vcpu);
627 		break;
628 	case VM_SET_TOPOLOGY: {
629 		struct vm_cpu_topology *topology;
630 
631 		topology = (struct vm_cpu_topology *)data;
632 		error = vm_set_topology(sc->vm, topology->sockets,
633 		    topology->cores, topology->threads, topology->maxcpus);
634 		break;
635 	}
636 	case VM_GET_TOPOLOGY: {
637 		struct vm_cpu_topology *topology;
638 
639 		topology = (struct vm_cpu_topology *)data;
640 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
641 		    &topology->threads, &topology->maxcpus);
642 		error = 0;
643 		break;
644 	}
645 	default:
646 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
647 		    td);
648 		break;
649 	}
650 
651 	if ((ioctl->flags &
652 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
653 		vm_unlock_memsegs(sc->vm);
654 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
655 		vcpu_unlock_all(sc);
656 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
657 		vcpu_unlock_one(vcpu);
658 
659 	/*
660 	 * Make sure that no handler returns a kernel-internal
661 	 * error value to userspace.
662 	 */
663 	KASSERT(error == ERESTART || error >= 0,
664 	    ("vmmdev_ioctl: invalid error return %d", error));
665 	return (error);
666 
667 lockfail:
668 	if ((ioctl->flags &
669 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
670 		vm_unlock_memsegs(sc->vm);
671 	return (error);
672 }
673 
674 static int
675 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
676     struct vm_object **objp, int nprot)
677 {
678 	struct vmmdev_softc *sc;
679 	vm_paddr_t gpa;
680 	size_t len;
681 	vm_ooffset_t segoff, first, last;
682 	int error, found, segid;
683 	bool sysmem;
684 
685 	error = vmm_priv_check(curthread->td_ucred);
686 	if (error)
687 		return (error);
688 
689 	first = *offset;
690 	last = first + mapsize;
691 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
692 		return (EINVAL);
693 
694 	sc = vmmdev_lookup2(cdev);
695 	if (sc == NULL) {
696 		/* virtual machine is in the process of being created */
697 		return (EINVAL);
698 	}
699 
700 	/*
701 	 * Get a read lock on the guest memory map.
702 	 */
703 	vm_slock_memsegs(sc->vm);
704 
705 	gpa = 0;
706 	found = 0;
707 	while (!found) {
708 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
709 		    NULL, NULL);
710 		if (error)
711 			break;
712 
713 		if (first >= gpa && last <= gpa + len)
714 			found = 1;
715 		else
716 			gpa += len;
717 	}
718 
719 	if (found) {
720 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
721 		KASSERT(error == 0 && *objp != NULL,
722 		    ("%s: invalid memory segment %d", __func__, segid));
723 		if (sysmem) {
724 			vm_object_reference(*objp);
725 			*offset = segoff + (first - gpa);
726 		} else {
727 			error = EINVAL;
728 		}
729 	}
730 	vm_unlock_memsegs(sc->vm);
731 	return (error);
732 }
733 
734 static void
735 vmmdev_destroy(void *arg)
736 {
737 	struct vmmdev_softc *sc = arg;
738 	struct devmem_softc *dsc;
739 	int error __diagused;
740 
741 	vm_disable_vcpu_creation(sc->vm);
742 	error = vcpu_lock_all(sc);
743 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
744 	vm_unlock_vcpus(sc->vm);
745 
746 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
747 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
748 		SLIST_REMOVE_HEAD(&sc->devmem, link);
749 		free(dsc->name, M_VMMDEV);
750 		free(dsc, M_VMMDEV);
751 	}
752 
753 	if (sc->cdev != NULL)
754 		destroy_dev(sc->cdev);
755 
756 	if (sc->vm != NULL)
757 		vm_destroy(sc->vm);
758 
759 	if (sc->ucred != NULL)
760 		crfree(sc->ucred);
761 
762 	if ((sc->flags & VSC_LINKED) != 0) {
763 		mtx_lock(&vmmdev_mtx);
764 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
765 		mtx_unlock(&vmmdev_mtx);
766 	}
767 
768 	free(sc, M_VMMDEV);
769 }
770 
771 static int
772 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
773 {
774 	struct devmem_softc *dsc;
775 	struct vmmdev_softc *sc;
776 	struct cdev *cdev;
777 	char *buf;
778 	int error, buflen;
779 
780 	error = vmm_priv_check(req->td->td_ucred);
781 	if (error)
782 		return (error);
783 
784 	buflen = VM_MAX_NAMELEN + 1;
785 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
786 	strlcpy(buf, "beavis", buflen);
787 	error = sysctl_handle_string(oidp, buf, buflen, req);
788 	if (error != 0 || req->newptr == NULL)
789 		goto out;
790 
791 	mtx_lock(&vmmdev_mtx);
792 	sc = vmmdev_lookup(buf);
793 	if (sc == NULL || sc->cdev == NULL) {
794 		mtx_unlock(&vmmdev_mtx);
795 		error = EINVAL;
796 		goto out;
797 	}
798 
799 	/*
800 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
801 	 * is scheduled for destruction.
802 	 */
803 	cdev = sc->cdev;
804 	sc->cdev = NULL;
805 	mtx_unlock(&vmmdev_mtx);
806 
807 	/*
808 	 * Destroy all cdevs:
809 	 *
810 	 * - any new operations on the 'cdev' will return an error (ENXIO).
811 	 *
812 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
813 	 */
814 	SLIST_FOREACH(dsc, &sc->devmem, link) {
815 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
816 		destroy_dev(dsc->cdev);
817 		devmem_destroy(dsc);
818 	}
819 	destroy_dev(cdev);
820 	vmmdev_destroy(sc);
821 	error = 0;
822 
823 out:
824 	free(buf, M_VMMDEV);
825 	return (error);
826 }
827 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
828     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
829     NULL, 0, sysctl_vmm_destroy, "A",
830     NULL);
831 
832 static struct cdevsw vmmdevsw = {
833 	.d_name		= "vmmdev",
834 	.d_version	= D_VERSION,
835 	.d_ioctl	= vmmdev_ioctl,
836 	.d_mmap_single	= vmmdev_mmap_single,
837 	.d_read		= vmmdev_rw,
838 	.d_write	= vmmdev_rw,
839 };
840 
841 static struct vmmdev_softc *
842 vmmdev_alloc(struct vm *vm, struct ucred *cred)
843 {
844 	struct vmmdev_softc *sc;
845 
846 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
847 	SLIST_INIT(&sc->devmem);
848 	sc->vm = vm;
849 	sc->ucred = crhold(cred);
850 	return (sc);
851 }
852 
853 static int
854 vmmdev_create(const char *name, struct ucred *cred)
855 {
856 	struct cdev *cdev;
857 	struct vmmdev_softc *sc, *sc2;
858 	struct vm *vm;
859 	int error;
860 
861 	mtx_lock(&vmmdev_mtx);
862 	sc = vmmdev_lookup(name);
863 	mtx_unlock(&vmmdev_mtx);
864 	if (sc != NULL)
865 		return (EEXIST);
866 
867 	error = vm_create(name, &vm);
868 	if (error != 0)
869 		return (error);
870 
871 	sc = vmmdev_alloc(vm, cred);
872 
873 	/*
874 	 * Lookup the name again just in case somebody sneaked in when we
875 	 * dropped the lock.
876 	 */
877 	mtx_lock(&vmmdev_mtx);
878 	sc2 = vmmdev_lookup(name);
879 	if (sc2 != NULL) {
880 		mtx_unlock(&vmmdev_mtx);
881 		vmmdev_destroy(sc);
882 		return (EEXIST);
883 	}
884 	sc->flags |= VSC_LINKED;
885 	SLIST_INSERT_HEAD(&head, sc, link);
886 	mtx_unlock(&vmmdev_mtx);
887 
888 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
889 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", name);
890 	if (error != 0) {
891 		vmmdev_destroy(sc);
892 		return (error);
893 	}
894 
895 	mtx_lock(&vmmdev_mtx);
896 	sc->cdev = cdev;
897 	sc->cdev->si_drv1 = sc;
898 	mtx_unlock(&vmmdev_mtx);
899 
900 	return (0);
901 }
902 
903 static int
904 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
905 {
906 	char *buf;
907 	int error, buflen;
908 
909 	error = vmm_priv_check(req->td->td_ucred);
910 	if (error != 0)
911 		return (error);
912 
913 	buflen = VM_MAX_NAMELEN + 1;
914 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
915 	strlcpy(buf, "beavis", buflen);
916 	error = sysctl_handle_string(oidp, buf, buflen, req);
917 	if (error == 0 && req->newptr != NULL)
918 		error = vmmdev_create(buf, req->td->td_ucred);
919 	free(buf, M_VMMDEV);
920 	return (error);
921 }
922 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
923     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
924     NULL, 0, sysctl_vmm_create, "A",
925     NULL);
926 
927 void
928 vmmdev_init(void)
929 {
930 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
931 	    "Allow use of vmm in a jail.");
932 }
933 
934 int
935 vmmdev_cleanup(void)
936 {
937 	int error;
938 
939 	if (SLIST_EMPTY(&head))
940 		error = 0;
941 	else
942 		error = EBUSY;
943 
944 	return (error);
945 }
946 
947 static int
948 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
949     struct vm_object **objp, int nprot)
950 {
951 	struct devmem_softc *dsc;
952 	vm_ooffset_t first, last;
953 	size_t seglen;
954 	int error;
955 	bool sysmem;
956 
957 	dsc = cdev->si_drv1;
958 	if (dsc == NULL) {
959 		/* 'cdev' has been created but is not ready for use */
960 		return (ENXIO);
961 	}
962 
963 	first = *offset;
964 	last = *offset + len;
965 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
966 		return (EINVAL);
967 
968 	vm_slock_memsegs(dsc->sc->vm);
969 
970 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
971 	KASSERT(error == 0 && !sysmem && *objp != NULL,
972 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
973 
974 	if (seglen >= last)
975 		vm_object_reference(*objp);
976 	else
977 		error = EINVAL;
978 
979 	vm_unlock_memsegs(dsc->sc->vm);
980 	return (error);
981 }
982 
983 static struct cdevsw devmemsw = {
984 	.d_name		= "devmem",
985 	.d_version	= D_VERSION,
986 	.d_mmap_single	= devmem_mmap_single,
987 };
988 
989 static int
990 devmem_create_cdev(const char *vmname, int segid, char *devname)
991 {
992 	struct devmem_softc *dsc;
993 	struct vmmdev_softc *sc;
994 	struct cdev *cdev;
995 	int error;
996 
997 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
998 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
999 	if (error)
1000 		return (error);
1001 
1002 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1003 
1004 	mtx_lock(&vmmdev_mtx);
1005 	sc = vmmdev_lookup(vmname);
1006 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1007 	if (sc->cdev == NULL) {
1008 		/* virtual machine is being created or destroyed */
1009 		mtx_unlock(&vmmdev_mtx);
1010 		free(dsc, M_VMMDEV);
1011 		destroy_dev_sched_cb(cdev, NULL, 0);
1012 		return (ENODEV);
1013 	}
1014 
1015 	dsc->segid = segid;
1016 	dsc->name = devname;
1017 	dsc->cdev = cdev;
1018 	dsc->sc = sc;
1019 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1020 	mtx_unlock(&vmmdev_mtx);
1021 
1022 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1023 	cdev->si_drv1 = dsc;
1024 	return (0);
1025 }
1026 
1027 static void
1028 devmem_destroy(void *arg)
1029 {
1030 	struct devmem_softc *dsc = arg;
1031 
1032 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1033 	dsc->cdev = NULL;
1034 	dsc->sc = NULL;
1035 }
1036