xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision 72ea8f41e19d3e028dd4ab3f9102240e215dbc6d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #include <sys/ioccom.h>
12 #include <sys/jail.h>
13 #include <sys/kernel.h>
14 #include <sys/malloc.h>
15 #include <sys/mman.h>
16 #include <sys/mutex.h>
17 #include <sys/proc.h>
18 #include <sys/queue.h>
19 #include <sys/sysctl.h>
20 #include <sys/ucred.h>
21 #include <sys/uio.h>
22 
23 #include <machine/vmm.h>
24 
25 #include <vm/vm.h>
26 #include <vm/vm_object.h>
27 
28 #include <dev/vmm/vmm_dev.h>
29 #include <dev/vmm/vmm_stat.h>
30 
31 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
32 struct vm_memseg_fbsd12 {
33 	int		segid;
34 	size_t		len;
35 	char		name[64];
36 };
37 _Static_assert(sizeof(struct vm_memseg_fbsd12) == 80, "COMPAT_FREEBSD12 ABI");
38 
39 #define	VM_ALLOC_MEMSEG_FBSD12	\
40 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_fbsd12)
41 #define	VM_GET_MEMSEG_FBSD12	\
42 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_fbsd12)
43 #endif
44 
45 struct devmem_softc {
46 	int	segid;
47 	char	*name;
48 	struct cdev *cdev;
49 	struct vmmdev_softc *sc;
50 	SLIST_ENTRY(devmem_softc) link;
51 };
52 
53 struct vmmdev_softc {
54 	struct vm	*vm;		/* vm instance cookie */
55 	struct cdev	*cdev;
56 	struct ucred	*ucred;
57 	SLIST_ENTRY(vmmdev_softc) link;
58 	SLIST_HEAD(, devmem_softc) devmem;
59 	int		flags;
60 };
61 #define	VSC_LINKED		0x01
62 
63 static SLIST_HEAD(, vmmdev_softc) head;
64 
65 static unsigned pr_allow_flag;
66 static struct mtx vmmdev_mtx;
67 MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
68 
69 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
70 
71 SYSCTL_DECL(_hw_vmm);
72 
73 static void devmem_destroy(void *arg);
74 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem);
75 
76 static int
77 vmm_priv_check(struct ucred *ucred)
78 {
79 	if (jailed(ucred) &&
80 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
81 		return (EPERM);
82 
83 	return (0);
84 }
85 
86 static int
87 vcpu_lock_one(struct vcpu *vcpu)
88 {
89 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
90 }
91 
92 static void
93 vcpu_unlock_one(struct vcpu *vcpu)
94 {
95 	enum vcpu_state state;
96 
97 	state = vcpu_get_state(vcpu, NULL);
98 	if (state != VCPU_FROZEN) {
99 		panic("vcpu %s(%d) has invalid state %d",
100 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
101 	}
102 
103 	vcpu_set_state(vcpu, VCPU_IDLE, false);
104 }
105 
106 static int
107 vcpu_lock_all(struct vmmdev_softc *sc)
108 {
109 	struct vcpu *vcpu;
110 	int error;
111 	uint16_t i, j, maxcpus;
112 
113 	error = 0;
114 	vm_slock_vcpus(sc->vm);
115 	maxcpus = vm_get_maxcpus(sc->vm);
116 	for (i = 0; i < maxcpus; i++) {
117 		vcpu = vm_vcpu(sc->vm, i);
118 		if (vcpu == NULL)
119 			continue;
120 		error = vcpu_lock_one(vcpu);
121 		if (error)
122 			break;
123 	}
124 
125 	if (error) {
126 		for (j = 0; j < i; j++) {
127 			vcpu = vm_vcpu(sc->vm, j);
128 			if (vcpu == NULL)
129 				continue;
130 			vcpu_unlock_one(vcpu);
131 		}
132 		vm_unlock_vcpus(sc->vm);
133 	}
134 
135 	return (error);
136 }
137 
138 static void
139 vcpu_unlock_all(struct vmmdev_softc *sc)
140 {
141 	struct vcpu *vcpu;
142 	uint16_t i, maxcpus;
143 
144 	maxcpus = vm_get_maxcpus(sc->vm);
145 	for (i = 0; i < maxcpus; i++) {
146 		vcpu = vm_vcpu(sc->vm, i);
147 		if (vcpu == NULL)
148 			continue;
149 		vcpu_unlock_one(vcpu);
150 	}
151 	vm_unlock_vcpus(sc->vm);
152 }
153 
154 static struct vmmdev_softc *
155 vmmdev_lookup(const char *name, struct ucred *cred)
156 {
157 	struct vmmdev_softc *sc;
158 
159 	mtx_assert(&vmmdev_mtx, MA_OWNED);
160 
161 	SLIST_FOREACH(sc, &head, link) {
162 		if (strcmp(name, vm_name(sc->vm)) == 0)
163 			break;
164 	}
165 
166 	if (sc == NULL)
167 		return (NULL);
168 
169 	if (cr_cansee(cred, sc->ucred))
170 		return (NULL);
171 
172 	return (sc);
173 }
174 
175 static struct vmmdev_softc *
176 vmmdev_lookup2(struct cdev *cdev)
177 {
178 	return (cdev->si_drv1);
179 }
180 
181 static int
182 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
183 {
184 	int error, off, c, prot;
185 	vm_paddr_t gpa, maxaddr;
186 	void *hpa, *cookie;
187 	struct vmmdev_softc *sc;
188 
189 	error = vmm_priv_check(curthread->td_ucred);
190 	if (error)
191 		return (error);
192 
193 	sc = vmmdev_lookup2(cdev);
194 	if (sc == NULL)
195 		return (ENXIO);
196 
197 	/*
198 	 * Get a read lock on the guest memory map.
199 	 */
200 	vm_slock_memsegs(sc->vm);
201 
202 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
203 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
204 	while (uio->uio_resid > 0 && error == 0) {
205 		gpa = uio->uio_offset;
206 		off = gpa & PAGE_MASK;
207 		c = min(uio->uio_resid, PAGE_SIZE - off);
208 
209 		/*
210 		 * The VM has a hole in its physical memory map. If we want to
211 		 * use 'dd' to inspect memory beyond the hole we need to
212 		 * provide bogus data for memory that lies in the hole.
213 		 *
214 		 * Since this device does not support lseek(2), dd(1) will
215 		 * read(2) blocks of data to simulate the lseek(2).
216 		 */
217 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
218 		if (hpa == NULL) {
219 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
220 				error = uiomove(__DECONST(void *, zero_region),
221 				    c, uio);
222 			else
223 				error = EFAULT;
224 		} else {
225 			error = uiomove(hpa, c, uio);
226 			vm_gpa_release(cookie);
227 		}
228 	}
229 	vm_unlock_memsegs(sc->vm);
230 	return (error);
231 }
232 
233 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
234 
235 static int
236 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
237 {
238 	struct devmem_softc *dsc;
239 	int error;
240 	bool sysmem;
241 
242 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
243 	if (error || mseg->len == 0)
244 		return (error);
245 
246 	if (!sysmem) {
247 		SLIST_FOREACH(dsc, &sc->devmem, link) {
248 			if (dsc->segid == mseg->segid)
249 				break;
250 		}
251 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
252 		    __func__, mseg->segid));
253 		error = copystr(dsc->name, mseg->name, len, NULL);
254 	} else {
255 		bzero(mseg->name, len);
256 	}
257 
258 	return (error);
259 }
260 
261 static int
262 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
263 {
264 	char *name;
265 	int error;
266 	bool sysmem;
267 
268 	error = 0;
269 	name = NULL;
270 	sysmem = true;
271 
272 	/*
273 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
274 	 * by stripped off when devfs processes the full string.
275 	 */
276 	if (VM_MEMSEG_NAME(mseg)) {
277 		sysmem = false;
278 		name = malloc(len, M_VMMDEV, M_WAITOK);
279 		error = copystr(mseg->name, name, len, NULL);
280 		if (error)
281 			goto done;
282 	}
283 
284 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
285 	if (error)
286 		goto done;
287 
288 	if (VM_MEMSEG_NAME(mseg)) {
289 		error = devmem_create_cdev(sc, mseg->segid, name);
290 		if (error)
291 			vm_free_memseg(sc->vm, mseg->segid);
292 		else
293 			name = NULL;	/* freed when 'cdev' is destroyed */
294 	}
295 done:
296 	free(name, M_VMMDEV);
297 	return (error);
298 }
299 
300 static int
301 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
302     uint64_t *regval)
303 {
304 	int error, i;
305 
306 	error = 0;
307 	for (i = 0; i < count; i++) {
308 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
309 		if (error)
310 			break;
311 	}
312 	return (error);
313 }
314 
315 static int
316 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
317     uint64_t *regval)
318 {
319 	int error, i;
320 
321 	error = 0;
322 	for (i = 0; i < count; i++) {
323 		error = vm_set_register(vcpu, regnum[i], regval[i]);
324 		if (error)
325 			break;
326 	}
327 	return (error);
328 }
329 
330 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
331 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
332 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
333 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
334 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
335 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
336 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
337 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
338 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
339 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
340 
341 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
342 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_FBSD12,
343 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
344 #endif
345 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
346 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
347 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
348 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
349 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
350 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
351 	VMMDEV_IOCTL(VM_REINIT,
352 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
353 
354 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
355 	VMMDEV_IOCTL(VM_GET_MEMSEG_FBSD12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
356 #endif
357 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
358 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
359 
360 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
361 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
362 
363 	VMMDEV_IOCTL(VM_SUSPEND, 0),
364 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
365 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
366 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
367 };
368 
369 static int
370 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
371     struct thread *td)
372 {
373 	struct vmmdev_softc *sc;
374 	struct vcpu *vcpu;
375 	const struct vmmdev_ioctl *ioctl;
376 	int error, vcpuid;
377 
378 	error = vmm_priv_check(td->td_ucred);
379 	if (error)
380 		return (error);
381 
382 	sc = vmmdev_lookup2(cdev);
383 	if (sc == NULL)
384 		return (ENXIO);
385 
386 	ioctl = NULL;
387 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
388 		if (vmmdev_ioctls[i].cmd == cmd) {
389 			ioctl = &vmmdev_ioctls[i];
390 			break;
391 		}
392 	}
393 	if (ioctl == NULL) {
394 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
395 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
396 				ioctl = &vmmdev_machdep_ioctls[i];
397 				break;
398 			}
399 		}
400 	}
401 	if (ioctl == NULL)
402 		return (ENOTTY);
403 
404 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
405 		vm_xlock_memsegs(sc->vm);
406 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
407 		vm_slock_memsegs(sc->vm);
408 
409 	vcpu = NULL;
410 	vcpuid = -1;
411 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
412 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
413 		vcpuid = *(int *)data;
414 		if (vcpuid == -1) {
415 			if ((ioctl->flags &
416 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
417 				error = EINVAL;
418 				goto lockfail;
419 			}
420 		} else {
421 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
422 			if (vcpu == NULL) {
423 				error = EINVAL;
424 				goto lockfail;
425 			}
426 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
427 				error = vcpu_lock_one(vcpu);
428 				if (error)
429 					goto lockfail;
430 			}
431 		}
432 	}
433 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
434 		error = vcpu_lock_all(sc);
435 		if (error)
436 			goto lockfail;
437 	}
438 
439 	switch (cmd) {
440 	case VM_SUSPEND: {
441 		struct vm_suspend *vmsuspend;
442 
443 		vmsuspend = (struct vm_suspend *)data;
444 		error = vm_suspend(sc->vm, vmsuspend->how);
445 		break;
446 	}
447 	case VM_REINIT:
448 		error = vm_reinit(sc->vm);
449 		break;
450 	case VM_STAT_DESC: {
451 		struct vm_stat_desc *statdesc;
452 
453 		statdesc = (struct vm_stat_desc *)data;
454 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
455 		    sizeof(statdesc->desc));
456 		break;
457 	}
458 	case VM_STATS: {
459 		struct vm_stats *vmstats;
460 
461 		vmstats = (struct vm_stats *)data;
462 		getmicrotime(&vmstats->tv);
463 		error = vmm_stat_copy(vcpu, vmstats->index,
464 		    nitems(vmstats->statbuf), &vmstats->num_entries,
465 		    vmstats->statbuf);
466 		break;
467 	}
468 	case VM_MMAP_GETNEXT: {
469 		struct vm_memmap *mm;
470 
471 		mm = (struct vm_memmap *)data;
472 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
473 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
474 		break;
475 	}
476 	case VM_MMAP_MEMSEG: {
477 		struct vm_memmap *mm;
478 
479 		mm = (struct vm_memmap *)data;
480 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
481 		    mm->len, mm->prot, mm->flags);
482 		break;
483 	}
484 	case VM_MUNMAP_MEMSEG: {
485 		struct vm_munmap *mu;
486 
487 		mu = (struct vm_munmap *)data;
488 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
489 		break;
490 	}
491 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
492 	case VM_ALLOC_MEMSEG_FBSD12:
493 		error = alloc_memseg(sc, (struct vm_memseg *)data,
494 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
495 		break;
496 	case VM_GET_MEMSEG_FBSD12:
497 		error = get_memseg(sc, (struct vm_memseg *)data,
498 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
499 		break;
500 #endif
501 	case VM_ALLOC_MEMSEG:
502 		error = alloc_memseg(sc, (struct vm_memseg *)data,
503 		    sizeof(((struct vm_memseg *)0)->name));
504 		break;
505 	case VM_GET_MEMSEG:
506 		error = get_memseg(sc, (struct vm_memseg *)data,
507 		    sizeof(((struct vm_memseg *)0)->name));
508 		break;
509 	case VM_GET_REGISTER: {
510 		struct vm_register *vmreg;
511 
512 		vmreg = (struct vm_register *)data;
513 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
514 		break;
515 	}
516 	case VM_SET_REGISTER: {
517 		struct vm_register *vmreg;
518 
519 		vmreg = (struct vm_register *)data;
520 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
521 		break;
522 	}
523 	case VM_GET_REGISTER_SET: {
524 		struct vm_register_set *vmregset;
525 		uint64_t *regvals;
526 		int *regnums;
527 
528 		vmregset = (struct vm_register_set *)data;
529 		if (vmregset->count > VM_REG_LAST) {
530 			error = EINVAL;
531 			break;
532 		}
533 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
534 		    M_WAITOK);
535 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
536 		    M_WAITOK);
537 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
538 		    vmregset->count);
539 		if (error == 0)
540 			error = vm_get_register_set(vcpu,
541 			    vmregset->count, regnums, regvals);
542 		if (error == 0)
543 			error = copyout(regvals, vmregset->regvals,
544 			    sizeof(regvals[0]) * vmregset->count);
545 		free(regvals, M_VMMDEV);
546 		free(regnums, M_VMMDEV);
547 		break;
548 	}
549 	case VM_SET_REGISTER_SET: {
550 		struct vm_register_set *vmregset;
551 		uint64_t *regvals;
552 		int *regnums;
553 
554 		vmregset = (struct vm_register_set *)data;
555 		if (vmregset->count > VM_REG_LAST) {
556 			error = EINVAL;
557 			break;
558 		}
559 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
560 		    M_WAITOK);
561 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
562 		    M_WAITOK);
563 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
564 		    vmregset->count);
565 		if (error == 0)
566 			error = copyin(vmregset->regvals, regvals,
567 			    sizeof(regvals[0]) * vmregset->count);
568 		if (error == 0)
569 			error = vm_set_register_set(vcpu,
570 			    vmregset->count, regnums, regvals);
571 		free(regvals, M_VMMDEV);
572 		free(regnums, M_VMMDEV);
573 		break;
574 	}
575 	case VM_GET_CAPABILITY: {
576 		struct vm_capability *vmcap;
577 
578 		vmcap = (struct vm_capability *)data;
579 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
580 		break;
581 	}
582 	case VM_SET_CAPABILITY: {
583 		struct vm_capability *vmcap;
584 
585 		vmcap = (struct vm_capability *)data;
586 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
587 		break;
588 	}
589 	case VM_ACTIVATE_CPU:
590 		error = vm_activate_cpu(vcpu);
591 		break;
592 	case VM_GET_CPUS: {
593 		struct vm_cpuset *vm_cpuset;
594 		cpuset_t *cpuset;
595 		int size;
596 
597 		error = 0;
598 		vm_cpuset = (struct vm_cpuset *)data;
599 		size = vm_cpuset->cpusetsize;
600 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
601 			error = ERANGE;
602 			break;
603 		}
604 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
605 		    M_WAITOK | M_ZERO);
606 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
607 			*cpuset = vm_active_cpus(sc->vm);
608 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
609 			*cpuset = vm_suspended_cpus(sc->vm);
610 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
611 			*cpuset = vm_debug_cpus(sc->vm);
612 		else
613 			error = EINVAL;
614 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
615 			error = ERANGE;
616 		if (error == 0)
617 			error = copyout(cpuset, vm_cpuset->cpus, size);
618 		free(cpuset, M_TEMP);
619 		break;
620 	}
621 	case VM_SUSPEND_CPU:
622 		error = vm_suspend_cpu(sc->vm, vcpu);
623 		break;
624 	case VM_RESUME_CPU:
625 		error = vm_resume_cpu(sc->vm, vcpu);
626 		break;
627 	case VM_SET_TOPOLOGY: {
628 		struct vm_cpu_topology *topology;
629 
630 		topology = (struct vm_cpu_topology *)data;
631 		error = vm_set_topology(sc->vm, topology->sockets,
632 		    topology->cores, topology->threads, topology->maxcpus);
633 		break;
634 	}
635 	case VM_GET_TOPOLOGY: {
636 		struct vm_cpu_topology *topology;
637 
638 		topology = (struct vm_cpu_topology *)data;
639 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
640 		    &topology->threads, &topology->maxcpus);
641 		error = 0;
642 		break;
643 	}
644 	default:
645 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
646 		    td);
647 		break;
648 	}
649 
650 	if ((ioctl->flags &
651 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
652 		vm_unlock_memsegs(sc->vm);
653 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
654 		vcpu_unlock_all(sc);
655 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
656 		vcpu_unlock_one(vcpu);
657 
658 	/*
659 	 * Make sure that no handler returns a kernel-internal
660 	 * error value to userspace.
661 	 */
662 	KASSERT(error == ERESTART || error >= 0,
663 	    ("vmmdev_ioctl: invalid error return %d", error));
664 	return (error);
665 
666 lockfail:
667 	if ((ioctl->flags &
668 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
669 		vm_unlock_memsegs(sc->vm);
670 	return (error);
671 }
672 
673 static int
674 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
675     struct vm_object **objp, int nprot)
676 {
677 	struct vmmdev_softc *sc;
678 	vm_paddr_t gpa;
679 	size_t len;
680 	vm_ooffset_t segoff, first, last;
681 	int error, found, segid;
682 	bool sysmem;
683 
684 	error = vmm_priv_check(curthread->td_ucred);
685 	if (error)
686 		return (error);
687 
688 	first = *offset;
689 	last = first + mapsize;
690 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
691 		return (EINVAL);
692 
693 	sc = vmmdev_lookup2(cdev);
694 	if (sc == NULL) {
695 		/* virtual machine is in the process of being created */
696 		return (EINVAL);
697 	}
698 
699 	/*
700 	 * Get a read lock on the guest memory map.
701 	 */
702 	vm_slock_memsegs(sc->vm);
703 
704 	gpa = 0;
705 	found = 0;
706 	while (!found) {
707 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
708 		    NULL, NULL);
709 		if (error)
710 			break;
711 
712 		if (first >= gpa && last <= gpa + len)
713 			found = 1;
714 		else
715 			gpa += len;
716 	}
717 
718 	if (found) {
719 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
720 		KASSERT(error == 0 && *objp != NULL,
721 		    ("%s: invalid memory segment %d", __func__, segid));
722 		if (sysmem) {
723 			vm_object_reference(*objp);
724 			*offset = segoff + (first - gpa);
725 		} else {
726 			error = EINVAL;
727 		}
728 	}
729 	vm_unlock_memsegs(sc->vm);
730 	return (error);
731 }
732 
733 static void
734 vmmdev_destroy(struct vmmdev_softc *sc)
735 {
736 	struct devmem_softc *dsc;
737 	int error __diagused;
738 
739 	/*
740 	 * Destroy all cdevs:
741 	 *
742 	 * - any new operations on the 'cdev' will return an error (ENXIO).
743 	 *
744 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
745 	 */
746 	SLIST_FOREACH(dsc, &sc->devmem, link) {
747 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
748 		destroy_dev(dsc->cdev);
749 		devmem_destroy(dsc);
750 	}
751 
752 	vm_disable_vcpu_creation(sc->vm);
753 	error = vcpu_lock_all(sc);
754 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
755 	vm_unlock_vcpus(sc->vm);
756 
757 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
758 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
759 		SLIST_REMOVE_HEAD(&sc->devmem, link);
760 		free(dsc->name, M_VMMDEV);
761 		free(dsc, M_VMMDEV);
762 	}
763 
764 	if (sc->cdev != NULL)
765 		destroy_dev(sc->cdev);
766 
767 	if (sc->vm != NULL)
768 		vm_destroy(sc->vm);
769 
770 	if (sc->ucred != NULL)
771 		crfree(sc->ucred);
772 
773 	if ((sc->flags & VSC_LINKED) != 0) {
774 		mtx_lock(&vmmdev_mtx);
775 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
776 		mtx_unlock(&vmmdev_mtx);
777 	}
778 
779 	free(sc, M_VMMDEV);
780 }
781 
782 static int
783 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred)
784 {
785 	struct cdev *cdev;
786 	struct vmmdev_softc *sc;
787 
788 	mtx_lock(&vmmdev_mtx);
789 	sc = vmmdev_lookup(name, cred);
790 	if (sc == NULL || sc->cdev == NULL) {
791 		mtx_unlock(&vmmdev_mtx);
792 		return (EINVAL);
793 	}
794 
795 	/*
796 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
797 	 * is scheduled for destruction.
798 	 */
799 	cdev = sc->cdev;
800 	sc->cdev = NULL;
801 	mtx_unlock(&vmmdev_mtx);
802 
803 	destroy_dev(cdev);
804 	vmmdev_destroy(sc);
805 
806 	return (0);
807 }
808 
809 static int
810 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
811 {
812 	char *buf;
813 	int error, buflen;
814 
815 	error = vmm_priv_check(req->td->td_ucred);
816 	if (error)
817 		return (error);
818 
819 	buflen = VM_MAX_NAMELEN + 1;
820 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
821 	strlcpy(buf, "beavis", buflen);
822 	error = sysctl_handle_string(oidp, buf, buflen, req);
823 	if (error == 0 && req->newptr != NULL)
824 		error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
825 	free(buf, M_VMMDEV);
826 	return (error);
827 }
828 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
829     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
830     NULL, 0, sysctl_vmm_destroy, "A",
831     NULL);
832 
833 static struct cdevsw vmmdevsw = {
834 	.d_name		= "vmmdev",
835 	.d_version	= D_VERSION,
836 	.d_ioctl	= vmmdev_ioctl,
837 	.d_mmap_single	= vmmdev_mmap_single,
838 	.d_read		= vmmdev_rw,
839 	.d_write	= vmmdev_rw,
840 };
841 
842 static struct vmmdev_softc *
843 vmmdev_alloc(struct vm *vm, struct ucred *cred)
844 {
845 	struct vmmdev_softc *sc;
846 
847 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
848 	SLIST_INIT(&sc->devmem);
849 	sc->vm = vm;
850 	sc->ucred = crhold(cred);
851 	return (sc);
852 }
853 
854 static int
855 vmmdev_create(const char *name, struct ucred *cred)
856 {
857 	struct cdev *cdev;
858 	struct vmmdev_softc *sc, *sc2;
859 	struct vm *vm;
860 	int error;
861 
862 	mtx_lock(&vmmdev_mtx);
863 	sc = vmmdev_lookup(name, cred);
864 	mtx_unlock(&vmmdev_mtx);
865 	if (sc != NULL)
866 		return (EEXIST);
867 
868 	error = vm_create(name, &vm);
869 	if (error != 0)
870 		return (error);
871 
872 	sc = vmmdev_alloc(vm, cred);
873 
874 	/*
875 	 * Lookup the name again just in case somebody sneaked in when we
876 	 * dropped the lock.
877 	 */
878 	mtx_lock(&vmmdev_mtx);
879 	sc2 = vmmdev_lookup(name, cred);
880 	if (sc2 != NULL) {
881 		mtx_unlock(&vmmdev_mtx);
882 		vmmdev_destroy(sc);
883 		return (EEXIST);
884 	}
885 	sc->flags |= VSC_LINKED;
886 	SLIST_INSERT_HEAD(&head, sc, link);
887 	mtx_unlock(&vmmdev_mtx);
888 
889 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
890 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", name);
891 	if (error != 0) {
892 		vmmdev_destroy(sc);
893 		return (error);
894 	}
895 
896 	mtx_lock(&vmmdev_mtx);
897 	sc->cdev = cdev;
898 	sc->cdev->si_drv1 = sc;
899 	mtx_unlock(&vmmdev_mtx);
900 
901 	return (0);
902 }
903 
904 static int
905 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
906 {
907 	char *buf;
908 	int error, buflen;
909 
910 	error = vmm_priv_check(req->td->td_ucred);
911 	if (error != 0)
912 		return (error);
913 
914 	buflen = VM_MAX_NAMELEN + 1;
915 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
916 	strlcpy(buf, "beavis", buflen);
917 	error = sysctl_handle_string(oidp, buf, buflen, req);
918 	if (error == 0 && req->newptr != NULL)
919 		error = vmmdev_create(buf, req->td->td_ucred);
920 	free(buf, M_VMMDEV);
921 	return (error);
922 }
923 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
924     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
925     NULL, 0, sysctl_vmm_create, "A",
926     NULL);
927 
928 void
929 vmmdev_init(void)
930 {
931 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
932 	    "Allow use of vmm in a jail.");
933 }
934 
935 int
936 vmmdev_cleanup(void)
937 {
938 	int error;
939 
940 	if (SLIST_EMPTY(&head))
941 		error = 0;
942 	else
943 		error = EBUSY;
944 
945 	return (error);
946 }
947 
948 static int
949 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
950     struct vm_object **objp, int nprot)
951 {
952 	struct devmem_softc *dsc;
953 	vm_ooffset_t first, last;
954 	size_t seglen;
955 	int error;
956 	bool sysmem;
957 
958 	dsc = cdev->si_drv1;
959 	if (dsc == NULL) {
960 		/* 'cdev' has been created but is not ready for use */
961 		return (ENXIO);
962 	}
963 
964 	first = *offset;
965 	last = *offset + len;
966 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
967 		return (EINVAL);
968 
969 	vm_slock_memsegs(dsc->sc->vm);
970 
971 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
972 	KASSERT(error == 0 && !sysmem && *objp != NULL,
973 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
974 
975 	if (seglen >= last)
976 		vm_object_reference(*objp);
977 	else
978 		error = EINVAL;
979 
980 	vm_unlock_memsegs(dsc->sc->vm);
981 	return (error);
982 }
983 
984 static struct cdevsw devmemsw = {
985 	.d_name		= "devmem",
986 	.d_version	= D_VERSION,
987 	.d_mmap_single	= devmem_mmap_single,
988 };
989 
990 static int
991 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname)
992 {
993 	struct devmem_softc *dsc;
994 	struct cdev *cdev;
995 	const char *vmname;
996 	int error;
997 
998 	vmname = vm_name(sc->vm);
999 
1000 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, sc->ucred,
1001 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1002 	if (error)
1003 		return (error);
1004 
1005 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1006 
1007 	mtx_lock(&vmmdev_mtx);
1008 	if (sc->cdev == NULL) {
1009 		/* virtual machine is being created or destroyed */
1010 		mtx_unlock(&vmmdev_mtx);
1011 		free(dsc, M_VMMDEV);
1012 		destroy_dev_sched_cb(cdev, NULL, 0);
1013 		return (ENODEV);
1014 	}
1015 
1016 	dsc->segid = segid;
1017 	dsc->name = devname;
1018 	dsc->cdev = cdev;
1019 	dsc->sc = sc;
1020 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1021 	mtx_unlock(&vmmdev_mtx);
1022 
1023 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1024 	cdev->si_drv1 = dsc;
1025 	return (0);
1026 }
1027 
1028 static void
1029 devmem_destroy(void *arg)
1030 {
1031 	struct devmem_softc *dsc = arg;
1032 
1033 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1034 	dsc->cdev = NULL;
1035 	dsc->sc = NULL;
1036 }
1037