xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision b9ef152bec6cff4cd82b68921f631bd6efb24ae6)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #include <sys/ioccom.h>
12 #include <sys/jail.h>
13 #include <sys/kernel.h>
14 #include <sys/malloc.h>
15 #include <sys/mman.h>
16 #include <sys/mutex.h>
17 #include <sys/proc.h>
18 #include <sys/queue.h>
19 #include <sys/sysctl.h>
20 #include <sys/ucred.h>
21 #include <sys/uio.h>
22 
23 #include <machine/vmm.h>
24 
25 #include <vm/vm.h>
26 #include <vm/vm_object.h>
27 
28 #include <dev/vmm/vmm_dev.h>
29 #include <dev/vmm/vmm_stat.h>
30 
31 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
32 
33 struct devmem_softc {
34 	int	segid;
35 	char	*name;
36 	struct cdev *cdev;
37 	struct vmmdev_softc *sc;
38 	SLIST_ENTRY(devmem_softc) link;
39 };
40 
41 struct vmmdev_softc {
42 	struct vm	*vm;		/* vm instance cookie */
43 	struct cdev	*cdev;
44 	struct ucred	*ucred;
45 	SLIST_ENTRY(vmmdev_softc) link;
46 	SLIST_HEAD(, devmem_softc) devmem;
47 	int		flags;
48 };
49 #define	VSC_LINKED		0x01
50 
51 static SLIST_HEAD(, vmmdev_softc) head;
52 
53 static unsigned pr_allow_flag;
54 static struct mtx vmmdev_mtx;
55 MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
56 
57 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
58 
59 SYSCTL_DECL(_hw_vmm);
60 
61 static void devmem_destroy(void *arg);
62 
63 static int
64 vmm_priv_check(struct ucred *ucred)
65 {
66 	if (jailed(ucred) &&
67 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
68 		return (EPERM);
69 
70 	return (0);
71 }
72 
73 static int
74 vcpu_lock_one(struct vcpu *vcpu)
75 {
76 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
77 }
78 
79 static void
80 vcpu_unlock_one(struct vcpu *vcpu)
81 {
82 	enum vcpu_state state;
83 
84 	state = vcpu_get_state(vcpu, NULL);
85 	if (state != VCPU_FROZEN) {
86 		panic("vcpu %s(%d) has invalid state %d",
87 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
88 	}
89 
90 	vcpu_set_state(vcpu, VCPU_IDLE, false);
91 }
92 
93 static int
94 vcpu_lock_all(struct vmmdev_softc *sc)
95 {
96 	struct vcpu *vcpu;
97 	int error;
98 	uint16_t i, j, maxcpus;
99 
100 	error = 0;
101 	vm_slock_vcpus(sc->vm);
102 	maxcpus = vm_get_maxcpus(sc->vm);
103 	for (i = 0; i < maxcpus; i++) {
104 		vcpu = vm_vcpu(sc->vm, i);
105 		if (vcpu == NULL)
106 			continue;
107 		error = vcpu_lock_one(vcpu);
108 		if (error)
109 			break;
110 	}
111 
112 	if (error) {
113 		for (j = 0; j < i; j++) {
114 			vcpu = vm_vcpu(sc->vm, j);
115 			if (vcpu == NULL)
116 				continue;
117 			vcpu_unlock_one(vcpu);
118 		}
119 		vm_unlock_vcpus(sc->vm);
120 	}
121 
122 	return (error);
123 }
124 
125 static void
126 vcpu_unlock_all(struct vmmdev_softc *sc)
127 {
128 	struct vcpu *vcpu;
129 	uint16_t i, maxcpus;
130 
131 	maxcpus = vm_get_maxcpus(sc->vm);
132 	for (i = 0; i < maxcpus; i++) {
133 		vcpu = vm_vcpu(sc->vm, i);
134 		if (vcpu == NULL)
135 			continue;
136 		vcpu_unlock_one(vcpu);
137 	}
138 	vm_unlock_vcpus(sc->vm);
139 }
140 
141 static struct vmmdev_softc *
142 vmmdev_lookup(const char *name)
143 {
144 	struct vmmdev_softc *sc;
145 
146 	mtx_assert(&vmmdev_mtx, MA_OWNED);
147 
148 	SLIST_FOREACH(sc, &head, link) {
149 		if (strcmp(name, vm_name(sc->vm)) == 0)
150 			break;
151 	}
152 
153 	if (sc == NULL)
154 		return (NULL);
155 
156 	if (cr_cansee(curthread->td_ucred, sc->ucred))
157 		return (NULL);
158 
159 	return (sc);
160 }
161 
162 static struct vmmdev_softc *
163 vmmdev_lookup2(struct cdev *cdev)
164 {
165 	return (cdev->si_drv1);
166 }
167 
168 static int
169 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
170 {
171 	int error, off, c, prot;
172 	vm_paddr_t gpa, maxaddr;
173 	void *hpa, *cookie;
174 	struct vmmdev_softc *sc;
175 
176 	error = vmm_priv_check(curthread->td_ucred);
177 	if (error)
178 		return (error);
179 
180 	sc = vmmdev_lookup2(cdev);
181 	if (sc == NULL)
182 		return (ENXIO);
183 
184 	/*
185 	 * Get a read lock on the guest memory map.
186 	 */
187 	vm_slock_memsegs(sc->vm);
188 
189 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
190 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
191 	while (uio->uio_resid > 0 && error == 0) {
192 		gpa = uio->uio_offset;
193 		off = gpa & PAGE_MASK;
194 		c = min(uio->uio_resid, PAGE_SIZE - off);
195 
196 		/*
197 		 * The VM has a hole in its physical memory map. If we want to
198 		 * use 'dd' to inspect memory beyond the hole we need to
199 		 * provide bogus data for memory that lies in the hole.
200 		 *
201 		 * Since this device does not support lseek(2), dd(1) will
202 		 * read(2) blocks of data to simulate the lseek(2).
203 		 */
204 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
205 		if (hpa == NULL) {
206 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
207 				error = uiomove(__DECONST(void *, zero_region),
208 				    c, uio);
209 			else
210 				error = EFAULT;
211 		} else {
212 			error = uiomove(hpa, c, uio);
213 			vm_gpa_release(cookie);
214 		}
215 	}
216 	vm_unlock_memsegs(sc->vm);
217 	return (error);
218 }
219 
220 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
221 
222 static int
223 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
224 {
225 	struct devmem_softc *dsc;
226 	int error;
227 	bool sysmem;
228 
229 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
230 	if (error || mseg->len == 0)
231 		return (error);
232 
233 	if (!sysmem) {
234 		SLIST_FOREACH(dsc, &sc->devmem, link) {
235 			if (dsc->segid == mseg->segid)
236 				break;
237 		}
238 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
239 		    __func__, mseg->segid));
240 		error = copystr(dsc->name, mseg->name, len, NULL);
241 	} else {
242 		bzero(mseg->name, len);
243 	}
244 
245 	return (error);
246 }
247 
248 static int
249 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
250 {
251 	char *name;
252 	int error;
253 	bool sysmem;
254 
255 	error = 0;
256 	name = NULL;
257 	sysmem = true;
258 
259 	/*
260 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
261 	 * by stripped off when devfs processes the full string.
262 	 */
263 	if (VM_MEMSEG_NAME(mseg)) {
264 		sysmem = false;
265 		name = malloc(len, M_VMMDEV, M_WAITOK);
266 		error = copystr(mseg->name, name, len, NULL);
267 		if (error)
268 			goto done;
269 	}
270 
271 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
272 	if (error)
273 		goto done;
274 
275 	if (VM_MEMSEG_NAME(mseg)) {
276 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
277 		if (error)
278 			vm_free_memseg(sc->vm, mseg->segid);
279 		else
280 			name = NULL;	/* freed when 'cdev' is destroyed */
281 	}
282 done:
283 	free(name, M_VMMDEV);
284 	return (error);
285 }
286 
287 static int
288 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
289     uint64_t *regval)
290 {
291 	int error, i;
292 
293 	error = 0;
294 	for (i = 0; i < count; i++) {
295 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
296 		if (error)
297 			break;
298 	}
299 	return (error);
300 }
301 
302 static int
303 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
304     uint64_t *regval)
305 {
306 	int error, i;
307 
308 	error = 0;
309 	for (i = 0; i < count; i++) {
310 		error = vm_set_register(vcpu, regnum[i], regval[i]);
311 		if (error)
312 			break;
313 	}
314 	return (error);
315 }
316 
317 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
318 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
319 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
320 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
321 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
322 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
323 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
324 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
325 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
326 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
327 
328 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
329 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_FBSD12,
330 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
331 #endif
332 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
333 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
334 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
335 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
336 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
337 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
338 	VMMDEV_IOCTL(VM_REINIT,
339 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
340 
341 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
342 	VMMDEV_IOCTL(VM_GET_MEMSEG_FBSD12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
343 #endif
344 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
345 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
346 
347 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
348 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
349 
350 	VMMDEV_IOCTL(VM_SUSPEND, 0),
351 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
352 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
353 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
354 };
355 
356 static int
357 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
358     struct thread *td)
359 {
360 	struct vmmdev_softc *sc;
361 	struct vcpu *vcpu;
362 	const struct vmmdev_ioctl *ioctl;
363 	int error, vcpuid;
364 
365 	error = vmm_priv_check(td->td_ucred);
366 	if (error)
367 		return (error);
368 
369 	sc = vmmdev_lookup2(cdev);
370 	if (sc == NULL)
371 		return (ENXIO);
372 
373 	ioctl = NULL;
374 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
375 		if (vmmdev_ioctls[i].cmd == cmd) {
376 			ioctl = &vmmdev_ioctls[i];
377 			break;
378 		}
379 	}
380 	if (ioctl == NULL) {
381 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
382 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
383 				ioctl = &vmmdev_machdep_ioctls[i];
384 				break;
385 			}
386 		}
387 	}
388 	if (ioctl == NULL)
389 		return (ENOTTY);
390 
391 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
392 		vm_xlock_memsegs(sc->vm);
393 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
394 		vm_slock_memsegs(sc->vm);
395 
396 	vcpu = NULL;
397 	vcpuid = -1;
398 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
399 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
400 		vcpuid = *(int *)data;
401 		if (vcpuid == -1) {
402 			if ((ioctl->flags &
403 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
404 				error = EINVAL;
405 				goto lockfail;
406 			}
407 		} else {
408 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
409 			if (vcpu == NULL) {
410 				error = EINVAL;
411 				goto lockfail;
412 			}
413 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
414 				error = vcpu_lock_one(vcpu);
415 				if (error)
416 					goto lockfail;
417 			}
418 		}
419 	}
420 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
421 		error = vcpu_lock_all(sc);
422 		if (error)
423 			goto lockfail;
424 	}
425 
426 	switch (cmd) {
427 	case VM_SUSPEND: {
428 		struct vm_suspend *vmsuspend;
429 
430 		vmsuspend = (struct vm_suspend *)data;
431 		error = vm_suspend(sc->vm, vmsuspend->how);
432 		break;
433 	}
434 	case VM_REINIT:
435 		error = vm_reinit(sc->vm);
436 		break;
437 	case VM_STAT_DESC: {
438 		struct vm_stat_desc *statdesc;
439 
440 		statdesc = (struct vm_stat_desc *)data;
441 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
442 		    sizeof(statdesc->desc));
443 		break;
444 	}
445 	case VM_STATS: {
446 		struct vm_stats *vmstats;
447 
448 		vmstats = (struct vm_stats *)data;
449 		getmicrotime(&vmstats->tv);
450 		error = vmm_stat_copy(vcpu, vmstats->index,
451 		    nitems(vmstats->statbuf), &vmstats->num_entries,
452 		    vmstats->statbuf);
453 		break;
454 	}
455 	case VM_MMAP_GETNEXT: {
456 		struct vm_memmap *mm;
457 
458 		mm = (struct vm_memmap *)data;
459 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
460 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
461 		break;
462 	}
463 	case VM_MMAP_MEMSEG: {
464 		struct vm_memmap *mm;
465 
466 		mm = (struct vm_memmap *)data;
467 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
468 		    mm->len, mm->prot, mm->flags);
469 		break;
470 	}
471 	case VM_MUNMAP_MEMSEG: {
472 		struct vm_munmap *mu;
473 
474 		mu = (struct vm_munmap *)data;
475 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
476 		break;
477 	}
478 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
479 	case VM_ALLOC_MEMSEG_FBSD12:
480 		error = alloc_memseg(sc, (struct vm_memseg *)data,
481 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
482 		break;
483 	case VM_GET_MEMSEG_FBSD12:
484 		error = get_memseg(sc, (struct vm_memseg *)data,
485 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
486 		break;
487 #endif
488 	case VM_ALLOC_MEMSEG:
489 		error = alloc_memseg(sc, (struct vm_memseg *)data,
490 		    sizeof(((struct vm_memseg *)0)->name));
491 		break;
492 	case VM_GET_MEMSEG:
493 		error = get_memseg(sc, (struct vm_memseg *)data,
494 		    sizeof(((struct vm_memseg *)0)->name));
495 		break;
496 	case VM_GET_REGISTER: {
497 		struct vm_register *vmreg;
498 
499 		vmreg = (struct vm_register *)data;
500 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
501 		break;
502 	}
503 	case VM_SET_REGISTER: {
504 		struct vm_register *vmreg;
505 
506 		vmreg = (struct vm_register *)data;
507 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
508 		break;
509 	}
510 	case VM_GET_REGISTER_SET: {
511 		struct vm_register_set *vmregset;
512 		uint64_t *regvals;
513 		int *regnums;
514 
515 		vmregset = (struct vm_register_set *)data;
516 		if (vmregset->count > VM_REG_LAST) {
517 			error = EINVAL;
518 			break;
519 		}
520 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
521 		    M_WAITOK);
522 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
523 		    M_WAITOK);
524 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
525 		    vmregset->count);
526 		if (error == 0)
527 			error = vm_get_register_set(vcpu,
528 			    vmregset->count, regnums, regvals);
529 		if (error == 0)
530 			error = copyout(regvals, vmregset->regvals,
531 			    sizeof(regvals[0]) * vmregset->count);
532 		free(regvals, M_VMMDEV);
533 		free(regnums, M_VMMDEV);
534 		break;
535 	}
536 	case VM_SET_REGISTER_SET: {
537 		struct vm_register_set *vmregset;
538 		uint64_t *regvals;
539 		int *regnums;
540 
541 		vmregset = (struct vm_register_set *)data;
542 		if (vmregset->count > VM_REG_LAST) {
543 			error = EINVAL;
544 			break;
545 		}
546 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
547 		    M_WAITOK);
548 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
549 		    M_WAITOK);
550 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
551 		    vmregset->count);
552 		if (error == 0)
553 			error = copyin(vmregset->regvals, regvals,
554 			    sizeof(regvals[0]) * vmregset->count);
555 		if (error == 0)
556 			error = vm_set_register_set(vcpu,
557 			    vmregset->count, regnums, regvals);
558 		free(regvals, M_VMMDEV);
559 		free(regnums, M_VMMDEV);
560 		break;
561 	}
562 	case VM_GET_CAPABILITY: {
563 		struct vm_capability *vmcap;
564 
565 		vmcap = (struct vm_capability *)data;
566 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
567 		break;
568 	}
569 	case VM_SET_CAPABILITY: {
570 		struct vm_capability *vmcap;
571 
572 		vmcap = (struct vm_capability *)data;
573 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
574 		break;
575 	}
576 	case VM_ACTIVATE_CPU:
577 		error = vm_activate_cpu(vcpu);
578 		break;
579 	case VM_GET_CPUS: {
580 		struct vm_cpuset *vm_cpuset;
581 		cpuset_t *cpuset;
582 		int size;
583 
584 		error = 0;
585 		vm_cpuset = (struct vm_cpuset *)data;
586 		size = vm_cpuset->cpusetsize;
587 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
588 			error = ERANGE;
589 			break;
590 		}
591 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
592 		    M_WAITOK | M_ZERO);
593 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
594 			*cpuset = vm_active_cpus(sc->vm);
595 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
596 			*cpuset = vm_suspended_cpus(sc->vm);
597 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
598 			*cpuset = vm_debug_cpus(sc->vm);
599 		else
600 			error = EINVAL;
601 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
602 			error = ERANGE;
603 		if (error == 0)
604 			error = copyout(cpuset, vm_cpuset->cpus, size);
605 		free(cpuset, M_TEMP);
606 		break;
607 	}
608 	case VM_SUSPEND_CPU:
609 		error = vm_suspend_cpu(sc->vm, vcpu);
610 		break;
611 	case VM_RESUME_CPU:
612 		error = vm_resume_cpu(sc->vm, vcpu);
613 		break;
614 	case VM_SET_TOPOLOGY: {
615 		struct vm_cpu_topology *topology;
616 
617 		topology = (struct vm_cpu_topology *)data;
618 		error = vm_set_topology(sc->vm, topology->sockets,
619 		    topology->cores, topology->threads, topology->maxcpus);
620 		break;
621 	}
622 	case VM_GET_TOPOLOGY: {
623 		struct vm_cpu_topology *topology;
624 
625 		topology = (struct vm_cpu_topology *)data;
626 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
627 		    &topology->threads, &topology->maxcpus);
628 		error = 0;
629 		break;
630 	}
631 	default:
632 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
633 		    td);
634 		break;
635 	}
636 
637 	if ((ioctl->flags &
638 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
639 		vm_unlock_memsegs(sc->vm);
640 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
641 		vcpu_unlock_all(sc);
642 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
643 		vcpu_unlock_one(vcpu);
644 
645 	/*
646 	 * Make sure that no handler returns a kernel-internal
647 	 * error value to userspace.
648 	 */
649 	KASSERT(error == ERESTART || error >= 0,
650 	    ("vmmdev_ioctl: invalid error return %d", error));
651 	return (error);
652 
653 lockfail:
654 	if ((ioctl->flags &
655 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
656 		vm_unlock_memsegs(sc->vm);
657 	return (error);
658 }
659 
660 static int
661 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
662     struct vm_object **objp, int nprot)
663 {
664 	struct vmmdev_softc *sc;
665 	vm_paddr_t gpa;
666 	size_t len;
667 	vm_ooffset_t segoff, first, last;
668 	int error, found, segid;
669 	bool sysmem;
670 
671 	error = vmm_priv_check(curthread->td_ucred);
672 	if (error)
673 		return (error);
674 
675 	first = *offset;
676 	last = first + mapsize;
677 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
678 		return (EINVAL);
679 
680 	sc = vmmdev_lookup2(cdev);
681 	if (sc == NULL) {
682 		/* virtual machine is in the process of being created */
683 		return (EINVAL);
684 	}
685 
686 	/*
687 	 * Get a read lock on the guest memory map.
688 	 */
689 	vm_slock_memsegs(sc->vm);
690 
691 	gpa = 0;
692 	found = 0;
693 	while (!found) {
694 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
695 		    NULL, NULL);
696 		if (error)
697 			break;
698 
699 		if (first >= gpa && last <= gpa + len)
700 			found = 1;
701 		else
702 			gpa += len;
703 	}
704 
705 	if (found) {
706 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
707 		KASSERT(error == 0 && *objp != NULL,
708 		    ("%s: invalid memory segment %d", __func__, segid));
709 		if (sysmem) {
710 			vm_object_reference(*objp);
711 			*offset = segoff + (first - gpa);
712 		} else {
713 			error = EINVAL;
714 		}
715 	}
716 	vm_unlock_memsegs(sc->vm);
717 	return (error);
718 }
719 
720 static void
721 vmmdev_destroy(void *arg)
722 {
723 	struct vmmdev_softc *sc = arg;
724 	struct devmem_softc *dsc;
725 	int error __diagused;
726 
727 	vm_disable_vcpu_creation(sc->vm);
728 	error = vcpu_lock_all(sc);
729 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
730 	vm_unlock_vcpus(sc->vm);
731 
732 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
733 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
734 		SLIST_REMOVE_HEAD(&sc->devmem, link);
735 		free(dsc->name, M_VMMDEV);
736 		free(dsc, M_VMMDEV);
737 	}
738 
739 	if (sc->cdev != NULL)
740 		destroy_dev(sc->cdev);
741 
742 	if (sc->vm != NULL)
743 		vm_destroy(sc->vm);
744 
745 	if (sc->ucred != NULL)
746 		crfree(sc->ucred);
747 
748 	if ((sc->flags & VSC_LINKED) != 0) {
749 		mtx_lock(&vmmdev_mtx);
750 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
751 		mtx_unlock(&vmmdev_mtx);
752 	}
753 
754 	free(sc, M_VMMDEV);
755 }
756 
757 static int
758 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
759 {
760 	struct devmem_softc *dsc;
761 	struct vmmdev_softc *sc;
762 	struct cdev *cdev;
763 	char *buf;
764 	int error, buflen;
765 
766 	error = vmm_priv_check(req->td->td_ucred);
767 	if (error)
768 		return (error);
769 
770 	buflen = VM_MAX_NAMELEN + 1;
771 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
772 	strlcpy(buf, "beavis", buflen);
773 	error = sysctl_handle_string(oidp, buf, buflen, req);
774 	if (error != 0 || req->newptr == NULL)
775 		goto out;
776 
777 	mtx_lock(&vmmdev_mtx);
778 	sc = vmmdev_lookup(buf);
779 	if (sc == NULL || sc->cdev == NULL) {
780 		mtx_unlock(&vmmdev_mtx);
781 		error = EINVAL;
782 		goto out;
783 	}
784 
785 	/*
786 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
787 	 * is scheduled for destruction.
788 	 */
789 	cdev = sc->cdev;
790 	sc->cdev = NULL;
791 	mtx_unlock(&vmmdev_mtx);
792 
793 	/*
794 	 * Destroy all cdevs:
795 	 *
796 	 * - any new operations on the 'cdev' will return an error (ENXIO).
797 	 *
798 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
799 	 */
800 	SLIST_FOREACH(dsc, &sc->devmem, link) {
801 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
802 		destroy_dev(dsc->cdev);
803 		devmem_destroy(dsc);
804 	}
805 	destroy_dev(cdev);
806 	vmmdev_destroy(sc);
807 	error = 0;
808 
809 out:
810 	free(buf, M_VMMDEV);
811 	return (error);
812 }
813 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
814     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
815     NULL, 0, sysctl_vmm_destroy, "A",
816     NULL);
817 
818 static struct cdevsw vmmdevsw = {
819 	.d_name		= "vmmdev",
820 	.d_version	= D_VERSION,
821 	.d_ioctl	= vmmdev_ioctl,
822 	.d_mmap_single	= vmmdev_mmap_single,
823 	.d_read		= vmmdev_rw,
824 	.d_write	= vmmdev_rw,
825 };
826 
827 static int
828 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
829 {
830 	struct vm *vm;
831 	struct cdev *cdev;
832 	struct vmmdev_softc *sc, *sc2;
833 	char *buf;
834 	int error, buflen;
835 
836 	error = vmm_priv_check(req->td->td_ucred);
837 	if (error)
838 		return (error);
839 
840 	buflen = VM_MAX_NAMELEN + 1;
841 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
842 	strlcpy(buf, "beavis", buflen);
843 	error = sysctl_handle_string(oidp, buf, buflen, req);
844 	if (error != 0 || req->newptr == NULL)
845 		goto out;
846 
847 	mtx_lock(&vmmdev_mtx);
848 	sc = vmmdev_lookup(buf);
849 	mtx_unlock(&vmmdev_mtx);
850 	if (sc != NULL) {
851 		error = EEXIST;
852 		goto out;
853 	}
854 
855 	error = vm_create(buf, &vm);
856 	if (error != 0)
857 		goto out;
858 
859 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
860 	sc->ucred = crhold(curthread->td_ucred);
861 	sc->vm = vm;
862 	SLIST_INIT(&sc->devmem);
863 
864 	/*
865 	 * Lookup the name again just in case somebody sneaked in when we
866 	 * dropped the lock.
867 	 */
868 	mtx_lock(&vmmdev_mtx);
869 	sc2 = vmmdev_lookup(buf);
870 	if (sc2 == NULL) {
871 		SLIST_INSERT_HEAD(&head, sc, link);
872 		sc->flags |= VSC_LINKED;
873 	}
874 	mtx_unlock(&vmmdev_mtx);
875 
876 	if (sc2 != NULL) {
877 		vmmdev_destroy(sc);
878 		error = EEXIST;
879 		goto out;
880 	}
881 
882 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
883 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
884 	if (error != 0) {
885 		vmmdev_destroy(sc);
886 		goto out;
887 	}
888 
889 	mtx_lock(&vmmdev_mtx);
890 	sc->cdev = cdev;
891 	sc->cdev->si_drv1 = sc;
892 	mtx_unlock(&vmmdev_mtx);
893 
894 out:
895 	free(buf, M_VMMDEV);
896 	return (error);
897 }
898 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
899     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
900     NULL, 0, sysctl_vmm_create, "A",
901     NULL);
902 
903 void
904 vmmdev_init(void)
905 {
906 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
907 	    "Allow use of vmm in a jail.");
908 }
909 
910 int
911 vmmdev_cleanup(void)
912 {
913 	int error;
914 
915 	if (SLIST_EMPTY(&head))
916 		error = 0;
917 	else
918 		error = EBUSY;
919 
920 	return (error);
921 }
922 
923 static int
924 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
925     struct vm_object **objp, int nprot)
926 {
927 	struct devmem_softc *dsc;
928 	vm_ooffset_t first, last;
929 	size_t seglen;
930 	int error;
931 	bool sysmem;
932 
933 	dsc = cdev->si_drv1;
934 	if (dsc == NULL) {
935 		/* 'cdev' has been created but is not ready for use */
936 		return (ENXIO);
937 	}
938 
939 	first = *offset;
940 	last = *offset + len;
941 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
942 		return (EINVAL);
943 
944 	vm_slock_memsegs(dsc->sc->vm);
945 
946 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
947 	KASSERT(error == 0 && !sysmem && *objp != NULL,
948 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
949 
950 	if (seglen >= last)
951 		vm_object_reference(*objp);
952 	else
953 		error = EINVAL;
954 
955 	vm_unlock_memsegs(dsc->sc->vm);
956 	return (error);
957 }
958 
959 static struct cdevsw devmemsw = {
960 	.d_name		= "devmem",
961 	.d_version	= D_VERSION,
962 	.d_mmap_single	= devmem_mmap_single,
963 };
964 
965 static int
966 devmem_create_cdev(const char *vmname, int segid, char *devname)
967 {
968 	struct devmem_softc *dsc;
969 	struct vmmdev_softc *sc;
970 	struct cdev *cdev;
971 	int error;
972 
973 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
974 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
975 	if (error)
976 		return (error);
977 
978 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
979 
980 	mtx_lock(&vmmdev_mtx);
981 	sc = vmmdev_lookup(vmname);
982 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
983 	if (sc->cdev == NULL) {
984 		/* virtual machine is being created or destroyed */
985 		mtx_unlock(&vmmdev_mtx);
986 		free(dsc, M_VMMDEV);
987 		destroy_dev_sched_cb(cdev, NULL, 0);
988 		return (ENODEV);
989 	}
990 
991 	dsc->segid = segid;
992 	dsc->name = devname;
993 	dsc->cdev = cdev;
994 	dsc->sc = sc;
995 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
996 	mtx_unlock(&vmmdev_mtx);
997 
998 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
999 	cdev->si_drv1 = dsc;
1000 	return (0);
1001 }
1002 
1003 static void
1004 devmem_destroy(void *arg)
1005 {
1006 	struct devmem_softc *dsc = arg;
1007 
1008 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1009 	dsc->cdev = NULL;
1010 	dsc->sc = NULL;
1011 }
1012