xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision c76c2a19ae3763d17aa6a60a5831ed24cbc16e83)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #include <sys/fcntl.h>
12 #include <sys/ioccom.h>
13 #include <sys/jail.h>
14 #include <sys/kernel.h>
15 #include <sys/malloc.h>
16 #include <sys/mman.h>
17 #include <sys/proc.h>
18 #include <sys/queue.h>
19 #include <sys/sx.h>
20 #include <sys/sysctl.h>
21 #include <sys/ucred.h>
22 #include <sys/uio.h>
23 
24 #include <machine/vmm.h>
25 
26 #include <vm/vm.h>
27 #include <vm/vm_object.h>
28 
29 #include <dev/vmm/vmm_dev.h>
30 #include <dev/vmm/vmm_mem.h>
31 #include <dev/vmm/vmm_stat.h>
32 
33 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
34 struct vm_memseg_12 {
35 	int		segid;
36 	size_t		len;
37 	char		name[64];
38 };
39 _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI");
40 
41 #define	VM_ALLOC_MEMSEG_12	\
42 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
43 #define	VM_GET_MEMSEG_12	\
44 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
45 #endif
46 
47 struct devmem_softc {
48 	int	segid;
49 	char	*name;
50 	struct cdev *cdev;
51 	struct vmmdev_softc *sc;
52 	SLIST_ENTRY(devmem_softc) link;
53 };
54 
55 struct vmmdev_softc {
56 	struct vm	*vm;		/* vm instance cookie */
57 	struct cdev	*cdev;
58 	struct ucred	*ucred;
59 	SLIST_ENTRY(vmmdev_softc) link;
60 	SLIST_HEAD(, devmem_softc) devmem;
61 	int		flags;
62 };
63 
64 static SLIST_HEAD(, vmmdev_softc) head;
65 
66 static unsigned pr_allow_flag;
67 static struct sx vmmdev_mtx;
68 SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex");
69 
70 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
71 
72 SYSCTL_DECL(_hw_vmm);
73 
74 static void devmem_destroy(void *arg);
75 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem);
76 
77 static int
vmm_priv_check(struct ucred * ucred)78 vmm_priv_check(struct ucred *ucred)
79 {
80 	if (jailed(ucred) &&
81 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
82 		return (EPERM);
83 
84 	return (0);
85 }
86 
87 static int
vcpu_lock_one(struct vcpu * vcpu)88 vcpu_lock_one(struct vcpu *vcpu)
89 {
90 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
91 }
92 
93 static void
vcpu_unlock_one(struct vcpu * vcpu)94 vcpu_unlock_one(struct vcpu *vcpu)
95 {
96 	enum vcpu_state state;
97 
98 	state = vcpu_get_state(vcpu, NULL);
99 	if (state != VCPU_FROZEN) {
100 		panic("vcpu %s(%d) has invalid state %d",
101 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
102 	}
103 
104 	vcpu_set_state(vcpu, VCPU_IDLE, false);
105 }
106 
107 static int
vcpu_lock_all(struct vmmdev_softc * sc)108 vcpu_lock_all(struct vmmdev_softc *sc)
109 {
110 	struct vcpu *vcpu;
111 	int error;
112 	uint16_t i, j, maxcpus;
113 
114 	error = 0;
115 	vm_slock_vcpus(sc->vm);
116 	maxcpus = vm_get_maxcpus(sc->vm);
117 	for (i = 0; i < maxcpus; i++) {
118 		vcpu = vm_vcpu(sc->vm, i);
119 		if (vcpu == NULL)
120 			continue;
121 		error = vcpu_lock_one(vcpu);
122 		if (error)
123 			break;
124 	}
125 
126 	if (error) {
127 		for (j = 0; j < i; j++) {
128 			vcpu = vm_vcpu(sc->vm, j);
129 			if (vcpu == NULL)
130 				continue;
131 			vcpu_unlock_one(vcpu);
132 		}
133 		vm_unlock_vcpus(sc->vm);
134 	}
135 
136 	return (error);
137 }
138 
139 static void
vcpu_unlock_all(struct vmmdev_softc * sc)140 vcpu_unlock_all(struct vmmdev_softc *sc)
141 {
142 	struct vcpu *vcpu;
143 	uint16_t i, maxcpus;
144 
145 	maxcpus = vm_get_maxcpus(sc->vm);
146 	for (i = 0; i < maxcpus; i++) {
147 		vcpu = vm_vcpu(sc->vm, i);
148 		if (vcpu == NULL)
149 			continue;
150 		vcpu_unlock_one(vcpu);
151 	}
152 	vm_unlock_vcpus(sc->vm);
153 }
154 
155 static struct vmmdev_softc *
vmmdev_lookup(const char * name,struct ucred * cred)156 vmmdev_lookup(const char *name, struct ucred *cred)
157 {
158 	struct vmmdev_softc *sc;
159 
160 	sx_assert(&vmmdev_mtx, SA_XLOCKED);
161 
162 	SLIST_FOREACH(sc, &head, link) {
163 		if (strcmp(name, vm_name(sc->vm)) == 0)
164 			break;
165 	}
166 
167 	if (sc == NULL)
168 		return (NULL);
169 
170 	if (cr_cansee(cred, sc->ucred))
171 		return (NULL);
172 
173 	return (sc);
174 }
175 
176 static struct vmmdev_softc *
vmmdev_lookup2(struct cdev * cdev)177 vmmdev_lookup2(struct cdev *cdev)
178 {
179 	return (cdev->si_drv1);
180 }
181 
182 static int
vmmdev_rw(struct cdev * cdev,struct uio * uio,int flags)183 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
184 {
185 	int error, off, c, prot;
186 	vm_paddr_t gpa, maxaddr;
187 	void *hpa, *cookie;
188 	struct vmmdev_softc *sc;
189 
190 	sc = vmmdev_lookup2(cdev);
191 	if (sc == NULL)
192 		return (ENXIO);
193 
194 	/*
195 	 * Get a read lock on the guest memory map.
196 	 */
197 	vm_slock_memsegs(sc->vm);
198 
199 	error = 0;
200 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
201 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
202 	while (uio->uio_resid > 0 && error == 0) {
203 		gpa = uio->uio_offset;
204 		off = gpa & PAGE_MASK;
205 		c = min(uio->uio_resid, PAGE_SIZE - off);
206 
207 		/*
208 		 * The VM has a hole in its physical memory map. If we want to
209 		 * use 'dd' to inspect memory beyond the hole we need to
210 		 * provide bogus data for memory that lies in the hole.
211 		 *
212 		 * Since this device does not support lseek(2), dd(1) will
213 		 * read(2) blocks of data to simulate the lseek(2).
214 		 */
215 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
216 		if (hpa == NULL) {
217 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
218 				error = uiomove(__DECONST(void *, zero_region),
219 				    c, uio);
220 			else
221 				error = EFAULT;
222 		} else {
223 			error = uiomove(hpa, c, uio);
224 			vm_gpa_release(cookie);
225 		}
226 	}
227 	vm_unlock_memsegs(sc->vm);
228 	return (error);
229 }
230 
231 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
232 
233 static int
get_memseg(struct vmmdev_softc * sc,struct vm_memseg * mseg,size_t len)234 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
235 {
236 	struct devmem_softc *dsc;
237 	int error;
238 	bool sysmem;
239 
240 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
241 	if (error || mseg->len == 0)
242 		return (error);
243 
244 	if (!sysmem) {
245 		SLIST_FOREACH(dsc, &sc->devmem, link) {
246 			if (dsc->segid == mseg->segid)
247 				break;
248 		}
249 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
250 		    __func__, mseg->segid));
251 		error = copystr(dsc->name, mseg->name, len, NULL);
252 	} else {
253 		bzero(mseg->name, len);
254 	}
255 
256 	return (error);
257 }
258 
259 static int
alloc_memseg(struct vmmdev_softc * sc,struct vm_memseg * mseg,size_t len)260 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
261 {
262 	char *name;
263 	int error;
264 	bool sysmem;
265 
266 	error = 0;
267 	name = NULL;
268 	sysmem = true;
269 
270 	/*
271 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
272 	 * by stripped off when devfs processes the full string.
273 	 */
274 	if (VM_MEMSEG_NAME(mseg)) {
275 		sysmem = false;
276 		name = malloc(len, M_VMMDEV, M_WAITOK);
277 		error = copystr(mseg->name, name, len, NULL);
278 		if (error)
279 			goto done;
280 	}
281 
282 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
283 	if (error)
284 		goto done;
285 
286 	if (VM_MEMSEG_NAME(mseg)) {
287 		error = devmem_create_cdev(sc, mseg->segid, name);
288 		if (error)
289 			vm_free_memseg(sc->vm, mseg->segid);
290 		else
291 			name = NULL;	/* freed when 'cdev' is destroyed */
292 	}
293 done:
294 	free(name, M_VMMDEV);
295 	return (error);
296 }
297 
298 static int
vm_get_register_set(struct vcpu * vcpu,unsigned int count,int * regnum,uint64_t * regval)299 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
300     uint64_t *regval)
301 {
302 	int error, i;
303 
304 	error = 0;
305 	for (i = 0; i < count; i++) {
306 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
307 		if (error)
308 			break;
309 	}
310 	return (error);
311 }
312 
313 static int
vm_set_register_set(struct vcpu * vcpu,unsigned int count,int * regnum,uint64_t * regval)314 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
315     uint64_t *regval)
316 {
317 	int error, i;
318 
319 	error = 0;
320 	for (i = 0; i < count; i++) {
321 		error = vm_set_register(vcpu, regnum[i], regval[i]);
322 		if (error)
323 			break;
324 	}
325 	return (error);
326 }
327 
328 static int
vmmdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)329 vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
330 {
331 	int error;
332 
333 	/*
334 	 * A jail without vmm access shouldn't be able to access vmm device
335 	 * files at all, but check here just to be thorough.
336 	 */
337 	error = vmm_priv_check(td->td_ucred);
338 	if (error != 0)
339 		return (error);
340 
341 	return (0);
342 }
343 
344 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
345 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
346 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
347 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
348 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
349 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
350 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
351 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
352 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
353 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
354 
355 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
356 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
357 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
358 #endif
359 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
360 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
361 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
362 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
363 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
364 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
365 	VMMDEV_IOCTL(VM_REINIT,
366 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
367 
368 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
369 	VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
370 #endif
371 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
372 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
373 
374 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
375 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
376 
377 	VMMDEV_IOCTL(VM_SUSPEND, 0),
378 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
379 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
380 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
381 };
382 
383 static int
vmmdev_ioctl(struct cdev * cdev,u_long cmd,caddr_t data,int fflag,struct thread * td)384 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
385     struct thread *td)
386 {
387 	struct vmmdev_softc *sc;
388 	struct vcpu *vcpu;
389 	const struct vmmdev_ioctl *ioctl;
390 	int error, vcpuid;
391 
392 	sc = vmmdev_lookup2(cdev);
393 	if (sc == NULL)
394 		return (ENXIO);
395 
396 	ioctl = NULL;
397 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
398 		if (vmmdev_ioctls[i].cmd == cmd) {
399 			ioctl = &vmmdev_ioctls[i];
400 			break;
401 		}
402 	}
403 	if (ioctl == NULL) {
404 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
405 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
406 				ioctl = &vmmdev_machdep_ioctls[i];
407 				break;
408 			}
409 		}
410 	}
411 	if (ioctl == NULL)
412 		return (ENOTTY);
413 
414 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
415 		vm_xlock_memsegs(sc->vm);
416 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
417 		vm_slock_memsegs(sc->vm);
418 
419 	vcpu = NULL;
420 	vcpuid = -1;
421 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
422 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
423 		vcpuid = *(int *)data;
424 		if (vcpuid == -1) {
425 			if ((ioctl->flags &
426 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
427 				error = EINVAL;
428 				goto lockfail;
429 			}
430 		} else {
431 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
432 			if (vcpu == NULL) {
433 				error = EINVAL;
434 				goto lockfail;
435 			}
436 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
437 				error = vcpu_lock_one(vcpu);
438 				if (error)
439 					goto lockfail;
440 			}
441 		}
442 	}
443 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
444 		error = vcpu_lock_all(sc);
445 		if (error)
446 			goto lockfail;
447 	}
448 
449 	switch (cmd) {
450 	case VM_SUSPEND: {
451 		struct vm_suspend *vmsuspend;
452 
453 		vmsuspend = (struct vm_suspend *)data;
454 		error = vm_suspend(sc->vm, vmsuspend->how);
455 		break;
456 	}
457 	case VM_REINIT:
458 		error = vm_reinit(sc->vm);
459 		break;
460 	case VM_STAT_DESC: {
461 		struct vm_stat_desc *statdesc;
462 
463 		statdesc = (struct vm_stat_desc *)data;
464 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
465 		    sizeof(statdesc->desc));
466 		break;
467 	}
468 	case VM_STATS: {
469 		struct vm_stats *vmstats;
470 
471 		vmstats = (struct vm_stats *)data;
472 		getmicrotime(&vmstats->tv);
473 		error = vmm_stat_copy(vcpu, vmstats->index,
474 		    nitems(vmstats->statbuf), &vmstats->num_entries,
475 		    vmstats->statbuf);
476 		break;
477 	}
478 	case VM_MMAP_GETNEXT: {
479 		struct vm_memmap *mm;
480 
481 		mm = (struct vm_memmap *)data;
482 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
483 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
484 		break;
485 	}
486 	case VM_MMAP_MEMSEG: {
487 		struct vm_memmap *mm;
488 
489 		mm = (struct vm_memmap *)data;
490 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
491 		    mm->len, mm->prot, mm->flags);
492 		break;
493 	}
494 	case VM_MUNMAP_MEMSEG: {
495 		struct vm_munmap *mu;
496 
497 		mu = (struct vm_munmap *)data;
498 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
499 		break;
500 	}
501 #if defined(__amd64__) && defined(COMPAT_FREEBSD12)
502 	case VM_ALLOC_MEMSEG_12:
503 		error = alloc_memseg(sc, (struct vm_memseg *)data,
504 		    sizeof(((struct vm_memseg_12 *)0)->name));
505 		break;
506 	case VM_GET_MEMSEG_12:
507 		error = get_memseg(sc, (struct vm_memseg *)data,
508 		    sizeof(((struct vm_memseg_12 *)0)->name));
509 		break;
510 #endif
511 	case VM_ALLOC_MEMSEG:
512 		error = alloc_memseg(sc, (struct vm_memseg *)data,
513 		    sizeof(((struct vm_memseg *)0)->name));
514 		break;
515 	case VM_GET_MEMSEG:
516 		error = get_memseg(sc, (struct vm_memseg *)data,
517 		    sizeof(((struct vm_memseg *)0)->name));
518 		break;
519 	case VM_GET_REGISTER: {
520 		struct vm_register *vmreg;
521 
522 		vmreg = (struct vm_register *)data;
523 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
524 		break;
525 	}
526 	case VM_SET_REGISTER: {
527 		struct vm_register *vmreg;
528 
529 		vmreg = (struct vm_register *)data;
530 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
531 		break;
532 	}
533 	case VM_GET_REGISTER_SET: {
534 		struct vm_register_set *vmregset;
535 		uint64_t *regvals;
536 		int *regnums;
537 
538 		vmregset = (struct vm_register_set *)data;
539 		if (vmregset->count > VM_REG_LAST) {
540 			error = EINVAL;
541 			break;
542 		}
543 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
544 		    M_WAITOK);
545 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
546 		    M_WAITOK);
547 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
548 		    vmregset->count);
549 		if (error == 0)
550 			error = vm_get_register_set(vcpu,
551 			    vmregset->count, regnums, regvals);
552 		if (error == 0)
553 			error = copyout(regvals, vmregset->regvals,
554 			    sizeof(regvals[0]) * vmregset->count);
555 		free(regvals, M_VMMDEV);
556 		free(regnums, M_VMMDEV);
557 		break;
558 	}
559 	case VM_SET_REGISTER_SET: {
560 		struct vm_register_set *vmregset;
561 		uint64_t *regvals;
562 		int *regnums;
563 
564 		vmregset = (struct vm_register_set *)data;
565 		if (vmregset->count > VM_REG_LAST) {
566 			error = EINVAL;
567 			break;
568 		}
569 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
570 		    M_WAITOK);
571 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
572 		    M_WAITOK);
573 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
574 		    vmregset->count);
575 		if (error == 0)
576 			error = copyin(vmregset->regvals, regvals,
577 			    sizeof(regvals[0]) * vmregset->count);
578 		if (error == 0)
579 			error = vm_set_register_set(vcpu,
580 			    vmregset->count, regnums, regvals);
581 		free(regvals, M_VMMDEV);
582 		free(regnums, M_VMMDEV);
583 		break;
584 	}
585 	case VM_GET_CAPABILITY: {
586 		struct vm_capability *vmcap;
587 
588 		vmcap = (struct vm_capability *)data;
589 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
590 		break;
591 	}
592 	case VM_SET_CAPABILITY: {
593 		struct vm_capability *vmcap;
594 
595 		vmcap = (struct vm_capability *)data;
596 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
597 		break;
598 	}
599 	case VM_ACTIVATE_CPU:
600 		error = vm_activate_cpu(vcpu);
601 		break;
602 	case VM_GET_CPUS: {
603 		struct vm_cpuset *vm_cpuset;
604 		cpuset_t *cpuset;
605 		int size;
606 
607 		error = 0;
608 		vm_cpuset = (struct vm_cpuset *)data;
609 		size = vm_cpuset->cpusetsize;
610 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
611 			error = ERANGE;
612 			break;
613 		}
614 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
615 		    M_WAITOK | M_ZERO);
616 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
617 			*cpuset = vm_active_cpus(sc->vm);
618 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
619 			*cpuset = vm_suspended_cpus(sc->vm);
620 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
621 			*cpuset = vm_debug_cpus(sc->vm);
622 		else
623 			error = EINVAL;
624 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
625 			error = ERANGE;
626 		if (error == 0)
627 			error = copyout(cpuset, vm_cpuset->cpus, size);
628 		free(cpuset, M_TEMP);
629 		break;
630 	}
631 	case VM_SUSPEND_CPU:
632 		error = vm_suspend_cpu(sc->vm, vcpu);
633 		break;
634 	case VM_RESUME_CPU:
635 		error = vm_resume_cpu(sc->vm, vcpu);
636 		break;
637 	case VM_SET_TOPOLOGY: {
638 		struct vm_cpu_topology *topology;
639 
640 		topology = (struct vm_cpu_topology *)data;
641 		error = vm_set_topology(sc->vm, topology->sockets,
642 		    topology->cores, topology->threads, topology->maxcpus);
643 		break;
644 	}
645 	case VM_GET_TOPOLOGY: {
646 		struct vm_cpu_topology *topology;
647 
648 		topology = (struct vm_cpu_topology *)data;
649 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
650 		    &topology->threads, &topology->maxcpus);
651 		error = 0;
652 		break;
653 	}
654 	default:
655 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
656 		    td);
657 		break;
658 	}
659 
660 	if ((ioctl->flags &
661 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
662 		vm_unlock_memsegs(sc->vm);
663 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
664 		vcpu_unlock_all(sc);
665 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
666 		vcpu_unlock_one(vcpu);
667 
668 	/*
669 	 * Make sure that no handler returns a kernel-internal
670 	 * error value to userspace.
671 	 */
672 	KASSERT(error == ERESTART || error >= 0,
673 	    ("vmmdev_ioctl: invalid error return %d", error));
674 	return (error);
675 
676 lockfail:
677 	if ((ioctl->flags &
678 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
679 		vm_unlock_memsegs(sc->vm);
680 	return (error);
681 }
682 
683 static int
vmmdev_mmap_single(struct cdev * cdev,vm_ooffset_t * offset,vm_size_t mapsize,struct vm_object ** objp,int nprot)684 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
685     struct vm_object **objp, int nprot)
686 {
687 	struct vmmdev_softc *sc;
688 	vm_paddr_t gpa;
689 	size_t len;
690 	vm_ooffset_t segoff, first, last;
691 	int error, found, segid;
692 	bool sysmem;
693 
694 	first = *offset;
695 	last = first + mapsize;
696 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
697 		return (EINVAL);
698 
699 	sc = vmmdev_lookup2(cdev);
700 	if (sc == NULL) {
701 		/* virtual machine is in the process of being created */
702 		return (EINVAL);
703 	}
704 
705 	/*
706 	 * Get a read lock on the guest memory map.
707 	 */
708 	vm_slock_memsegs(sc->vm);
709 
710 	gpa = 0;
711 	found = 0;
712 	while (!found) {
713 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
714 		    NULL, NULL);
715 		if (error)
716 			break;
717 
718 		if (first >= gpa && last <= gpa + len)
719 			found = 1;
720 		else
721 			gpa += len;
722 	}
723 
724 	if (found) {
725 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
726 		KASSERT(error == 0 && *objp != NULL,
727 		    ("%s: invalid memory segment %d", __func__, segid));
728 		if (sysmem) {
729 			vm_object_reference(*objp);
730 			*offset = segoff + (first - gpa);
731 		} else {
732 			error = EINVAL;
733 		}
734 	}
735 	vm_unlock_memsegs(sc->vm);
736 	return (error);
737 }
738 
739 static void
vmmdev_destroy(struct vmmdev_softc * sc)740 vmmdev_destroy(struct vmmdev_softc *sc)
741 {
742 	struct devmem_softc *dsc;
743 	int error __diagused;
744 
745 	KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__));
746 
747 	/*
748 	 * Destroy all cdevs:
749 	 *
750 	 * - any new operations on the 'cdev' will return an error (ENXIO).
751 	 *
752 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
753 	 */
754 	SLIST_FOREACH(dsc, &sc->devmem, link) {
755 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
756 		devmem_destroy(dsc);
757 	}
758 
759 	vm_disable_vcpu_creation(sc->vm);
760 	error = vcpu_lock_all(sc);
761 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
762 	vm_unlock_vcpus(sc->vm);
763 
764 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
765 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
766 		SLIST_REMOVE_HEAD(&sc->devmem, link);
767 		free(dsc->name, M_VMMDEV);
768 		free(dsc, M_VMMDEV);
769 	}
770 
771 	if (sc->vm != NULL)
772 		vm_destroy(sc->vm);
773 
774 	if (sc->ucred != NULL)
775 		crfree(sc->ucred);
776 
777 	sx_xlock(&vmmdev_mtx);
778 	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
779 	sx_xunlock(&vmmdev_mtx);
780 	free(sc, M_VMMDEV);
781 }
782 
783 static int
vmmdev_lookup_and_destroy(const char * name,struct ucred * cred)784 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred)
785 {
786 	struct cdev *cdev;
787 	struct vmmdev_softc *sc;
788 
789 	sx_xlock(&vmmdev_mtx);
790 	sc = vmmdev_lookup(name, cred);
791 	if (sc == NULL || sc->cdev == NULL) {
792 		sx_xunlock(&vmmdev_mtx);
793 		return (EINVAL);
794 	}
795 
796 	/*
797 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
798 	 * is scheduled for destruction.
799 	 */
800 	cdev = sc->cdev;
801 	sc->cdev = NULL;
802 	sx_xunlock(&vmmdev_mtx);
803 
804 	destroy_dev(cdev);
805 	vmmdev_destroy(sc);
806 
807 	return (0);
808 }
809 
810 static int
sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)811 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
812 {
813 	char *buf;
814 	int error, buflen;
815 
816 	error = vmm_priv_check(req->td->td_ucred);
817 	if (error)
818 		return (error);
819 
820 	buflen = VM_MAX_NAMELEN + 1;
821 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
822 	strlcpy(buf, "beavis", buflen);
823 	error = sysctl_handle_string(oidp, buf, buflen, req);
824 	if (error == 0 && req->newptr != NULL)
825 		error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
826 	free(buf, M_VMMDEV);
827 	return (error);
828 }
829 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
830     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
831     NULL, 0, sysctl_vmm_destroy, "A",
832     NULL);
833 
834 static struct cdevsw vmmdevsw = {
835 	.d_name		= "vmmdev",
836 	.d_version	= D_VERSION,
837 	.d_open		= vmmdev_open,
838 	.d_ioctl	= vmmdev_ioctl,
839 	.d_mmap_single	= vmmdev_mmap_single,
840 	.d_read		= vmmdev_rw,
841 	.d_write	= vmmdev_rw,
842 };
843 
844 static struct vmmdev_softc *
vmmdev_alloc(struct vm * vm,struct ucred * cred)845 vmmdev_alloc(struct vm *vm, struct ucred *cred)
846 {
847 	struct vmmdev_softc *sc;
848 
849 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
850 	SLIST_INIT(&sc->devmem);
851 	sc->vm = vm;
852 	sc->ucred = crhold(cred);
853 	return (sc);
854 }
855 
856 static int
vmmdev_create(const char * name,struct ucred * cred)857 vmmdev_create(const char *name, struct ucred *cred)
858 {
859 	struct make_dev_args mda;
860 	struct cdev *cdev;
861 	struct vmmdev_softc *sc;
862 	struct vm *vm;
863 	int error;
864 
865 	sx_xlock(&vmmdev_mtx);
866 	sc = vmmdev_lookup(name, cred);
867 	if (sc != NULL) {
868 		sx_xunlock(&vmmdev_mtx);
869 		return (EEXIST);
870 	}
871 
872 	error = vm_create(name, &vm);
873 	if (error != 0) {
874 		sx_xunlock(&vmmdev_mtx);
875 		return (error);
876 	}
877 	sc = vmmdev_alloc(vm, cred);
878 	SLIST_INSERT_HEAD(&head, sc, link);
879 
880 	make_dev_args_init(&mda);
881 	mda.mda_devsw = &vmmdevsw;
882 	mda.mda_cr = sc->ucred;
883 	mda.mda_uid = UID_ROOT;
884 	mda.mda_gid = GID_WHEEL;
885 	mda.mda_mode = 0600;
886 	mda.mda_si_drv1 = sc;
887 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
888 	error = make_dev_s(&mda, &cdev, "vmm/%s", name);
889 	if (error != 0) {
890 		sx_xunlock(&vmmdev_mtx);
891 		vmmdev_destroy(sc);
892 		return (error);
893 	}
894 	sc->cdev = cdev;
895 	sx_xunlock(&vmmdev_mtx);
896 	return (0);
897 }
898 
899 static int
sysctl_vmm_create(SYSCTL_HANDLER_ARGS)900 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
901 {
902 	char *buf;
903 	int error, buflen;
904 
905 	error = vmm_priv_check(req->td->td_ucred);
906 	if (error != 0)
907 		return (error);
908 
909 	buflen = VM_MAX_NAMELEN + 1;
910 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
911 	strlcpy(buf, "beavis", buflen);
912 	error = sysctl_handle_string(oidp, buf, buflen, req);
913 	if (error == 0 && req->newptr != NULL)
914 		error = vmmdev_create(buf, req->td->td_ucred);
915 	free(buf, M_VMMDEV);
916 	return (error);
917 }
918 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
919     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
920     NULL, 0, sysctl_vmm_create, "A",
921     NULL);
922 
923 static int
vmmctl_open(struct cdev * cdev,int flags,int fmt,struct thread * td)924 vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td)
925 {
926 	int error;
927 
928 	error = vmm_priv_check(td->td_ucred);
929 	if (error != 0)
930 		return (error);
931 
932 	if ((flags & FWRITE) == 0)
933 		return (EPERM);
934 
935 	return (0);
936 }
937 
938 static int
vmmctl_ioctl(struct cdev * cdev,u_long cmd,caddr_t data,int fflag,struct thread * td)939 vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
940     struct thread *td)
941 {
942 	int error;
943 
944 	switch (cmd) {
945 	case VMMCTL_VM_CREATE: {
946 		struct vmmctl_vm_create *vmc;
947 
948 		vmc = (struct vmmctl_vm_create *)data;
949 		vmc->name[VM_MAX_NAMELEN] = '\0';
950 		for (size_t i = 0; i < nitems(vmc->reserved); i++) {
951 			if (vmc->reserved[i] != 0) {
952 				error = EINVAL;
953 				return (error);
954 			}
955 		}
956 
957 		error = vmmdev_create(vmc->name, td->td_ucred);
958 		break;
959 	}
960 	case VMMCTL_VM_DESTROY: {
961 		struct vmmctl_vm_destroy *vmd;
962 
963 		vmd = (struct vmmctl_vm_destroy *)data;
964 		vmd->name[VM_MAX_NAMELEN] = '\0';
965 		for (size_t i = 0; i < nitems(vmd->reserved); i++) {
966 			if (vmd->reserved[i] != 0) {
967 				error = EINVAL;
968 				return (error);
969 			}
970 		}
971 
972 		error = vmmdev_lookup_and_destroy(vmd->name, td->td_ucred);
973 		break;
974 	}
975 	default:
976 		error = ENOTTY;
977 		break;
978 	}
979 
980 	return (error);
981 }
982 
983 static struct cdev *vmmctl_cdev;
984 static struct cdevsw vmmctlsw = {
985 	.d_name		= "vmmctl",
986 	.d_version	= D_VERSION,
987 	.d_open		= vmmctl_open,
988 	.d_ioctl	= vmmctl_ioctl,
989 };
990 
991 int
vmmdev_init(void)992 vmmdev_init(void)
993 {
994 	int error;
995 
996 	sx_xlock(&vmmdev_mtx);
997 	error = make_dev_p(MAKEDEV_CHECKNAME, &vmmctl_cdev, &vmmctlsw, NULL,
998 	    UID_ROOT, GID_WHEEL, 0600, "vmmctl");
999 	if (error == 0)
1000 		pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1001 		    "Allow use of vmm in a jail.");
1002 	sx_xunlock(&vmmdev_mtx);
1003 
1004 	return (error);
1005 }
1006 
1007 int
vmmdev_cleanup(void)1008 vmmdev_cleanup(void)
1009 {
1010 	sx_xlock(&vmmdev_mtx);
1011 	if (!SLIST_EMPTY(&head)) {
1012 		sx_xunlock(&vmmdev_mtx);
1013 		return (EBUSY);
1014 	}
1015 	if (vmmctl_cdev != NULL) {
1016 		destroy_dev(vmmctl_cdev);
1017 		vmmctl_cdev = NULL;
1018 	}
1019 	sx_xunlock(&vmmdev_mtx);
1020 
1021 	return (0);
1022 }
1023 
1024 static int
devmem_mmap_single(struct cdev * cdev,vm_ooffset_t * offset,vm_size_t len,struct vm_object ** objp,int nprot)1025 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1026     struct vm_object **objp, int nprot)
1027 {
1028 	struct devmem_softc *dsc;
1029 	vm_ooffset_t first, last;
1030 	size_t seglen;
1031 	int error;
1032 	bool sysmem;
1033 
1034 	dsc = cdev->si_drv1;
1035 	if (dsc == NULL) {
1036 		/* 'cdev' has been created but is not ready for use */
1037 		return (ENXIO);
1038 	}
1039 
1040 	first = *offset;
1041 	last = *offset + len;
1042 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1043 		return (EINVAL);
1044 
1045 	vm_slock_memsegs(dsc->sc->vm);
1046 
1047 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1048 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1049 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1050 
1051 	if (seglen >= last)
1052 		vm_object_reference(*objp);
1053 	else
1054 		error = EINVAL;
1055 
1056 	vm_unlock_memsegs(dsc->sc->vm);
1057 	return (error);
1058 }
1059 
1060 static struct cdevsw devmemsw = {
1061 	.d_name		= "devmem",
1062 	.d_version	= D_VERSION,
1063 	.d_mmap_single	= devmem_mmap_single,
1064 };
1065 
1066 static int
devmem_create_cdev(struct vmmdev_softc * sc,int segid,char * devname)1067 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname)
1068 {
1069 	struct make_dev_args mda;
1070 	struct devmem_softc *dsc;
1071 	int error;
1072 
1073 	sx_xlock(&vmmdev_mtx);
1074 
1075 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1076 	dsc->segid = segid;
1077 	dsc->name = devname;
1078 	dsc->sc = sc;
1079 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1080 
1081 	make_dev_args_init(&mda);
1082 	mda.mda_devsw = &devmemsw;
1083 	mda.mda_cr = sc->ucred;
1084 	mda.mda_uid = UID_ROOT;
1085 	mda.mda_gid = GID_WHEEL;
1086 	mda.mda_mode = 0600;
1087 	mda.mda_si_drv1 = dsc;
1088 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1089 	error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm),
1090 	    devname);
1091 	if (error != 0) {
1092 		SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link);
1093 		free(dsc->name, M_VMMDEV);
1094 		free(dsc, M_VMMDEV);
1095 	}
1096 
1097 	sx_xunlock(&vmmdev_mtx);
1098 
1099 	return (error);
1100 }
1101 
1102 static void
devmem_destroy(void * arg)1103 devmem_destroy(void *arg)
1104 {
1105 	struct devmem_softc *dsc = arg;
1106 
1107 	destroy_dev(dsc->cdev);
1108 	dsc->cdev = NULL;
1109 	dsc->sc = NULL;
1110 }
1111