xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision 08345e62a28eb971f65125f45c512a12b39ee2eb)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #include <sys/fcntl.h>
12 #include <sys/ioccom.h>
13 #include <sys/jail.h>
14 #include <sys/kernel.h>
15 #include <sys/malloc.h>
16 #include <sys/mman.h>
17 #include <sys/proc.h>
18 #include <sys/queue.h>
19 #include <sys/sx.h>
20 #include <sys/sysctl.h>
21 #include <sys/ucred.h>
22 #include <sys/uio.h>
23 
24 #include <machine/vmm.h>
25 
26 #include <vm/vm.h>
27 #include <vm/vm_object.h>
28 
29 #include <dev/vmm/vmm_dev.h>
30 #include <dev/vmm/vmm_mem.h>
31 #include <dev/vmm/vmm_stat.h>
32 
33 #ifdef __amd64__
34 #ifdef COMPAT_FREEBSD12
35 struct vm_memseg_12 {
36 	int		segid;
37 	size_t		len;
38 	char		name[64];
39 };
40 _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI");
41 
42 #define	VM_ALLOC_MEMSEG_12	\
43 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
44 #define	VM_GET_MEMSEG_12	\
45 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
46 #endif /* COMPAT_FREEBSD12 */
47 #ifdef COMPAT_FREEBSD14
48 struct vm_memseg_14 {
49 	int		segid;
50 	size_t		len;
51 	char		name[VM_MAX_SUFFIXLEN + 1];
52 };
53 _Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16),
54     "COMPAT_FREEBSD14 ABI");
55 
56 #define	VM_ALLOC_MEMSEG_14	\
57 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14)
58 #define	VM_GET_MEMSEG_14	\
59 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14)
60 #endif /* COMPAT_FREEBSD14 */
61 #endif /* __amd64__ */
62 
63 struct devmem_softc {
64 	int	segid;
65 	char	*name;
66 	struct cdev *cdev;
67 	struct vmmdev_softc *sc;
68 	SLIST_ENTRY(devmem_softc) link;
69 };
70 
71 struct vmmdev_softc {
72 	struct vm	*vm;		/* vm instance cookie */
73 	struct cdev	*cdev;
74 	struct ucred	*ucred;
75 	SLIST_ENTRY(vmmdev_softc) link;
76 	SLIST_HEAD(, devmem_softc) devmem;
77 	int		flags;
78 };
79 
80 static SLIST_HEAD(, vmmdev_softc) head;
81 
82 static unsigned pr_allow_flag;
83 static struct sx vmmdev_mtx;
84 SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex");
85 
86 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
87 
88 SYSCTL_DECL(_hw_vmm);
89 
90 static void devmem_destroy(void *arg);
91 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem);
92 
93 static int
94 vmm_priv_check(struct ucred *ucred)
95 {
96 	if (jailed(ucred) &&
97 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
98 		return (EPERM);
99 
100 	return (0);
101 }
102 
103 static int
104 vcpu_lock_one(struct vcpu *vcpu)
105 {
106 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
107 }
108 
109 static void
110 vcpu_unlock_one(struct vcpu *vcpu)
111 {
112 	enum vcpu_state state;
113 
114 	state = vcpu_get_state(vcpu, NULL);
115 	if (state != VCPU_FROZEN) {
116 		panic("vcpu %s(%d) has invalid state %d",
117 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
118 	}
119 
120 	vcpu_set_state(vcpu, VCPU_IDLE, false);
121 }
122 
123 #ifndef __amd64__
124 static int
125 vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate)
126 {
127 	struct vcpu *vcpu;
128 	int error;
129 	uint16_t i, j, maxcpus;
130 
131 	error = 0;
132 	maxcpus = vm_get_maxcpus(vm);
133 	for (i = 0; i < maxcpus; i++) {
134 		vcpu = vm_vcpu(vm, i);
135 		if (vcpu == NULL)
136 			continue;
137 		error = vcpu_lock_one(vcpu);
138 		if (error)
139 			break;
140 	}
141 
142 	if (error) {
143 		for (j = 0; j < i; j++) {
144 			vcpu = vm_vcpu(vm, j);
145 			if (vcpu == NULL)
146 				continue;
147 			vcpu_unlock_one(vcpu);
148 		}
149 	}
150 
151 	return (error);
152 }
153 #endif
154 
155 static int
156 vcpu_lock_all(struct vmmdev_softc *sc)
157 {
158 	int error;
159 
160 	/*
161 	 * Serialize vcpu_lock_all() callers.  Individual vCPUs are not locked
162 	 * in a consistent order so we need to serialize to avoid deadlocks.
163 	 */
164 	vm_lock_vcpus(sc->vm);
165 	error = vcpu_set_state_all(sc->vm, VCPU_FROZEN);
166 	if (error != 0)
167 		vm_unlock_vcpus(sc->vm);
168 	return (error);
169 }
170 
171 static void
172 vcpu_unlock_all(struct vmmdev_softc *sc)
173 {
174 	struct vcpu *vcpu;
175 	uint16_t i, maxcpus;
176 
177 	maxcpus = vm_get_maxcpus(sc->vm);
178 	for (i = 0; i < maxcpus; i++) {
179 		vcpu = vm_vcpu(sc->vm, i);
180 		if (vcpu == NULL)
181 			continue;
182 		vcpu_unlock_one(vcpu);
183 	}
184 	vm_unlock_vcpus(sc->vm);
185 }
186 
187 static struct vmmdev_softc *
188 vmmdev_lookup(const char *name, struct ucred *cred)
189 {
190 	struct vmmdev_softc *sc;
191 
192 	sx_assert(&vmmdev_mtx, SA_XLOCKED);
193 
194 	SLIST_FOREACH(sc, &head, link) {
195 		if (strcmp(name, vm_name(sc->vm)) == 0)
196 			break;
197 	}
198 
199 	if (sc == NULL)
200 		return (NULL);
201 
202 	if (cr_cansee(cred, sc->ucred))
203 		return (NULL);
204 
205 	return (sc);
206 }
207 
208 static struct vmmdev_softc *
209 vmmdev_lookup2(struct cdev *cdev)
210 {
211 	return (cdev->si_drv1);
212 }
213 
214 static int
215 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
216 {
217 	int error, off, c, prot;
218 	vm_paddr_t gpa, maxaddr;
219 	void *hpa, *cookie;
220 	struct vmmdev_softc *sc;
221 
222 	sc = vmmdev_lookup2(cdev);
223 	if (sc == NULL)
224 		return (ENXIO);
225 
226 	/*
227 	 * Get a read lock on the guest memory map.
228 	 */
229 	vm_slock_memsegs(sc->vm);
230 
231 	error = 0;
232 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
233 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
234 	while (uio->uio_resid > 0 && error == 0) {
235 		gpa = uio->uio_offset;
236 		off = gpa & PAGE_MASK;
237 		c = min(uio->uio_resid, PAGE_SIZE - off);
238 
239 		/*
240 		 * The VM has a hole in its physical memory map. If we want to
241 		 * use 'dd' to inspect memory beyond the hole we need to
242 		 * provide bogus data for memory that lies in the hole.
243 		 *
244 		 * Since this device does not support lseek(2), dd(1) will
245 		 * read(2) blocks of data to simulate the lseek(2).
246 		 */
247 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
248 		if (hpa == NULL) {
249 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
250 				error = uiomove(__DECONST(void *, zero_region),
251 				    c, uio);
252 			else
253 				error = EFAULT;
254 		} else {
255 			error = uiomove(hpa, c, uio);
256 			vm_gpa_release(cookie);
257 		}
258 	}
259 	vm_unlock_memsegs(sc->vm);
260 	return (error);
261 }
262 
263 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
264 
265 static int
266 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
267 {
268 	struct devmem_softc *dsc;
269 	int error;
270 	bool sysmem;
271 
272 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
273 	if (error || mseg->len == 0)
274 		return (error);
275 
276 	if (!sysmem) {
277 		SLIST_FOREACH(dsc, &sc->devmem, link) {
278 			if (dsc->segid == mseg->segid)
279 				break;
280 		}
281 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
282 		    __func__, mseg->segid));
283 		error = copystr(dsc->name, mseg->name, len, NULL);
284 	} else {
285 		bzero(mseg->name, len);
286 	}
287 
288 	return (error);
289 }
290 
291 static int
292 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len,
293     struct domainset *domainset)
294 {
295 	char *name;
296 	int error;
297 	bool sysmem;
298 
299 	error = 0;
300 	name = NULL;
301 	sysmem = true;
302 
303 	/*
304 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
305 	 * by stripped off when devfs processes the full string.
306 	 */
307 	if (VM_MEMSEG_NAME(mseg)) {
308 		sysmem = false;
309 		name = malloc(len, M_VMMDEV, M_WAITOK);
310 		error = copystr(mseg->name, name, len, NULL);
311 		if (error)
312 			goto done;
313 	}
314 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, domainset);
315 	if (error)
316 		goto done;
317 
318 	if (VM_MEMSEG_NAME(mseg)) {
319 		error = devmem_create_cdev(sc, mseg->segid, name);
320 		if (error)
321 			vm_free_memseg(sc->vm, mseg->segid);
322 		else
323 			name = NULL;	/* freed when 'cdev' is destroyed */
324 	}
325 done:
326 	free(name, M_VMMDEV);
327 	return (error);
328 }
329 
330 #if defined(__amd64__) && \
331     (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12))
332 /*
333  * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts.
334  */
335 static void
336 adjust_segid(struct vm_memseg *mseg)
337 {
338 	if (mseg->segid != VM_SYSMEM) {
339 		mseg->segid += (VM_BOOTROM - 1);
340 	}
341 }
342 #endif
343 
344 static int
345 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
346     uint64_t *regval)
347 {
348 	int error, i;
349 
350 	error = 0;
351 	for (i = 0; i < count; i++) {
352 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
353 		if (error)
354 			break;
355 	}
356 	return (error);
357 }
358 
359 static int
360 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
361     uint64_t *regval)
362 {
363 	int error, i;
364 
365 	error = 0;
366 	for (i = 0; i < count; i++) {
367 		error = vm_set_register(vcpu, regnum[i], regval[i]);
368 		if (error)
369 			break;
370 	}
371 	return (error);
372 }
373 
374 static int
375 vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
376 {
377 	int error;
378 
379 	/*
380 	 * A jail without vmm access shouldn't be able to access vmm device
381 	 * files at all, but check here just to be thorough.
382 	 */
383 	error = vmm_priv_check(td->td_ucred);
384 	if (error != 0)
385 		return (error);
386 
387 	return (0);
388 }
389 
390 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
391 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
392 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
393 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
394 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
395 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
396 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
397 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
398 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
399 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
400 	VMMDEV_IOCTL(VM_STAT_DESC, 0),
401 
402 #ifdef __amd64__
403 #ifdef COMPAT_FREEBSD12
404 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
405 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
406 #endif
407 #ifdef COMPAT_FREEBSD14
408 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14,
409 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
410 #endif
411 #endif /* __amd64__ */
412 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
413 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
414 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
415 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
416 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
417 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
418 	VMMDEV_IOCTL(VM_REINIT,
419 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
420 
421 #ifdef __amd64__
422 #if defined(COMPAT_FREEBSD12)
423 	VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
424 #endif
425 #ifdef COMPAT_FREEBSD14
426 	VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS),
427 #endif
428 #endif /* __amd64__ */
429 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
430 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
431 
432 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
433 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
434 
435 	VMMDEV_IOCTL(VM_SUSPEND, 0),
436 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
437 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
438 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
439 };
440 
441 static int
442 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
443     struct thread *td)
444 {
445 	struct vmmdev_softc *sc;
446 	struct vcpu *vcpu;
447 	const struct vmmdev_ioctl *ioctl;
448 	struct vm_memseg *mseg;
449 	int error, vcpuid;
450 
451 	sc = vmmdev_lookup2(cdev);
452 	if (sc == NULL)
453 		return (ENXIO);
454 
455 	ioctl = NULL;
456 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
457 		if (vmmdev_ioctls[i].cmd == cmd) {
458 			ioctl = &vmmdev_ioctls[i];
459 			break;
460 		}
461 	}
462 	if (ioctl == NULL) {
463 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
464 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
465 				ioctl = &vmmdev_machdep_ioctls[i];
466 				break;
467 			}
468 		}
469 	}
470 	if (ioctl == NULL)
471 		return (ENOTTY);
472 
473 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
474 		vm_xlock_memsegs(sc->vm);
475 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
476 		vm_slock_memsegs(sc->vm);
477 
478 	vcpu = NULL;
479 	vcpuid = -1;
480 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
481 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
482 		vcpuid = *(int *)data;
483 		if (vcpuid == -1) {
484 			if ((ioctl->flags &
485 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
486 				error = EINVAL;
487 				goto lockfail;
488 			}
489 		} else {
490 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
491 			if (vcpu == NULL) {
492 				error = EINVAL;
493 				goto lockfail;
494 			}
495 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
496 				error = vcpu_lock_one(vcpu);
497 				if (error)
498 					goto lockfail;
499 			}
500 		}
501 	}
502 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
503 		error = vcpu_lock_all(sc);
504 		if (error)
505 			goto lockfail;
506 	}
507 
508 	switch (cmd) {
509 	case VM_SUSPEND: {
510 		struct vm_suspend *vmsuspend;
511 
512 		vmsuspend = (struct vm_suspend *)data;
513 		error = vm_suspend(sc->vm, vmsuspend->how);
514 		break;
515 	}
516 	case VM_REINIT:
517 		error = vm_reinit(sc->vm);
518 		break;
519 	case VM_STAT_DESC: {
520 		struct vm_stat_desc *statdesc;
521 
522 		statdesc = (struct vm_stat_desc *)data;
523 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
524 		    sizeof(statdesc->desc));
525 		break;
526 	}
527 	case VM_STATS: {
528 		struct vm_stats *vmstats;
529 
530 		vmstats = (struct vm_stats *)data;
531 		getmicrotime(&vmstats->tv);
532 		error = vmm_stat_copy(vcpu, vmstats->index,
533 		    nitems(vmstats->statbuf), &vmstats->num_entries,
534 		    vmstats->statbuf);
535 		break;
536 	}
537 	case VM_MMAP_GETNEXT: {
538 		struct vm_memmap *mm;
539 
540 		mm = (struct vm_memmap *)data;
541 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
542 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
543 		break;
544 	}
545 	case VM_MMAP_MEMSEG: {
546 		struct vm_memmap *mm;
547 
548 		mm = (struct vm_memmap *)data;
549 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
550 		    mm->len, mm->prot, mm->flags);
551 		break;
552 	}
553 	case VM_MUNMAP_MEMSEG: {
554 		struct vm_munmap *mu;
555 
556 		mu = (struct vm_munmap *)data;
557 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
558 		break;
559 	}
560 #ifdef __amd64__
561 #ifdef COMPAT_FREEBSD12
562 	case VM_ALLOC_MEMSEG_12:
563 		mseg = (struct vm_memseg *)data;
564 
565 		adjust_segid(mseg);
566 		error = alloc_memseg(sc, mseg,
567 		    sizeof(((struct vm_memseg_12 *)0)->name), NULL);
568 		break;
569 	case VM_GET_MEMSEG_12:
570 		mseg = (struct vm_memseg *)data;
571 
572 		adjust_segid(mseg);
573 		error = get_memseg(sc, mseg,
574 		    sizeof(((struct vm_memseg_12 *)0)->name));
575 		break;
576 #endif /* COMPAT_FREEBSD12 */
577 #ifdef COMPAT_FREEBSD14
578 	case VM_ALLOC_MEMSEG_14:
579 		mseg = (struct vm_memseg *)data;
580 
581 		adjust_segid(mseg);
582 		error = alloc_memseg(sc, mseg,
583 		    sizeof(((struct vm_memseg_14 *)0)->name), NULL);
584 		break;
585 	case VM_GET_MEMSEG_14:
586 		mseg = (struct vm_memseg *)data;
587 
588 		adjust_segid(mseg);
589 		error = get_memseg(sc, mseg,
590 		    sizeof(((struct vm_memseg_14 *)0)->name));
591 		break;
592 #endif /* COMPAT_FREEBSD14 */
593 #endif /* __amd64__ */
594 	case VM_ALLOC_MEMSEG: {
595 		domainset_t *mask;
596 		struct domainset *domainset, domain;
597 
598 		domainset = NULL;
599 		mseg = (struct vm_memseg *)data;
600 		if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && mseg->ds_mask != NULL) {
601 			if (mseg->ds_mask_size < sizeof(domainset_t) ||
602 			    mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) {
603 				error = ERANGE;
604 				break;
605 			}
606 			memset(&domain, 0, sizeof(domain));
607 			mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK);
608 			error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size);
609 			if (error) {
610 				free(mask, M_VMMDEV);
611 				break;
612 			}
613 			error = domainset_populate(&domain, mask, mseg->ds_policy,
614 			    mseg->ds_mask_size);
615 			if (error) {
616 				free(mask, M_VMMDEV);
617 				break;
618 			}
619 			domainset = domainset_create(&domain);
620 			if (domainset == NULL) {
621 				error = EINVAL;
622 				free(mask, M_VMMDEV);
623 				break;
624 			}
625 			free(mask, M_VMMDEV);
626 		}
627 		error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset);
628 
629 		break;
630 	}
631 	case VM_GET_MEMSEG:
632 		error = get_memseg(sc, (struct vm_memseg *)data,
633 		    sizeof(((struct vm_memseg *)0)->name));
634 		break;
635 	case VM_GET_REGISTER: {
636 		struct vm_register *vmreg;
637 
638 		vmreg = (struct vm_register *)data;
639 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
640 		break;
641 	}
642 	case VM_SET_REGISTER: {
643 		struct vm_register *vmreg;
644 
645 		vmreg = (struct vm_register *)data;
646 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
647 		break;
648 	}
649 	case VM_GET_REGISTER_SET: {
650 		struct vm_register_set *vmregset;
651 		uint64_t *regvals;
652 		int *regnums;
653 
654 		vmregset = (struct vm_register_set *)data;
655 		if (vmregset->count > VM_REG_LAST) {
656 			error = EINVAL;
657 			break;
658 		}
659 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
660 		    M_WAITOK);
661 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
662 		    M_WAITOK);
663 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
664 		    vmregset->count);
665 		if (error == 0)
666 			error = vm_get_register_set(vcpu,
667 			    vmregset->count, regnums, regvals);
668 		if (error == 0)
669 			error = copyout(regvals, vmregset->regvals,
670 			    sizeof(regvals[0]) * vmregset->count);
671 		free(regvals, M_VMMDEV);
672 		free(regnums, M_VMMDEV);
673 		break;
674 	}
675 	case VM_SET_REGISTER_SET: {
676 		struct vm_register_set *vmregset;
677 		uint64_t *regvals;
678 		int *regnums;
679 
680 		vmregset = (struct vm_register_set *)data;
681 		if (vmregset->count > VM_REG_LAST) {
682 			error = EINVAL;
683 			break;
684 		}
685 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
686 		    M_WAITOK);
687 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
688 		    M_WAITOK);
689 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
690 		    vmregset->count);
691 		if (error == 0)
692 			error = copyin(vmregset->regvals, regvals,
693 			    sizeof(regvals[0]) * vmregset->count);
694 		if (error == 0)
695 			error = vm_set_register_set(vcpu,
696 			    vmregset->count, regnums, regvals);
697 		free(regvals, M_VMMDEV);
698 		free(regnums, M_VMMDEV);
699 		break;
700 	}
701 	case VM_GET_CAPABILITY: {
702 		struct vm_capability *vmcap;
703 
704 		vmcap = (struct vm_capability *)data;
705 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
706 		break;
707 	}
708 	case VM_SET_CAPABILITY: {
709 		struct vm_capability *vmcap;
710 
711 		vmcap = (struct vm_capability *)data;
712 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
713 		break;
714 	}
715 	case VM_ACTIVATE_CPU:
716 		error = vm_activate_cpu(vcpu);
717 		break;
718 	case VM_GET_CPUS: {
719 		struct vm_cpuset *vm_cpuset;
720 		cpuset_t *cpuset;
721 		int size;
722 
723 		error = 0;
724 		vm_cpuset = (struct vm_cpuset *)data;
725 		size = vm_cpuset->cpusetsize;
726 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
727 			error = ERANGE;
728 			break;
729 		}
730 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
731 		    M_WAITOK | M_ZERO);
732 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
733 			*cpuset = vm_active_cpus(sc->vm);
734 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
735 			*cpuset = vm_suspended_cpus(sc->vm);
736 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
737 			*cpuset = vm_debug_cpus(sc->vm);
738 		else
739 			error = EINVAL;
740 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
741 			error = ERANGE;
742 		if (error == 0)
743 			error = copyout(cpuset, vm_cpuset->cpus, size);
744 		free(cpuset, M_TEMP);
745 		break;
746 	}
747 	case VM_SUSPEND_CPU:
748 		error = vm_suspend_cpu(sc->vm, vcpu);
749 		break;
750 	case VM_RESUME_CPU:
751 		error = vm_resume_cpu(sc->vm, vcpu);
752 		break;
753 	case VM_SET_TOPOLOGY: {
754 		struct vm_cpu_topology *topology;
755 
756 		topology = (struct vm_cpu_topology *)data;
757 		error = vm_set_topology(sc->vm, topology->sockets,
758 		    topology->cores, topology->threads, topology->maxcpus);
759 		break;
760 	}
761 	case VM_GET_TOPOLOGY: {
762 		struct vm_cpu_topology *topology;
763 
764 		topology = (struct vm_cpu_topology *)data;
765 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
766 		    &topology->threads, &topology->maxcpus);
767 		error = 0;
768 		break;
769 	}
770 	default:
771 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
772 		    td);
773 		break;
774 	}
775 
776 	if ((ioctl->flags &
777 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
778 		vm_unlock_memsegs(sc->vm);
779 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
780 		vcpu_unlock_all(sc);
781 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
782 		vcpu_unlock_one(vcpu);
783 
784 	/*
785 	 * Make sure that no handler returns a kernel-internal
786 	 * error value to userspace.
787 	 */
788 	KASSERT(error == ERESTART || error >= 0,
789 	    ("vmmdev_ioctl: invalid error return %d", error));
790 	return (error);
791 
792 lockfail:
793 	if ((ioctl->flags &
794 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
795 		vm_unlock_memsegs(sc->vm);
796 	return (error);
797 }
798 
799 static int
800 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
801     struct vm_object **objp, int nprot)
802 {
803 	struct vmmdev_softc *sc;
804 	vm_paddr_t gpa;
805 	size_t len;
806 	vm_ooffset_t segoff, first, last;
807 	int error, found, segid;
808 	bool sysmem;
809 
810 	first = *offset;
811 	last = first + mapsize;
812 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
813 		return (EINVAL);
814 
815 	sc = vmmdev_lookup2(cdev);
816 	if (sc == NULL) {
817 		/* virtual machine is in the process of being created */
818 		return (EINVAL);
819 	}
820 
821 	/*
822 	 * Get a read lock on the guest memory map.
823 	 */
824 	vm_slock_memsegs(sc->vm);
825 
826 	gpa = 0;
827 	found = 0;
828 	while (!found) {
829 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
830 		    NULL, NULL);
831 		if (error)
832 			break;
833 
834 		if (first >= gpa && last <= gpa + len)
835 			found = 1;
836 		else
837 			gpa += len;
838 	}
839 
840 	if (found) {
841 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
842 		KASSERT(error == 0 && *objp != NULL,
843 		    ("%s: invalid memory segment %d", __func__, segid));
844 		if (sysmem) {
845 			vm_object_reference(*objp);
846 			*offset = segoff + (first - gpa);
847 		} else {
848 			error = EINVAL;
849 		}
850 	}
851 	vm_unlock_memsegs(sc->vm);
852 	return (error);
853 }
854 
855 static void
856 vmmdev_destroy(struct vmmdev_softc *sc)
857 {
858 	struct devmem_softc *dsc;
859 	int error __diagused;
860 
861 	KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__));
862 
863 	/*
864 	 * Destroy all cdevs:
865 	 *
866 	 * - any new operations on the 'cdev' will return an error (ENXIO).
867 	 *
868 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
869 	 */
870 	SLIST_FOREACH(dsc, &sc->devmem, link) {
871 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
872 		devmem_destroy(dsc);
873 	}
874 
875 	vm_disable_vcpu_creation(sc->vm);
876 	error = vcpu_lock_all(sc);
877 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
878 	vm_unlock_vcpus(sc->vm);
879 
880 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
881 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
882 		SLIST_REMOVE_HEAD(&sc->devmem, link);
883 		free(dsc->name, M_VMMDEV);
884 		free(dsc, M_VMMDEV);
885 	}
886 
887 	if (sc->vm != NULL)
888 		vm_destroy(sc->vm);
889 
890 	if (sc->ucred != NULL)
891 		crfree(sc->ucred);
892 
893 	sx_xlock(&vmmdev_mtx);
894 	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
895 	sx_xunlock(&vmmdev_mtx);
896 	free(sc, M_VMMDEV);
897 }
898 
899 static int
900 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred)
901 {
902 	struct cdev *cdev;
903 	struct vmmdev_softc *sc;
904 
905 	sx_xlock(&vmmdev_mtx);
906 	sc = vmmdev_lookup(name, cred);
907 	if (sc == NULL || sc->cdev == NULL) {
908 		sx_xunlock(&vmmdev_mtx);
909 		return (EINVAL);
910 	}
911 
912 	/*
913 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
914 	 * is scheduled for destruction.
915 	 */
916 	cdev = sc->cdev;
917 	sc->cdev = NULL;
918 	sx_xunlock(&vmmdev_mtx);
919 
920 	vm_suspend(sc->vm, VM_SUSPEND_DESTROY);
921 	destroy_dev(cdev);
922 	vmmdev_destroy(sc);
923 
924 	return (0);
925 }
926 
927 static int
928 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
929 {
930 	char *buf;
931 	int error, buflen;
932 
933 	error = vmm_priv_check(req->td->td_ucred);
934 	if (error)
935 		return (error);
936 
937 	buflen = VM_MAX_NAMELEN + 1;
938 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
939 	error = sysctl_handle_string(oidp, buf, buflen, req);
940 	if (error == 0 && req->newptr != NULL)
941 		error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
942 	free(buf, M_VMMDEV);
943 	return (error);
944 }
945 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
946     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
947     NULL, 0, sysctl_vmm_destroy, "A",
948     "Destroy a vmm(4) instance (legacy interface)");
949 
950 static struct cdevsw vmmdevsw = {
951 	.d_name		= "vmmdev",
952 	.d_version	= D_VERSION,
953 	.d_open		= vmmdev_open,
954 	.d_ioctl	= vmmdev_ioctl,
955 	.d_mmap_single	= vmmdev_mmap_single,
956 	.d_read		= vmmdev_rw,
957 	.d_write	= vmmdev_rw,
958 };
959 
960 static struct vmmdev_softc *
961 vmmdev_alloc(struct vm *vm, struct ucred *cred)
962 {
963 	struct vmmdev_softc *sc;
964 
965 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
966 	SLIST_INIT(&sc->devmem);
967 	sc->vm = vm;
968 	sc->ucred = crhold(cred);
969 	return (sc);
970 }
971 
972 static int
973 vmmdev_create(const char *name, struct ucred *cred)
974 {
975 	struct make_dev_args mda;
976 	struct cdev *cdev;
977 	struct vmmdev_softc *sc;
978 	struct vm *vm;
979 	int error;
980 
981 	sx_xlock(&vmmdev_mtx);
982 	sc = vmmdev_lookup(name, cred);
983 	if (sc != NULL) {
984 		sx_xunlock(&vmmdev_mtx);
985 		return (EEXIST);
986 	}
987 
988 	error = vm_create(name, &vm);
989 	if (error != 0) {
990 		sx_xunlock(&vmmdev_mtx);
991 		return (error);
992 	}
993 	sc = vmmdev_alloc(vm, cred);
994 	SLIST_INSERT_HEAD(&head, sc, link);
995 
996 	make_dev_args_init(&mda);
997 	mda.mda_devsw = &vmmdevsw;
998 	mda.mda_cr = sc->ucred;
999 	mda.mda_uid = UID_ROOT;
1000 	mda.mda_gid = GID_WHEEL;
1001 	mda.mda_mode = 0600;
1002 	mda.mda_si_drv1 = sc;
1003 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1004 	error = make_dev_s(&mda, &cdev, "vmm/%s", name);
1005 	if (error != 0) {
1006 		sx_xunlock(&vmmdev_mtx);
1007 		vmmdev_destroy(sc);
1008 		return (error);
1009 	}
1010 	sc->cdev = cdev;
1011 	sx_xunlock(&vmmdev_mtx);
1012 	return (0);
1013 }
1014 
1015 static int
1016 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
1017 {
1018 	char *buf;
1019 	int error, buflen;
1020 
1021 	error = vmm_priv_check(req->td->td_ucred);
1022 	if (error != 0)
1023 		return (error);
1024 
1025 	buflen = VM_MAX_NAMELEN + 1;
1026 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1027 	error = sysctl_handle_string(oidp, buf, buflen, req);
1028 	if (error == 0 && req->newptr != NULL)
1029 		error = vmmdev_create(buf, req->td->td_ucred);
1030 	free(buf, M_VMMDEV);
1031 	return (error);
1032 }
1033 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1034     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1035     NULL, 0, sysctl_vmm_create, "A",
1036     "Create a vmm(4) instance (legacy interface)");
1037 
1038 static int
1039 vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td)
1040 {
1041 	int error;
1042 
1043 	error = vmm_priv_check(td->td_ucred);
1044 	if (error != 0)
1045 		return (error);
1046 
1047 	if ((flags & FWRITE) == 0)
1048 		return (EPERM);
1049 
1050 	return (0);
1051 }
1052 
1053 static int
1054 vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
1055     struct thread *td)
1056 {
1057 	int error;
1058 
1059 	switch (cmd) {
1060 	case VMMCTL_VM_CREATE: {
1061 		struct vmmctl_vm_create *vmc;
1062 
1063 		vmc = (struct vmmctl_vm_create *)data;
1064 		vmc->name[VM_MAX_NAMELEN] = '\0';
1065 		for (size_t i = 0; i < nitems(vmc->reserved); i++) {
1066 			if (vmc->reserved[i] != 0) {
1067 				error = EINVAL;
1068 				return (error);
1069 			}
1070 		}
1071 
1072 		error = vmmdev_create(vmc->name, td->td_ucred);
1073 		break;
1074 	}
1075 	case VMMCTL_VM_DESTROY: {
1076 		struct vmmctl_vm_destroy *vmd;
1077 
1078 		vmd = (struct vmmctl_vm_destroy *)data;
1079 		vmd->name[VM_MAX_NAMELEN] = '\0';
1080 		for (size_t i = 0; i < nitems(vmd->reserved); i++) {
1081 			if (vmd->reserved[i] != 0) {
1082 				error = EINVAL;
1083 				return (error);
1084 			}
1085 		}
1086 
1087 		error = vmmdev_lookup_and_destroy(vmd->name, td->td_ucred);
1088 		break;
1089 	}
1090 	default:
1091 		error = ENOTTY;
1092 		break;
1093 	}
1094 
1095 	return (error);
1096 }
1097 
1098 static struct cdev *vmmctl_cdev;
1099 static struct cdevsw vmmctlsw = {
1100 	.d_name		= "vmmctl",
1101 	.d_version	= D_VERSION,
1102 	.d_open		= vmmctl_open,
1103 	.d_ioctl	= vmmctl_ioctl,
1104 };
1105 
1106 int
1107 vmmdev_init(void)
1108 {
1109 	int error;
1110 
1111 	sx_xlock(&vmmdev_mtx);
1112 	error = make_dev_p(MAKEDEV_CHECKNAME, &vmmctl_cdev, &vmmctlsw, NULL,
1113 	    UID_ROOT, GID_WHEEL, 0600, "vmmctl");
1114 	if (error == 0)
1115 		pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1116 		    "Allow use of vmm in a jail.");
1117 	sx_xunlock(&vmmdev_mtx);
1118 
1119 	return (error);
1120 }
1121 
1122 int
1123 vmmdev_cleanup(void)
1124 {
1125 	sx_xlock(&vmmdev_mtx);
1126 	if (!SLIST_EMPTY(&head)) {
1127 		sx_xunlock(&vmmdev_mtx);
1128 		return (EBUSY);
1129 	}
1130 	if (vmmctl_cdev != NULL) {
1131 		destroy_dev(vmmctl_cdev);
1132 		vmmctl_cdev = NULL;
1133 	}
1134 	sx_xunlock(&vmmdev_mtx);
1135 
1136 	return (0);
1137 }
1138 
1139 static int
1140 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1141     struct vm_object **objp, int nprot)
1142 {
1143 	struct devmem_softc *dsc;
1144 	vm_ooffset_t first, last;
1145 	size_t seglen;
1146 	int error;
1147 	bool sysmem;
1148 
1149 	dsc = cdev->si_drv1;
1150 	if (dsc == NULL) {
1151 		/* 'cdev' has been created but is not ready for use */
1152 		return (ENXIO);
1153 	}
1154 
1155 	first = *offset;
1156 	last = *offset + len;
1157 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1158 		return (EINVAL);
1159 
1160 	vm_slock_memsegs(dsc->sc->vm);
1161 
1162 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1163 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1164 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1165 
1166 	if (seglen >= last)
1167 		vm_object_reference(*objp);
1168 	else
1169 		error = EINVAL;
1170 
1171 	vm_unlock_memsegs(dsc->sc->vm);
1172 	return (error);
1173 }
1174 
1175 static struct cdevsw devmemsw = {
1176 	.d_name		= "devmem",
1177 	.d_version	= D_VERSION,
1178 	.d_mmap_single	= devmem_mmap_single,
1179 };
1180 
1181 static int
1182 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname)
1183 {
1184 	struct make_dev_args mda;
1185 	struct devmem_softc *dsc;
1186 	int error;
1187 
1188 	sx_xlock(&vmmdev_mtx);
1189 
1190 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1191 	dsc->segid = segid;
1192 	dsc->name = devname;
1193 	dsc->sc = sc;
1194 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1195 
1196 	make_dev_args_init(&mda);
1197 	mda.mda_devsw = &devmemsw;
1198 	mda.mda_cr = sc->ucred;
1199 	mda.mda_uid = UID_ROOT;
1200 	mda.mda_gid = GID_WHEEL;
1201 	mda.mda_mode = 0600;
1202 	mda.mda_si_drv1 = dsc;
1203 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1204 	error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm),
1205 	    devname);
1206 	if (error != 0) {
1207 		SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link);
1208 		free(dsc->name, M_VMMDEV);
1209 		free(dsc, M_VMMDEV);
1210 	}
1211 
1212 	sx_xunlock(&vmmdev_mtx);
1213 
1214 	return (error);
1215 }
1216 
1217 static void
1218 devmem_destroy(void *arg)
1219 {
1220 	struct devmem_softc *dsc = arg;
1221 
1222 	destroy_dev(dsc->cdev);
1223 	dsc->cdev = NULL;
1224 	dsc->sc = NULL;
1225 }
1226