xref: /freebsd/sys/dev/vmm/vmm_dev.c (revision bd16bac27e7e0d31bccf88feca95cd98f0ef0fd4)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6  * All rights reserved.
7  */
8 
9 #include <sys/param.h>
10 #include <sys/conf.h>
11 #include <sys/fcntl.h>
12 #include <sys/ioccom.h>
13 #include <sys/jail.h>
14 #include <sys/kernel.h>
15 #include <sys/malloc.h>
16 #include <sys/mman.h>
17 #include <sys/module.h>
18 #include <sys/priv.h>
19 #include <sys/proc.h>
20 #include <sys/queue.h>
21 #include <sys/resourcevar.h>
22 #include <sys/smp.h>
23 #include <sys/sx.h>
24 #include <sys/sysctl.h>
25 #include <sys/ucred.h>
26 #include <sys/uio.h>
27 
28 #include <machine/vmm.h>
29 
30 #include <vm/vm.h>
31 #include <vm/vm_object.h>
32 
33 #include <dev/vmm/vmm_dev.h>
34 #include <dev/vmm/vmm_mem.h>
35 #include <dev/vmm/vmm_stat.h>
36 
37 #ifdef __amd64__
38 #ifdef COMPAT_FREEBSD12
39 struct vm_memseg_12 {
40 	int		segid;
41 	size_t		len;
42 	char		name[64];
43 };
44 _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI");
45 
46 #define	VM_ALLOC_MEMSEG_12	\
47 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
48 #define	VM_GET_MEMSEG_12	\
49 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
50 #endif /* COMPAT_FREEBSD12 */
51 #ifdef COMPAT_FREEBSD14
52 struct vm_memseg_14 {
53 	int		segid;
54 	size_t		len;
55 	char		name[VM_MAX_SUFFIXLEN + 1];
56 };
57 _Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16),
58     "COMPAT_FREEBSD14 ABI");
59 
60 #define	VM_ALLOC_MEMSEG_14	\
61 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14)
62 #define	VM_GET_MEMSEG_14	\
63 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14)
64 #endif /* COMPAT_FREEBSD14 */
65 #endif /* __amd64__ */
66 
67 struct devmem_softc {
68 	int	segid;
69 	char	*name;
70 	struct cdev *cdev;
71 	struct vmmdev_softc *sc;
72 	SLIST_ENTRY(devmem_softc) link;
73 };
74 
75 struct vmmdev_softc {
76 	struct vm	*vm;		/* vm instance cookie */
77 	struct cdev	*cdev;
78 	struct ucred	*ucred;
79 	SLIST_ENTRY(vmmdev_softc) link;
80 	LIST_ENTRY(vmmdev_softc) priv_link;
81 	SLIST_HEAD(, devmem_softc) devmem;
82 	int		flags;
83 };
84 
85 struct vmmctl_priv {
86 	LIST_HEAD(, vmmdev_softc) softcs;
87 };
88 
89 static bool vmm_initialized = false;
90 
91 static SLIST_HEAD(, vmmdev_softc) head;
92 
93 static unsigned pr_allow_flag;
94 static struct sx vmmdev_mtx;
95 SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex");
96 
97 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
98 
99 SYSCTL_DECL(_hw_vmm);
100 
101 u_int vm_maxcpu;
102 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
103     &vm_maxcpu, 0, "Maximum number of vCPUs");
104 
105 u_int vm_maxvmms;
106 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxvmms, CTLFLAG_RWTUN,
107     &vm_maxvmms, 0, "Maximum number of VMM instances per user");
108 
109 static void devmem_destroy(void *arg);
110 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem);
111 static void vmmdev_destroy(struct vmmdev_softc *sc);
112 
113 static int
vmm_priv_check(struct ucred * ucred)114 vmm_priv_check(struct ucred *ucred)
115 {
116 	if (jailed(ucred) &&
117 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
118 		return (EPERM);
119 
120 	return (0);
121 }
122 
123 static int
vcpu_lock_one(struct vcpu * vcpu)124 vcpu_lock_one(struct vcpu *vcpu)
125 {
126 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
127 }
128 
129 static void
vcpu_unlock_one(struct vcpu * vcpu)130 vcpu_unlock_one(struct vcpu *vcpu)
131 {
132 	enum vcpu_state state;
133 
134 	state = vcpu_get_state(vcpu, NULL);
135 	if (state != VCPU_FROZEN) {
136 		panic("vcpu %s(%d) has invalid state %d",
137 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
138 	}
139 
140 	vcpu_set_state(vcpu, VCPU_IDLE, false);
141 }
142 
143 #ifndef __amd64__
144 static int
vcpu_set_state_all(struct vm * vm,enum vcpu_state newstate)145 vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate)
146 {
147 	struct vcpu *vcpu;
148 	int error;
149 	uint16_t i, j, maxcpus;
150 
151 	error = 0;
152 	maxcpus = vm_get_maxcpus(vm);
153 	for (i = 0; i < maxcpus; i++) {
154 		vcpu = vm_vcpu(vm, i);
155 		if (vcpu == NULL)
156 			continue;
157 		error = vcpu_lock_one(vcpu);
158 		if (error)
159 			break;
160 	}
161 
162 	if (error) {
163 		for (j = 0; j < i; j++) {
164 			vcpu = vm_vcpu(vm, j);
165 			if (vcpu == NULL)
166 				continue;
167 			vcpu_unlock_one(vcpu);
168 		}
169 	}
170 
171 	return (error);
172 }
173 #endif
174 
175 static int
vcpu_lock_all(struct vmmdev_softc * sc)176 vcpu_lock_all(struct vmmdev_softc *sc)
177 {
178 	int error;
179 
180 	/*
181 	 * Serialize vcpu_lock_all() callers.  Individual vCPUs are not locked
182 	 * in a consistent order so we need to serialize to avoid deadlocks.
183 	 */
184 	vm_lock_vcpus(sc->vm);
185 	error = vcpu_set_state_all(sc->vm, VCPU_FROZEN);
186 	if (error != 0)
187 		vm_unlock_vcpus(sc->vm);
188 	return (error);
189 }
190 
191 static void
vcpu_unlock_all(struct vmmdev_softc * sc)192 vcpu_unlock_all(struct vmmdev_softc *sc)
193 {
194 	struct vcpu *vcpu;
195 	uint16_t i, maxcpus;
196 
197 	maxcpus = vm_get_maxcpus(sc->vm);
198 	for (i = 0; i < maxcpus; i++) {
199 		vcpu = vm_vcpu(sc->vm, i);
200 		if (vcpu == NULL)
201 			continue;
202 		vcpu_unlock_one(vcpu);
203 	}
204 	vm_unlock_vcpus(sc->vm);
205 }
206 
207 static struct vmmdev_softc *
vmmdev_lookup(const char * name,struct ucred * cred)208 vmmdev_lookup(const char *name, struct ucred *cred)
209 {
210 	struct vmmdev_softc *sc;
211 
212 	sx_assert(&vmmdev_mtx, SA_XLOCKED);
213 
214 	SLIST_FOREACH(sc, &head, link) {
215 		if (strcmp(name, vm_name(sc->vm)) == 0)
216 			break;
217 	}
218 
219 	if (sc == NULL)
220 		return (NULL);
221 
222 	if (cr_cansee(cred, sc->ucred))
223 		return (NULL);
224 
225 	return (sc);
226 }
227 
228 static struct vmmdev_softc *
vmmdev_lookup2(struct cdev * cdev)229 vmmdev_lookup2(struct cdev *cdev)
230 {
231 	return (cdev->si_drv1);
232 }
233 
234 static int
vmmdev_rw(struct cdev * cdev,struct uio * uio,int flags)235 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
236 {
237 	int error, off, c, prot;
238 	vm_paddr_t gpa, maxaddr;
239 	void *hpa, *cookie;
240 	struct vmmdev_softc *sc;
241 
242 	sc = vmmdev_lookup2(cdev);
243 	if (sc == NULL)
244 		return (ENXIO);
245 
246 	/*
247 	 * Get a read lock on the guest memory map.
248 	 */
249 	vm_slock_memsegs(sc->vm);
250 
251 	error = 0;
252 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
253 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
254 	while (uio->uio_resid > 0 && error == 0) {
255 		gpa = uio->uio_offset;
256 		off = gpa & PAGE_MASK;
257 		c = min(uio->uio_resid, PAGE_SIZE - off);
258 
259 		/*
260 		 * The VM has a hole in its physical memory map. If we want to
261 		 * use 'dd' to inspect memory beyond the hole we need to
262 		 * provide bogus data for memory that lies in the hole.
263 		 *
264 		 * Since this device does not support lseek(2), dd(1) will
265 		 * read(2) blocks of data to simulate the lseek(2).
266 		 */
267 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
268 		if (hpa == NULL) {
269 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
270 				error = uiomove(__DECONST(void *, zero_region),
271 				    c, uio);
272 			else
273 				error = EFAULT;
274 		} else {
275 			error = uiomove(hpa, c, uio);
276 			vm_gpa_release(cookie);
277 		}
278 	}
279 	vm_unlock_memsegs(sc->vm);
280 	return (error);
281 }
282 
283 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
284 
285 static int
get_memseg(struct vmmdev_softc * sc,struct vm_memseg * mseg,size_t len)286 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
287 {
288 	struct devmem_softc *dsc;
289 	int error;
290 	bool sysmem;
291 
292 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
293 	if (error || mseg->len == 0)
294 		return (error);
295 
296 	if (!sysmem) {
297 		SLIST_FOREACH(dsc, &sc->devmem, link) {
298 			if (dsc->segid == mseg->segid)
299 				break;
300 		}
301 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
302 		    __func__, mseg->segid));
303 		error = copystr(dsc->name, mseg->name, len, NULL);
304 	} else {
305 		bzero(mseg->name, len);
306 	}
307 
308 	return (error);
309 }
310 
311 static int
alloc_memseg(struct vmmdev_softc * sc,struct vm_memseg * mseg,size_t len,struct domainset * domainset)312 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len,
313     struct domainset *domainset)
314 {
315 	char *name;
316 	int error;
317 	bool sysmem;
318 
319 	error = 0;
320 	name = NULL;
321 	sysmem = true;
322 
323 	/*
324 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
325 	 * by stripped off when devfs processes the full string.
326 	 */
327 	if (VM_MEMSEG_NAME(mseg)) {
328 		sysmem = false;
329 		name = malloc(len, M_VMMDEV, M_WAITOK);
330 		error = copystr(mseg->name, name, len, NULL);
331 		if (error)
332 			goto done;
333 	}
334 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, domainset);
335 	if (error)
336 		goto done;
337 
338 	if (VM_MEMSEG_NAME(mseg)) {
339 		error = devmem_create_cdev(sc, mseg->segid, name);
340 		if (error)
341 			vm_free_memseg(sc->vm, mseg->segid);
342 		else
343 			name = NULL;	/* freed when 'cdev' is destroyed */
344 	}
345 done:
346 	free(name, M_VMMDEV);
347 	return (error);
348 }
349 
350 #if defined(__amd64__) && \
351     (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12))
352 /*
353  * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts.
354  */
355 static void
adjust_segid(struct vm_memseg * mseg)356 adjust_segid(struct vm_memseg *mseg)
357 {
358 	if (mseg->segid != VM_SYSMEM) {
359 		mseg->segid += (VM_BOOTROM - 1);
360 	}
361 }
362 #endif
363 
364 static int
vm_get_register_set(struct vcpu * vcpu,unsigned int count,int * regnum,uint64_t * regval)365 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
366     uint64_t *regval)
367 {
368 	int error, i;
369 
370 	error = 0;
371 	for (i = 0; i < count; i++) {
372 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
373 		if (error)
374 			break;
375 	}
376 	return (error);
377 }
378 
379 static int
vm_set_register_set(struct vcpu * vcpu,unsigned int count,int * regnum,uint64_t * regval)380 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
381     uint64_t *regval)
382 {
383 	int error, i;
384 
385 	error = 0;
386 	for (i = 0; i < count; i++) {
387 		error = vm_set_register(vcpu, regnum[i], regval[i]);
388 		if (error)
389 			break;
390 	}
391 	return (error);
392 }
393 
394 static int
vmmdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)395 vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
396 {
397 	int error;
398 
399 	/*
400 	 * A jail without vmm access shouldn't be able to access vmm device
401 	 * files at all, but check here just to be thorough.
402 	 */
403 	error = vmm_priv_check(td->td_ucred);
404 	if (error != 0)
405 		return (error);
406 
407 	return (0);
408 }
409 
410 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
411 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
412 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
413 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
414 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
415 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
416 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
417 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
418 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
419 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
420 	VMMDEV_IOCTL(VM_STAT_DESC, 0),
421 
422 #ifdef __amd64__
423 #ifdef COMPAT_FREEBSD12
424 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
425 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
426 #endif
427 #ifdef COMPAT_FREEBSD14
428 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14,
429 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
430 #endif
431 #endif /* __amd64__ */
432 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
433 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
434 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
435 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
436 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
437 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
438 	VMMDEV_IOCTL(VM_REINIT,
439 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
440 
441 #ifdef __amd64__
442 #if defined(COMPAT_FREEBSD12)
443 	VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
444 #endif
445 #ifdef COMPAT_FREEBSD14
446 	VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS),
447 #endif
448 #endif /* __amd64__ */
449 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
450 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
451 
452 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
453 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
454 
455 	VMMDEV_IOCTL(VM_SUSPEND, 0),
456 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
457 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
458 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
459 };
460 
461 static int
vmmdev_ioctl(struct cdev * cdev,u_long cmd,caddr_t data,int fflag,struct thread * td)462 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
463     struct thread *td)
464 {
465 	struct vmmdev_softc *sc;
466 	struct vcpu *vcpu;
467 	const struct vmmdev_ioctl *ioctl;
468 	struct vm_memseg *mseg;
469 	int error, vcpuid;
470 
471 	sc = vmmdev_lookup2(cdev);
472 	if (sc == NULL)
473 		return (ENXIO);
474 
475 	ioctl = NULL;
476 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
477 		if (vmmdev_ioctls[i].cmd == cmd) {
478 			ioctl = &vmmdev_ioctls[i];
479 			break;
480 		}
481 	}
482 	if (ioctl == NULL) {
483 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
484 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
485 				ioctl = &vmmdev_machdep_ioctls[i];
486 				break;
487 			}
488 		}
489 	}
490 	if (ioctl == NULL)
491 		return (ENOTTY);
492 
493 	if ((ioctl->flags & VMMDEV_IOCTL_PRIV_CHECK_DRIVER) != 0) {
494 		error = priv_check(td, PRIV_DRIVER);
495 		if (error != 0)
496 			return (error);
497 	}
498 
499 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
500 		vm_xlock_memsegs(sc->vm);
501 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
502 		vm_slock_memsegs(sc->vm);
503 
504 	vcpu = NULL;
505 	vcpuid = -1;
506 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
507 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
508 		vcpuid = *(int *)data;
509 		if (vcpuid == -1) {
510 			if ((ioctl->flags &
511 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
512 				error = EINVAL;
513 				goto lockfail;
514 			}
515 		} else {
516 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
517 			if (vcpu == NULL) {
518 				error = EINVAL;
519 				goto lockfail;
520 			}
521 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
522 				error = vcpu_lock_one(vcpu);
523 				if (error)
524 					goto lockfail;
525 			}
526 		}
527 	}
528 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
529 		error = vcpu_lock_all(sc);
530 		if (error)
531 			goto lockfail;
532 	}
533 
534 	switch (cmd) {
535 	case VM_SUSPEND: {
536 		struct vm_suspend *vmsuspend;
537 
538 		vmsuspend = (struct vm_suspend *)data;
539 		error = vm_suspend(sc->vm, vmsuspend->how);
540 		break;
541 	}
542 	case VM_REINIT:
543 		error = vm_reinit(sc->vm);
544 		break;
545 	case VM_STAT_DESC: {
546 		struct vm_stat_desc *statdesc;
547 
548 		statdesc = (struct vm_stat_desc *)data;
549 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
550 		    sizeof(statdesc->desc));
551 		break;
552 	}
553 	case VM_STATS: {
554 		struct vm_stats *vmstats;
555 
556 		vmstats = (struct vm_stats *)data;
557 		getmicrotime(&vmstats->tv);
558 		error = vmm_stat_copy(vcpu, vmstats->index,
559 		    nitems(vmstats->statbuf), &vmstats->num_entries,
560 		    vmstats->statbuf);
561 		break;
562 	}
563 	case VM_MMAP_GETNEXT: {
564 		struct vm_memmap *mm;
565 
566 		mm = (struct vm_memmap *)data;
567 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
568 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
569 		break;
570 	}
571 	case VM_MMAP_MEMSEG: {
572 		struct vm_memmap *mm;
573 
574 		mm = (struct vm_memmap *)data;
575 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
576 		    mm->len, mm->prot, mm->flags);
577 		break;
578 	}
579 	case VM_MUNMAP_MEMSEG: {
580 		struct vm_munmap *mu;
581 
582 		mu = (struct vm_munmap *)data;
583 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
584 		break;
585 	}
586 #ifdef __amd64__
587 #ifdef COMPAT_FREEBSD12
588 	case VM_ALLOC_MEMSEG_12:
589 		mseg = (struct vm_memseg *)data;
590 
591 		adjust_segid(mseg);
592 		error = alloc_memseg(sc, mseg,
593 		    sizeof(((struct vm_memseg_12 *)0)->name), NULL);
594 		break;
595 	case VM_GET_MEMSEG_12:
596 		mseg = (struct vm_memseg *)data;
597 
598 		adjust_segid(mseg);
599 		error = get_memseg(sc, mseg,
600 		    sizeof(((struct vm_memseg_12 *)0)->name));
601 		break;
602 #endif /* COMPAT_FREEBSD12 */
603 #ifdef COMPAT_FREEBSD14
604 	case VM_ALLOC_MEMSEG_14:
605 		mseg = (struct vm_memseg *)data;
606 
607 		adjust_segid(mseg);
608 		error = alloc_memseg(sc, mseg,
609 		    sizeof(((struct vm_memseg_14 *)0)->name), NULL);
610 		break;
611 	case VM_GET_MEMSEG_14:
612 		mseg = (struct vm_memseg *)data;
613 
614 		adjust_segid(mseg);
615 		error = get_memseg(sc, mseg,
616 		    sizeof(((struct vm_memseg_14 *)0)->name));
617 		break;
618 #endif /* COMPAT_FREEBSD14 */
619 #endif /* __amd64__ */
620 	case VM_ALLOC_MEMSEG: {
621 		domainset_t *mask;
622 		struct domainset *domainset, domain;
623 
624 		domainset = NULL;
625 		mseg = (struct vm_memseg *)data;
626 		if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && mseg->ds_mask != NULL) {
627 			if (mseg->ds_mask_size < sizeof(domainset_t) ||
628 			    mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) {
629 				error = ERANGE;
630 				break;
631 			}
632 			memset(&domain, 0, sizeof(domain));
633 			mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK);
634 			error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size);
635 			if (error) {
636 				free(mask, M_VMMDEV);
637 				break;
638 			}
639 			error = domainset_populate(&domain, mask, mseg->ds_policy,
640 			    mseg->ds_mask_size);
641 			free(mask, M_VMMDEV);
642 			if (error)
643 				break;
644 			domainset = domainset_create(&domain);
645 			if (domainset == NULL) {
646 				error = EINVAL;
647 				break;
648 			}
649 		}
650 		error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset);
651 		break;
652 	}
653 	case VM_GET_MEMSEG:
654 		error = get_memseg(sc, (struct vm_memseg *)data,
655 		    sizeof(((struct vm_memseg *)0)->name));
656 		break;
657 	case VM_GET_REGISTER: {
658 		struct vm_register *vmreg;
659 
660 		vmreg = (struct vm_register *)data;
661 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
662 		break;
663 	}
664 	case VM_SET_REGISTER: {
665 		struct vm_register *vmreg;
666 
667 		vmreg = (struct vm_register *)data;
668 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
669 		break;
670 	}
671 	case VM_GET_REGISTER_SET: {
672 		struct vm_register_set *vmregset;
673 		uint64_t *regvals;
674 		int *regnums;
675 
676 		vmregset = (struct vm_register_set *)data;
677 		if (vmregset->count > VM_REG_LAST) {
678 			error = EINVAL;
679 			break;
680 		}
681 		regvals = mallocarray(vmregset->count, sizeof(regvals[0]),
682 		    M_VMMDEV, M_WAITOK);
683 		regnums = mallocarray(vmregset->count, sizeof(regnums[0]),
684 		    M_VMMDEV, M_WAITOK);
685 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
686 		    vmregset->count);
687 		if (error == 0)
688 			error = vm_get_register_set(vcpu,
689 			    vmregset->count, regnums, regvals);
690 		if (error == 0)
691 			error = copyout(regvals, vmregset->regvals,
692 			    sizeof(regvals[0]) * vmregset->count);
693 		free(regvals, M_VMMDEV);
694 		free(regnums, M_VMMDEV);
695 		break;
696 	}
697 	case VM_SET_REGISTER_SET: {
698 		struct vm_register_set *vmregset;
699 		uint64_t *regvals;
700 		int *regnums;
701 
702 		vmregset = (struct vm_register_set *)data;
703 		if (vmregset->count > VM_REG_LAST) {
704 			error = EINVAL;
705 			break;
706 		}
707 		regvals = mallocarray(vmregset->count, sizeof(regvals[0]),
708 		    M_VMMDEV, M_WAITOK);
709 		regnums = mallocarray(vmregset->count, sizeof(regnums[0]),
710 		    M_VMMDEV, M_WAITOK);
711 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
712 		    vmregset->count);
713 		if (error == 0)
714 			error = copyin(vmregset->regvals, regvals,
715 			    sizeof(regvals[0]) * vmregset->count);
716 		if (error == 0)
717 			error = vm_set_register_set(vcpu,
718 			    vmregset->count, regnums, regvals);
719 		free(regvals, M_VMMDEV);
720 		free(regnums, M_VMMDEV);
721 		break;
722 	}
723 	case VM_GET_CAPABILITY: {
724 		struct vm_capability *vmcap;
725 
726 		vmcap = (struct vm_capability *)data;
727 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
728 		break;
729 	}
730 	case VM_SET_CAPABILITY: {
731 		struct vm_capability *vmcap;
732 
733 		vmcap = (struct vm_capability *)data;
734 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
735 		break;
736 	}
737 	case VM_ACTIVATE_CPU:
738 		error = vm_activate_cpu(vcpu);
739 		break;
740 	case VM_GET_CPUS: {
741 		struct vm_cpuset *vm_cpuset;
742 		cpuset_t *cpuset;
743 		int size;
744 
745 		error = 0;
746 		vm_cpuset = (struct vm_cpuset *)data;
747 		size = vm_cpuset->cpusetsize;
748 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
749 			error = ERANGE;
750 			break;
751 		}
752 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
753 		    M_WAITOK | M_ZERO);
754 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
755 			*cpuset = vm_active_cpus(sc->vm);
756 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
757 			*cpuset = vm_suspended_cpus(sc->vm);
758 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
759 			*cpuset = vm_debug_cpus(sc->vm);
760 		else
761 			error = EINVAL;
762 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
763 			error = ERANGE;
764 		if (error == 0)
765 			error = copyout(cpuset, vm_cpuset->cpus, size);
766 		free(cpuset, M_TEMP);
767 		break;
768 	}
769 	case VM_SUSPEND_CPU:
770 		error = vm_suspend_cpu(sc->vm, vcpu);
771 		break;
772 	case VM_RESUME_CPU:
773 		error = vm_resume_cpu(sc->vm, vcpu);
774 		break;
775 	case VM_SET_TOPOLOGY: {
776 		struct vm_cpu_topology *topology;
777 
778 		topology = (struct vm_cpu_topology *)data;
779 		error = vm_set_topology(sc->vm, topology->sockets,
780 		    topology->cores, topology->threads, topology->maxcpus);
781 		break;
782 	}
783 	case VM_GET_TOPOLOGY: {
784 		struct vm_cpu_topology *topology;
785 
786 		topology = (struct vm_cpu_topology *)data;
787 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
788 		    &topology->threads, &topology->maxcpus);
789 		error = 0;
790 		break;
791 	}
792 	default:
793 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
794 		    td);
795 		break;
796 	}
797 
798 	if ((ioctl->flags &
799 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
800 		vm_unlock_memsegs(sc->vm);
801 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
802 		vcpu_unlock_all(sc);
803 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
804 		vcpu_unlock_one(vcpu);
805 
806 	/*
807 	 * Make sure that no handler returns a kernel-internal
808 	 * error value to userspace.
809 	 */
810 	KASSERT(error == ERESTART || error >= 0,
811 	    ("vmmdev_ioctl: invalid error return %d", error));
812 	return (error);
813 
814 lockfail:
815 	if ((ioctl->flags &
816 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
817 		vm_unlock_memsegs(sc->vm);
818 	return (error);
819 }
820 
821 static int
vmmdev_mmap_single(struct cdev * cdev,vm_ooffset_t * offset,vm_size_t mapsize,struct vm_object ** objp,int nprot)822 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
823     struct vm_object **objp, int nprot)
824 {
825 	struct vmmdev_softc *sc;
826 	vm_paddr_t gpa;
827 	size_t len;
828 	vm_ooffset_t segoff, first, last;
829 	int error, found, segid;
830 	bool sysmem;
831 
832 	first = *offset;
833 	last = first + mapsize;
834 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
835 		return (EINVAL);
836 
837 	sc = vmmdev_lookup2(cdev);
838 	if (sc == NULL) {
839 		/* virtual machine is in the process of being created */
840 		return (EINVAL);
841 	}
842 
843 	/*
844 	 * Get a read lock on the guest memory map.
845 	 */
846 	vm_slock_memsegs(sc->vm);
847 
848 	gpa = 0;
849 	found = 0;
850 	while (!found) {
851 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
852 		    NULL, NULL);
853 		if (error)
854 			break;
855 
856 		if (first >= gpa && last <= gpa + len)
857 			found = 1;
858 		else
859 			gpa += len;
860 	}
861 
862 	if (found) {
863 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
864 		KASSERT(error == 0 && *objp != NULL,
865 		    ("%s: invalid memory segment %d", __func__, segid));
866 		if (sysmem) {
867 			vm_object_reference(*objp);
868 			*offset = segoff + (first - gpa);
869 		} else {
870 			error = EINVAL;
871 		}
872 	}
873 	vm_unlock_memsegs(sc->vm);
874 	return (error);
875 }
876 
877 static void
vmmdev_destroy(struct vmmdev_softc * sc)878 vmmdev_destroy(struct vmmdev_softc *sc)
879 {
880 	struct devmem_softc *dsc;
881 	int error __diagused;
882 
883 	KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__));
884 	KASSERT(sc->ucred != NULL, ("%s: missing ucred", __func__));
885 
886 	/*
887 	 * Destroy all cdevs:
888 	 *
889 	 * - any new operations on the 'cdev' will return an error (ENXIO).
890 	 *
891 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
892 	 */
893 	SLIST_FOREACH(dsc, &sc->devmem, link) {
894 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
895 		devmem_destroy(dsc);
896 	}
897 
898 	vm_disable_vcpu_creation(sc->vm);
899 	error = vcpu_lock_all(sc);
900 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
901 	vm_unlock_vcpus(sc->vm);
902 
903 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
904 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
905 		SLIST_REMOVE_HEAD(&sc->devmem, link);
906 		free(dsc->name, M_VMMDEV);
907 		free(dsc, M_VMMDEV);
908 	}
909 
910 	if (sc->vm != NULL)
911 		vm_destroy(sc->vm);
912 
913 	chgvmmcnt(sc->ucred->cr_ruidinfo, -1, 0);
914 	crfree(sc->ucred);
915 
916 	sx_xlock(&vmmdev_mtx);
917 	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
918 	if ((sc->flags & VMMCTL_CREATE_DESTROY_ON_CLOSE) != 0)
919 		LIST_REMOVE(sc, priv_link);
920 	sx_xunlock(&vmmdev_mtx);
921 	wakeup(sc);
922 	free(sc, M_VMMDEV);
923 }
924 
925 static int
vmmdev_lookup_and_destroy(const char * name,struct ucred * cred)926 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred)
927 {
928 	struct cdev *cdev;
929 	struct vmmdev_softc *sc;
930 
931 	sx_xlock(&vmmdev_mtx);
932 	sc = vmmdev_lookup(name, cred);
933 	if (sc == NULL || sc->cdev == NULL) {
934 		sx_xunlock(&vmmdev_mtx);
935 		return (EINVAL);
936 	}
937 
938 	/*
939 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
940 	 * is scheduled for destruction.
941 	 */
942 	cdev = sc->cdev;
943 	sc->cdev = NULL;
944 	sx_xunlock(&vmmdev_mtx);
945 
946 	(void)vm_suspend(sc->vm, VM_SUSPEND_DESTROY);
947 	destroy_dev(cdev);
948 	vmmdev_destroy(sc);
949 
950 	return (0);
951 }
952 
953 static int
sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)954 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
955 {
956 	char *buf;
957 	int error, buflen;
958 
959 	error = vmm_priv_check(req->td->td_ucred);
960 	if (error)
961 		return (error);
962 
963 	buflen = VM_MAX_NAMELEN + 1;
964 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
965 	error = sysctl_handle_string(oidp, buf, buflen, req);
966 	if (error == 0 && req->newptr != NULL)
967 		error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
968 	free(buf, M_VMMDEV);
969 	return (error);
970 }
971 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
972     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
973     NULL, 0, sysctl_vmm_destroy, "A",
974     "Destroy a vmm(4) instance (legacy interface)");
975 
976 static struct cdevsw vmmdevsw = {
977 	.d_name		= "vmmdev",
978 	.d_version	= D_VERSION,
979 	.d_open		= vmmdev_open,
980 	.d_ioctl	= vmmdev_ioctl,
981 	.d_mmap_single	= vmmdev_mmap_single,
982 	.d_read		= vmmdev_rw,
983 	.d_write	= vmmdev_rw,
984 };
985 
986 static struct vmmdev_softc *
vmmdev_alloc(struct vm * vm,struct ucred * cred)987 vmmdev_alloc(struct vm *vm, struct ucred *cred)
988 {
989 	struct vmmdev_softc *sc;
990 
991 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
992 	SLIST_INIT(&sc->devmem);
993 	sc->vm = vm;
994 	sc->ucred = crhold(cred);
995 	return (sc);
996 }
997 
998 static int
vmmdev_create(const char * name,uint32_t flags,struct ucred * cred)999 vmmdev_create(const char *name, uint32_t flags, struct ucred *cred)
1000 {
1001 	struct make_dev_args mda;
1002 	struct cdev *cdev;
1003 	struct vmmdev_softc *sc;
1004 	struct vmmctl_priv *priv;
1005 	struct vm *vm;
1006 	int error;
1007 
1008 	if (name == NULL || strlen(name) > VM_MAX_NAMELEN)
1009 		return (EINVAL);
1010 
1011 	if ((flags & ~VMMCTL_FLAGS_MASK) != 0)
1012 		return (EINVAL);
1013 	error = devfs_get_cdevpriv((void **)&priv);
1014 	if (error)
1015 		return (error);
1016 
1017 	sx_xlock(&vmmdev_mtx);
1018 	sc = vmmdev_lookup(name, cred);
1019 	if (sc != NULL) {
1020 		sx_xunlock(&vmmdev_mtx);
1021 		return (EEXIST);
1022 	}
1023 
1024 	error = vm_create(name, &vm);
1025 	if (error != 0) {
1026 		sx_xunlock(&vmmdev_mtx);
1027 		return (error);
1028 	}
1029 	sc = vmmdev_alloc(vm, cred);
1030 	SLIST_INSERT_HEAD(&head, sc, link);
1031 	sc->flags = flags;
1032 	if ((flags & VMMCTL_CREATE_DESTROY_ON_CLOSE) != 0)
1033 		LIST_INSERT_HEAD(&priv->softcs, sc, priv_link);
1034 
1035 	make_dev_args_init(&mda);
1036 	mda.mda_devsw = &vmmdevsw;
1037 	mda.mda_cr = sc->ucred;
1038 	mda.mda_uid = UID_ROOT;
1039 	mda.mda_gid = GID_WHEEL;
1040 	mda.mda_mode = 0600;
1041 	mda.mda_si_drv1 = sc;
1042 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1043 	error = make_dev_s(&mda, &cdev, "vmm/%s", name);
1044 	if (error != 0) {
1045 		sx_xunlock(&vmmdev_mtx);
1046 		vmmdev_destroy(sc);
1047 		return (error);
1048 	}
1049 	if (!chgvmmcnt(cred->cr_ruidinfo, 1, vm_maxvmms)) {
1050 		sx_xunlock(&vmmdev_mtx);
1051 		destroy_dev(cdev);
1052 		vmmdev_destroy(sc);
1053 		return (ENOMEM);
1054 	}
1055 	sc->cdev = cdev;
1056 	sx_xunlock(&vmmdev_mtx);
1057 	return (0);
1058 }
1059 
1060 static int
sysctl_vmm_create(SYSCTL_HANDLER_ARGS)1061 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
1062 {
1063 	char *buf;
1064 	int error, buflen;
1065 
1066 	if (!vmm_initialized)
1067 		return (ENXIO);
1068 
1069 	error = vmm_priv_check(req->td->td_ucred);
1070 	if (error != 0)
1071 		return (error);
1072 
1073 	buflen = VM_MAX_NAMELEN + 1;
1074 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1075 	error = sysctl_handle_string(oidp, buf, buflen, req);
1076 	if (error == 0 && req->newptr != NULL)
1077 		error = vmmdev_create(buf, 0, req->td->td_ucred);
1078 	free(buf, M_VMMDEV);
1079 	return (error);
1080 }
1081 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1082     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1083     NULL, 0, sysctl_vmm_create, "A",
1084     "Create a vmm(4) instance (legacy interface)");
1085 
1086 static void
vmmctl_dtor(void * arg)1087 vmmctl_dtor(void *arg)
1088 {
1089 	struct cdev *sc_cdev;
1090 	struct vmmdev_softc *sc;
1091 	struct vmmctl_priv *priv = arg;
1092 
1093 	/*
1094 	 * Scan the softc list for any VMs associated with
1095 	 * the current descriptor and destroy them.
1096 	 */
1097 	sx_xlock(&vmmdev_mtx);
1098 	while (!LIST_EMPTY(&priv->softcs)) {
1099 		sc = LIST_FIRST(&priv->softcs);
1100 		sc_cdev = sc->cdev;
1101 		if (sc_cdev != NULL) {
1102 			sc->cdev = NULL;
1103 		} else {
1104 			/*
1105 			 * Another thread has already
1106 			 * started the removal process.
1107 			 * Sleep until 'vmmdev_destroy' notifies us
1108 			 * that the removal has finished.
1109 			 */
1110 			sx_sleep(sc, &vmmdev_mtx, 0, "vmmctl_dtor", 0);
1111 			continue;
1112 		}
1113 		/*
1114 		 * Temporarily drop the lock to allow vmmdev_destroy to run.
1115 		 */
1116 		sx_xunlock(&vmmdev_mtx);
1117 		(void)vm_suspend(sc->vm, VM_SUSPEND_DESTROY);
1118 		destroy_dev(sc_cdev);
1119 		/* vmmdev_destroy will unlink the 'priv_link' entry. */
1120 		vmmdev_destroy(sc);
1121 		sx_xlock(&vmmdev_mtx);
1122 	}
1123 	sx_xunlock(&vmmdev_mtx);
1124 
1125 	free(priv, M_VMMDEV);
1126 }
1127 
1128 static int
vmmctl_open(struct cdev * cdev,int flags,int fmt,struct thread * td)1129 vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td)
1130 {
1131 	int error;
1132 	struct vmmctl_priv *priv;
1133 
1134 	error = vmm_priv_check(td->td_ucred);
1135 	if (error != 0)
1136 		return (error);
1137 
1138 	if ((flags & FWRITE) == 0)
1139 		return (EPERM);
1140 
1141 	priv = malloc(sizeof(*priv), M_VMMDEV, M_WAITOK | M_ZERO);
1142 	LIST_INIT(&priv->softcs);
1143 	error = devfs_set_cdevpriv(priv, vmmctl_dtor);
1144 	if (error != 0) {
1145 		free(priv, M_VMMDEV);
1146 		return (error);
1147 	}
1148 
1149 	return (0);
1150 }
1151 
1152 static int
vmmctl_ioctl(struct cdev * cdev,u_long cmd,caddr_t data,int fflag,struct thread * td)1153 vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
1154     struct thread *td)
1155 {
1156 	int error;
1157 
1158 	switch (cmd) {
1159 	case VMMCTL_VM_CREATE: {
1160 		struct vmmctl_vm_create *vmc;
1161 
1162 		vmc = (struct vmmctl_vm_create *)data;
1163 		vmc->name[VM_MAX_NAMELEN] = '\0';
1164 		for (size_t i = 0; i < nitems(vmc->reserved); i++) {
1165 			if (vmc->reserved[i] != 0) {
1166 				error = EINVAL;
1167 				return (error);
1168 			}
1169 		}
1170 
1171 		error = vmmdev_create(vmc->name, vmc->flags, td->td_ucred);
1172 		break;
1173 	}
1174 	case VMMCTL_VM_DESTROY: {
1175 		struct vmmctl_vm_destroy *vmd;
1176 
1177 		vmd = (struct vmmctl_vm_destroy *)data;
1178 		vmd->name[VM_MAX_NAMELEN] = '\0';
1179 		for (size_t i = 0; i < nitems(vmd->reserved); i++) {
1180 			if (vmd->reserved[i] != 0) {
1181 				error = EINVAL;
1182 				return (error);
1183 			}
1184 		}
1185 
1186 		error = vmmdev_lookup_and_destroy(vmd->name, td->td_ucred);
1187 		break;
1188 	}
1189 	default:
1190 		error = ENOTTY;
1191 		break;
1192 	}
1193 
1194 	return (error);
1195 }
1196 
1197 static struct cdev *vmmctl_cdev;
1198 static struct cdevsw vmmctlsw = {
1199 	.d_name		= "vmmctl",
1200 	.d_version	= D_VERSION,
1201 	.d_open		= vmmctl_open,
1202 	.d_ioctl	= vmmctl_ioctl,
1203 };
1204 
1205 static int
vmmdev_init(void)1206 vmmdev_init(void)
1207 {
1208 	int error;
1209 
1210 	sx_xlock(&vmmdev_mtx);
1211 	error = make_dev_p(MAKEDEV_CHECKNAME, &vmmctl_cdev, &vmmctlsw, NULL,
1212 	    UID_ROOT, GID_WHEEL, 0600, "vmmctl");
1213 	if (error == 0)
1214 		pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1215 		    "Allow use of vmm in a jail.");
1216 	sx_xunlock(&vmmdev_mtx);
1217 
1218 	return (error);
1219 }
1220 
1221 static int
vmmdev_cleanup(void)1222 vmmdev_cleanup(void)
1223 {
1224 	sx_xlock(&vmmdev_mtx);
1225 	if (!SLIST_EMPTY(&head)) {
1226 		sx_xunlock(&vmmdev_mtx);
1227 		return (EBUSY);
1228 	}
1229 	if (vmmctl_cdev != NULL) {
1230 		destroy_dev(vmmctl_cdev);
1231 		vmmctl_cdev = NULL;
1232 	}
1233 	sx_xunlock(&vmmdev_mtx);
1234 
1235 	return (0);
1236 }
1237 
1238 static int
vmm_handler(module_t mod,int what,void * arg)1239 vmm_handler(module_t mod, int what, void *arg)
1240 {
1241 	int error;
1242 
1243 	switch (what) {
1244 	case MOD_LOAD:
1245 		error = vmmdev_init();
1246 		if (error != 0)
1247 			break;
1248 
1249 		vm_maxcpu = mp_ncpus;
1250 		TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
1251 		if (vm_maxcpu > VM_MAXCPU) {
1252 			printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
1253 			vm_maxcpu = VM_MAXCPU;
1254 		}
1255 		if (vm_maxcpu == 0)
1256 			vm_maxcpu = 1;
1257 		vm_maxvmms = 4 * mp_ncpus;
1258 		error = vmm_modinit();
1259 		if (error == 0)
1260 			vmm_initialized = true;
1261 		else {
1262 			error = vmmdev_cleanup();
1263 			KASSERT(error == 0,
1264 			    ("%s: vmmdev_cleanup failed: %d", __func__, error));
1265 		}
1266 		break;
1267 	case MOD_UNLOAD:
1268 		error = vmmdev_cleanup();
1269 		if (error == 0 && vmm_initialized) {
1270 			error = vmm_modcleanup();
1271 			if (error) {
1272 				/*
1273 				 * Something bad happened - prevent new
1274 				 * VMs from being created
1275 				 */
1276 				vmm_initialized = false;
1277 			}
1278 		}
1279 		break;
1280 	default:
1281 		error = 0;
1282 		break;
1283 	}
1284 	return (error);
1285 }
1286 
1287 static moduledata_t vmm_kmod = {
1288 	"vmm",
1289 	vmm_handler,
1290 	NULL
1291 };
1292 
1293 /*
1294  * vmm initialization has the following dependencies:
1295  *
1296  * - Initialization requires smp_rendezvous() and therefore must happen
1297  *   after SMP is fully functional (after SI_SUB_SMP).
1298  * - vmm device initialization requires an initialized devfs.
1299  */
1300 DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY);
1301 MODULE_VERSION(vmm, 1);
1302 
1303 static int
devmem_mmap_single(struct cdev * cdev,vm_ooffset_t * offset,vm_size_t len,struct vm_object ** objp,int nprot)1304 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1305     struct vm_object **objp, int nprot)
1306 {
1307 	struct devmem_softc *dsc;
1308 	vm_ooffset_t first, last;
1309 	size_t seglen;
1310 	int error;
1311 	bool sysmem;
1312 
1313 	dsc = cdev->si_drv1;
1314 	if (dsc == NULL) {
1315 		/* 'cdev' has been created but is not ready for use */
1316 		return (ENXIO);
1317 	}
1318 
1319 	first = *offset;
1320 	last = *offset + len;
1321 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1322 		return (EINVAL);
1323 
1324 	vm_slock_memsegs(dsc->sc->vm);
1325 
1326 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1327 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1328 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1329 
1330 	if (seglen >= last)
1331 		vm_object_reference(*objp);
1332 	else
1333 		error = EINVAL;
1334 
1335 	vm_unlock_memsegs(dsc->sc->vm);
1336 	return (error);
1337 }
1338 
1339 static struct cdevsw devmemsw = {
1340 	.d_name		= "devmem",
1341 	.d_version	= D_VERSION,
1342 	.d_mmap_single	= devmem_mmap_single,
1343 };
1344 
1345 static int
devmem_create_cdev(struct vmmdev_softc * sc,int segid,char * devname)1346 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname)
1347 {
1348 	struct make_dev_args mda;
1349 	struct devmem_softc *dsc;
1350 	int error;
1351 
1352 	sx_xlock(&vmmdev_mtx);
1353 
1354 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1355 	dsc->segid = segid;
1356 	dsc->name = devname;
1357 	dsc->sc = sc;
1358 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1359 
1360 	make_dev_args_init(&mda);
1361 	mda.mda_devsw = &devmemsw;
1362 	mda.mda_cr = sc->ucred;
1363 	mda.mda_uid = UID_ROOT;
1364 	mda.mda_gid = GID_WHEEL;
1365 	mda.mda_mode = 0600;
1366 	mda.mda_si_drv1 = dsc;
1367 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1368 	error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm),
1369 	    devname);
1370 	if (error != 0) {
1371 		SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link);
1372 		free(dsc->name, M_VMMDEV);
1373 		free(dsc, M_VMMDEV);
1374 	}
1375 
1376 	sx_xunlock(&vmmdev_mtx);
1377 
1378 	return (error);
1379 }
1380 
1381 static void
devmem_destroy(void * arg)1382 devmem_destroy(void *arg)
1383 {
1384 	struct devmem_softc *dsc = arg;
1385 
1386 	destroy_dev(dsc->cdev);
1387 	dsc->cdev = NULL;
1388 	dsc->sc = NULL;
1389 }
1390