1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com> 6 * All rights reserved. 7 */ 8 9 #include <sys/param.h> 10 #include <sys/conf.h> 11 #include <sys/fcntl.h> 12 #include <sys/ioccom.h> 13 #include <sys/jail.h> 14 #include <sys/kernel.h> 15 #include <sys/malloc.h> 16 #include <sys/mman.h> 17 #include <sys/module.h> 18 #include <sys/priv.h> 19 #include <sys/proc.h> 20 #include <sys/queue.h> 21 #include <sys/smp.h> 22 #include <sys/sx.h> 23 #include <sys/sysctl.h> 24 #include <sys/ucred.h> 25 #include <sys/uio.h> 26 27 #include <machine/vmm.h> 28 29 #include <vm/vm.h> 30 #include <vm/vm_object.h> 31 32 #include <dev/vmm/vmm_dev.h> 33 #include <dev/vmm/vmm_mem.h> 34 #include <dev/vmm/vmm_stat.h> 35 36 #ifdef __amd64__ 37 #ifdef COMPAT_FREEBSD12 38 struct vm_memseg_12 { 39 int segid; 40 size_t len; 41 char name[64]; 42 }; 43 _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI"); 44 45 #define VM_ALLOC_MEMSEG_12 \ 46 _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12) 47 #define VM_GET_MEMSEG_12 \ 48 _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12) 49 #endif /* COMPAT_FREEBSD12 */ 50 #ifdef COMPAT_FREEBSD14 51 struct vm_memseg_14 { 52 int segid; 53 size_t len; 54 char name[VM_MAX_SUFFIXLEN + 1]; 55 }; 56 _Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16), 57 "COMPAT_FREEBSD14 ABI"); 58 59 #define VM_ALLOC_MEMSEG_14 \ 60 _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14) 61 #define VM_GET_MEMSEG_14 \ 62 _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14) 63 #endif /* COMPAT_FREEBSD14 */ 64 #endif /* __amd64__ */ 65 66 struct devmem_softc { 67 int segid; 68 char *name; 69 struct cdev *cdev; 70 struct vmmdev_softc *sc; 71 SLIST_ENTRY(devmem_softc) link; 72 }; 73 74 struct vmmdev_softc { 75 struct vm *vm; /* vm instance cookie */ 76 struct cdev *cdev; 77 struct ucred *ucred; 78 SLIST_ENTRY(vmmdev_softc) link; 79 SLIST_HEAD(, devmem_softc) devmem; 80 int flags; 81 }; 82 83 static bool vmm_initialized = false; 84 85 static SLIST_HEAD(, vmmdev_softc) head; 86 87 static unsigned pr_allow_flag; 88 static struct sx vmmdev_mtx; 89 SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex"); 90 91 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 92 93 SYSCTL_DECL(_hw_vmm); 94 95 u_int vm_maxcpu; 96 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 97 &vm_maxcpu, 0, "Maximum number of vCPUs"); 98 99 static void devmem_destroy(void *arg); 100 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem); 101 102 static int 103 vmm_priv_check(struct ucred *ucred) 104 { 105 if (jailed(ucred) && 106 !(ucred->cr_prison->pr_allow & pr_allow_flag)) 107 return (EPERM); 108 109 return (0); 110 } 111 112 static int 113 vcpu_lock_one(struct vcpu *vcpu) 114 { 115 return (vcpu_set_state(vcpu, VCPU_FROZEN, true)); 116 } 117 118 static void 119 vcpu_unlock_one(struct vcpu *vcpu) 120 { 121 enum vcpu_state state; 122 123 state = vcpu_get_state(vcpu, NULL); 124 if (state != VCPU_FROZEN) { 125 panic("vcpu %s(%d) has invalid state %d", 126 vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state); 127 } 128 129 vcpu_set_state(vcpu, VCPU_IDLE, false); 130 } 131 132 #ifndef __amd64__ 133 static int 134 vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate) 135 { 136 struct vcpu *vcpu; 137 int error; 138 uint16_t i, j, maxcpus; 139 140 error = 0; 141 maxcpus = vm_get_maxcpus(vm); 142 for (i = 0; i < maxcpus; i++) { 143 vcpu = vm_vcpu(vm, i); 144 if (vcpu == NULL) 145 continue; 146 error = vcpu_lock_one(vcpu); 147 if (error) 148 break; 149 } 150 151 if (error) { 152 for (j = 0; j < i; j++) { 153 vcpu = vm_vcpu(vm, j); 154 if (vcpu == NULL) 155 continue; 156 vcpu_unlock_one(vcpu); 157 } 158 } 159 160 return (error); 161 } 162 #endif 163 164 static int 165 vcpu_lock_all(struct vmmdev_softc *sc) 166 { 167 int error; 168 169 /* 170 * Serialize vcpu_lock_all() callers. Individual vCPUs are not locked 171 * in a consistent order so we need to serialize to avoid deadlocks. 172 */ 173 vm_lock_vcpus(sc->vm); 174 error = vcpu_set_state_all(sc->vm, VCPU_FROZEN); 175 if (error != 0) 176 vm_unlock_vcpus(sc->vm); 177 return (error); 178 } 179 180 static void 181 vcpu_unlock_all(struct vmmdev_softc *sc) 182 { 183 struct vcpu *vcpu; 184 uint16_t i, maxcpus; 185 186 maxcpus = vm_get_maxcpus(sc->vm); 187 for (i = 0; i < maxcpus; i++) { 188 vcpu = vm_vcpu(sc->vm, i); 189 if (vcpu == NULL) 190 continue; 191 vcpu_unlock_one(vcpu); 192 } 193 vm_unlock_vcpus(sc->vm); 194 } 195 196 static struct vmmdev_softc * 197 vmmdev_lookup(const char *name, struct ucred *cred) 198 { 199 struct vmmdev_softc *sc; 200 201 sx_assert(&vmmdev_mtx, SA_XLOCKED); 202 203 SLIST_FOREACH(sc, &head, link) { 204 if (strcmp(name, vm_name(sc->vm)) == 0) 205 break; 206 } 207 208 if (sc == NULL) 209 return (NULL); 210 211 if (cr_cansee(cred, sc->ucred)) 212 return (NULL); 213 214 return (sc); 215 } 216 217 static struct vmmdev_softc * 218 vmmdev_lookup2(struct cdev *cdev) 219 { 220 return (cdev->si_drv1); 221 } 222 223 static int 224 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 225 { 226 int error, off, c, prot; 227 vm_paddr_t gpa, maxaddr; 228 void *hpa, *cookie; 229 struct vmmdev_softc *sc; 230 231 sc = vmmdev_lookup2(cdev); 232 if (sc == NULL) 233 return (ENXIO); 234 235 /* 236 * Get a read lock on the guest memory map. 237 */ 238 vm_slock_memsegs(sc->vm); 239 240 error = 0; 241 prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 242 maxaddr = vmm_sysmem_maxaddr(sc->vm); 243 while (uio->uio_resid > 0 && error == 0) { 244 gpa = uio->uio_offset; 245 off = gpa & PAGE_MASK; 246 c = min(uio->uio_resid, PAGE_SIZE - off); 247 248 /* 249 * The VM has a hole in its physical memory map. If we want to 250 * use 'dd' to inspect memory beyond the hole we need to 251 * provide bogus data for memory that lies in the hole. 252 * 253 * Since this device does not support lseek(2), dd(1) will 254 * read(2) blocks of data to simulate the lseek(2). 255 */ 256 hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie); 257 if (hpa == NULL) { 258 if (uio->uio_rw == UIO_READ && gpa < maxaddr) 259 error = uiomove(__DECONST(void *, zero_region), 260 c, uio); 261 else 262 error = EFAULT; 263 } else { 264 error = uiomove(hpa, c, uio); 265 vm_gpa_release(cookie); 266 } 267 } 268 vm_unlock_memsegs(sc->vm); 269 return (error); 270 } 271 272 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1); 273 274 static int 275 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) 276 { 277 struct devmem_softc *dsc; 278 int error; 279 bool sysmem; 280 281 error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); 282 if (error || mseg->len == 0) 283 return (error); 284 285 if (!sysmem) { 286 SLIST_FOREACH(dsc, &sc->devmem, link) { 287 if (dsc->segid == mseg->segid) 288 break; 289 } 290 KASSERT(dsc != NULL, ("%s: devmem segment %d not found", 291 __func__, mseg->segid)); 292 error = copystr(dsc->name, mseg->name, len, NULL); 293 } else { 294 bzero(mseg->name, len); 295 } 296 297 return (error); 298 } 299 300 static int 301 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len, 302 struct domainset *domainset) 303 { 304 char *name; 305 int error; 306 bool sysmem; 307 308 error = 0; 309 name = NULL; 310 sysmem = true; 311 312 /* 313 * The allocation is lengthened by 1 to hold a terminating NUL. It'll 314 * by stripped off when devfs processes the full string. 315 */ 316 if (VM_MEMSEG_NAME(mseg)) { 317 sysmem = false; 318 name = malloc(len, M_VMMDEV, M_WAITOK); 319 error = copystr(mseg->name, name, len, NULL); 320 if (error) 321 goto done; 322 } 323 error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, domainset); 324 if (error) 325 goto done; 326 327 if (VM_MEMSEG_NAME(mseg)) { 328 error = devmem_create_cdev(sc, mseg->segid, name); 329 if (error) 330 vm_free_memseg(sc->vm, mseg->segid); 331 else 332 name = NULL; /* freed when 'cdev' is destroyed */ 333 } 334 done: 335 free(name, M_VMMDEV); 336 return (error); 337 } 338 339 #if defined(__amd64__) && \ 340 (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12)) 341 /* 342 * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts. 343 */ 344 static void 345 adjust_segid(struct vm_memseg *mseg) 346 { 347 if (mseg->segid != VM_SYSMEM) { 348 mseg->segid += (VM_BOOTROM - 1); 349 } 350 } 351 #endif 352 353 static int 354 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, 355 uint64_t *regval) 356 { 357 int error, i; 358 359 error = 0; 360 for (i = 0; i < count; i++) { 361 error = vm_get_register(vcpu, regnum[i], ®val[i]); 362 if (error) 363 break; 364 } 365 return (error); 366 } 367 368 static int 369 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, 370 uint64_t *regval) 371 { 372 int error, i; 373 374 error = 0; 375 for (i = 0; i < count; i++) { 376 error = vm_set_register(vcpu, regnum[i], regval[i]); 377 if (error) 378 break; 379 } 380 return (error); 381 } 382 383 static int 384 vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) 385 { 386 int error; 387 388 /* 389 * A jail without vmm access shouldn't be able to access vmm device 390 * files at all, but check here just to be thorough. 391 */ 392 error = vmm_priv_check(td->td_ucred); 393 if (error != 0) 394 return (error); 395 396 return (0); 397 } 398 399 static const struct vmmdev_ioctl vmmdev_ioctls[] = { 400 VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU), 401 VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU), 402 VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU), 403 VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU), 404 VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU), 405 VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU), 406 VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU), 407 VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU), 408 VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU), 409 VMMDEV_IOCTL(VM_STAT_DESC, 0), 410 411 #ifdef __amd64__ 412 #ifdef COMPAT_FREEBSD12 413 VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12, 414 VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), 415 #endif 416 #ifdef COMPAT_FREEBSD14 417 VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14, 418 VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), 419 #endif 420 #endif /* __amd64__ */ 421 VMMDEV_IOCTL(VM_ALLOC_MEMSEG, 422 VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), 423 VMMDEV_IOCTL(VM_MMAP_MEMSEG, 424 VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), 425 VMMDEV_IOCTL(VM_MUNMAP_MEMSEG, 426 VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), 427 VMMDEV_IOCTL(VM_REINIT, 428 VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), 429 430 #ifdef __amd64__ 431 #if defined(COMPAT_FREEBSD12) 432 VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS), 433 #endif 434 #ifdef COMPAT_FREEBSD14 435 VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS), 436 #endif 437 #endif /* __amd64__ */ 438 VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS), 439 VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS), 440 441 VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU), 442 VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU), 443 444 VMMDEV_IOCTL(VM_SUSPEND, 0), 445 VMMDEV_IOCTL(VM_GET_CPUS, 0), 446 VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0), 447 VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0), 448 }; 449 450 static int 451 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 452 struct thread *td) 453 { 454 struct vmmdev_softc *sc; 455 struct vcpu *vcpu; 456 const struct vmmdev_ioctl *ioctl; 457 struct vm_memseg *mseg; 458 int error, vcpuid; 459 460 sc = vmmdev_lookup2(cdev); 461 if (sc == NULL) 462 return (ENXIO); 463 464 ioctl = NULL; 465 for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) { 466 if (vmmdev_ioctls[i].cmd == cmd) { 467 ioctl = &vmmdev_ioctls[i]; 468 break; 469 } 470 } 471 if (ioctl == NULL) { 472 for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) { 473 if (vmmdev_machdep_ioctls[i].cmd == cmd) { 474 ioctl = &vmmdev_machdep_ioctls[i]; 475 break; 476 } 477 } 478 } 479 if (ioctl == NULL) 480 return (ENOTTY); 481 482 if ((ioctl->flags & VMMDEV_IOCTL_PRIV_CHECK_DRIVER) != 0) { 483 error = priv_check(td, PRIV_DRIVER); 484 if (error != 0) 485 return (error); 486 } 487 488 if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0) 489 vm_xlock_memsegs(sc->vm); 490 else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0) 491 vm_slock_memsegs(sc->vm); 492 493 vcpu = NULL; 494 vcpuid = -1; 495 if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU | 496 VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) { 497 vcpuid = *(int *)data; 498 if (vcpuid == -1) { 499 if ((ioctl->flags & 500 VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) { 501 error = EINVAL; 502 goto lockfail; 503 } 504 } else { 505 vcpu = vm_alloc_vcpu(sc->vm, vcpuid); 506 if (vcpu == NULL) { 507 error = EINVAL; 508 goto lockfail; 509 } 510 if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) { 511 error = vcpu_lock_one(vcpu); 512 if (error) 513 goto lockfail; 514 } 515 } 516 } 517 if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) { 518 error = vcpu_lock_all(sc); 519 if (error) 520 goto lockfail; 521 } 522 523 switch (cmd) { 524 case VM_SUSPEND: { 525 struct vm_suspend *vmsuspend; 526 527 vmsuspend = (struct vm_suspend *)data; 528 error = vm_suspend(sc->vm, vmsuspend->how); 529 break; 530 } 531 case VM_REINIT: 532 error = vm_reinit(sc->vm); 533 break; 534 case VM_STAT_DESC: { 535 struct vm_stat_desc *statdesc; 536 537 statdesc = (struct vm_stat_desc *)data; 538 error = vmm_stat_desc_copy(statdesc->index, statdesc->desc, 539 sizeof(statdesc->desc)); 540 break; 541 } 542 case VM_STATS: { 543 struct vm_stats *vmstats; 544 545 vmstats = (struct vm_stats *)data; 546 getmicrotime(&vmstats->tv); 547 error = vmm_stat_copy(vcpu, vmstats->index, 548 nitems(vmstats->statbuf), &vmstats->num_entries, 549 vmstats->statbuf); 550 break; 551 } 552 case VM_MMAP_GETNEXT: { 553 struct vm_memmap *mm; 554 555 mm = (struct vm_memmap *)data; 556 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, 557 &mm->segoff, &mm->len, &mm->prot, &mm->flags); 558 break; 559 } 560 case VM_MMAP_MEMSEG: { 561 struct vm_memmap *mm; 562 563 mm = (struct vm_memmap *)data; 564 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, 565 mm->len, mm->prot, mm->flags); 566 break; 567 } 568 case VM_MUNMAP_MEMSEG: { 569 struct vm_munmap *mu; 570 571 mu = (struct vm_munmap *)data; 572 error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); 573 break; 574 } 575 #ifdef __amd64__ 576 #ifdef COMPAT_FREEBSD12 577 case VM_ALLOC_MEMSEG_12: 578 mseg = (struct vm_memseg *)data; 579 580 adjust_segid(mseg); 581 error = alloc_memseg(sc, mseg, 582 sizeof(((struct vm_memseg_12 *)0)->name), NULL); 583 break; 584 case VM_GET_MEMSEG_12: 585 mseg = (struct vm_memseg *)data; 586 587 adjust_segid(mseg); 588 error = get_memseg(sc, mseg, 589 sizeof(((struct vm_memseg_12 *)0)->name)); 590 break; 591 #endif /* COMPAT_FREEBSD12 */ 592 #ifdef COMPAT_FREEBSD14 593 case VM_ALLOC_MEMSEG_14: 594 mseg = (struct vm_memseg *)data; 595 596 adjust_segid(mseg); 597 error = alloc_memseg(sc, mseg, 598 sizeof(((struct vm_memseg_14 *)0)->name), NULL); 599 break; 600 case VM_GET_MEMSEG_14: 601 mseg = (struct vm_memseg *)data; 602 603 adjust_segid(mseg); 604 error = get_memseg(sc, mseg, 605 sizeof(((struct vm_memseg_14 *)0)->name)); 606 break; 607 #endif /* COMPAT_FREEBSD14 */ 608 #endif /* __amd64__ */ 609 case VM_ALLOC_MEMSEG: { 610 domainset_t *mask; 611 struct domainset *domainset, domain; 612 613 domainset = NULL; 614 mseg = (struct vm_memseg *)data; 615 if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && mseg->ds_mask != NULL) { 616 if (mseg->ds_mask_size < sizeof(domainset_t) || 617 mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) { 618 error = ERANGE; 619 break; 620 } 621 memset(&domain, 0, sizeof(domain)); 622 mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK); 623 error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size); 624 if (error) { 625 free(mask, M_VMMDEV); 626 break; 627 } 628 error = domainset_populate(&domain, mask, mseg->ds_policy, 629 mseg->ds_mask_size); 630 free(mask, M_VMMDEV); 631 if (error) 632 break; 633 domainset = domainset_create(&domain); 634 if (domainset == NULL) { 635 error = EINVAL; 636 break; 637 } 638 } 639 error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset); 640 break; 641 } 642 case VM_GET_MEMSEG: 643 error = get_memseg(sc, (struct vm_memseg *)data, 644 sizeof(((struct vm_memseg *)0)->name)); 645 break; 646 case VM_GET_REGISTER: { 647 struct vm_register *vmreg; 648 649 vmreg = (struct vm_register *)data; 650 error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval); 651 break; 652 } 653 case VM_SET_REGISTER: { 654 struct vm_register *vmreg; 655 656 vmreg = (struct vm_register *)data; 657 error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval); 658 break; 659 } 660 case VM_GET_REGISTER_SET: { 661 struct vm_register_set *vmregset; 662 uint64_t *regvals; 663 int *regnums; 664 665 vmregset = (struct vm_register_set *)data; 666 if (vmregset->count > VM_REG_LAST) { 667 error = EINVAL; 668 break; 669 } 670 regvals = mallocarray(vmregset->count, sizeof(regvals[0]), 671 M_VMMDEV, M_WAITOK); 672 regnums = mallocarray(vmregset->count, sizeof(regnums[0]), 673 M_VMMDEV, M_WAITOK); 674 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * 675 vmregset->count); 676 if (error == 0) 677 error = vm_get_register_set(vcpu, 678 vmregset->count, regnums, regvals); 679 if (error == 0) 680 error = copyout(regvals, vmregset->regvals, 681 sizeof(regvals[0]) * vmregset->count); 682 free(regvals, M_VMMDEV); 683 free(regnums, M_VMMDEV); 684 break; 685 } 686 case VM_SET_REGISTER_SET: { 687 struct vm_register_set *vmregset; 688 uint64_t *regvals; 689 int *regnums; 690 691 vmregset = (struct vm_register_set *)data; 692 if (vmregset->count > VM_REG_LAST) { 693 error = EINVAL; 694 break; 695 } 696 regvals = mallocarray(vmregset->count, sizeof(regvals[0]), 697 M_VMMDEV, M_WAITOK); 698 regnums = mallocarray(vmregset->count, sizeof(regnums[0]), 699 M_VMMDEV, M_WAITOK); 700 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * 701 vmregset->count); 702 if (error == 0) 703 error = copyin(vmregset->regvals, regvals, 704 sizeof(regvals[0]) * vmregset->count); 705 if (error == 0) 706 error = vm_set_register_set(vcpu, 707 vmregset->count, regnums, regvals); 708 free(regvals, M_VMMDEV); 709 free(regnums, M_VMMDEV); 710 break; 711 } 712 case VM_GET_CAPABILITY: { 713 struct vm_capability *vmcap; 714 715 vmcap = (struct vm_capability *)data; 716 error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval); 717 break; 718 } 719 case VM_SET_CAPABILITY: { 720 struct vm_capability *vmcap; 721 722 vmcap = (struct vm_capability *)data; 723 error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval); 724 break; 725 } 726 case VM_ACTIVATE_CPU: 727 error = vm_activate_cpu(vcpu); 728 break; 729 case VM_GET_CPUS: { 730 struct vm_cpuset *vm_cpuset; 731 cpuset_t *cpuset; 732 int size; 733 734 error = 0; 735 vm_cpuset = (struct vm_cpuset *)data; 736 size = vm_cpuset->cpusetsize; 737 if (size < 1 || size > CPU_MAXSIZE / NBBY) { 738 error = ERANGE; 739 break; 740 } 741 cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP, 742 M_WAITOK | M_ZERO); 743 if (vm_cpuset->which == VM_ACTIVE_CPUS) 744 *cpuset = vm_active_cpus(sc->vm); 745 else if (vm_cpuset->which == VM_SUSPENDED_CPUS) 746 *cpuset = vm_suspended_cpus(sc->vm); 747 else if (vm_cpuset->which == VM_DEBUG_CPUS) 748 *cpuset = vm_debug_cpus(sc->vm); 749 else 750 error = EINVAL; 751 if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY)) 752 error = ERANGE; 753 if (error == 0) 754 error = copyout(cpuset, vm_cpuset->cpus, size); 755 free(cpuset, M_TEMP); 756 break; 757 } 758 case VM_SUSPEND_CPU: 759 error = vm_suspend_cpu(sc->vm, vcpu); 760 break; 761 case VM_RESUME_CPU: 762 error = vm_resume_cpu(sc->vm, vcpu); 763 break; 764 case VM_SET_TOPOLOGY: { 765 struct vm_cpu_topology *topology; 766 767 topology = (struct vm_cpu_topology *)data; 768 error = vm_set_topology(sc->vm, topology->sockets, 769 topology->cores, topology->threads, topology->maxcpus); 770 break; 771 } 772 case VM_GET_TOPOLOGY: { 773 struct vm_cpu_topology *topology; 774 775 topology = (struct vm_cpu_topology *)data; 776 vm_get_topology(sc->vm, &topology->sockets, &topology->cores, 777 &topology->threads, &topology->maxcpus); 778 error = 0; 779 break; 780 } 781 default: 782 error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag, 783 td); 784 break; 785 } 786 787 if ((ioctl->flags & 788 (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0) 789 vm_unlock_memsegs(sc->vm); 790 if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) 791 vcpu_unlock_all(sc); 792 else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) 793 vcpu_unlock_one(vcpu); 794 795 /* 796 * Make sure that no handler returns a kernel-internal 797 * error value to userspace. 798 */ 799 KASSERT(error == ERESTART || error >= 0, 800 ("vmmdev_ioctl: invalid error return %d", error)); 801 return (error); 802 803 lockfail: 804 if ((ioctl->flags & 805 (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0) 806 vm_unlock_memsegs(sc->vm); 807 return (error); 808 } 809 810 static int 811 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, 812 struct vm_object **objp, int nprot) 813 { 814 struct vmmdev_softc *sc; 815 vm_paddr_t gpa; 816 size_t len; 817 vm_ooffset_t segoff, first, last; 818 int error, found, segid; 819 bool sysmem; 820 821 first = *offset; 822 last = first + mapsize; 823 if ((nprot & PROT_EXEC) || first < 0 || first >= last) 824 return (EINVAL); 825 826 sc = vmmdev_lookup2(cdev); 827 if (sc == NULL) { 828 /* virtual machine is in the process of being created */ 829 return (EINVAL); 830 } 831 832 /* 833 * Get a read lock on the guest memory map. 834 */ 835 vm_slock_memsegs(sc->vm); 836 837 gpa = 0; 838 found = 0; 839 while (!found) { 840 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, 841 NULL, NULL); 842 if (error) 843 break; 844 845 if (first >= gpa && last <= gpa + len) 846 found = 1; 847 else 848 gpa += len; 849 } 850 851 if (found) { 852 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); 853 KASSERT(error == 0 && *objp != NULL, 854 ("%s: invalid memory segment %d", __func__, segid)); 855 if (sysmem) { 856 vm_object_reference(*objp); 857 *offset = segoff + (first - gpa); 858 } else { 859 error = EINVAL; 860 } 861 } 862 vm_unlock_memsegs(sc->vm); 863 return (error); 864 } 865 866 static void 867 vmmdev_destroy(struct vmmdev_softc *sc) 868 { 869 struct devmem_softc *dsc; 870 int error __diagused; 871 872 KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__)); 873 874 /* 875 * Destroy all cdevs: 876 * 877 * - any new operations on the 'cdev' will return an error (ENXIO). 878 * 879 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' 880 */ 881 SLIST_FOREACH(dsc, &sc->devmem, link) { 882 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); 883 devmem_destroy(dsc); 884 } 885 886 vm_disable_vcpu_creation(sc->vm); 887 error = vcpu_lock_all(sc); 888 KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); 889 vm_unlock_vcpus(sc->vm); 890 891 while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { 892 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); 893 SLIST_REMOVE_HEAD(&sc->devmem, link); 894 free(dsc->name, M_VMMDEV); 895 free(dsc, M_VMMDEV); 896 } 897 898 if (sc->vm != NULL) 899 vm_destroy(sc->vm); 900 901 if (sc->ucred != NULL) 902 crfree(sc->ucred); 903 904 sx_xlock(&vmmdev_mtx); 905 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 906 sx_xunlock(&vmmdev_mtx); 907 free(sc, M_VMMDEV); 908 } 909 910 static int 911 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred) 912 { 913 struct cdev *cdev; 914 struct vmmdev_softc *sc; 915 916 sx_xlock(&vmmdev_mtx); 917 sc = vmmdev_lookup(name, cred); 918 if (sc == NULL || sc->cdev == NULL) { 919 sx_xunlock(&vmmdev_mtx); 920 return (EINVAL); 921 } 922 923 /* 924 * Setting 'sc->cdev' to NULL is used to indicate that the VM 925 * is scheduled for destruction. 926 */ 927 cdev = sc->cdev; 928 sc->cdev = NULL; 929 sx_xunlock(&vmmdev_mtx); 930 931 vm_suspend(sc->vm, VM_SUSPEND_DESTROY); 932 destroy_dev(cdev); 933 vmmdev_destroy(sc); 934 935 return (0); 936 } 937 938 static int 939 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 940 { 941 char *buf; 942 int error, buflen; 943 944 error = vmm_priv_check(req->td->td_ucred); 945 if (error) 946 return (error); 947 948 buflen = VM_MAX_NAMELEN + 1; 949 buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); 950 error = sysctl_handle_string(oidp, buf, buflen, req); 951 if (error == 0 && req->newptr != NULL) 952 error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred); 953 free(buf, M_VMMDEV); 954 return (error); 955 } 956 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, 957 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 958 NULL, 0, sysctl_vmm_destroy, "A", 959 "Destroy a vmm(4) instance (legacy interface)"); 960 961 static struct cdevsw vmmdevsw = { 962 .d_name = "vmmdev", 963 .d_version = D_VERSION, 964 .d_open = vmmdev_open, 965 .d_ioctl = vmmdev_ioctl, 966 .d_mmap_single = vmmdev_mmap_single, 967 .d_read = vmmdev_rw, 968 .d_write = vmmdev_rw, 969 }; 970 971 static struct vmmdev_softc * 972 vmmdev_alloc(struct vm *vm, struct ucred *cred) 973 { 974 struct vmmdev_softc *sc; 975 976 sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO); 977 SLIST_INIT(&sc->devmem); 978 sc->vm = vm; 979 sc->ucred = crhold(cred); 980 return (sc); 981 } 982 983 static int 984 vmmdev_create(const char *name, struct ucred *cred) 985 { 986 struct make_dev_args mda; 987 struct cdev *cdev; 988 struct vmmdev_softc *sc; 989 struct vm *vm; 990 int error; 991 992 if (name == NULL || strlen(name) > VM_MAX_NAMELEN) 993 return (EINVAL); 994 995 sx_xlock(&vmmdev_mtx); 996 sc = vmmdev_lookup(name, cred); 997 if (sc != NULL) { 998 sx_xunlock(&vmmdev_mtx); 999 return (EEXIST); 1000 } 1001 1002 error = vm_create(name, &vm); 1003 if (error != 0) { 1004 sx_xunlock(&vmmdev_mtx); 1005 return (error); 1006 } 1007 sc = vmmdev_alloc(vm, cred); 1008 SLIST_INSERT_HEAD(&head, sc, link); 1009 1010 make_dev_args_init(&mda); 1011 mda.mda_devsw = &vmmdevsw; 1012 mda.mda_cr = sc->ucred; 1013 mda.mda_uid = UID_ROOT; 1014 mda.mda_gid = GID_WHEEL; 1015 mda.mda_mode = 0600; 1016 mda.mda_si_drv1 = sc; 1017 mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1018 error = make_dev_s(&mda, &cdev, "vmm/%s", name); 1019 if (error != 0) { 1020 sx_xunlock(&vmmdev_mtx); 1021 vmmdev_destroy(sc); 1022 return (error); 1023 } 1024 sc->cdev = cdev; 1025 sx_xunlock(&vmmdev_mtx); 1026 return (0); 1027 } 1028 1029 static int 1030 sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 1031 { 1032 char *buf; 1033 int error, buflen; 1034 1035 if (!vmm_initialized) 1036 return (ENXIO); 1037 1038 error = vmm_priv_check(req->td->td_ucred); 1039 if (error != 0) 1040 return (error); 1041 1042 buflen = VM_MAX_NAMELEN + 1; 1043 buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); 1044 error = sysctl_handle_string(oidp, buf, buflen, req); 1045 if (error == 0 && req->newptr != NULL) 1046 error = vmmdev_create(buf, req->td->td_ucred); 1047 free(buf, M_VMMDEV); 1048 return (error); 1049 } 1050 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, 1051 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 1052 NULL, 0, sysctl_vmm_create, "A", 1053 "Create a vmm(4) instance (legacy interface)"); 1054 1055 static int 1056 vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td) 1057 { 1058 int error; 1059 1060 error = vmm_priv_check(td->td_ucred); 1061 if (error != 0) 1062 return (error); 1063 1064 if ((flags & FWRITE) == 0) 1065 return (EPERM); 1066 1067 return (0); 1068 } 1069 1070 static int 1071 vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 1072 struct thread *td) 1073 { 1074 int error; 1075 1076 switch (cmd) { 1077 case VMMCTL_VM_CREATE: { 1078 struct vmmctl_vm_create *vmc; 1079 1080 vmc = (struct vmmctl_vm_create *)data; 1081 vmc->name[VM_MAX_NAMELEN] = '\0'; 1082 for (size_t i = 0; i < nitems(vmc->reserved); i++) { 1083 if (vmc->reserved[i] != 0) { 1084 error = EINVAL; 1085 return (error); 1086 } 1087 } 1088 1089 error = vmmdev_create(vmc->name, td->td_ucred); 1090 break; 1091 } 1092 case VMMCTL_VM_DESTROY: { 1093 struct vmmctl_vm_destroy *vmd; 1094 1095 vmd = (struct vmmctl_vm_destroy *)data; 1096 vmd->name[VM_MAX_NAMELEN] = '\0'; 1097 for (size_t i = 0; i < nitems(vmd->reserved); i++) { 1098 if (vmd->reserved[i] != 0) { 1099 error = EINVAL; 1100 return (error); 1101 } 1102 } 1103 1104 error = vmmdev_lookup_and_destroy(vmd->name, td->td_ucred); 1105 break; 1106 } 1107 default: 1108 error = ENOTTY; 1109 break; 1110 } 1111 1112 return (error); 1113 } 1114 1115 static struct cdev *vmmctl_cdev; 1116 static struct cdevsw vmmctlsw = { 1117 .d_name = "vmmctl", 1118 .d_version = D_VERSION, 1119 .d_open = vmmctl_open, 1120 .d_ioctl = vmmctl_ioctl, 1121 }; 1122 1123 static int 1124 vmmdev_init(void) 1125 { 1126 int error; 1127 1128 sx_xlock(&vmmdev_mtx); 1129 error = make_dev_p(MAKEDEV_CHECKNAME, &vmmctl_cdev, &vmmctlsw, NULL, 1130 UID_ROOT, GID_WHEEL, 0600, "vmmctl"); 1131 if (error == 0) 1132 pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, 1133 "Allow use of vmm in a jail."); 1134 sx_xunlock(&vmmdev_mtx); 1135 1136 return (error); 1137 } 1138 1139 static int 1140 vmmdev_cleanup(void) 1141 { 1142 sx_xlock(&vmmdev_mtx); 1143 if (!SLIST_EMPTY(&head)) { 1144 sx_xunlock(&vmmdev_mtx); 1145 return (EBUSY); 1146 } 1147 if (vmmctl_cdev != NULL) { 1148 destroy_dev(vmmctl_cdev); 1149 vmmctl_cdev = NULL; 1150 } 1151 sx_xunlock(&vmmdev_mtx); 1152 1153 return (0); 1154 } 1155 1156 static int 1157 vmm_handler(module_t mod, int what, void *arg) 1158 { 1159 int error; 1160 1161 switch (what) { 1162 case MOD_LOAD: 1163 error = vmmdev_init(); 1164 if (error != 0) 1165 break; 1166 1167 vm_maxcpu = mp_ncpus; 1168 TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); 1169 if (vm_maxcpu > VM_MAXCPU) { 1170 printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); 1171 vm_maxcpu = VM_MAXCPU; 1172 } 1173 if (vm_maxcpu == 0) 1174 vm_maxcpu = 1; 1175 1176 error = vmm_modinit(); 1177 if (error == 0) 1178 vmm_initialized = true; 1179 else { 1180 error = vmmdev_cleanup(); 1181 KASSERT(error == 0, 1182 ("%s: vmmdev_cleanup failed: %d", __func__, error)); 1183 } 1184 break; 1185 case MOD_UNLOAD: 1186 error = vmmdev_cleanup(); 1187 if (error == 0 && vmm_initialized) { 1188 error = vmm_modcleanup(); 1189 if (error) { 1190 /* 1191 * Something bad happened - prevent new 1192 * VMs from being created 1193 */ 1194 vmm_initialized = false; 1195 } 1196 } 1197 break; 1198 default: 1199 error = 0; 1200 break; 1201 } 1202 return (error); 1203 } 1204 1205 static moduledata_t vmm_kmod = { 1206 "vmm", 1207 vmm_handler, 1208 NULL 1209 }; 1210 1211 /* 1212 * vmm initialization has the following dependencies: 1213 * 1214 * - Initialization requires smp_rendezvous() and therefore must happen 1215 * after SMP is fully functional (after SI_SUB_SMP). 1216 * - vmm device initialization requires an initialized devfs. 1217 */ 1218 DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); 1219 MODULE_VERSION(vmm, 1); 1220 1221 static int 1222 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, 1223 struct vm_object **objp, int nprot) 1224 { 1225 struct devmem_softc *dsc; 1226 vm_ooffset_t first, last; 1227 size_t seglen; 1228 int error; 1229 bool sysmem; 1230 1231 dsc = cdev->si_drv1; 1232 if (dsc == NULL) { 1233 /* 'cdev' has been created but is not ready for use */ 1234 return (ENXIO); 1235 } 1236 1237 first = *offset; 1238 last = *offset + len; 1239 if ((nprot & PROT_EXEC) || first < 0 || first >= last) 1240 return (EINVAL); 1241 1242 vm_slock_memsegs(dsc->sc->vm); 1243 1244 error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); 1245 KASSERT(error == 0 && !sysmem && *objp != NULL, 1246 ("%s: invalid devmem segment %d", __func__, dsc->segid)); 1247 1248 if (seglen >= last) 1249 vm_object_reference(*objp); 1250 else 1251 error = EINVAL; 1252 1253 vm_unlock_memsegs(dsc->sc->vm); 1254 return (error); 1255 } 1256 1257 static struct cdevsw devmemsw = { 1258 .d_name = "devmem", 1259 .d_version = D_VERSION, 1260 .d_mmap_single = devmem_mmap_single, 1261 }; 1262 1263 static int 1264 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname) 1265 { 1266 struct make_dev_args mda; 1267 struct devmem_softc *dsc; 1268 int error; 1269 1270 sx_xlock(&vmmdev_mtx); 1271 1272 dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); 1273 dsc->segid = segid; 1274 dsc->name = devname; 1275 dsc->sc = sc; 1276 SLIST_INSERT_HEAD(&sc->devmem, dsc, link); 1277 1278 make_dev_args_init(&mda); 1279 mda.mda_devsw = &devmemsw; 1280 mda.mda_cr = sc->ucred; 1281 mda.mda_uid = UID_ROOT; 1282 mda.mda_gid = GID_WHEEL; 1283 mda.mda_mode = 0600; 1284 mda.mda_si_drv1 = dsc; 1285 mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1286 error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm), 1287 devname); 1288 if (error != 0) { 1289 SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link); 1290 free(dsc->name, M_VMMDEV); 1291 free(dsc, M_VMMDEV); 1292 } 1293 1294 sx_xunlock(&vmmdev_mtx); 1295 1296 return (error); 1297 } 1298 1299 static void 1300 devmem_destroy(void *arg) 1301 { 1302 struct devmem_softc *dsc = arg; 1303 1304 destroy_dev(dsc->cdev); 1305 dsc->cdev = NULL; 1306 dsc->sc = NULL; 1307 } 1308