/* * Copyright (c) 2014 Roger Pau Monné * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_PRIVCMD, "privcmd_dev", "Xen privcmd user-space device"); #define MAX_DMOP_BUFFERS 16 struct privcmd_map { vm_object_t mem; vm_size_t size; struct resource *pseudo_phys_res; int pseudo_phys_res_id; vm_paddr_t phys_base_addr; boolean_t mapped; BITSET_DEFINE_VAR() *err; }; static d_ioctl_t privcmd_ioctl; static d_open_t privcmd_open; static d_mmap_single_t privcmd_mmap_single; static struct cdevsw privcmd_devsw = { .d_version = D_VERSION, .d_ioctl = privcmd_ioctl, .d_mmap_single = privcmd_mmap_single, .d_open = privcmd_open, .d_name = "privcmd", }; static int privcmd_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color); static void privcmd_pg_dtor(void *handle); static int privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres); static struct cdev_pager_ops privcmd_pg_ops = { .cdev_pg_fault = privcmd_pg_fault, .cdev_pg_ctor = privcmd_pg_ctor, .cdev_pg_dtor = privcmd_pg_dtor, }; struct per_user_data { domid_t dom; }; static device_t privcmd_dev = NULL; /*------------------------- Privcmd Pager functions --------------------------*/ static int privcmd_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { return (0); } static void privcmd_pg_dtor(void *handle) { struct xen_remove_from_physmap rm = { .domid = DOMID_SELF }; struct privcmd_map *map = handle; int error __diagused; vm_size_t i; vm_page_t m; /* * Remove the mappings from the used pages. This will remove the * underlying p2m bindings in Xen second stage translation. */ if (map->mapped == true) { VM_OBJECT_WLOCK(map->mem); retry: for (i = 0; i < map->size; i++) { m = vm_page_lookup(map->mem, i); if (m == NULL) continue; if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0) goto retry; cdev_pager_free_page(map->mem, m); } VM_OBJECT_WUNLOCK(map->mem); for (i = 0; i < map->size; i++) { rm.gpfn = atop(map->phys_base_addr) + i; HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &rm); } free(map->err, M_PRIVCMD); } error = xenmem_free(privcmd_dev, map->pseudo_phys_res_id, map->pseudo_phys_res); KASSERT(error == 0, ("Unable to release memory resource: %d", error)); free(map, M_PRIVCMD); } static int privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct privcmd_map *map = object->handle; vm_pindex_t pidx; vm_page_t page; if (map->mapped != true) return (VM_PAGER_FAIL); pidx = OFF_TO_IDX(offset); if (pidx >= map->size || BIT_ISSET(map->size, pidx, map->err)) return (VM_PAGER_FAIL); page = PHYS_TO_VM_PAGE(map->phys_base_addr + offset); if (page == NULL) return (VM_PAGER_FAIL); KASSERT((page->flags & PG_FICTITIOUS) != 0, ("not fictitious %p", page)); KASSERT(vm_page_wired(page), ("page %p not wired", page)); KASSERT(!vm_page_busied(page), ("page %p is busy", page)); vm_page_busy_acquire(page, 0); vm_page_valid(page); if (*mres != NULL) vm_page_replace(page, object, pidx, *mres); else vm_page_insert(page, object, pidx); *mres = page; return (VM_PAGER_OK); } /*----------------------- Privcmd char device methods ------------------------*/ static int privcmd_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size, vm_object_t *object, int nprot) { struct privcmd_map *map; map = malloc(sizeof(*map), M_PRIVCMD, M_WAITOK | M_ZERO); map->size = OFF_TO_IDX(size); map->pseudo_phys_res_id = 0; map->pseudo_phys_res = xenmem_alloc(privcmd_dev, &map->pseudo_phys_res_id, size); if (map->pseudo_phys_res == NULL) { free(map, M_PRIVCMD); return (ENOMEM); } map->phys_base_addr = rman_get_start(map->pseudo_phys_res); map->mem = cdev_pager_allocate(map, OBJT_MGTDEVICE, &privcmd_pg_ops, size, nprot, *offset, NULL); if (map->mem == NULL) { xenmem_free(privcmd_dev, map->pseudo_phys_res_id, map->pseudo_phys_res); free(map, M_PRIVCMD); return (ENOMEM); } *object = map->mem; return (0); } static struct privcmd_map * setup_virtual_area(struct thread *td, unsigned long addr, unsigned long num) { vm_map_t map; vm_map_entry_t entry; vm_object_t mem; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; struct privcmd_map *umap; int error; if ((num == 0) || ((addr & PAGE_MASK) != 0)) return NULL; map = &td->td_proc->p_vmspace->vm_map; error = vm_map_lookup(&map, addr, VM_PROT_NONE, &entry, &mem, &pindex, &prot, &wired); if (error != KERN_SUCCESS || (entry->start != addr) || (entry->end != addr + (num * PAGE_SIZE))) return NULL; vm_map_lookup_done(map, entry); if ((mem->type != OBJT_MGTDEVICE) || (mem->un_pager.devp.ops != &privcmd_pg_ops)) return NULL; umap = mem->handle; /* Allocate a bitset to store broken page mappings. */ umap->err = BITSET_ALLOC(num, M_PRIVCMD, M_WAITOK | M_ZERO); return umap; } static int privcmd_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg, int mode, struct thread *td) { int error; unsigned int i; void *data; const struct per_user_data *u; error = devfs_get_cdevpriv(&data); if (error != 0) return (EINVAL); /* * Constify user-data to prevent unintended changes to the restriction * limits. */ u = data; switch (cmd) { case IOCTL_PRIVCMD_HYPERCALL: { struct ioctl_privcmd_hypercall *hcall; hcall = (struct ioctl_privcmd_hypercall *)arg; /* Forbid hypercalls if restricted. */ if (u->dom != DOMID_INVALID) { error = EPERM; break; } #ifdef __amd64__ /* * The hypervisor page table walker will refuse to access * user-space pages if SMAP is enabled, so temporary disable it * while performing the hypercall. */ if (cpu_stdext_feature & CPUID_STDEXT_SMAP) stac(); #endif error = privcmd_hypercall(hcall->op, hcall->arg[0], hcall->arg[1], hcall->arg[2], hcall->arg[3], hcall->arg[4]); #ifdef __amd64__ if (cpu_stdext_feature & CPUID_STDEXT_SMAP) clac(); #endif if (error >= 0) { hcall->retval = error; error = 0; } else { error = xen_translate_error(error); hcall->retval = 0; } break; } case IOCTL_PRIVCMD_MMAPBATCH: { struct ioctl_privcmd_mmapbatch *mmap; struct xen_add_to_physmap_batch add; xen_ulong_t *idxs; xen_pfn_t *gpfns; int *errs; unsigned int index; struct privcmd_map *umap; uint16_t num; mmap = (struct ioctl_privcmd_mmapbatch *)arg; if (u->dom != DOMID_INVALID && u->dom != mmap->dom) { error = EPERM; break; } umap = setup_virtual_area(td, mmap->addr, mmap->num); if (umap == NULL) { error = EINVAL; break; } add.domid = DOMID_SELF; add.space = XENMAPSPACE_gmfn_foreign; add.u.foreign_domid = mmap->dom; /* * The 'size' field in the xen_add_to_physmap_range only * allows for UINT16_MAX mappings in a single hypercall. */ num = MIN(mmap->num, UINT16_MAX); idxs = malloc(sizeof(*idxs) * num, M_PRIVCMD, M_WAITOK); gpfns = malloc(sizeof(*gpfns) * num, M_PRIVCMD, M_WAITOK); errs = malloc(sizeof(*errs) * num, M_PRIVCMD, M_WAITOK); set_xen_guest_handle(add.idxs, idxs); set_xen_guest_handle(add.gpfns, gpfns); set_xen_guest_handle(add.errs, errs); for (index = 0; index < mmap->num; index += num) { num = MIN(mmap->num - index, UINT16_MAX); add.size = num; error = copyin(&mmap->arr[index], idxs, sizeof(idxs[0]) * num); if (error != 0) goto mmap_out; for (i = 0; i < num; i++) gpfns[i] = atop(umap->phys_base_addr + (i + index) * PAGE_SIZE); bzero(errs, sizeof(*errs) * num); error = HYPERVISOR_memory_op( XENMEM_add_to_physmap_batch, &add); if (error != 0) { error = xen_translate_error(error); goto mmap_out; } for (i = 0; i < num; i++) { if (errs[i] != 0) { errs[i] = xen_translate_error(errs[i]); /* Mark the page as invalid. */ BIT_SET(mmap->num, index + i, umap->err); } } error = copyout(errs, &mmap->err[index], sizeof(errs[0]) * num); if (error != 0) goto mmap_out; } umap->mapped = true; mmap_out: free(idxs, M_PRIVCMD); free(gpfns, M_PRIVCMD); free(errs, M_PRIVCMD); if (!umap->mapped) free(umap->err, M_PRIVCMD); break; } case IOCTL_PRIVCMD_MMAP_RESOURCE: { struct ioctl_privcmd_mmapresource *mmap; struct xen_mem_acquire_resource adq; xen_pfn_t *gpfns; struct privcmd_map *umap; mmap = (struct ioctl_privcmd_mmapresource *)arg; if (u->dom != DOMID_INVALID && u->dom != mmap->dom) { error = EPERM; break; } bzero(&adq, sizeof(adq)); adq.domid = mmap->dom; adq.type = mmap->type; adq.id = mmap->id; /* Shortcut for getting the resource size. */ if (mmap->addr == 0 && mmap->num == 0) { error = HYPERVISOR_memory_op(XENMEM_acquire_resource, &adq); if (error != 0) error = xen_translate_error(error); else mmap->num = adq.nr_frames; break; } umap = setup_virtual_area(td, mmap->addr, mmap->num); if (umap == NULL) { error = EINVAL; break; } adq.nr_frames = mmap->num; adq.frame = mmap->idx; gpfns = malloc(sizeof(*gpfns) * mmap->num, M_PRIVCMD, M_WAITOK); for (i = 0; i < mmap->num; i++) gpfns[i] = atop(umap->phys_base_addr) + i; set_xen_guest_handle(adq.frame_list, gpfns); error = HYPERVISOR_memory_op(XENMEM_acquire_resource, &adq); if (error != 0) error = xen_translate_error(error); else umap->mapped = true; free(gpfns, M_PRIVCMD); if (!umap->mapped) free(umap->err, M_PRIVCMD); break; } case IOCTL_PRIVCMD_DM_OP: { const struct ioctl_privcmd_dmop *dmop; struct privcmd_dmop_buf *bufs; struct xen_dm_op_buf *hbufs; dmop = (struct ioctl_privcmd_dmop *)arg; if (u->dom != DOMID_INVALID && u->dom != dmop->dom) { error = EPERM; break; } if (dmop->num == 0) break; if (dmop->num > MAX_DMOP_BUFFERS) { error = E2BIG; break; } bufs = malloc(sizeof(*bufs) * dmop->num, M_PRIVCMD, M_WAITOK); error = copyin(dmop->ubufs, bufs, sizeof(*bufs) * dmop->num); if (error != 0) { free(bufs, M_PRIVCMD); break; } hbufs = malloc(sizeof(*hbufs) * dmop->num, M_PRIVCMD, M_WAITOK); for (i = 0; i < dmop->num; i++) { set_xen_guest_handle(hbufs[i].h, bufs[i].uptr); hbufs[i].size = bufs[i].size; } #ifdef __amd64__ if (cpu_stdext_feature & CPUID_STDEXT_SMAP) stac(); #endif error = HYPERVISOR_dm_op(dmop->dom, dmop->num, hbufs); #ifdef __amd64__ if (cpu_stdext_feature & CPUID_STDEXT_SMAP) clac(); #endif if (error != 0) error = xen_translate_error(error); free(bufs, M_PRIVCMD); free(hbufs, M_PRIVCMD); break; } case IOCTL_PRIVCMD_RESTRICT: { struct per_user_data *u; domid_t dom; dom = *(domid_t *)arg; error = devfs_get_cdevpriv((void **)&u); if (error != 0) break; if (u->dom != DOMID_INVALID && u->dom != dom) { error = -EINVAL; break; } u->dom = dom; break; } default: error = ENOSYS; break; } return (error); } static void user_release(void *arg) { free(arg, M_PRIVCMD); } static int privcmd_open(struct cdev *dev, int flag, int otyp, struct thread *td) { struct per_user_data *u; int error; u = malloc(sizeof(*u), M_PRIVCMD, M_WAITOK); u->dom = DOMID_INVALID; /* Assign the allocated per_user_data to this open instance. */ error = devfs_set_cdevpriv(u, user_release); if (error != 0) { free(u, M_PRIVCMD); } return (error); } /*------------------ Private Device Attachment Functions --------------------*/ static void privcmd_identify(driver_t *driver, device_t parent) { KASSERT(xen_domain(), ("Trying to attach privcmd device on non Xen domain")); if (BUS_ADD_CHILD(parent, 0, "privcmd", 0) == NULL) panic("unable to attach privcmd user-space device"); } static int privcmd_probe(device_t dev) { privcmd_dev = dev; device_set_desc(dev, "Xen privileged interface user-space device"); return (BUS_PROBE_NOWILDCARD); } static int privcmd_attach(device_t dev) { make_dev_credf(MAKEDEV_ETERNAL, &privcmd_devsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "xen/privcmd"); return (0); } /*-------------------- Private Device Attachment Data -----------------------*/ static device_method_t privcmd_methods[] = { DEVMETHOD(device_identify, privcmd_identify), DEVMETHOD(device_probe, privcmd_probe), DEVMETHOD(device_attach, privcmd_attach), DEVMETHOD_END }; static driver_t privcmd_driver = { "privcmd", privcmd_methods, 0, }; DRIVER_MODULE(privcmd, xenpv, privcmd_driver, 0, 0); MODULE_DEPEND(privcmd, xenpv, 1, 1, 1);