1 /*
2 * Copyright (c) 2014 Roger Pau Monné <roger.pau@citrix.com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/uio.h>
30 #include <sys/bus.h>
31 #include <sys/malloc.h>
32 #include <sys/kernel.h>
33 #include <sys/lock.h>
34 #include <sys/mutex.h>
35 #include <sys/rwlock.h>
36 #include <sys/selinfo.h>
37 #include <sys/poll.h>
38 #include <sys/conf.h>
39 #include <sys/fcntl.h>
40 #include <sys/ioccom.h>
41 #include <sys/rman.h>
42 #include <sys/tree.h>
43 #include <sys/module.h>
44 #include <sys/proc.h>
45 #include <sys/bitset.h>
46
47 #include <vm/vm.h>
48 #include <vm/vm_param.h>
49 #include <vm/vm_extern.h>
50 #include <vm/vm_kern.h>
51 #include <vm/vm_page.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_object.h>
54 #include <vm/vm_pager.h>
55
56 #include <machine/md_var.h>
57
58 #include <xen/xen-os.h>
59 #include <xen/hypervisor.h>
60 #include <xen/privcmd.h>
61 #include <xen/error.h>
62
63 MALLOC_DEFINE(M_PRIVCMD, "privcmd_dev", "Xen privcmd user-space device");
64
65 #define MAX_DMOP_BUFFERS 16
66
67 struct privcmd_map {
68 vm_object_t mem;
69 vm_size_t size;
70 struct resource *pseudo_phys_res;
71 int pseudo_phys_res_id;
72 vm_paddr_t phys_base_addr;
73 boolean_t mapped;
74 BITSET_DEFINE_VAR() *err;
75 };
76
77 static d_ioctl_t privcmd_ioctl;
78 static d_open_t privcmd_open;
79 static d_mmap_single_t privcmd_mmap_single;
80
81 static struct cdevsw privcmd_devsw = {
82 .d_version = D_VERSION,
83 .d_ioctl = privcmd_ioctl,
84 .d_mmap_single = privcmd_mmap_single,
85 .d_open = privcmd_open,
86 .d_name = "privcmd",
87 };
88
89 static int privcmd_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
90 vm_ooffset_t foff, struct ucred *cred, u_short *color);
91 static void privcmd_pg_dtor(void *handle);
92 static int privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset,
93 int prot, vm_page_t *mres);
94
95 static struct cdev_pager_ops privcmd_pg_ops = {
96 .cdev_pg_fault = privcmd_pg_fault,
97 .cdev_pg_ctor = privcmd_pg_ctor,
98 .cdev_pg_dtor = privcmd_pg_dtor,
99 };
100
101 struct per_user_data {
102 domid_t dom;
103 };
104
105 static device_t privcmd_dev = NULL;
106
107 /*------------------------- Privcmd Pager functions --------------------------*/
108 static int
privcmd_pg_ctor(void * handle,vm_ooffset_t size,vm_prot_t prot,vm_ooffset_t foff,struct ucred * cred,u_short * color)109 privcmd_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
110 vm_ooffset_t foff, struct ucred *cred, u_short *color)
111 {
112
113 return (0);
114 }
115
116 static void
privcmd_pg_dtor(void * handle)117 privcmd_pg_dtor(void *handle)
118 {
119 struct xen_remove_from_physmap rm = { .domid = DOMID_SELF };
120 struct privcmd_map *map = handle;
121 int error __diagused;
122 vm_size_t i;
123 vm_page_t m;
124
125 /*
126 * Remove the mappings from the used pages. This will remove the
127 * underlying p2m bindings in Xen second stage translation.
128 */
129 if (map->mapped == true) {
130 VM_OBJECT_WLOCK(map->mem);
131 retry:
132 for (i = 0; i < map->size; i++) {
133 m = vm_page_lookup(map->mem, i);
134 if (m == NULL)
135 continue;
136 if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0)
137 goto retry;
138 cdev_mgtdev_pager_free_page(map->mem, m);
139 }
140 VM_OBJECT_WUNLOCK(map->mem);
141
142 for (i = 0; i < map->size; i++) {
143 rm.gpfn = atop(map->phys_base_addr) + i;
144 HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &rm);
145 }
146 free(map->err, M_PRIVCMD);
147 }
148
149 error = xenmem_free(privcmd_dev, map->pseudo_phys_res_id,
150 map->pseudo_phys_res);
151 KASSERT(error == 0, ("Unable to release memory resource: %d", error));
152
153 free(map, M_PRIVCMD);
154 }
155
156 static int
privcmd_pg_fault(vm_object_t object,vm_ooffset_t offset,int prot,vm_page_t * mres)157 privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset,
158 int prot, vm_page_t *mres)
159 {
160 struct privcmd_map *map = object->handle;
161 vm_pindex_t pidx;
162 vm_page_t page;
163
164 if (map->mapped != true)
165 return (VM_PAGER_FAIL);
166
167 pidx = OFF_TO_IDX(offset);
168 if (pidx >= map->size || BIT_ISSET(map->size, pidx, map->err))
169 return (VM_PAGER_FAIL);
170
171 page = PHYS_TO_VM_PAGE(map->phys_base_addr + offset);
172 if (page == NULL)
173 return (VM_PAGER_FAIL);
174
175 KASSERT((page->flags & PG_FICTITIOUS) != 0,
176 ("not fictitious %p", page));
177 KASSERT(vm_page_wired(page), ("page %p not wired", page));
178 KASSERT(!vm_page_busied(page), ("page %p is busy", page));
179
180 vm_page_busy_acquire(page, 0);
181 vm_page_valid(page);
182
183 if (*mres != NULL)
184 vm_page_replace(page, object, pidx, *mres);
185 else
186 vm_page_insert(page, object, pidx);
187 *mres = page;
188 return (VM_PAGER_OK);
189 }
190
191 /*----------------------- Privcmd char device methods ------------------------*/
192 static int
privcmd_mmap_single(struct cdev * cdev,vm_ooffset_t * offset,vm_size_t size,vm_object_t * object,int nprot)193 privcmd_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size,
194 vm_object_t *object, int nprot)
195 {
196 struct privcmd_map *map;
197
198 map = malloc(sizeof(*map), M_PRIVCMD, M_WAITOK | M_ZERO);
199
200 map->size = OFF_TO_IDX(size);
201 map->pseudo_phys_res_id = 0;
202
203 map->pseudo_phys_res = xenmem_alloc(privcmd_dev,
204 &map->pseudo_phys_res_id, size);
205 if (map->pseudo_phys_res == NULL) {
206 free(map, M_PRIVCMD);
207 return (ENOMEM);
208 }
209
210 map->phys_base_addr = rman_get_start(map->pseudo_phys_res);
211 map->mem = cdev_pager_allocate(map, OBJT_MGTDEVICE, &privcmd_pg_ops,
212 size, nprot, *offset, NULL);
213 if (map->mem == NULL) {
214 xenmem_free(privcmd_dev, map->pseudo_phys_res_id,
215 map->pseudo_phys_res);
216 free(map, M_PRIVCMD);
217 return (ENOMEM);
218 }
219
220 *object = map->mem;
221
222 return (0);
223 }
224
225 static struct privcmd_map *
setup_virtual_area(struct thread * td,unsigned long addr,unsigned long num)226 setup_virtual_area(struct thread *td, unsigned long addr, unsigned long num)
227 {
228 vm_map_t map;
229 vm_map_entry_t entry;
230 vm_object_t mem;
231 vm_pindex_t pindex;
232 vm_prot_t prot;
233 boolean_t wired;
234 struct privcmd_map *umap;
235 int error;
236
237 if ((num == 0) || ((addr & PAGE_MASK) != 0))
238 return NULL;
239
240 map = &td->td_proc->p_vmspace->vm_map;
241 error = vm_map_lookup(&map, addr, VM_PROT_NONE, &entry, &mem, &pindex,
242 &prot, &wired);
243 if (error != KERN_SUCCESS || (entry->start != addr) ||
244 (entry->end != addr + (num * PAGE_SIZE)))
245 return NULL;
246
247 vm_map_lookup_done(map, entry);
248 if ((mem->type != OBJT_MGTDEVICE) ||
249 (mem->un_pager.devp.ops != &privcmd_pg_ops))
250 return NULL;
251
252 umap = mem->handle;
253 /* Allocate a bitset to store broken page mappings. */
254 umap->err = BITSET_ALLOC(num, M_PRIVCMD, M_WAITOK | M_ZERO);
255
256 return umap;
257 }
258
259 static int
privcmd_ioctl(struct cdev * dev,unsigned long cmd,caddr_t arg,int mode,struct thread * td)260 privcmd_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg,
261 int mode, struct thread *td)
262 {
263 int error;
264 unsigned int i;
265 void *data;
266 const struct per_user_data *u;
267
268 error = devfs_get_cdevpriv(&data);
269 if (error != 0)
270 return (EINVAL);
271 /*
272 * Constify user-data to prevent unintended changes to the restriction
273 * limits.
274 */
275 u = data;
276
277 switch (cmd) {
278 case IOCTL_PRIVCMD_HYPERCALL: {
279 struct ioctl_privcmd_hypercall *hcall;
280
281 hcall = (struct ioctl_privcmd_hypercall *)arg;
282
283 /* Forbid hypercalls if restricted. */
284 if (u->dom != DOMID_INVALID) {
285 error = EPERM;
286 break;
287 }
288
289 #ifdef __amd64__
290 /*
291 * The hypervisor page table walker will refuse to access
292 * user-space pages if SMAP is enabled, so temporary disable it
293 * while performing the hypercall.
294 */
295 if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
296 stac();
297 #endif
298 error = privcmd_hypercall(hcall->op, hcall->arg[0],
299 hcall->arg[1], hcall->arg[2], hcall->arg[3], hcall->arg[4]);
300 #ifdef __amd64__
301 if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
302 clac();
303 #endif
304 if (error >= 0) {
305 hcall->retval = error;
306 error = 0;
307 } else {
308 error = xen_translate_error(error);
309 hcall->retval = 0;
310 }
311 break;
312 }
313 case IOCTL_PRIVCMD_MMAPBATCH: {
314 struct ioctl_privcmd_mmapbatch *mmap;
315 struct xen_add_to_physmap_batch add;
316 xen_ulong_t *idxs;
317 xen_pfn_t *gpfns;
318 int *errs;
319 unsigned int index;
320 struct privcmd_map *umap;
321 uint16_t num;
322
323 mmap = (struct ioctl_privcmd_mmapbatch *)arg;
324
325 if (u->dom != DOMID_INVALID && u->dom != mmap->dom) {
326 error = EPERM;
327 break;
328 }
329
330 umap = setup_virtual_area(td, mmap->addr, mmap->num);
331 if (umap == NULL) {
332 error = EINVAL;
333 break;
334 }
335
336 add.domid = DOMID_SELF;
337 add.space = XENMAPSPACE_gmfn_foreign;
338 add.u.foreign_domid = mmap->dom;
339
340 /*
341 * The 'size' field in the xen_add_to_physmap_range only
342 * allows for UINT16_MAX mappings in a single hypercall.
343 */
344 num = MIN(mmap->num, UINT16_MAX);
345
346 idxs = malloc(sizeof(*idxs) * num, M_PRIVCMD, M_WAITOK);
347 gpfns = malloc(sizeof(*gpfns) * num, M_PRIVCMD, M_WAITOK);
348 errs = malloc(sizeof(*errs) * num, M_PRIVCMD, M_WAITOK);
349
350 set_xen_guest_handle(add.idxs, idxs);
351 set_xen_guest_handle(add.gpfns, gpfns);
352 set_xen_guest_handle(add.errs, errs);
353
354 for (index = 0; index < mmap->num; index += num) {
355 num = MIN(mmap->num - index, UINT16_MAX);
356 add.size = num;
357
358 error = copyin(&mmap->arr[index], idxs,
359 sizeof(idxs[0]) * num);
360 if (error != 0)
361 goto mmap_out;
362
363 for (i = 0; i < num; i++)
364 gpfns[i] = atop(umap->phys_base_addr +
365 (i + index) * PAGE_SIZE);
366
367 bzero(errs, sizeof(*errs) * num);
368
369 error = HYPERVISOR_memory_op(
370 XENMEM_add_to_physmap_batch, &add);
371 if (error != 0) {
372 error = xen_translate_error(error);
373 goto mmap_out;
374 }
375
376 for (i = 0; i < num; i++) {
377 if (errs[i] != 0) {
378 errs[i] = xen_translate_error(errs[i]);
379
380 /* Mark the page as invalid. */
381 BIT_SET(mmap->num, index + i,
382 umap->err);
383 }
384 }
385
386 error = copyout(errs, &mmap->err[index],
387 sizeof(errs[0]) * num);
388 if (error != 0)
389 goto mmap_out;
390 }
391
392 umap->mapped = true;
393
394 mmap_out:
395 free(idxs, M_PRIVCMD);
396 free(gpfns, M_PRIVCMD);
397 free(errs, M_PRIVCMD);
398 if (!umap->mapped)
399 free(umap->err, M_PRIVCMD);
400
401 break;
402 }
403 case IOCTL_PRIVCMD_MMAP_RESOURCE: {
404 struct ioctl_privcmd_mmapresource *mmap;
405 struct xen_mem_acquire_resource adq;
406 xen_pfn_t *gpfns;
407 struct privcmd_map *umap;
408
409 mmap = (struct ioctl_privcmd_mmapresource *)arg;
410
411 if (u->dom != DOMID_INVALID && u->dom != mmap->dom) {
412 error = EPERM;
413 break;
414 }
415
416 bzero(&adq, sizeof(adq));
417
418 adq.domid = mmap->dom;
419 adq.type = mmap->type;
420 adq.id = mmap->id;
421
422 /* Shortcut for getting the resource size. */
423 if (mmap->addr == 0 && mmap->num == 0) {
424 error = HYPERVISOR_memory_op(XENMEM_acquire_resource,
425 &adq);
426 if (error != 0)
427 error = xen_translate_error(error);
428 else
429 mmap->num = adq.nr_frames;
430 break;
431 }
432
433 umap = setup_virtual_area(td, mmap->addr, mmap->num);
434 if (umap == NULL) {
435 error = EINVAL;
436 break;
437 }
438
439 adq.nr_frames = mmap->num;
440 adq.frame = mmap->idx;
441
442 gpfns = malloc(sizeof(*gpfns) * mmap->num, M_PRIVCMD, M_WAITOK);
443 for (i = 0; i < mmap->num; i++)
444 gpfns[i] = atop(umap->phys_base_addr) + i;
445 set_xen_guest_handle(adq.frame_list, gpfns);
446
447 error = HYPERVISOR_memory_op(XENMEM_acquire_resource, &adq);
448 if (error != 0)
449 error = xen_translate_error(error);
450 else
451 umap->mapped = true;
452
453 free(gpfns, M_PRIVCMD);
454 if (!umap->mapped)
455 free(umap->err, M_PRIVCMD);
456
457 break;
458 }
459 case IOCTL_PRIVCMD_DM_OP: {
460 const struct ioctl_privcmd_dmop *dmop;
461 struct privcmd_dmop_buf *bufs;
462 struct xen_dm_op_buf *hbufs;
463
464 dmop = (struct ioctl_privcmd_dmop *)arg;
465
466 if (u->dom != DOMID_INVALID && u->dom != dmop->dom) {
467 error = EPERM;
468 break;
469 }
470
471 if (dmop->num == 0)
472 break;
473
474 if (dmop->num > MAX_DMOP_BUFFERS) {
475 error = E2BIG;
476 break;
477 }
478
479 bufs = malloc(sizeof(*bufs) * dmop->num, M_PRIVCMD, M_WAITOK);
480
481 error = copyin(dmop->ubufs, bufs, sizeof(*bufs) * dmop->num);
482 if (error != 0) {
483 free(bufs, M_PRIVCMD);
484 break;
485 }
486
487 hbufs = malloc(sizeof(*hbufs) * dmop->num, M_PRIVCMD, M_WAITOK);
488 for (i = 0; i < dmop->num; i++) {
489 set_xen_guest_handle(hbufs[i].h, bufs[i].uptr);
490 hbufs[i].size = bufs[i].size;
491 }
492
493 #ifdef __amd64__
494 if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
495 stac();
496 #endif
497 error = HYPERVISOR_dm_op(dmop->dom, dmop->num, hbufs);
498 #ifdef __amd64__
499 if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
500 clac();
501 #endif
502 if (error != 0)
503 error = xen_translate_error(error);
504
505 free(bufs, M_PRIVCMD);
506 free(hbufs, M_PRIVCMD);
507
508
509 break;
510 }
511 case IOCTL_PRIVCMD_RESTRICT: {
512 struct per_user_data *u;
513 domid_t dom;
514
515 dom = *(domid_t *)arg;
516
517 error = devfs_get_cdevpriv((void **)&u);
518 if (error != 0)
519 break;
520
521 if (u->dom != DOMID_INVALID && u->dom != dom) {
522 error = -EINVAL;
523 break;
524 }
525 u->dom = dom;
526
527 break;
528 }
529 default:
530 error = ENOSYS;
531 break;
532 }
533
534 return (error);
535 }
536
537 static void
user_release(void * arg)538 user_release(void *arg)
539 {
540
541 free(arg, M_PRIVCMD);
542 }
543
544 static int
privcmd_open(struct cdev * dev,int flag,int otyp,struct thread * td)545 privcmd_open(struct cdev *dev, int flag, int otyp, struct thread *td)
546 {
547 struct per_user_data *u;
548 int error;
549
550 u = malloc(sizeof(*u), M_PRIVCMD, M_WAITOK);
551 u->dom = DOMID_INVALID;
552
553 /* Assign the allocated per_user_data to this open instance. */
554 error = devfs_set_cdevpriv(u, user_release);
555 if (error != 0) {
556 free(u, M_PRIVCMD);
557 }
558
559 return (error);
560 }
561
562 /*------------------ Private Device Attachment Functions --------------------*/
563 static void
privcmd_identify(driver_t * driver,device_t parent)564 privcmd_identify(driver_t *driver, device_t parent)
565 {
566
567 KASSERT(xen_domain(),
568 ("Trying to attach privcmd device on non Xen domain"));
569
570 if (BUS_ADD_CHILD(parent, 0, "privcmd", 0) == NULL)
571 panic("unable to attach privcmd user-space device");
572 }
573
574 static int
privcmd_probe(device_t dev)575 privcmd_probe(device_t dev)
576 {
577
578 privcmd_dev = dev;
579 device_set_desc(dev, "Xen privileged interface user-space device");
580 return (BUS_PROBE_NOWILDCARD);
581 }
582
583 static int
privcmd_attach(device_t dev)584 privcmd_attach(device_t dev)
585 {
586
587 make_dev_credf(MAKEDEV_ETERNAL, &privcmd_devsw, 0, NULL, UID_ROOT,
588 GID_WHEEL, 0600, "xen/privcmd");
589 return (0);
590 }
591
592 /*-------------------- Private Device Attachment Data -----------------------*/
593 static device_method_t privcmd_methods[] = {
594 DEVMETHOD(device_identify, privcmd_identify),
595 DEVMETHOD(device_probe, privcmd_probe),
596 DEVMETHOD(device_attach, privcmd_attach),
597
598 DEVMETHOD_END
599 };
600
601 static driver_t privcmd_driver = {
602 "privcmd",
603 privcmd_methods,
604 0,
605 };
606
607 DRIVER_MODULE(privcmd, xenpv, privcmd_driver, 0, 0);
608 MODULE_DEPEND(privcmd, xenpv, 1, 1, 1);
609