1 // SPDX-License-Identifier: GPL-2.0-only 2 /****************************************************************************** 3 * privcmd.c 4 * 5 * Interface to privileged domain-0 commands. 6 * 7 * Copyright (c) 2002-2004, K A Fraser, B Dragovic 8 */ 9 10 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 11 12 #include <linux/eventfd.h> 13 #include <linux/file.h> 14 #include <linux/kernel.h> 15 #include <linux/module.h> 16 #include <linux/mutex.h> 17 #include <linux/poll.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/srcu.h> 21 #include <linux/string.h> 22 #include <linux/workqueue.h> 23 #include <linux/errno.h> 24 #include <linux/mm.h> 25 #include <linux/mman.h> 26 #include <linux/uaccess.h> 27 #include <linux/swap.h> 28 #include <linux/highmem.h> 29 #include <linux/pagemap.h> 30 #include <linux/seq_file.h> 31 #include <linux/miscdevice.h> 32 #include <linux/moduleparam.h> 33 #include <linux/virtio_mmio.h> 34 35 #include <asm/xen/hypervisor.h> 36 #include <asm/xen/hypercall.h> 37 38 #include <xen/xen.h> 39 #include <xen/events.h> 40 #include <xen/privcmd.h> 41 #include <xen/interface/xen.h> 42 #include <xen/interface/memory.h> 43 #include <xen/interface/hvm/dm_op.h> 44 #include <xen/interface/hvm/ioreq.h> 45 #include <xen/features.h> 46 #include <xen/page.h> 47 #include <xen/xen-ops.h> 48 #include <xen/balloon.h> 49 50 #include "privcmd.h" 51 52 MODULE_DESCRIPTION("Xen hypercall passthrough driver"); 53 MODULE_LICENSE("GPL"); 54 55 #define PRIV_VMA_LOCKED ((void *)1) 56 57 static unsigned int privcmd_dm_op_max_num = 16; 58 module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644); 59 MODULE_PARM_DESC(dm_op_max_nr_bufs, 60 "Maximum number of buffers per dm_op hypercall"); 61 62 static unsigned int privcmd_dm_op_buf_max_size = 4096; 63 module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, 64 0644); 65 MODULE_PARM_DESC(dm_op_buf_max_size, 66 "Maximum size of a dm_op hypercall buffer"); 67 68 struct privcmd_data { 69 domid_t domid; 70 }; 71 72 static int privcmd_vma_range_is_mapped( 73 struct vm_area_struct *vma, 74 unsigned long addr, 75 unsigned long nr_pages); 76 77 static long privcmd_ioctl_hypercall(struct file *file, void __user *udata) 78 { 79 struct privcmd_data *data = file->private_data; 80 struct privcmd_hypercall hypercall; 81 long ret; 82 83 /* Disallow arbitrary hypercalls if restricted */ 84 if (data->domid != DOMID_INVALID) 85 return -EPERM; 86 87 if (copy_from_user(&hypercall, udata, sizeof(hypercall))) 88 return -EFAULT; 89 90 xen_preemptible_hcall_begin(); 91 ret = privcmd_call(hypercall.op, 92 hypercall.arg[0], hypercall.arg[1], 93 hypercall.arg[2], hypercall.arg[3], 94 hypercall.arg[4]); 95 xen_preemptible_hcall_end(); 96 97 return ret; 98 } 99 100 static void free_page_list(struct list_head *pages) 101 { 102 struct page *p, *n; 103 104 list_for_each_entry_safe(p, n, pages, lru) 105 __free_page(p); 106 107 INIT_LIST_HEAD(pages); 108 } 109 110 /* 111 * Given an array of items in userspace, return a list of pages 112 * containing the data. If copying fails, either because of memory 113 * allocation failure or a problem reading user memory, return an 114 * error code; its up to the caller to dispose of any partial list. 115 */ 116 static int gather_array(struct list_head *pagelist, 117 unsigned nelem, size_t size, 118 const void __user *data) 119 { 120 unsigned pageidx; 121 void *pagedata; 122 int ret; 123 124 if (size > PAGE_SIZE) 125 return 0; 126 127 pageidx = PAGE_SIZE; 128 pagedata = NULL; /* quiet, gcc */ 129 while (nelem--) { 130 if (pageidx > PAGE_SIZE-size) { 131 struct page *page = alloc_page(GFP_KERNEL); 132 133 ret = -ENOMEM; 134 if (page == NULL) 135 goto fail; 136 137 pagedata = page_address(page); 138 139 list_add_tail(&page->lru, pagelist); 140 pageidx = 0; 141 } 142 143 ret = -EFAULT; 144 if (copy_from_user(pagedata + pageidx, data, size)) 145 goto fail; 146 147 data += size; 148 pageidx += size; 149 } 150 151 ret = 0; 152 153 fail: 154 return ret; 155 } 156 157 /* 158 * Call function "fn" on each element of the array fragmented 159 * over a list of pages. 160 */ 161 static int traverse_pages(unsigned nelem, size_t size, 162 struct list_head *pos, 163 int (*fn)(void *data, void *state), 164 void *state) 165 { 166 void *pagedata; 167 unsigned pageidx; 168 int ret = 0; 169 170 BUG_ON(size > PAGE_SIZE); 171 172 pageidx = PAGE_SIZE; 173 pagedata = NULL; /* hush, gcc */ 174 175 while (nelem--) { 176 if (pageidx > PAGE_SIZE-size) { 177 struct page *page; 178 pos = pos->next; 179 page = list_entry(pos, struct page, lru); 180 pagedata = page_address(page); 181 pageidx = 0; 182 } 183 184 ret = (*fn)(pagedata + pageidx, state); 185 if (ret) 186 break; 187 pageidx += size; 188 } 189 190 return ret; 191 } 192 193 /* 194 * Similar to traverse_pages, but use each page as a "block" of 195 * data to be processed as one unit. 196 */ 197 static int traverse_pages_block(unsigned nelem, size_t size, 198 struct list_head *pos, 199 int (*fn)(void *data, int nr, void *state), 200 void *state) 201 { 202 void *pagedata; 203 int ret = 0; 204 205 BUG_ON(size > PAGE_SIZE); 206 207 while (nelem) { 208 int nr = (PAGE_SIZE/size); 209 struct page *page; 210 if (nr > nelem) 211 nr = nelem; 212 pos = pos->next; 213 page = list_entry(pos, struct page, lru); 214 pagedata = page_address(page); 215 ret = (*fn)(pagedata, nr, state); 216 if (ret) 217 break; 218 nelem -= nr; 219 } 220 221 return ret; 222 } 223 224 struct mmap_gfn_state { 225 unsigned long va; 226 struct vm_area_struct *vma; 227 domid_t domain; 228 }; 229 230 static int mmap_gfn_range(void *data, void *state) 231 { 232 struct privcmd_mmap_entry *msg = data; 233 struct mmap_gfn_state *st = state; 234 struct vm_area_struct *vma = st->vma; 235 int rc; 236 237 /* Do not allow range to wrap the address space. */ 238 if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || 239 ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) 240 return -EINVAL; 241 242 /* Range chunks must be contiguous in va space. */ 243 if ((msg->va != st->va) || 244 ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) 245 return -EINVAL; 246 247 rc = xen_remap_domain_gfn_range(vma, 248 msg->va & PAGE_MASK, 249 msg->mfn, msg->npages, 250 vma->vm_page_prot, 251 st->domain, NULL); 252 if (rc < 0) 253 return rc; 254 255 st->va += msg->npages << PAGE_SHIFT; 256 257 return 0; 258 } 259 260 static long privcmd_ioctl_mmap(struct file *file, void __user *udata) 261 { 262 struct privcmd_data *data = file->private_data; 263 struct privcmd_mmap mmapcmd; 264 struct mm_struct *mm = current->mm; 265 struct vm_area_struct *vma; 266 int rc; 267 LIST_HEAD(pagelist); 268 struct mmap_gfn_state state; 269 270 /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ 271 if (xen_feature(XENFEAT_auto_translated_physmap)) 272 return -ENOSYS; 273 274 if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) 275 return -EFAULT; 276 277 /* If restriction is in place, check the domid matches */ 278 if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom) 279 return -EPERM; 280 281 rc = gather_array(&pagelist, 282 mmapcmd.num, sizeof(struct privcmd_mmap_entry), 283 mmapcmd.entry); 284 285 if (rc || list_empty(&pagelist)) 286 goto out; 287 288 mmap_write_lock(mm); 289 290 { 291 struct page *page = list_first_entry(&pagelist, 292 struct page, lru); 293 struct privcmd_mmap_entry *msg = page_address(page); 294 295 vma = vma_lookup(mm, msg->va); 296 rc = -EINVAL; 297 298 if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) 299 goto out_up; 300 vma->vm_private_data = PRIV_VMA_LOCKED; 301 } 302 303 state.va = vma->vm_start; 304 state.vma = vma; 305 state.domain = mmapcmd.dom; 306 307 rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), 308 &pagelist, 309 mmap_gfn_range, &state); 310 311 312 out_up: 313 mmap_write_unlock(mm); 314 315 out: 316 free_page_list(&pagelist); 317 318 return rc; 319 } 320 321 struct mmap_batch_state { 322 domid_t domain; 323 unsigned long va; 324 struct vm_area_struct *vma; 325 int index; 326 /* A tristate: 327 * 0 for no errors 328 * 1 if at least one error has happened (and no 329 * -ENOENT errors have happened) 330 * -ENOENT if at least 1 -ENOENT has happened. 331 */ 332 int global_error; 333 int version; 334 335 /* User-space gfn array to store errors in the second pass for V1. */ 336 xen_pfn_t __user *user_gfn; 337 /* User-space int array to store errors in the second pass for V2. */ 338 int __user *user_err; 339 }; 340 341 /* auto translated dom0 note: if domU being created is PV, then gfn is 342 * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP). 343 */ 344 static int mmap_batch_fn(void *data, int nr, void *state) 345 { 346 xen_pfn_t *gfnp = data; 347 struct mmap_batch_state *st = state; 348 struct vm_area_struct *vma = st->vma; 349 struct page **pages = vma->vm_private_data; 350 struct page **cur_pages = NULL; 351 int ret; 352 353 if (xen_feature(XENFEAT_auto_translated_physmap)) 354 cur_pages = &pages[st->index]; 355 356 BUG_ON(nr < 0); 357 ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr, 358 (int *)gfnp, st->vma->vm_page_prot, 359 st->domain, cur_pages); 360 361 /* Adjust the global_error? */ 362 if (ret != nr) { 363 if (ret == -ENOENT) 364 st->global_error = -ENOENT; 365 else { 366 /* Record that at least one error has happened. */ 367 if (st->global_error == 0) 368 st->global_error = 1; 369 } 370 } 371 st->va += XEN_PAGE_SIZE * nr; 372 st->index += nr / XEN_PFN_PER_PAGE; 373 374 return 0; 375 } 376 377 static int mmap_return_error(int err, struct mmap_batch_state *st) 378 { 379 int ret; 380 381 if (st->version == 1) { 382 if (err) { 383 xen_pfn_t gfn; 384 385 ret = get_user(gfn, st->user_gfn); 386 if (ret < 0) 387 return ret; 388 /* 389 * V1 encodes the error codes in the 32bit top 390 * nibble of the gfn (with its known 391 * limitations vis-a-vis 64 bit callers). 392 */ 393 gfn |= (err == -ENOENT) ? 394 PRIVCMD_MMAPBATCH_PAGED_ERROR : 395 PRIVCMD_MMAPBATCH_MFN_ERROR; 396 return __put_user(gfn, st->user_gfn++); 397 } else 398 st->user_gfn++; 399 } else { /* st->version == 2 */ 400 if (err) 401 return __put_user(err, st->user_err++); 402 else 403 st->user_err++; 404 } 405 406 return 0; 407 } 408 409 static int mmap_return_errors(void *data, int nr, void *state) 410 { 411 struct mmap_batch_state *st = state; 412 int *errs = data; 413 int i; 414 int ret; 415 416 for (i = 0; i < nr; i++) { 417 ret = mmap_return_error(errs[i], st); 418 if (ret < 0) 419 return ret; 420 } 421 return 0; 422 } 423 424 /* Allocate pfns that are then mapped with gfns from foreign domid. Update 425 * the vma with the page info to use later. 426 * Returns: 0 if success, otherwise -errno 427 */ 428 static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) 429 { 430 int rc; 431 struct page **pages; 432 433 pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); 434 if (pages == NULL) 435 return -ENOMEM; 436 437 rc = xen_alloc_unpopulated_pages(numpgs, pages); 438 if (rc != 0) { 439 pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, 440 numpgs, rc); 441 kvfree(pages); 442 return -ENOMEM; 443 } 444 BUG_ON(vma->vm_private_data != NULL); 445 vma->vm_private_data = pages; 446 447 return 0; 448 } 449 450 static const struct vm_operations_struct privcmd_vm_ops; 451 452 static long privcmd_ioctl_mmap_batch( 453 struct file *file, void __user *udata, int version) 454 { 455 struct privcmd_data *data = file->private_data; 456 int ret; 457 struct privcmd_mmapbatch_v2 m; 458 struct mm_struct *mm = current->mm; 459 struct vm_area_struct *vma; 460 unsigned long nr_pages; 461 LIST_HEAD(pagelist); 462 struct mmap_batch_state state; 463 464 switch (version) { 465 case 1: 466 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) 467 return -EFAULT; 468 /* Returns per-frame error in m.arr. */ 469 m.err = NULL; 470 if (!access_ok(m.arr, m.num * sizeof(*m.arr))) 471 return -EFAULT; 472 break; 473 case 2: 474 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) 475 return -EFAULT; 476 /* Returns per-frame error code in m.err. */ 477 if (!access_ok(m.err, m.num * (sizeof(*m.err)))) 478 return -EFAULT; 479 break; 480 default: 481 return -EINVAL; 482 } 483 484 /* If restriction is in place, check the domid matches */ 485 if (data->domid != DOMID_INVALID && data->domid != m.dom) 486 return -EPERM; 487 488 nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); 489 if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) 490 return -EINVAL; 491 492 ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); 493 494 if (ret) 495 goto out; 496 if (list_empty(&pagelist)) { 497 ret = -EINVAL; 498 goto out; 499 } 500 501 if (version == 2) { 502 /* Zero error array now to only copy back actual errors. */ 503 if (clear_user(m.err, sizeof(int) * m.num)) { 504 ret = -EFAULT; 505 goto out; 506 } 507 } 508 509 mmap_write_lock(mm); 510 511 vma = find_vma(mm, m.addr); 512 if (!vma || 513 vma->vm_ops != &privcmd_vm_ops) { 514 ret = -EINVAL; 515 goto out_unlock; 516 } 517 518 /* 519 * Caller must either: 520 * 521 * Map the whole VMA range, which will also allocate all the 522 * pages required for the auto_translated_physmap case. 523 * 524 * Or 525 * 526 * Map unmapped holes left from a previous map attempt (e.g., 527 * because those foreign frames were previously paged out). 528 */ 529 if (vma->vm_private_data == NULL) { 530 if (m.addr != vma->vm_start || 531 m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { 532 ret = -EINVAL; 533 goto out_unlock; 534 } 535 if (xen_feature(XENFEAT_auto_translated_physmap)) { 536 ret = alloc_empty_pages(vma, nr_pages); 537 if (ret < 0) 538 goto out_unlock; 539 } else 540 vma->vm_private_data = PRIV_VMA_LOCKED; 541 } else { 542 if (m.addr < vma->vm_start || 543 m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { 544 ret = -EINVAL; 545 goto out_unlock; 546 } 547 if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { 548 ret = -EINVAL; 549 goto out_unlock; 550 } 551 } 552 553 state.domain = m.dom; 554 state.vma = vma; 555 state.va = m.addr; 556 state.index = 0; 557 state.global_error = 0; 558 state.version = version; 559 560 BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0); 561 /* mmap_batch_fn guarantees ret == 0 */ 562 BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), 563 &pagelist, mmap_batch_fn, &state)); 564 565 mmap_write_unlock(mm); 566 567 if (state.global_error) { 568 /* Write back errors in second pass. */ 569 state.user_gfn = (xen_pfn_t *)m.arr; 570 state.user_err = m.err; 571 ret = traverse_pages_block(m.num, sizeof(xen_pfn_t), 572 &pagelist, mmap_return_errors, &state); 573 } else 574 ret = 0; 575 576 /* If we have not had any EFAULT-like global errors then set the global 577 * error to -ENOENT if necessary. */ 578 if ((ret == 0) && (state.global_error == -ENOENT)) 579 ret = -ENOENT; 580 581 out: 582 free_page_list(&pagelist); 583 return ret; 584 585 out_unlock: 586 mmap_write_unlock(mm); 587 goto out; 588 } 589 590 static int lock_pages( 591 struct privcmd_dm_op_buf kbufs[], unsigned int num, 592 struct page *pages[], unsigned int nr_pages, unsigned int *pinned) 593 { 594 unsigned int i, off = 0; 595 596 for (i = 0; i < num; ) { 597 unsigned int requested; 598 int page_count; 599 600 requested = DIV_ROUND_UP( 601 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 602 PAGE_SIZE) - off; 603 if (requested > nr_pages) 604 return -ENOSPC; 605 606 page_count = pin_user_pages_fast( 607 (unsigned long)kbufs[i].uptr + off * PAGE_SIZE, 608 requested, FOLL_WRITE, pages); 609 if (page_count <= 0) 610 return page_count ? : -EFAULT; 611 612 *pinned += page_count; 613 nr_pages -= page_count; 614 pages += page_count; 615 616 off = (requested == page_count) ? 0 : off + page_count; 617 i += !off; 618 } 619 620 return 0; 621 } 622 623 static void unlock_pages(struct page *pages[], unsigned int nr_pages) 624 { 625 unpin_user_pages_dirty_lock(pages, nr_pages, true); 626 } 627 628 static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) 629 { 630 struct privcmd_data *data = file->private_data; 631 struct privcmd_dm_op kdata; 632 struct privcmd_dm_op_buf *kbufs; 633 unsigned int nr_pages = 0; 634 struct page **pages = NULL; 635 struct xen_dm_op_buf *xbufs = NULL; 636 unsigned int i; 637 long rc; 638 unsigned int pinned = 0; 639 640 if (copy_from_user(&kdata, udata, sizeof(kdata))) 641 return -EFAULT; 642 643 /* If restriction is in place, check the domid matches */ 644 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 645 return -EPERM; 646 647 if (kdata.num == 0) 648 return 0; 649 650 if (kdata.num > privcmd_dm_op_max_num) 651 return -E2BIG; 652 653 kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL); 654 if (!kbufs) 655 return -ENOMEM; 656 657 if (copy_from_user(kbufs, kdata.ubufs, 658 sizeof(*kbufs) * kdata.num)) { 659 rc = -EFAULT; 660 goto out; 661 } 662 663 for (i = 0; i < kdata.num; i++) { 664 if (kbufs[i].size > privcmd_dm_op_buf_max_size) { 665 rc = -E2BIG; 666 goto out; 667 } 668 669 if (!access_ok(kbufs[i].uptr, 670 kbufs[i].size)) { 671 rc = -EFAULT; 672 goto out; 673 } 674 675 nr_pages += DIV_ROUND_UP( 676 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 677 PAGE_SIZE); 678 } 679 680 pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); 681 if (!pages) { 682 rc = -ENOMEM; 683 goto out; 684 } 685 686 xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL); 687 if (!xbufs) { 688 rc = -ENOMEM; 689 goto out; 690 } 691 692 rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned); 693 if (rc < 0) 694 goto out; 695 696 for (i = 0; i < kdata.num; i++) { 697 set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); 698 xbufs[i].size = kbufs[i].size; 699 } 700 701 xen_preemptible_hcall_begin(); 702 rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs); 703 xen_preemptible_hcall_end(); 704 705 out: 706 unlock_pages(pages, pinned); 707 kfree(xbufs); 708 kfree(pages); 709 kfree(kbufs); 710 711 return rc; 712 } 713 714 static long privcmd_ioctl_restrict(struct file *file, void __user *udata) 715 { 716 struct privcmd_data *data = file->private_data; 717 domid_t dom; 718 719 if (copy_from_user(&dom, udata, sizeof(dom))) 720 return -EFAULT; 721 722 /* Set restriction to the specified domain, or check it matches */ 723 if (data->domid == DOMID_INVALID) 724 data->domid = dom; 725 else if (data->domid != dom) 726 return -EINVAL; 727 728 return 0; 729 } 730 731 static long privcmd_ioctl_mmap_resource(struct file *file, 732 struct privcmd_mmap_resource __user *udata) 733 { 734 struct privcmd_data *data = file->private_data; 735 struct mm_struct *mm = current->mm; 736 struct vm_area_struct *vma; 737 struct privcmd_mmap_resource kdata; 738 xen_pfn_t *pfns = NULL; 739 struct xen_mem_acquire_resource xdata = { }; 740 int rc; 741 742 if (copy_from_user(&kdata, udata, sizeof(kdata))) 743 return -EFAULT; 744 745 /* If restriction is in place, check the domid matches */ 746 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 747 return -EPERM; 748 749 /* Both fields must be set or unset */ 750 if (!!kdata.addr != !!kdata.num) 751 return -EINVAL; 752 753 xdata.domid = kdata.dom; 754 xdata.type = kdata.type; 755 xdata.id = kdata.id; 756 757 if (!kdata.addr && !kdata.num) { 758 /* Query the size of the resource. */ 759 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 760 if (rc) 761 return rc; 762 return __put_user(xdata.nr_frames, &udata->num); 763 } 764 765 mmap_write_lock(mm); 766 767 vma = find_vma(mm, kdata.addr); 768 if (!vma || vma->vm_ops != &privcmd_vm_ops) { 769 rc = -EINVAL; 770 goto out; 771 } 772 773 pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL | __GFP_NOWARN); 774 if (!pfns) { 775 rc = -ENOMEM; 776 goto out; 777 } 778 779 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && 780 xen_feature(XENFEAT_auto_translated_physmap)) { 781 unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); 782 struct page **pages; 783 unsigned int i; 784 785 rc = alloc_empty_pages(vma, nr); 786 if (rc < 0) 787 goto out; 788 789 pages = vma->vm_private_data; 790 791 for (i = 0; i < kdata.num; i++) { 792 xen_pfn_t pfn = 793 page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); 794 795 pfns[i] = pfn + (i % XEN_PFN_PER_PAGE); 796 } 797 } else 798 vma->vm_private_data = PRIV_VMA_LOCKED; 799 800 xdata.frame = kdata.idx; 801 xdata.nr_frames = kdata.num; 802 set_xen_guest_handle(xdata.frame_list, pfns); 803 804 xen_preemptible_hcall_begin(); 805 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 806 xen_preemptible_hcall_end(); 807 808 if (rc) 809 goto out; 810 811 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && 812 xen_feature(XENFEAT_auto_translated_physmap)) { 813 rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); 814 } else { 815 unsigned int domid = 816 (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? 817 DOMID_SELF : kdata.dom; 818 int num, *errs = (int *)pfns; 819 820 BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); 821 num = xen_remap_domain_mfn_array(vma, 822 kdata.addr & PAGE_MASK, 823 pfns, kdata.num, errs, 824 vma->vm_page_prot, 825 domid); 826 if (num < 0) 827 rc = num; 828 else if (num != kdata.num) { 829 unsigned int i; 830 831 for (i = 0; i < num; i++) { 832 rc = errs[i]; 833 if (rc < 0) 834 break; 835 } 836 } else 837 rc = 0; 838 } 839 840 out: 841 mmap_write_unlock(mm); 842 kfree(pfns); 843 844 return rc; 845 } 846 847 #ifdef CONFIG_XEN_PRIVCMD_EVENTFD 848 /* Irqfd support */ 849 static struct workqueue_struct *irqfd_cleanup_wq; 850 static DEFINE_SPINLOCK(irqfds_lock); 851 DEFINE_STATIC_SRCU(irqfds_srcu); 852 static LIST_HEAD(irqfds_list); 853 854 struct privcmd_kernel_irqfd { 855 struct xen_dm_op_buf xbufs; 856 domid_t dom; 857 bool error; 858 struct eventfd_ctx *eventfd; 859 struct work_struct shutdown; 860 wait_queue_entry_t wait; 861 struct list_head list; 862 poll_table pt; 863 }; 864 865 static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd) 866 { 867 lockdep_assert_held(&irqfds_lock); 868 869 list_del_init(&kirqfd->list); 870 queue_work(irqfd_cleanup_wq, &kirqfd->shutdown); 871 } 872 873 static void irqfd_shutdown(struct work_struct *work) 874 { 875 struct privcmd_kernel_irqfd *kirqfd = 876 container_of(work, struct privcmd_kernel_irqfd, shutdown); 877 u64 cnt; 878 879 /* Make sure irqfd has been initialized in assign path */ 880 synchronize_srcu(&irqfds_srcu); 881 882 eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt); 883 eventfd_ctx_put(kirqfd->eventfd); 884 kfree(kirqfd); 885 } 886 887 static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd) 888 { 889 u64 cnt; 890 long rc; 891 892 eventfd_ctx_do_read(kirqfd->eventfd, &cnt); 893 894 xen_preemptible_hcall_begin(); 895 rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs); 896 xen_preemptible_hcall_end(); 897 898 /* Don't repeat the error message for consecutive failures */ 899 if (rc && !kirqfd->error) { 900 pr_err("Failed to configure irq for guest domain: %d\n", 901 kirqfd->dom); 902 } 903 904 kirqfd->error = rc; 905 } 906 907 static int 908 irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) 909 { 910 struct privcmd_kernel_irqfd *kirqfd = 911 container_of(wait, struct privcmd_kernel_irqfd, wait); 912 __poll_t flags = key_to_poll(key); 913 914 if (flags & EPOLLIN) 915 irqfd_inject(kirqfd); 916 917 if (flags & EPOLLHUP) { 918 unsigned long flags; 919 920 spin_lock_irqsave(&irqfds_lock, flags); 921 irqfd_deactivate(kirqfd); 922 spin_unlock_irqrestore(&irqfds_lock, flags); 923 } 924 925 return 0; 926 } 927 928 static void 929 irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) 930 { 931 struct privcmd_kernel_irqfd *kirqfd = 932 container_of(pt, struct privcmd_kernel_irqfd, pt); 933 934 add_wait_queue_priority(wqh, &kirqfd->wait); 935 } 936 937 static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd) 938 { 939 struct privcmd_kernel_irqfd *kirqfd, *tmp; 940 unsigned long flags; 941 __poll_t events; 942 struct fd f; 943 void *dm_op; 944 int ret, idx; 945 946 kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL); 947 if (!kirqfd) 948 return -ENOMEM; 949 dm_op = kirqfd + 1; 950 951 if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) { 952 ret = -EFAULT; 953 goto error_kfree; 954 } 955 956 kirqfd->xbufs.size = irqfd->size; 957 set_xen_guest_handle(kirqfd->xbufs.h, dm_op); 958 kirqfd->dom = irqfd->dom; 959 INIT_WORK(&kirqfd->shutdown, irqfd_shutdown); 960 961 f = fdget(irqfd->fd); 962 if (!f.file) { 963 ret = -EBADF; 964 goto error_kfree; 965 } 966 967 kirqfd->eventfd = eventfd_ctx_fileget(f.file); 968 if (IS_ERR(kirqfd->eventfd)) { 969 ret = PTR_ERR(kirqfd->eventfd); 970 goto error_fd_put; 971 } 972 973 /* 974 * Install our own custom wake-up handling so we are notified via a 975 * callback whenever someone signals the underlying eventfd. 976 */ 977 init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup); 978 init_poll_funcptr(&kirqfd->pt, irqfd_poll_func); 979 980 spin_lock_irqsave(&irqfds_lock, flags); 981 982 list_for_each_entry(tmp, &irqfds_list, list) { 983 if (kirqfd->eventfd == tmp->eventfd) { 984 ret = -EBUSY; 985 spin_unlock_irqrestore(&irqfds_lock, flags); 986 goto error_eventfd; 987 } 988 } 989 990 idx = srcu_read_lock(&irqfds_srcu); 991 list_add_tail(&kirqfd->list, &irqfds_list); 992 spin_unlock_irqrestore(&irqfds_lock, flags); 993 994 /* 995 * Check if there was an event already pending on the eventfd before we 996 * registered, and trigger it as if we didn't miss it. 997 */ 998 events = vfs_poll(f.file, &kirqfd->pt); 999 if (events & EPOLLIN) 1000 irqfd_inject(kirqfd); 1001 1002 srcu_read_unlock(&irqfds_srcu, idx); 1003 1004 /* 1005 * Do not drop the file until the kirqfd is fully initialized, otherwise 1006 * we might race against the EPOLLHUP. 1007 */ 1008 fdput(f); 1009 return 0; 1010 1011 error_eventfd: 1012 eventfd_ctx_put(kirqfd->eventfd); 1013 1014 error_fd_put: 1015 fdput(f); 1016 1017 error_kfree: 1018 kfree(kirqfd); 1019 return ret; 1020 } 1021 1022 static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd) 1023 { 1024 struct privcmd_kernel_irqfd *kirqfd; 1025 struct eventfd_ctx *eventfd; 1026 unsigned long flags; 1027 1028 eventfd = eventfd_ctx_fdget(irqfd->fd); 1029 if (IS_ERR(eventfd)) 1030 return PTR_ERR(eventfd); 1031 1032 spin_lock_irqsave(&irqfds_lock, flags); 1033 1034 list_for_each_entry(kirqfd, &irqfds_list, list) { 1035 if (kirqfd->eventfd == eventfd) { 1036 irqfd_deactivate(kirqfd); 1037 break; 1038 } 1039 } 1040 1041 spin_unlock_irqrestore(&irqfds_lock, flags); 1042 1043 eventfd_ctx_put(eventfd); 1044 1045 /* 1046 * Block until we know all outstanding shutdown jobs have completed so 1047 * that we guarantee there will not be any more interrupts once this 1048 * deassign function returns. 1049 */ 1050 flush_workqueue(irqfd_cleanup_wq); 1051 1052 return 0; 1053 } 1054 1055 static long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1056 { 1057 struct privcmd_data *data = file->private_data; 1058 struct privcmd_irqfd irqfd; 1059 1060 if (copy_from_user(&irqfd, udata, sizeof(irqfd))) 1061 return -EFAULT; 1062 1063 /* No other flags should be set */ 1064 if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN) 1065 return -EINVAL; 1066 1067 /* If restriction is in place, check the domid matches */ 1068 if (data->domid != DOMID_INVALID && data->domid != irqfd.dom) 1069 return -EPERM; 1070 1071 if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN) 1072 return privcmd_irqfd_deassign(&irqfd); 1073 1074 return privcmd_irqfd_assign(&irqfd); 1075 } 1076 1077 static int privcmd_irqfd_init(void) 1078 { 1079 irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0); 1080 if (!irqfd_cleanup_wq) 1081 return -ENOMEM; 1082 1083 return 0; 1084 } 1085 1086 static void privcmd_irqfd_exit(void) 1087 { 1088 struct privcmd_kernel_irqfd *kirqfd, *tmp; 1089 unsigned long flags; 1090 1091 spin_lock_irqsave(&irqfds_lock, flags); 1092 1093 list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) 1094 irqfd_deactivate(kirqfd); 1095 1096 spin_unlock_irqrestore(&irqfds_lock, flags); 1097 1098 destroy_workqueue(irqfd_cleanup_wq); 1099 } 1100 1101 /* Ioeventfd Support */ 1102 #define QUEUE_NOTIFY_VQ_MASK 0xFFFF 1103 1104 static DEFINE_MUTEX(ioreq_lock); 1105 static LIST_HEAD(ioreq_list); 1106 1107 /* per-eventfd structure */ 1108 struct privcmd_kernel_ioeventfd { 1109 struct eventfd_ctx *eventfd; 1110 struct list_head list; 1111 u64 addr; 1112 unsigned int addr_len; 1113 unsigned int vq; 1114 }; 1115 1116 /* per-guest CPU / port structure */ 1117 struct ioreq_port { 1118 int vcpu; 1119 unsigned int port; 1120 struct privcmd_kernel_ioreq *kioreq; 1121 }; 1122 1123 /* per-guest structure */ 1124 struct privcmd_kernel_ioreq { 1125 domid_t dom; 1126 unsigned int vcpus; 1127 u64 uioreq; 1128 struct ioreq *ioreq; 1129 spinlock_t lock; /* Protects ioeventfds list */ 1130 struct list_head ioeventfds; 1131 struct list_head list; 1132 struct ioreq_port ports[] __counted_by(vcpus); 1133 }; 1134 1135 static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id) 1136 { 1137 struct ioreq_port *port = dev_id; 1138 struct privcmd_kernel_ioreq *kioreq = port->kioreq; 1139 struct ioreq *ioreq = &kioreq->ioreq[port->vcpu]; 1140 struct privcmd_kernel_ioeventfd *kioeventfd; 1141 unsigned int state = STATE_IOREQ_READY; 1142 1143 if (ioreq->state != STATE_IOREQ_READY || 1144 ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE) 1145 return IRQ_NONE; 1146 1147 /* 1148 * We need a barrier, smp_mb(), here to ensure reads are finished before 1149 * `state` is updated. Since the lock implementation ensures that 1150 * appropriate barrier will be added anyway, we can avoid adding 1151 * explicit barrier here. 1152 * 1153 * Ideally we don't need to update `state` within the locks, but we do 1154 * that here to avoid adding explicit barrier. 1155 */ 1156 1157 spin_lock(&kioreq->lock); 1158 ioreq->state = STATE_IOREQ_INPROCESS; 1159 1160 list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { 1161 if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY && 1162 ioreq->size == kioeventfd->addr_len && 1163 (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) { 1164 eventfd_signal(kioeventfd->eventfd); 1165 state = STATE_IORESP_READY; 1166 break; 1167 } 1168 } 1169 spin_unlock(&kioreq->lock); 1170 1171 /* 1172 * We need a barrier, smp_mb(), here to ensure writes are finished 1173 * before `state` is updated. Since the lock implementation ensures that 1174 * appropriate barrier will be added anyway, we can avoid adding 1175 * explicit barrier here. 1176 */ 1177 1178 ioreq->state = state; 1179 1180 if (state == STATE_IORESP_READY) { 1181 notify_remote_via_evtchn(port->port); 1182 return IRQ_HANDLED; 1183 } 1184 1185 return IRQ_NONE; 1186 } 1187 1188 static void ioreq_free(struct privcmd_kernel_ioreq *kioreq) 1189 { 1190 struct ioreq_port *ports = kioreq->ports; 1191 int i; 1192 1193 lockdep_assert_held(&ioreq_lock); 1194 1195 list_del(&kioreq->list); 1196 1197 for (i = kioreq->vcpus - 1; i >= 0; i--) 1198 unbind_from_irqhandler(irq_from_evtchn(ports[i].port), &ports[i]); 1199 1200 kfree(kioreq); 1201 } 1202 1203 static 1204 struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd) 1205 { 1206 struct privcmd_kernel_ioreq *kioreq; 1207 struct mm_struct *mm = current->mm; 1208 struct vm_area_struct *vma; 1209 struct page **pages; 1210 unsigned int *ports; 1211 int ret, size, i; 1212 1213 lockdep_assert_held(&ioreq_lock); 1214 1215 size = struct_size(kioreq, ports, ioeventfd->vcpus); 1216 kioreq = kzalloc(size, GFP_KERNEL); 1217 if (!kioreq) 1218 return ERR_PTR(-ENOMEM); 1219 1220 kioreq->dom = ioeventfd->dom; 1221 kioreq->vcpus = ioeventfd->vcpus; 1222 kioreq->uioreq = ioeventfd->ioreq; 1223 spin_lock_init(&kioreq->lock); 1224 INIT_LIST_HEAD(&kioreq->ioeventfds); 1225 1226 /* The memory for ioreq server must have been mapped earlier */ 1227 mmap_write_lock(mm); 1228 vma = find_vma(mm, (unsigned long)ioeventfd->ioreq); 1229 if (!vma) { 1230 pr_err("Failed to find vma for ioreq page!\n"); 1231 mmap_write_unlock(mm); 1232 ret = -EFAULT; 1233 goto error_kfree; 1234 } 1235 1236 pages = vma->vm_private_data; 1237 kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0])); 1238 mmap_write_unlock(mm); 1239 1240 ports = memdup_array_user(u64_to_user_ptr(ioeventfd->ports), 1241 kioreq->vcpus, sizeof(*ports)); 1242 if (IS_ERR(ports)) { 1243 ret = PTR_ERR(ports); 1244 goto error_kfree; 1245 } 1246 1247 for (i = 0; i < kioreq->vcpus; i++) { 1248 kioreq->ports[i].vcpu = i; 1249 kioreq->ports[i].port = ports[i]; 1250 kioreq->ports[i].kioreq = kioreq; 1251 1252 ret = bind_evtchn_to_irqhandler_lateeoi(ports[i], 1253 ioeventfd_interrupt, IRQF_SHARED, "ioeventfd", 1254 &kioreq->ports[i]); 1255 if (ret < 0) 1256 goto error_unbind; 1257 } 1258 1259 kfree(ports); 1260 1261 list_add_tail(&kioreq->list, &ioreq_list); 1262 1263 return kioreq; 1264 1265 error_unbind: 1266 while (--i >= 0) 1267 unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]); 1268 1269 kfree(ports); 1270 error_kfree: 1271 kfree(kioreq); 1272 return ERR_PTR(ret); 1273 } 1274 1275 static struct privcmd_kernel_ioreq * 1276 get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd) 1277 { 1278 struct privcmd_kernel_ioreq *kioreq; 1279 unsigned long flags; 1280 1281 list_for_each_entry(kioreq, &ioreq_list, list) { 1282 struct privcmd_kernel_ioeventfd *kioeventfd; 1283 1284 /* 1285 * kioreq fields can be accessed here without a lock as they are 1286 * never updated after being added to the ioreq_list. 1287 */ 1288 if (kioreq->uioreq != ioeventfd->ioreq) { 1289 continue; 1290 } else if (kioreq->dom != ioeventfd->dom || 1291 kioreq->vcpus != ioeventfd->vcpus) { 1292 pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n", 1293 kioreq->dom, ioeventfd->dom, kioreq->vcpus, 1294 ioeventfd->vcpus); 1295 return ERR_PTR(-EINVAL); 1296 } 1297 1298 /* Look for a duplicate eventfd for the same guest */ 1299 spin_lock_irqsave(&kioreq->lock, flags); 1300 list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { 1301 if (eventfd == kioeventfd->eventfd) { 1302 spin_unlock_irqrestore(&kioreq->lock, flags); 1303 return ERR_PTR(-EBUSY); 1304 } 1305 } 1306 spin_unlock_irqrestore(&kioreq->lock, flags); 1307 1308 return kioreq; 1309 } 1310 1311 /* Matching kioreq isn't found, allocate a new one */ 1312 return alloc_ioreq(ioeventfd); 1313 } 1314 1315 static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd) 1316 { 1317 list_del(&kioeventfd->list); 1318 eventfd_ctx_put(kioeventfd->eventfd); 1319 kfree(kioeventfd); 1320 } 1321 1322 static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd) 1323 { 1324 struct privcmd_kernel_ioeventfd *kioeventfd; 1325 struct privcmd_kernel_ioreq *kioreq; 1326 unsigned long flags; 1327 struct fd f; 1328 int ret; 1329 1330 /* Check for range overflow */ 1331 if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr) 1332 return -EINVAL; 1333 1334 /* Vhost requires us to support length 1, 2, 4, and 8 */ 1335 if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 || 1336 ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8)) 1337 return -EINVAL; 1338 1339 /* 4096 vcpus limit enough ? */ 1340 if (!ioeventfd->vcpus || ioeventfd->vcpus > 4096) 1341 return -EINVAL; 1342 1343 kioeventfd = kzalloc(sizeof(*kioeventfd), GFP_KERNEL); 1344 if (!kioeventfd) 1345 return -ENOMEM; 1346 1347 f = fdget(ioeventfd->event_fd); 1348 if (!f.file) { 1349 ret = -EBADF; 1350 goto error_kfree; 1351 } 1352 1353 kioeventfd->eventfd = eventfd_ctx_fileget(f.file); 1354 fdput(f); 1355 1356 if (IS_ERR(kioeventfd->eventfd)) { 1357 ret = PTR_ERR(kioeventfd->eventfd); 1358 goto error_kfree; 1359 } 1360 1361 kioeventfd->addr = ioeventfd->addr; 1362 kioeventfd->addr_len = ioeventfd->addr_len; 1363 kioeventfd->vq = ioeventfd->vq; 1364 1365 mutex_lock(&ioreq_lock); 1366 kioreq = get_ioreq(ioeventfd, kioeventfd->eventfd); 1367 if (IS_ERR(kioreq)) { 1368 mutex_unlock(&ioreq_lock); 1369 ret = PTR_ERR(kioreq); 1370 goto error_eventfd; 1371 } 1372 1373 spin_lock_irqsave(&kioreq->lock, flags); 1374 list_add_tail(&kioeventfd->list, &kioreq->ioeventfds); 1375 spin_unlock_irqrestore(&kioreq->lock, flags); 1376 1377 mutex_unlock(&ioreq_lock); 1378 1379 return 0; 1380 1381 error_eventfd: 1382 eventfd_ctx_put(kioeventfd->eventfd); 1383 1384 error_kfree: 1385 kfree(kioeventfd); 1386 return ret; 1387 } 1388 1389 static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd) 1390 { 1391 struct privcmd_kernel_ioreq *kioreq, *tkioreq; 1392 struct eventfd_ctx *eventfd; 1393 unsigned long flags; 1394 int ret = 0; 1395 1396 eventfd = eventfd_ctx_fdget(ioeventfd->event_fd); 1397 if (IS_ERR(eventfd)) 1398 return PTR_ERR(eventfd); 1399 1400 mutex_lock(&ioreq_lock); 1401 list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) { 1402 struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; 1403 /* 1404 * kioreq fields can be accessed here without a lock as they are 1405 * never updated after being added to the ioreq_list. 1406 */ 1407 if (kioreq->dom != ioeventfd->dom || 1408 kioreq->uioreq != ioeventfd->ioreq || 1409 kioreq->vcpus != ioeventfd->vcpus) 1410 continue; 1411 1412 spin_lock_irqsave(&kioreq->lock, flags); 1413 list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) { 1414 if (eventfd == kioeventfd->eventfd) { 1415 ioeventfd_free(kioeventfd); 1416 spin_unlock_irqrestore(&kioreq->lock, flags); 1417 1418 if (list_empty(&kioreq->ioeventfds)) 1419 ioreq_free(kioreq); 1420 goto unlock; 1421 } 1422 } 1423 spin_unlock_irqrestore(&kioreq->lock, flags); 1424 break; 1425 } 1426 1427 pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n", 1428 ioeventfd->dom, ioeventfd->addr); 1429 ret = -ENODEV; 1430 1431 unlock: 1432 mutex_unlock(&ioreq_lock); 1433 eventfd_ctx_put(eventfd); 1434 1435 return ret; 1436 } 1437 1438 static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) 1439 { 1440 struct privcmd_data *data = file->private_data; 1441 struct privcmd_ioeventfd ioeventfd; 1442 1443 if (copy_from_user(&ioeventfd, udata, sizeof(ioeventfd))) 1444 return -EFAULT; 1445 1446 /* No other flags should be set */ 1447 if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) 1448 return -EINVAL; 1449 1450 /* If restriction is in place, check the domid matches */ 1451 if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom) 1452 return -EPERM; 1453 1454 if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) 1455 return privcmd_ioeventfd_deassign(&ioeventfd); 1456 1457 return privcmd_ioeventfd_assign(&ioeventfd); 1458 } 1459 1460 static void privcmd_ioeventfd_exit(void) 1461 { 1462 struct privcmd_kernel_ioreq *kioreq, *tmp; 1463 unsigned long flags; 1464 1465 mutex_lock(&ioreq_lock); 1466 list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) { 1467 struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; 1468 1469 spin_lock_irqsave(&kioreq->lock, flags); 1470 list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) 1471 ioeventfd_free(kioeventfd); 1472 spin_unlock_irqrestore(&kioreq->lock, flags); 1473 1474 ioreq_free(kioreq); 1475 } 1476 mutex_unlock(&ioreq_lock); 1477 } 1478 #else 1479 static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1480 { 1481 return -EOPNOTSUPP; 1482 } 1483 1484 static inline int privcmd_irqfd_init(void) 1485 { 1486 return 0; 1487 } 1488 1489 static inline void privcmd_irqfd_exit(void) 1490 { 1491 } 1492 1493 static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) 1494 { 1495 return -EOPNOTSUPP; 1496 } 1497 1498 static inline void privcmd_ioeventfd_exit(void) 1499 { 1500 } 1501 #endif /* CONFIG_XEN_PRIVCMD_EVENTFD */ 1502 1503 static long privcmd_ioctl(struct file *file, 1504 unsigned int cmd, unsigned long data) 1505 { 1506 int ret = -ENOTTY; 1507 void __user *udata = (void __user *) data; 1508 1509 switch (cmd) { 1510 case IOCTL_PRIVCMD_HYPERCALL: 1511 ret = privcmd_ioctl_hypercall(file, udata); 1512 break; 1513 1514 case IOCTL_PRIVCMD_MMAP: 1515 ret = privcmd_ioctl_mmap(file, udata); 1516 break; 1517 1518 case IOCTL_PRIVCMD_MMAPBATCH: 1519 ret = privcmd_ioctl_mmap_batch(file, udata, 1); 1520 break; 1521 1522 case IOCTL_PRIVCMD_MMAPBATCH_V2: 1523 ret = privcmd_ioctl_mmap_batch(file, udata, 2); 1524 break; 1525 1526 case IOCTL_PRIVCMD_DM_OP: 1527 ret = privcmd_ioctl_dm_op(file, udata); 1528 break; 1529 1530 case IOCTL_PRIVCMD_RESTRICT: 1531 ret = privcmd_ioctl_restrict(file, udata); 1532 break; 1533 1534 case IOCTL_PRIVCMD_MMAP_RESOURCE: 1535 ret = privcmd_ioctl_mmap_resource(file, udata); 1536 break; 1537 1538 case IOCTL_PRIVCMD_IRQFD: 1539 ret = privcmd_ioctl_irqfd(file, udata); 1540 break; 1541 1542 case IOCTL_PRIVCMD_IOEVENTFD: 1543 ret = privcmd_ioctl_ioeventfd(file, udata); 1544 break; 1545 1546 default: 1547 break; 1548 } 1549 1550 return ret; 1551 } 1552 1553 static int privcmd_open(struct inode *ino, struct file *file) 1554 { 1555 struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL); 1556 1557 if (!data) 1558 return -ENOMEM; 1559 1560 /* DOMID_INVALID implies no restriction */ 1561 data->domid = DOMID_INVALID; 1562 1563 file->private_data = data; 1564 return 0; 1565 } 1566 1567 static int privcmd_release(struct inode *ino, struct file *file) 1568 { 1569 struct privcmd_data *data = file->private_data; 1570 1571 kfree(data); 1572 return 0; 1573 } 1574 1575 static void privcmd_close(struct vm_area_struct *vma) 1576 { 1577 struct page **pages = vma->vm_private_data; 1578 int numpgs = vma_pages(vma); 1579 int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; 1580 int rc; 1581 1582 if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) 1583 return; 1584 1585 rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); 1586 if (rc == 0) 1587 xen_free_unpopulated_pages(numpgs, pages); 1588 else 1589 pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", 1590 numpgs, rc); 1591 kvfree(pages); 1592 } 1593 1594 static vm_fault_t privcmd_fault(struct vm_fault *vmf) 1595 { 1596 printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", 1597 vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, 1598 vmf->pgoff, (void *)vmf->address); 1599 1600 return VM_FAULT_SIGBUS; 1601 } 1602 1603 static const struct vm_operations_struct privcmd_vm_ops = { 1604 .close = privcmd_close, 1605 .fault = privcmd_fault 1606 }; 1607 1608 static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) 1609 { 1610 /* DONTCOPY is essential for Xen because copy_page_range doesn't know 1611 * how to recreate these mappings */ 1612 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | 1613 VM_DONTEXPAND | VM_DONTDUMP); 1614 vma->vm_ops = &privcmd_vm_ops; 1615 vma->vm_private_data = NULL; 1616 1617 return 0; 1618 } 1619 1620 /* 1621 * For MMAPBATCH*. This allows asserting the singleshot mapping 1622 * on a per pfn/pte basis. Mapping calls that fail with ENOENT 1623 * can be then retried until success. 1624 */ 1625 static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) 1626 { 1627 return pte_none(ptep_get(pte)) ? 0 : -EBUSY; 1628 } 1629 1630 static int privcmd_vma_range_is_mapped( 1631 struct vm_area_struct *vma, 1632 unsigned long addr, 1633 unsigned long nr_pages) 1634 { 1635 return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, 1636 is_mapped_fn, NULL) != 0; 1637 } 1638 1639 const struct file_operations xen_privcmd_fops = { 1640 .owner = THIS_MODULE, 1641 .unlocked_ioctl = privcmd_ioctl, 1642 .open = privcmd_open, 1643 .release = privcmd_release, 1644 .mmap = privcmd_mmap, 1645 }; 1646 EXPORT_SYMBOL_GPL(xen_privcmd_fops); 1647 1648 static struct miscdevice privcmd_dev = { 1649 .minor = MISC_DYNAMIC_MINOR, 1650 .name = "xen/privcmd", 1651 .fops = &xen_privcmd_fops, 1652 }; 1653 1654 static int __init privcmd_init(void) 1655 { 1656 int err; 1657 1658 if (!xen_domain()) 1659 return -ENODEV; 1660 1661 err = misc_register(&privcmd_dev); 1662 if (err != 0) { 1663 pr_err("Could not register Xen privcmd device\n"); 1664 return err; 1665 } 1666 1667 err = misc_register(&xen_privcmdbuf_dev); 1668 if (err != 0) { 1669 pr_err("Could not register Xen hypercall-buf device\n"); 1670 goto err_privcmdbuf; 1671 } 1672 1673 err = privcmd_irqfd_init(); 1674 if (err != 0) { 1675 pr_err("irqfd init failed\n"); 1676 goto err_irqfd; 1677 } 1678 1679 return 0; 1680 1681 err_irqfd: 1682 misc_deregister(&xen_privcmdbuf_dev); 1683 err_privcmdbuf: 1684 misc_deregister(&privcmd_dev); 1685 return err; 1686 } 1687 1688 static void __exit privcmd_exit(void) 1689 { 1690 privcmd_ioeventfd_exit(); 1691 privcmd_irqfd_exit(); 1692 misc_deregister(&privcmd_dev); 1693 misc_deregister(&xen_privcmdbuf_dev); 1694 } 1695 1696 module_init(privcmd_init); 1697 module_exit(privcmd_exit); 1698