1 // SPDX-License-Identifier: GPL-2.0-only 2 /****************************************************************************** 3 * privcmd.c 4 * 5 * Interface to privileged domain-0 commands. 6 * 7 * Copyright (c) 2002-2004, K A Fraser, B Dragovic 8 */ 9 10 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 11 12 #include <linux/eventfd.h> 13 #include <linux/file.h> 14 #include <linux/kernel.h> 15 #include <linux/kstrtox.h> 16 #include <linux/module.h> 17 #include <linux/mutex.h> 18 #include <linux/poll.h> 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/srcu.h> 22 #include <linux/string.h> 23 #include <linux/workqueue.h> 24 #include <linux/errno.h> 25 #include <linux/mm.h> 26 #include <linux/mman.h> 27 #include <linux/uaccess.h> 28 #include <linux/swap.h> 29 #include <linux/highmem.h> 30 #include <linux/pagemap.h> 31 #include <linux/seq_file.h> 32 #include <linux/miscdevice.h> 33 #include <linux/moduleparam.h> 34 #include <linux/notifier.h> 35 #include <linux/security.h> 36 #include <linux/virtio_mmio.h> 37 #include <linux/wait.h> 38 39 #include <asm/xen/hypervisor.h> 40 #include <asm/xen/hypercall.h> 41 42 #include <xen/xen.h> 43 #include <xen/events.h> 44 #include <xen/privcmd.h> 45 #include <xen/interface/xen.h> 46 #include <xen/interface/memory.h> 47 #include <xen/interface/hvm/dm_op.h> 48 #include <xen/interface/hvm/ioreq.h> 49 #include <xen/features.h> 50 #include <xen/page.h> 51 #include <xen/xen-ops.h> 52 #include <xen/balloon.h> 53 #include <xen/xenbus.h> 54 #ifdef CONFIG_XEN_ACPI 55 #include <xen/acpi.h> 56 #endif 57 58 #include "privcmd.h" 59 60 MODULE_DESCRIPTION("Xen hypercall passthrough driver"); 61 MODULE_LICENSE("GPL"); 62 63 #define PRIV_VMA_LOCKED ((void *)1) 64 65 static unsigned int privcmd_dm_op_max_num = 16; 66 module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644); 67 MODULE_PARM_DESC(dm_op_max_nr_bufs, 68 "Maximum number of buffers per dm_op hypercall"); 69 70 static unsigned int privcmd_dm_op_buf_max_size = 4096; 71 module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, 72 0644); 73 MODULE_PARM_DESC(dm_op_buf_max_size, 74 "Maximum size of a dm_op hypercall buffer"); 75 76 static bool unrestricted; 77 module_param(unrestricted, bool, 0); 78 MODULE_PARM_DESC(unrestricted, 79 "Don't restrict hypercalls to target domain if running in a domU"); 80 81 struct privcmd_data { 82 domid_t domid; 83 }; 84 85 /* DOMID_INVALID implies no restriction */ 86 static domid_t target_domain = DOMID_INVALID; 87 static bool restrict_wait; 88 static DECLARE_WAIT_QUEUE_HEAD(restrict_wait_wq); 89 90 static int privcmd_vma_range_is_mapped( 91 struct vm_area_struct *vma, 92 unsigned long addr, 93 unsigned long nr_pages); 94 95 static long privcmd_ioctl_hypercall(struct file *file, void __user *udata) 96 { 97 struct privcmd_data *data = file->private_data; 98 struct privcmd_hypercall hypercall; 99 long ret; 100 101 /* Disallow arbitrary hypercalls if restricted */ 102 if (data->domid != DOMID_INVALID) 103 return -EPERM; 104 105 if (copy_from_user(&hypercall, udata, sizeof(hypercall))) 106 return -EFAULT; 107 108 xen_preemptible_hcall_begin(); 109 ret = privcmd_call(hypercall.op, 110 hypercall.arg[0], hypercall.arg[1], 111 hypercall.arg[2], hypercall.arg[3], 112 hypercall.arg[4]); 113 xen_preemptible_hcall_end(); 114 115 return ret; 116 } 117 118 static void free_page_list(struct list_head *pages) 119 { 120 struct page *p, *n; 121 122 list_for_each_entry_safe(p, n, pages, lru) 123 __free_page(p); 124 125 INIT_LIST_HEAD(pages); 126 } 127 128 /* 129 * Given an array of items in userspace, return a list of pages 130 * containing the data. If copying fails, either because of memory 131 * allocation failure or a problem reading user memory, return an 132 * error code; its up to the caller to dispose of any partial list. 133 */ 134 static int gather_array(struct list_head *pagelist, 135 unsigned nelem, size_t size, 136 const void __user *data) 137 { 138 unsigned pageidx; 139 void *pagedata; 140 int ret; 141 142 if (size > PAGE_SIZE) 143 return 0; 144 145 pageidx = PAGE_SIZE; 146 pagedata = NULL; /* quiet, gcc */ 147 while (nelem--) { 148 if (pageidx > PAGE_SIZE-size) { 149 struct page *page = alloc_page(GFP_KERNEL); 150 151 ret = -ENOMEM; 152 if (page == NULL) 153 goto fail; 154 155 pagedata = page_address(page); 156 157 list_add_tail(&page->lru, pagelist); 158 pageidx = 0; 159 } 160 161 ret = -EFAULT; 162 if (copy_from_user(pagedata + pageidx, data, size)) 163 goto fail; 164 165 data += size; 166 pageidx += size; 167 } 168 169 ret = 0; 170 171 fail: 172 return ret; 173 } 174 175 /* 176 * Call function "fn" on each element of the array fragmented 177 * over a list of pages. 178 */ 179 static int traverse_pages(unsigned nelem, size_t size, 180 struct list_head *pos, 181 int (*fn)(void *data, void *state), 182 void *state) 183 { 184 void *pagedata; 185 unsigned pageidx; 186 int ret = 0; 187 188 BUG_ON(size > PAGE_SIZE); 189 190 pageidx = PAGE_SIZE; 191 pagedata = NULL; /* hush, gcc */ 192 193 while (nelem--) { 194 if (pageidx > PAGE_SIZE-size) { 195 struct page *page; 196 pos = pos->next; 197 page = list_entry(pos, struct page, lru); 198 pagedata = page_address(page); 199 pageidx = 0; 200 } 201 202 ret = (*fn)(pagedata + pageidx, state); 203 if (ret) 204 break; 205 pageidx += size; 206 } 207 208 return ret; 209 } 210 211 /* 212 * Similar to traverse_pages, but use each page as a "block" of 213 * data to be processed as one unit. 214 */ 215 static int traverse_pages_block(unsigned nelem, size_t size, 216 struct list_head *pos, 217 int (*fn)(void *data, int nr, void *state), 218 void *state) 219 { 220 void *pagedata; 221 int ret = 0; 222 223 BUG_ON(size > PAGE_SIZE); 224 225 while (nelem) { 226 int nr = (PAGE_SIZE/size); 227 struct page *page; 228 if (nr > nelem) 229 nr = nelem; 230 pos = pos->next; 231 page = list_entry(pos, struct page, lru); 232 pagedata = page_address(page); 233 ret = (*fn)(pagedata, nr, state); 234 if (ret) 235 break; 236 nelem -= nr; 237 } 238 239 return ret; 240 } 241 242 struct mmap_gfn_state { 243 unsigned long va; 244 struct vm_area_struct *vma; 245 domid_t domain; 246 }; 247 248 static int mmap_gfn_range(void *data, void *state) 249 { 250 struct privcmd_mmap_entry *msg = data; 251 struct mmap_gfn_state *st = state; 252 struct vm_area_struct *vma = st->vma; 253 int rc; 254 255 /* Do not allow range to wrap the address space. */ 256 if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || 257 ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) 258 return -EINVAL; 259 260 /* Range chunks must be contiguous in va space. */ 261 if ((msg->va != st->va) || 262 ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) 263 return -EINVAL; 264 265 rc = xen_remap_domain_gfn_range(vma, 266 msg->va & PAGE_MASK, 267 msg->mfn, msg->npages, 268 vma->vm_page_prot, 269 st->domain, NULL); 270 if (rc < 0) 271 return rc; 272 273 st->va += msg->npages << PAGE_SHIFT; 274 275 return 0; 276 } 277 278 static long privcmd_ioctl_mmap(struct file *file, void __user *udata) 279 { 280 struct privcmd_data *data = file->private_data; 281 struct privcmd_mmap mmapcmd; 282 struct mm_struct *mm = current->mm; 283 struct vm_area_struct *vma; 284 int rc; 285 LIST_HEAD(pagelist); 286 struct mmap_gfn_state state; 287 288 /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ 289 if (!xen_pv_domain()) 290 return -ENOSYS; 291 292 if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) 293 return -EFAULT; 294 295 /* If restriction is in place, check the domid matches */ 296 if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom) 297 return -EPERM; 298 299 rc = gather_array(&pagelist, 300 mmapcmd.num, sizeof(struct privcmd_mmap_entry), 301 mmapcmd.entry); 302 303 if (rc || list_empty(&pagelist)) 304 goto out; 305 306 mmap_write_lock(mm); 307 308 { 309 struct page *page = list_first_entry(&pagelist, 310 struct page, lru); 311 struct privcmd_mmap_entry *msg = page_address(page); 312 313 vma = vma_lookup(mm, msg->va); 314 rc = -EINVAL; 315 316 if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) 317 goto out_up; 318 vma->vm_private_data = PRIV_VMA_LOCKED; 319 } 320 321 state.va = vma->vm_start; 322 state.vma = vma; 323 state.domain = mmapcmd.dom; 324 325 rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), 326 &pagelist, 327 mmap_gfn_range, &state); 328 329 330 out_up: 331 mmap_write_unlock(mm); 332 333 out: 334 free_page_list(&pagelist); 335 336 return rc; 337 } 338 339 struct mmap_batch_state { 340 domid_t domain; 341 unsigned long va; 342 struct vm_area_struct *vma; 343 int index; 344 /* A tristate: 345 * 0 for no errors 346 * 1 if at least one error has happened (and no 347 * -ENOENT errors have happened) 348 * -ENOENT if at least 1 -ENOENT has happened. 349 */ 350 int global_error; 351 int version; 352 353 /* User-space gfn array to store errors in the second pass for V1. */ 354 xen_pfn_t __user *user_gfn; 355 /* User-space int array to store errors in the second pass for V2. */ 356 int __user *user_err; 357 }; 358 359 /* auto translated dom0 note: if domU being created is PV, then gfn is 360 * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP). 361 */ 362 static int mmap_batch_fn(void *data, int nr, void *state) 363 { 364 xen_pfn_t *gfnp = data; 365 struct mmap_batch_state *st = state; 366 struct vm_area_struct *vma = st->vma; 367 struct page **pages = vma->vm_private_data; 368 struct page **cur_pages = NULL; 369 int ret; 370 371 if (!xen_pv_domain()) 372 cur_pages = &pages[st->index]; 373 374 BUG_ON(nr < 0); 375 ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr, 376 (int *)gfnp, st->vma->vm_page_prot, 377 st->domain, cur_pages); 378 379 /* Adjust the global_error? */ 380 if (ret != nr) { 381 if (ret == -ENOENT) 382 st->global_error = -ENOENT; 383 else { 384 /* Record that at least one error has happened. */ 385 if (st->global_error == 0) 386 st->global_error = 1; 387 } 388 } 389 st->va += XEN_PAGE_SIZE * nr; 390 st->index += nr / XEN_PFN_PER_PAGE; 391 392 return 0; 393 } 394 395 static int mmap_return_error(int err, struct mmap_batch_state *st) 396 { 397 int ret; 398 399 if (st->version == 1) { 400 if (err) { 401 xen_pfn_t gfn; 402 403 ret = get_user(gfn, st->user_gfn); 404 if (ret < 0) 405 return ret; 406 /* 407 * V1 encodes the error codes in the 32bit top 408 * nibble of the gfn (with its known 409 * limitations vis-a-vis 64 bit callers). 410 */ 411 gfn |= (err == -ENOENT) ? 412 PRIVCMD_MMAPBATCH_PAGED_ERROR : 413 PRIVCMD_MMAPBATCH_MFN_ERROR; 414 return __put_user(gfn, st->user_gfn++); 415 } else 416 st->user_gfn++; 417 } else { /* st->version == 2 */ 418 if (err) 419 return __put_user(err, st->user_err++); 420 else 421 st->user_err++; 422 } 423 424 return 0; 425 } 426 427 static int mmap_return_errors(void *data, int nr, void *state) 428 { 429 struct mmap_batch_state *st = state; 430 int *errs = data; 431 int i; 432 int ret; 433 434 for (i = 0; i < nr; i++) { 435 ret = mmap_return_error(errs[i], st); 436 if (ret < 0) 437 return ret; 438 } 439 return 0; 440 } 441 442 /* Allocate pfns that are then mapped with gfns from foreign domid. Update 443 * the vma with the page info to use later. 444 * Returns: 0 if success, otherwise -errno 445 */ 446 static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) 447 { 448 int rc; 449 struct page **pages; 450 451 pages = kvzalloc_objs(pages[0], numpgs); 452 if (pages == NULL) 453 return -ENOMEM; 454 455 rc = xen_alloc_unpopulated_pages(numpgs, pages); 456 if (rc != 0) { 457 pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, 458 numpgs, rc); 459 kvfree(pages); 460 return -ENOMEM; 461 } 462 BUG_ON(vma->vm_private_data != NULL); 463 vma->vm_private_data = pages; 464 465 return 0; 466 } 467 468 static const struct vm_operations_struct privcmd_vm_ops; 469 470 static long privcmd_ioctl_mmap_batch( 471 struct file *file, void __user *udata, int version) 472 { 473 struct privcmd_data *data = file->private_data; 474 int ret; 475 struct privcmd_mmapbatch_v2 m; 476 struct mm_struct *mm = current->mm; 477 struct vm_area_struct *vma; 478 unsigned long nr_pages; 479 LIST_HEAD(pagelist); 480 struct mmap_batch_state state; 481 482 switch (version) { 483 case 1: 484 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) 485 return -EFAULT; 486 /* Returns per-frame error in m.arr. */ 487 m.err = NULL; 488 if (!access_ok(m.arr, m.num * sizeof(*m.arr))) 489 return -EFAULT; 490 break; 491 case 2: 492 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) 493 return -EFAULT; 494 /* Returns per-frame error code in m.err. */ 495 if (!access_ok(m.err, m.num * (sizeof(*m.err)))) 496 return -EFAULT; 497 break; 498 default: 499 return -EINVAL; 500 } 501 502 /* If restriction is in place, check the domid matches */ 503 if (data->domid != DOMID_INVALID && data->domid != m.dom) 504 return -EPERM; 505 506 nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); 507 if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) 508 return -EINVAL; 509 510 ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); 511 512 if (ret) 513 goto out; 514 if (list_empty(&pagelist)) { 515 ret = -EINVAL; 516 goto out; 517 } 518 519 if (version == 2) { 520 /* Zero error array now to only copy back actual errors. */ 521 if (clear_user(m.err, sizeof(int) * m.num)) { 522 ret = -EFAULT; 523 goto out; 524 } 525 } 526 527 mmap_write_lock(mm); 528 529 vma = find_vma(mm, m.addr); 530 if (!vma || 531 vma->vm_ops != &privcmd_vm_ops) { 532 ret = -EINVAL; 533 goto out_unlock; 534 } 535 536 /* 537 * Caller must either: 538 * 539 * Map the whole VMA range, which will also allocate all the 540 * pages required for the auto_translated_physmap case. 541 * 542 * Or 543 * 544 * Map unmapped holes left from a previous map attempt (e.g., 545 * because those foreign frames were previously paged out). 546 */ 547 if (vma->vm_private_data == NULL) { 548 if (m.addr != vma->vm_start || 549 m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { 550 ret = -EINVAL; 551 goto out_unlock; 552 } 553 if (!xen_pv_domain()) { 554 ret = alloc_empty_pages(vma, nr_pages); 555 if (ret < 0) 556 goto out_unlock; 557 } else 558 vma->vm_private_data = PRIV_VMA_LOCKED; 559 } else { 560 if (m.addr < vma->vm_start || 561 m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { 562 ret = -EINVAL; 563 goto out_unlock; 564 } 565 if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { 566 ret = -EINVAL; 567 goto out_unlock; 568 } 569 } 570 571 state.domain = m.dom; 572 state.vma = vma; 573 state.va = m.addr; 574 state.index = 0; 575 state.global_error = 0; 576 state.version = version; 577 578 BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0); 579 /* mmap_batch_fn guarantees ret == 0 */ 580 BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), 581 &pagelist, mmap_batch_fn, &state)); 582 583 mmap_write_unlock(mm); 584 585 if (state.global_error) { 586 /* Write back errors in second pass. */ 587 state.user_gfn = (xen_pfn_t *)m.arr; 588 state.user_err = m.err; 589 ret = traverse_pages_block(m.num, sizeof(xen_pfn_t), 590 &pagelist, mmap_return_errors, &state); 591 } else 592 ret = 0; 593 594 /* If we have not had any EFAULT-like global errors then set the global 595 * error to -ENOENT if necessary. */ 596 if ((ret == 0) && (state.global_error == -ENOENT)) 597 ret = -ENOENT; 598 599 out: 600 free_page_list(&pagelist); 601 return ret; 602 603 out_unlock: 604 mmap_write_unlock(mm); 605 goto out; 606 } 607 608 static int lock_pages( 609 struct privcmd_dm_op_buf kbufs[], unsigned int num, 610 struct page *pages[], unsigned int nr_pages, unsigned int *pinned) 611 { 612 unsigned int i, off = 0; 613 614 for (i = 0; i < num; ) { 615 unsigned int requested; 616 int page_count; 617 618 requested = DIV_ROUND_UP( 619 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 620 PAGE_SIZE) - off; 621 if (requested > nr_pages) 622 return -ENOSPC; 623 624 page_count = pin_user_pages_fast( 625 (unsigned long)kbufs[i].uptr + off * PAGE_SIZE, 626 requested, FOLL_WRITE, pages); 627 if (page_count <= 0) 628 return page_count ? : -EFAULT; 629 630 *pinned += page_count; 631 nr_pages -= page_count; 632 pages += page_count; 633 634 off = (requested == page_count) ? 0 : off + page_count; 635 i += !off; 636 } 637 638 return 0; 639 } 640 641 static void unlock_pages(struct page *pages[], unsigned int nr_pages) 642 { 643 unpin_user_pages_dirty_lock(pages, nr_pages, true); 644 } 645 646 static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) 647 { 648 struct privcmd_data *data = file->private_data; 649 struct privcmd_dm_op kdata; 650 struct privcmd_dm_op_buf *kbufs; 651 unsigned int nr_pages = 0; 652 struct page **pages = NULL; 653 struct xen_dm_op_buf *xbufs = NULL; 654 unsigned int i; 655 long rc; 656 unsigned int pinned = 0; 657 658 if (copy_from_user(&kdata, udata, sizeof(kdata))) 659 return -EFAULT; 660 661 /* If restriction is in place, check the domid matches */ 662 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 663 return -EPERM; 664 665 if (kdata.num == 0) 666 return 0; 667 668 if (kdata.num > privcmd_dm_op_max_num) 669 return -E2BIG; 670 671 kbufs = kzalloc_objs(*kbufs, kdata.num); 672 if (!kbufs) 673 return -ENOMEM; 674 675 if (copy_from_user(kbufs, kdata.ubufs, 676 sizeof(*kbufs) * kdata.num)) { 677 rc = -EFAULT; 678 goto out; 679 } 680 681 for (i = 0; i < kdata.num; i++) { 682 if (kbufs[i].size > privcmd_dm_op_buf_max_size) { 683 rc = -E2BIG; 684 goto out; 685 } 686 687 if (!access_ok(kbufs[i].uptr, 688 kbufs[i].size)) { 689 rc = -EFAULT; 690 goto out; 691 } 692 693 nr_pages += DIV_ROUND_UP( 694 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 695 PAGE_SIZE); 696 } 697 698 pages = kzalloc_objs(*pages, nr_pages); 699 if (!pages) { 700 rc = -ENOMEM; 701 goto out; 702 } 703 704 xbufs = kzalloc_objs(*xbufs, kdata.num); 705 if (!xbufs) { 706 rc = -ENOMEM; 707 goto out; 708 } 709 710 rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned); 711 if (rc < 0) 712 goto out; 713 714 for (i = 0; i < kdata.num; i++) { 715 set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); 716 xbufs[i].size = kbufs[i].size; 717 } 718 719 xen_preemptible_hcall_begin(); 720 rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs); 721 xen_preemptible_hcall_end(); 722 723 out: 724 unlock_pages(pages, pinned); 725 kfree(xbufs); 726 kfree(pages); 727 kfree(kbufs); 728 729 return rc; 730 } 731 732 static long privcmd_ioctl_restrict(struct file *file, void __user *udata) 733 { 734 struct privcmd_data *data = file->private_data; 735 domid_t dom; 736 737 if (copy_from_user(&dom, udata, sizeof(dom))) 738 return -EFAULT; 739 740 /* Set restriction to the specified domain, or check it matches */ 741 if (data->domid == DOMID_INVALID) 742 data->domid = dom; 743 else if (data->domid != dom) 744 return -EINVAL; 745 746 return 0; 747 } 748 749 static long privcmd_ioctl_mmap_resource(struct file *file, 750 struct privcmd_mmap_resource __user *udata) 751 { 752 struct privcmd_data *data = file->private_data; 753 struct mm_struct *mm = current->mm; 754 struct vm_area_struct *vma; 755 struct privcmd_mmap_resource kdata; 756 xen_pfn_t *pfns = NULL; 757 struct xen_mem_acquire_resource xdata = { }; 758 int rc; 759 760 if (copy_from_user(&kdata, udata, sizeof(kdata))) 761 return -EFAULT; 762 763 /* If restriction is in place, check the domid matches */ 764 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 765 return -EPERM; 766 767 /* Both fields must be set or unset */ 768 if (!!kdata.addr != !!kdata.num) 769 return -EINVAL; 770 771 xdata.domid = kdata.dom; 772 xdata.type = kdata.type; 773 xdata.id = kdata.id; 774 775 if (!kdata.addr && !kdata.num) { 776 /* Query the size of the resource. */ 777 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 778 if (rc) 779 return rc; 780 return __put_user(xdata.nr_frames, &udata->num); 781 } 782 783 mmap_write_lock(mm); 784 785 vma = find_vma(mm, kdata.addr); 786 if (!vma || vma->vm_ops != &privcmd_vm_ops) { 787 rc = -EINVAL; 788 goto out; 789 } 790 791 pfns = kzalloc_objs(*pfns, kdata.num, GFP_KERNEL | __GFP_NOWARN); 792 if (!pfns) { 793 rc = -ENOMEM; 794 goto out; 795 } 796 797 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { 798 unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); 799 struct page **pages; 800 unsigned int i; 801 802 rc = alloc_empty_pages(vma, nr); 803 if (rc < 0) 804 goto out; 805 806 pages = vma->vm_private_data; 807 808 for (i = 0; i < kdata.num; i++) { 809 xen_pfn_t pfn = 810 page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); 811 812 pfns[i] = pfn + (i % XEN_PFN_PER_PAGE); 813 } 814 } else 815 vma->vm_private_data = PRIV_VMA_LOCKED; 816 817 xdata.frame = kdata.idx; 818 xdata.nr_frames = kdata.num; 819 set_xen_guest_handle(xdata.frame_list, pfns); 820 821 xen_preemptible_hcall_begin(); 822 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 823 xen_preemptible_hcall_end(); 824 825 if (rc) 826 goto out; 827 828 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { 829 rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); 830 } else { 831 unsigned int domid = 832 (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? 833 DOMID_SELF : kdata.dom; 834 int num, *errs = (int *)pfns; 835 836 BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); 837 num = xen_remap_domain_mfn_array(vma, 838 kdata.addr & PAGE_MASK, 839 pfns, kdata.num, errs, 840 vma->vm_page_prot, 841 domid); 842 if (num < 0) 843 rc = num; 844 else if (num != kdata.num) { 845 unsigned int i; 846 847 for (i = 0; i < num; i++) { 848 rc = errs[i]; 849 if (rc < 0) 850 break; 851 } 852 } else 853 rc = 0; 854 } 855 856 out: 857 mmap_write_unlock(mm); 858 kfree(pfns); 859 860 return rc; 861 } 862 863 static long privcmd_ioctl_pcidev_get_gsi(struct file *file, void __user *udata) 864 { 865 #if defined(CONFIG_XEN_ACPI) 866 int rc; 867 struct privcmd_pcidev_get_gsi kdata; 868 869 if (copy_from_user(&kdata, udata, sizeof(kdata))) 870 return -EFAULT; 871 872 rc = xen_acpi_get_gsi_from_sbdf(kdata.sbdf); 873 if (rc < 0) 874 return rc; 875 876 kdata.gsi = rc; 877 if (copy_to_user(udata, &kdata, sizeof(kdata))) 878 return -EFAULT; 879 880 return 0; 881 #else 882 return -EINVAL; 883 #endif 884 } 885 886 #ifdef CONFIG_XEN_PRIVCMD_EVENTFD 887 /* Irqfd support */ 888 static struct workqueue_struct *irqfd_cleanup_wq; 889 static DEFINE_SPINLOCK(irqfds_lock); 890 DEFINE_STATIC_SRCU(irqfds_srcu); 891 static LIST_HEAD(irqfds_list); 892 893 struct privcmd_kernel_irqfd { 894 struct xen_dm_op_buf xbufs; 895 domid_t dom; 896 bool error; 897 struct eventfd_ctx *eventfd; 898 struct work_struct shutdown; 899 wait_queue_entry_t wait; 900 struct list_head list; 901 poll_table pt; 902 }; 903 904 static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd) 905 { 906 lockdep_assert_held(&irqfds_lock); 907 908 list_del_init(&kirqfd->list); 909 queue_work(irqfd_cleanup_wq, &kirqfd->shutdown); 910 } 911 912 static void irqfd_shutdown(struct work_struct *work) 913 { 914 struct privcmd_kernel_irqfd *kirqfd = 915 container_of(work, struct privcmd_kernel_irqfd, shutdown); 916 u64 cnt; 917 918 /* Make sure irqfd has been initialized in assign path */ 919 synchronize_srcu(&irqfds_srcu); 920 921 eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt); 922 eventfd_ctx_put(kirqfd->eventfd); 923 kfree(kirqfd); 924 } 925 926 static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd) 927 { 928 u64 cnt; 929 long rc; 930 931 eventfd_ctx_do_read(kirqfd->eventfd, &cnt); 932 933 xen_preemptible_hcall_begin(); 934 rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs); 935 xen_preemptible_hcall_end(); 936 937 /* Don't repeat the error message for consecutive failures */ 938 if (rc && !kirqfd->error) { 939 pr_err("Failed to configure irq for guest domain: %d\n", 940 kirqfd->dom); 941 } 942 943 kirqfd->error = rc; 944 } 945 946 static int 947 irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) 948 { 949 struct privcmd_kernel_irqfd *kirqfd = 950 container_of(wait, struct privcmd_kernel_irqfd, wait); 951 __poll_t flags = key_to_poll(key); 952 953 if (flags & EPOLLIN) 954 irqfd_inject(kirqfd); 955 956 if (flags & EPOLLHUP) { 957 unsigned long flags; 958 959 spin_lock_irqsave(&irqfds_lock, flags); 960 irqfd_deactivate(kirqfd); 961 spin_unlock_irqrestore(&irqfds_lock, flags); 962 } 963 964 return 0; 965 } 966 967 static void 968 irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) 969 { 970 struct privcmd_kernel_irqfd *kirqfd = 971 container_of(pt, struct privcmd_kernel_irqfd, pt); 972 973 add_wait_queue_priority(wqh, &kirqfd->wait); 974 } 975 976 static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd) 977 { 978 struct privcmd_kernel_irqfd *kirqfd, *tmp; 979 unsigned long flags; 980 __poll_t events; 981 void *dm_op; 982 int ret, idx; 983 984 CLASS(fd, f)(irqfd->fd); 985 986 kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL); 987 if (!kirqfd) 988 return -ENOMEM; 989 dm_op = kirqfd + 1; 990 991 if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) { 992 ret = -EFAULT; 993 goto error_kfree; 994 } 995 996 kirqfd->xbufs.size = irqfd->size; 997 set_xen_guest_handle(kirqfd->xbufs.h, dm_op); 998 kirqfd->dom = irqfd->dom; 999 INIT_WORK(&kirqfd->shutdown, irqfd_shutdown); 1000 1001 if (fd_empty(f)) { 1002 ret = -EBADF; 1003 goto error_kfree; 1004 } 1005 1006 kirqfd->eventfd = eventfd_ctx_fileget(fd_file(f)); 1007 if (IS_ERR(kirqfd->eventfd)) { 1008 ret = PTR_ERR(kirqfd->eventfd); 1009 goto error_kfree; 1010 } 1011 1012 /* 1013 * Install our own custom wake-up handling so we are notified via a 1014 * callback whenever someone signals the underlying eventfd. 1015 */ 1016 init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup); 1017 init_poll_funcptr(&kirqfd->pt, irqfd_poll_func); 1018 1019 spin_lock_irqsave(&irqfds_lock, flags); 1020 1021 list_for_each_entry(tmp, &irqfds_list, list) { 1022 if (kirqfd->eventfd == tmp->eventfd) { 1023 ret = -EBUSY; 1024 spin_unlock_irqrestore(&irqfds_lock, flags); 1025 goto error_eventfd; 1026 } 1027 } 1028 1029 idx = srcu_read_lock(&irqfds_srcu); 1030 list_add_tail(&kirqfd->list, &irqfds_list); 1031 spin_unlock_irqrestore(&irqfds_lock, flags); 1032 1033 /* 1034 * Check if there was an event already pending on the eventfd before we 1035 * registered, and trigger it as if we didn't miss it. 1036 */ 1037 events = vfs_poll(fd_file(f), &kirqfd->pt); 1038 if (events & EPOLLIN) 1039 irqfd_inject(kirqfd); 1040 1041 srcu_read_unlock(&irqfds_srcu, idx); 1042 return 0; 1043 1044 error_eventfd: 1045 eventfd_ctx_put(kirqfd->eventfd); 1046 1047 error_kfree: 1048 kfree(kirqfd); 1049 return ret; 1050 } 1051 1052 static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd) 1053 { 1054 struct privcmd_kernel_irqfd *kirqfd; 1055 struct eventfd_ctx *eventfd; 1056 unsigned long flags; 1057 1058 eventfd = eventfd_ctx_fdget(irqfd->fd); 1059 if (IS_ERR(eventfd)) 1060 return PTR_ERR(eventfd); 1061 1062 spin_lock_irqsave(&irqfds_lock, flags); 1063 1064 list_for_each_entry(kirqfd, &irqfds_list, list) { 1065 if (kirqfd->eventfd == eventfd) { 1066 irqfd_deactivate(kirqfd); 1067 break; 1068 } 1069 } 1070 1071 spin_unlock_irqrestore(&irqfds_lock, flags); 1072 1073 eventfd_ctx_put(eventfd); 1074 1075 /* 1076 * Block until we know all outstanding shutdown jobs have completed so 1077 * that we guarantee there will not be any more interrupts once this 1078 * deassign function returns. 1079 */ 1080 flush_workqueue(irqfd_cleanup_wq); 1081 1082 return 0; 1083 } 1084 1085 static long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1086 { 1087 struct privcmd_data *data = file->private_data; 1088 struct privcmd_irqfd irqfd; 1089 1090 if (copy_from_user(&irqfd, udata, sizeof(irqfd))) 1091 return -EFAULT; 1092 1093 /* No other flags should be set */ 1094 if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN) 1095 return -EINVAL; 1096 1097 /* If restriction is in place, check the domid matches */ 1098 if (data->domid != DOMID_INVALID && data->domid != irqfd.dom) 1099 return -EPERM; 1100 1101 if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN) 1102 return privcmd_irqfd_deassign(&irqfd); 1103 1104 return privcmd_irqfd_assign(&irqfd); 1105 } 1106 1107 static int privcmd_irqfd_init(void) 1108 { 1109 irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", WQ_PERCPU, 1110 0); 1111 if (!irqfd_cleanup_wq) 1112 return -ENOMEM; 1113 1114 return 0; 1115 } 1116 1117 static void privcmd_irqfd_exit(void) 1118 { 1119 struct privcmd_kernel_irqfd *kirqfd, *tmp; 1120 unsigned long flags; 1121 1122 spin_lock_irqsave(&irqfds_lock, flags); 1123 1124 list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) 1125 irqfd_deactivate(kirqfd); 1126 1127 spin_unlock_irqrestore(&irqfds_lock, flags); 1128 1129 destroy_workqueue(irqfd_cleanup_wq); 1130 } 1131 1132 /* Ioeventfd Support */ 1133 #define QUEUE_NOTIFY_VQ_MASK 0xFFFF 1134 1135 static DEFINE_MUTEX(ioreq_lock); 1136 static LIST_HEAD(ioreq_list); 1137 1138 /* per-eventfd structure */ 1139 struct privcmd_kernel_ioeventfd { 1140 struct eventfd_ctx *eventfd; 1141 struct list_head list; 1142 u64 addr; 1143 unsigned int addr_len; 1144 unsigned int vq; 1145 }; 1146 1147 /* per-guest CPU / port structure */ 1148 struct ioreq_port { 1149 int vcpu; 1150 unsigned int port; 1151 struct privcmd_kernel_ioreq *kioreq; 1152 }; 1153 1154 /* per-guest structure */ 1155 struct privcmd_kernel_ioreq { 1156 domid_t dom; 1157 unsigned int vcpus; 1158 u64 uioreq; 1159 struct ioreq *ioreq; 1160 spinlock_t lock; /* Protects ioeventfds list */ 1161 struct list_head ioeventfds; 1162 struct list_head list; 1163 struct ioreq_port ports[] __counted_by(vcpus); 1164 }; 1165 1166 static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id) 1167 { 1168 struct ioreq_port *port = dev_id; 1169 struct privcmd_kernel_ioreq *kioreq = port->kioreq; 1170 struct ioreq *ioreq = &kioreq->ioreq[port->vcpu]; 1171 struct privcmd_kernel_ioeventfd *kioeventfd; 1172 unsigned int state = STATE_IOREQ_READY; 1173 1174 if (ioreq->state != STATE_IOREQ_READY || 1175 ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE) 1176 return IRQ_NONE; 1177 1178 /* 1179 * We need a barrier, smp_mb(), here to ensure reads are finished before 1180 * `state` is updated. Since the lock implementation ensures that 1181 * appropriate barrier will be added anyway, we can avoid adding 1182 * explicit barrier here. 1183 * 1184 * Ideally we don't need to update `state` within the locks, but we do 1185 * that here to avoid adding explicit barrier. 1186 */ 1187 1188 spin_lock(&kioreq->lock); 1189 ioreq->state = STATE_IOREQ_INPROCESS; 1190 1191 list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { 1192 if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY && 1193 ioreq->size == kioeventfd->addr_len && 1194 (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) { 1195 eventfd_signal(kioeventfd->eventfd); 1196 state = STATE_IORESP_READY; 1197 break; 1198 } 1199 } 1200 spin_unlock(&kioreq->lock); 1201 1202 /* 1203 * We need a barrier, smp_mb(), here to ensure writes are finished 1204 * before `state` is updated. Since the lock implementation ensures that 1205 * appropriate barrier will be added anyway, we can avoid adding 1206 * explicit barrier here. 1207 */ 1208 1209 ioreq->state = state; 1210 1211 if (state == STATE_IORESP_READY) { 1212 notify_remote_via_evtchn(port->port); 1213 return IRQ_HANDLED; 1214 } 1215 1216 return IRQ_NONE; 1217 } 1218 1219 static void ioreq_free(struct privcmd_kernel_ioreq *kioreq) 1220 { 1221 struct ioreq_port *ports = kioreq->ports; 1222 int i; 1223 1224 lockdep_assert_held(&ioreq_lock); 1225 1226 list_del(&kioreq->list); 1227 1228 for (i = kioreq->vcpus - 1; i >= 0; i--) 1229 unbind_from_irqhandler(irq_from_evtchn(ports[i].port), &ports[i]); 1230 1231 kfree(kioreq); 1232 } 1233 1234 static 1235 struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd) 1236 { 1237 struct privcmd_kernel_ioreq *kioreq; 1238 struct mm_struct *mm = current->mm; 1239 struct vm_area_struct *vma; 1240 struct page **pages; 1241 unsigned int *ports; 1242 int ret, size, i; 1243 1244 lockdep_assert_held(&ioreq_lock); 1245 1246 size = struct_size(kioreq, ports, ioeventfd->vcpus); 1247 kioreq = kzalloc(size, GFP_KERNEL); 1248 if (!kioreq) 1249 return ERR_PTR(-ENOMEM); 1250 1251 kioreq->dom = ioeventfd->dom; 1252 kioreq->vcpus = ioeventfd->vcpus; 1253 kioreq->uioreq = ioeventfd->ioreq; 1254 spin_lock_init(&kioreq->lock); 1255 INIT_LIST_HEAD(&kioreq->ioeventfds); 1256 1257 /* The memory for ioreq server must have been mapped earlier */ 1258 mmap_write_lock(mm); 1259 vma = find_vma(mm, (unsigned long)ioeventfd->ioreq); 1260 if (!vma) { 1261 pr_err("Failed to find vma for ioreq page!\n"); 1262 mmap_write_unlock(mm); 1263 ret = -EFAULT; 1264 goto error_kfree; 1265 } 1266 1267 pages = vma->vm_private_data; 1268 kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0])); 1269 mmap_write_unlock(mm); 1270 1271 ports = memdup_array_user(u64_to_user_ptr(ioeventfd->ports), 1272 kioreq->vcpus, sizeof(*ports)); 1273 if (IS_ERR(ports)) { 1274 ret = PTR_ERR(ports); 1275 goto error_kfree; 1276 } 1277 1278 for (i = 0; i < kioreq->vcpus; i++) { 1279 kioreq->ports[i].vcpu = i; 1280 kioreq->ports[i].port = ports[i]; 1281 kioreq->ports[i].kioreq = kioreq; 1282 1283 ret = bind_evtchn_to_irqhandler_lateeoi(ports[i], 1284 ioeventfd_interrupt, IRQF_SHARED, "ioeventfd", 1285 &kioreq->ports[i]); 1286 if (ret < 0) 1287 goto error_unbind; 1288 } 1289 1290 kfree(ports); 1291 1292 list_add_tail(&kioreq->list, &ioreq_list); 1293 1294 return kioreq; 1295 1296 error_unbind: 1297 while (--i >= 0) 1298 unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]); 1299 1300 kfree(ports); 1301 error_kfree: 1302 kfree(kioreq); 1303 return ERR_PTR(ret); 1304 } 1305 1306 static struct privcmd_kernel_ioreq * 1307 get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd) 1308 { 1309 struct privcmd_kernel_ioreq *kioreq; 1310 unsigned long flags; 1311 1312 list_for_each_entry(kioreq, &ioreq_list, list) { 1313 struct privcmd_kernel_ioeventfd *kioeventfd; 1314 1315 /* 1316 * kioreq fields can be accessed here without a lock as they are 1317 * never updated after being added to the ioreq_list. 1318 */ 1319 if (kioreq->uioreq != ioeventfd->ioreq) { 1320 continue; 1321 } else if (kioreq->dom != ioeventfd->dom || 1322 kioreq->vcpus != ioeventfd->vcpus) { 1323 pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n", 1324 kioreq->dom, ioeventfd->dom, kioreq->vcpus, 1325 ioeventfd->vcpus); 1326 return ERR_PTR(-EINVAL); 1327 } 1328 1329 /* Look for a duplicate eventfd for the same guest */ 1330 spin_lock_irqsave(&kioreq->lock, flags); 1331 list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { 1332 if (eventfd == kioeventfd->eventfd) { 1333 spin_unlock_irqrestore(&kioreq->lock, flags); 1334 return ERR_PTR(-EBUSY); 1335 } 1336 } 1337 spin_unlock_irqrestore(&kioreq->lock, flags); 1338 1339 return kioreq; 1340 } 1341 1342 /* Matching kioreq isn't found, allocate a new one */ 1343 return alloc_ioreq(ioeventfd); 1344 } 1345 1346 static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd) 1347 { 1348 list_del(&kioeventfd->list); 1349 eventfd_ctx_put(kioeventfd->eventfd); 1350 kfree(kioeventfd); 1351 } 1352 1353 static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd) 1354 { 1355 struct privcmd_kernel_ioeventfd *kioeventfd; 1356 struct privcmd_kernel_ioreq *kioreq; 1357 unsigned long flags; 1358 int ret; 1359 1360 /* Check for range overflow */ 1361 if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr) 1362 return -EINVAL; 1363 1364 /* Vhost requires us to support length 1, 2, 4, and 8 */ 1365 if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 || 1366 ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8)) 1367 return -EINVAL; 1368 1369 /* 4096 vcpus limit enough ? */ 1370 if (!ioeventfd->vcpus || ioeventfd->vcpus > 4096) 1371 return -EINVAL; 1372 1373 kioeventfd = kzalloc_obj(*kioeventfd); 1374 if (!kioeventfd) 1375 return -ENOMEM; 1376 1377 kioeventfd->eventfd = eventfd_ctx_fdget(ioeventfd->event_fd); 1378 if (IS_ERR(kioeventfd->eventfd)) { 1379 ret = PTR_ERR(kioeventfd->eventfd); 1380 goto error_kfree; 1381 } 1382 1383 kioeventfd->addr = ioeventfd->addr; 1384 kioeventfd->addr_len = ioeventfd->addr_len; 1385 kioeventfd->vq = ioeventfd->vq; 1386 1387 mutex_lock(&ioreq_lock); 1388 kioreq = get_ioreq(ioeventfd, kioeventfd->eventfd); 1389 if (IS_ERR(kioreq)) { 1390 mutex_unlock(&ioreq_lock); 1391 ret = PTR_ERR(kioreq); 1392 goto error_eventfd; 1393 } 1394 1395 spin_lock_irqsave(&kioreq->lock, flags); 1396 list_add_tail(&kioeventfd->list, &kioreq->ioeventfds); 1397 spin_unlock_irqrestore(&kioreq->lock, flags); 1398 1399 mutex_unlock(&ioreq_lock); 1400 1401 return 0; 1402 1403 error_eventfd: 1404 eventfd_ctx_put(kioeventfd->eventfd); 1405 1406 error_kfree: 1407 kfree(kioeventfd); 1408 return ret; 1409 } 1410 1411 static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd) 1412 { 1413 struct privcmd_kernel_ioreq *kioreq, *tkioreq; 1414 struct eventfd_ctx *eventfd; 1415 unsigned long flags; 1416 int ret = 0; 1417 1418 eventfd = eventfd_ctx_fdget(ioeventfd->event_fd); 1419 if (IS_ERR(eventfd)) 1420 return PTR_ERR(eventfd); 1421 1422 mutex_lock(&ioreq_lock); 1423 list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) { 1424 struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; 1425 /* 1426 * kioreq fields can be accessed here without a lock as they are 1427 * never updated after being added to the ioreq_list. 1428 */ 1429 if (kioreq->dom != ioeventfd->dom || 1430 kioreq->uioreq != ioeventfd->ioreq || 1431 kioreq->vcpus != ioeventfd->vcpus) 1432 continue; 1433 1434 spin_lock_irqsave(&kioreq->lock, flags); 1435 list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) { 1436 if (eventfd == kioeventfd->eventfd) { 1437 ioeventfd_free(kioeventfd); 1438 spin_unlock_irqrestore(&kioreq->lock, flags); 1439 1440 if (list_empty(&kioreq->ioeventfds)) 1441 ioreq_free(kioreq); 1442 goto unlock; 1443 } 1444 } 1445 spin_unlock_irqrestore(&kioreq->lock, flags); 1446 break; 1447 } 1448 1449 pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n", 1450 ioeventfd->dom, ioeventfd->addr); 1451 ret = -ENODEV; 1452 1453 unlock: 1454 mutex_unlock(&ioreq_lock); 1455 eventfd_ctx_put(eventfd); 1456 1457 return ret; 1458 } 1459 1460 static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) 1461 { 1462 struct privcmd_data *data = file->private_data; 1463 struct privcmd_ioeventfd ioeventfd; 1464 1465 if (copy_from_user(&ioeventfd, udata, sizeof(ioeventfd))) 1466 return -EFAULT; 1467 1468 /* No other flags should be set */ 1469 if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) 1470 return -EINVAL; 1471 1472 /* If restriction is in place, check the domid matches */ 1473 if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom) 1474 return -EPERM; 1475 1476 if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) 1477 return privcmd_ioeventfd_deassign(&ioeventfd); 1478 1479 return privcmd_ioeventfd_assign(&ioeventfd); 1480 } 1481 1482 static void privcmd_ioeventfd_exit(void) 1483 { 1484 struct privcmd_kernel_ioreq *kioreq, *tmp; 1485 unsigned long flags; 1486 1487 mutex_lock(&ioreq_lock); 1488 list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) { 1489 struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; 1490 1491 spin_lock_irqsave(&kioreq->lock, flags); 1492 list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) 1493 ioeventfd_free(kioeventfd); 1494 spin_unlock_irqrestore(&kioreq->lock, flags); 1495 1496 ioreq_free(kioreq); 1497 } 1498 mutex_unlock(&ioreq_lock); 1499 } 1500 #else 1501 static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1502 { 1503 return -EOPNOTSUPP; 1504 } 1505 1506 static inline int privcmd_irqfd_init(void) 1507 { 1508 return 0; 1509 } 1510 1511 static inline void privcmd_irqfd_exit(void) 1512 { 1513 } 1514 1515 static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) 1516 { 1517 return -EOPNOTSUPP; 1518 } 1519 1520 static inline void privcmd_ioeventfd_exit(void) 1521 { 1522 } 1523 #endif /* CONFIG_XEN_PRIVCMD_EVENTFD */ 1524 1525 static long privcmd_ioctl(struct file *file, 1526 unsigned int cmd, unsigned long data) 1527 { 1528 int ret = -ENOTTY; 1529 void __user *udata = (void __user *) data; 1530 1531 switch (cmd) { 1532 case IOCTL_PRIVCMD_HYPERCALL: 1533 ret = privcmd_ioctl_hypercall(file, udata); 1534 break; 1535 1536 case IOCTL_PRIVCMD_MMAP: 1537 ret = privcmd_ioctl_mmap(file, udata); 1538 break; 1539 1540 case IOCTL_PRIVCMD_MMAPBATCH: 1541 ret = privcmd_ioctl_mmap_batch(file, udata, 1); 1542 break; 1543 1544 case IOCTL_PRIVCMD_MMAPBATCH_V2: 1545 ret = privcmd_ioctl_mmap_batch(file, udata, 2); 1546 break; 1547 1548 case IOCTL_PRIVCMD_DM_OP: 1549 ret = privcmd_ioctl_dm_op(file, udata); 1550 break; 1551 1552 case IOCTL_PRIVCMD_RESTRICT: 1553 ret = privcmd_ioctl_restrict(file, udata); 1554 break; 1555 1556 case IOCTL_PRIVCMD_MMAP_RESOURCE: 1557 ret = privcmd_ioctl_mmap_resource(file, udata); 1558 break; 1559 1560 case IOCTL_PRIVCMD_IRQFD: 1561 ret = privcmd_ioctl_irqfd(file, udata); 1562 break; 1563 1564 case IOCTL_PRIVCMD_IOEVENTFD: 1565 ret = privcmd_ioctl_ioeventfd(file, udata); 1566 break; 1567 1568 case IOCTL_PRIVCMD_PCIDEV_GET_GSI: 1569 ret = privcmd_ioctl_pcidev_get_gsi(file, udata); 1570 break; 1571 1572 default: 1573 break; 1574 } 1575 1576 return ret; 1577 } 1578 1579 static int privcmd_open(struct inode *ino, struct file *file) 1580 { 1581 struct privcmd_data *data; 1582 1583 if (wait_event_interruptible(restrict_wait_wq, !restrict_wait) < 0) 1584 return -EINTR; 1585 1586 data = kzalloc_obj(*data); 1587 if (!data) 1588 return -ENOMEM; 1589 1590 data->domid = target_domain; 1591 1592 file->private_data = data; 1593 return 0; 1594 } 1595 1596 static int privcmd_release(struct inode *ino, struct file *file) 1597 { 1598 struct privcmd_data *data = file->private_data; 1599 1600 kfree(data); 1601 return 0; 1602 } 1603 1604 static void privcmd_close(struct vm_area_struct *vma) 1605 { 1606 struct page **pages = vma->vm_private_data; 1607 int numpgs = vma_pages(vma); 1608 int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; 1609 int rc; 1610 1611 if (xen_pv_domain() || !numpgs || !pages) 1612 return; 1613 1614 rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); 1615 if (rc == 0) 1616 xen_free_unpopulated_pages(numpgs, pages); 1617 else 1618 pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", 1619 numpgs, rc); 1620 kvfree(pages); 1621 } 1622 1623 static vm_fault_t privcmd_fault(struct vm_fault *vmf) 1624 { 1625 printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", 1626 vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, 1627 vmf->pgoff, (void *)vmf->address); 1628 1629 return VM_FAULT_SIGBUS; 1630 } 1631 1632 static const struct vm_operations_struct privcmd_vm_ops = { 1633 .close = privcmd_close, 1634 .fault = privcmd_fault 1635 }; 1636 1637 static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) 1638 { 1639 /* DONTCOPY is essential for Xen because copy_page_range doesn't know 1640 * how to recreate these mappings */ 1641 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | 1642 VM_DONTEXPAND | VM_DONTDUMP); 1643 vma->vm_ops = &privcmd_vm_ops; 1644 vma->vm_private_data = NULL; 1645 1646 return 0; 1647 } 1648 1649 /* 1650 * For MMAPBATCH*. This allows asserting the singleshot mapping 1651 * on a per pfn/pte basis. Mapping calls that fail with ENOENT 1652 * can be then retried until success. 1653 */ 1654 static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) 1655 { 1656 return pte_none(ptep_get(pte)) ? 0 : -EBUSY; 1657 } 1658 1659 static int privcmd_vma_range_is_mapped( 1660 struct vm_area_struct *vma, 1661 unsigned long addr, 1662 unsigned long nr_pages) 1663 { 1664 return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, 1665 is_mapped_fn, NULL) != 0; 1666 } 1667 1668 const struct file_operations xen_privcmd_fops = { 1669 .owner = THIS_MODULE, 1670 .unlocked_ioctl = privcmd_ioctl, 1671 .open = privcmd_open, 1672 .release = privcmd_release, 1673 .mmap = privcmd_mmap, 1674 }; 1675 EXPORT_SYMBOL_GPL(xen_privcmd_fops); 1676 1677 static struct miscdevice privcmd_dev = { 1678 .minor = MISC_DYNAMIC_MINOR, 1679 .name = "xen/privcmd", 1680 .fops = &xen_privcmd_fops, 1681 }; 1682 1683 static int init_restrict(struct notifier_block *notifier, 1684 unsigned long event, 1685 void *data) 1686 { 1687 char *target; 1688 unsigned int domid; 1689 1690 /* Default to an guaranteed unused domain-id. */ 1691 target_domain = DOMID_IDLE; 1692 1693 target = xenbus_read(XBT_NIL, "target", "", NULL); 1694 if (IS_ERR(target) || kstrtouint(target, 10, &domid)) { 1695 pr_err("No target domain found, blocking all hypercalls\n"); 1696 goto out; 1697 } 1698 1699 target_domain = domid; 1700 1701 out: 1702 if (!IS_ERR(target)) 1703 kfree(target); 1704 1705 restrict_wait = false; 1706 wake_up_all(&restrict_wait_wq); 1707 1708 return NOTIFY_DONE; 1709 } 1710 1711 static struct notifier_block xenstore_notifier = { 1712 .notifier_call = init_restrict, 1713 }; 1714 1715 static void __init restrict_driver(void) 1716 { 1717 if (unrestricted) { 1718 if (security_locked_down(LOCKDOWN_XEN_USER_ACTIONS)) 1719 pr_warn("Kernel is locked down, parameter \"unrestricted\" ignored\n"); 1720 else 1721 return; 1722 } 1723 1724 restrict_wait = true; 1725 1726 register_xenstore_notifier(&xenstore_notifier); 1727 } 1728 1729 static int __init privcmd_init(void) 1730 { 1731 int err; 1732 1733 if (!xen_domain()) 1734 return -ENODEV; 1735 1736 if (!xen_initial_domain()) 1737 restrict_driver(); 1738 1739 err = misc_register(&privcmd_dev); 1740 if (err != 0) { 1741 pr_err("Could not register Xen privcmd device\n"); 1742 return err; 1743 } 1744 1745 err = misc_register(&xen_privcmdbuf_dev); 1746 if (err != 0) { 1747 pr_err("Could not register Xen hypercall-buf device\n"); 1748 goto err_privcmdbuf; 1749 } 1750 1751 err = privcmd_irqfd_init(); 1752 if (err != 0) { 1753 pr_err("irqfd init failed\n"); 1754 goto err_irqfd; 1755 } 1756 1757 return 0; 1758 1759 err_irqfd: 1760 misc_deregister(&xen_privcmdbuf_dev); 1761 err_privcmdbuf: 1762 misc_deregister(&privcmd_dev); 1763 return err; 1764 } 1765 1766 static void __exit privcmd_exit(void) 1767 { 1768 privcmd_ioeventfd_exit(); 1769 privcmd_irqfd_exit(); 1770 misc_deregister(&privcmd_dev); 1771 misc_deregister(&xen_privcmdbuf_dev); 1772 } 1773 1774 module_init(privcmd_init); 1775 module_exit(privcmd_exit); 1776