1 // SPDX-License-Identifier: GPL-2.0-only 2 /****************************************************************************** 3 * privcmd.c 4 * 5 * Interface to privileged domain-0 commands. 6 * 7 * Copyright (c) 2002-2004, K A Fraser, B Dragovic 8 */ 9 10 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 11 12 #include <linux/eventfd.h> 13 #include <linux/file.h> 14 #include <linux/kernel.h> 15 #include <linux/module.h> 16 #include <linux/mutex.h> 17 #include <linux/poll.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/srcu.h> 21 #include <linux/string.h> 22 #include <linux/workqueue.h> 23 #include <linux/errno.h> 24 #include <linux/mm.h> 25 #include <linux/mman.h> 26 #include <linux/uaccess.h> 27 #include <linux/swap.h> 28 #include <linux/highmem.h> 29 #include <linux/pagemap.h> 30 #include <linux/seq_file.h> 31 #include <linux/miscdevice.h> 32 #include <linux/moduleparam.h> 33 #include <linux/virtio_mmio.h> 34 35 #include <asm/xen/hypervisor.h> 36 #include <asm/xen/hypercall.h> 37 38 #include <xen/xen.h> 39 #include <xen/events.h> 40 #include <xen/privcmd.h> 41 #include <xen/interface/xen.h> 42 #include <xen/interface/memory.h> 43 #include <xen/interface/hvm/dm_op.h> 44 #include <xen/interface/hvm/ioreq.h> 45 #include <xen/features.h> 46 #include <xen/page.h> 47 #include <xen/xen-ops.h> 48 #include <xen/balloon.h> 49 #ifdef CONFIG_XEN_ACPI 50 #include <xen/acpi.h> 51 #endif 52 53 #include "privcmd.h" 54 55 MODULE_DESCRIPTION("Xen hypercall passthrough driver"); 56 MODULE_LICENSE("GPL"); 57 58 #define PRIV_VMA_LOCKED ((void *)1) 59 60 static unsigned int privcmd_dm_op_max_num = 16; 61 module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644); 62 MODULE_PARM_DESC(dm_op_max_nr_bufs, 63 "Maximum number of buffers per dm_op hypercall"); 64 65 static unsigned int privcmd_dm_op_buf_max_size = 4096; 66 module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, 67 0644); 68 MODULE_PARM_DESC(dm_op_buf_max_size, 69 "Maximum size of a dm_op hypercall buffer"); 70 71 struct privcmd_data { 72 domid_t domid; 73 }; 74 75 static int privcmd_vma_range_is_mapped( 76 struct vm_area_struct *vma, 77 unsigned long addr, 78 unsigned long nr_pages); 79 80 static long privcmd_ioctl_hypercall(struct file *file, void __user *udata) 81 { 82 struct privcmd_data *data = file->private_data; 83 struct privcmd_hypercall hypercall; 84 long ret; 85 86 /* Disallow arbitrary hypercalls if restricted */ 87 if (data->domid != DOMID_INVALID) 88 return -EPERM; 89 90 if (copy_from_user(&hypercall, udata, sizeof(hypercall))) 91 return -EFAULT; 92 93 xen_preemptible_hcall_begin(); 94 ret = privcmd_call(hypercall.op, 95 hypercall.arg[0], hypercall.arg[1], 96 hypercall.arg[2], hypercall.arg[3], 97 hypercall.arg[4]); 98 xen_preemptible_hcall_end(); 99 100 return ret; 101 } 102 103 static void free_page_list(struct list_head *pages) 104 { 105 struct page *p, *n; 106 107 list_for_each_entry_safe(p, n, pages, lru) 108 __free_page(p); 109 110 INIT_LIST_HEAD(pages); 111 } 112 113 /* 114 * Given an array of items in userspace, return a list of pages 115 * containing the data. If copying fails, either because of memory 116 * allocation failure or a problem reading user memory, return an 117 * error code; its up to the caller to dispose of any partial list. 118 */ 119 static int gather_array(struct list_head *pagelist, 120 unsigned nelem, size_t size, 121 const void __user *data) 122 { 123 unsigned pageidx; 124 void *pagedata; 125 int ret; 126 127 if (size > PAGE_SIZE) 128 return 0; 129 130 pageidx = PAGE_SIZE; 131 pagedata = NULL; /* quiet, gcc */ 132 while (nelem--) { 133 if (pageidx > PAGE_SIZE-size) { 134 struct page *page = alloc_page(GFP_KERNEL); 135 136 ret = -ENOMEM; 137 if (page == NULL) 138 goto fail; 139 140 pagedata = page_address(page); 141 142 list_add_tail(&page->lru, pagelist); 143 pageidx = 0; 144 } 145 146 ret = -EFAULT; 147 if (copy_from_user(pagedata + pageidx, data, size)) 148 goto fail; 149 150 data += size; 151 pageidx += size; 152 } 153 154 ret = 0; 155 156 fail: 157 return ret; 158 } 159 160 /* 161 * Call function "fn" on each element of the array fragmented 162 * over a list of pages. 163 */ 164 static int traverse_pages(unsigned nelem, size_t size, 165 struct list_head *pos, 166 int (*fn)(void *data, void *state), 167 void *state) 168 { 169 void *pagedata; 170 unsigned pageidx; 171 int ret = 0; 172 173 BUG_ON(size > PAGE_SIZE); 174 175 pageidx = PAGE_SIZE; 176 pagedata = NULL; /* hush, gcc */ 177 178 while (nelem--) { 179 if (pageidx > PAGE_SIZE-size) { 180 struct page *page; 181 pos = pos->next; 182 page = list_entry(pos, struct page, lru); 183 pagedata = page_address(page); 184 pageidx = 0; 185 } 186 187 ret = (*fn)(pagedata + pageidx, state); 188 if (ret) 189 break; 190 pageidx += size; 191 } 192 193 return ret; 194 } 195 196 /* 197 * Similar to traverse_pages, but use each page as a "block" of 198 * data to be processed as one unit. 199 */ 200 static int traverse_pages_block(unsigned nelem, size_t size, 201 struct list_head *pos, 202 int (*fn)(void *data, int nr, void *state), 203 void *state) 204 { 205 void *pagedata; 206 int ret = 0; 207 208 BUG_ON(size > PAGE_SIZE); 209 210 while (nelem) { 211 int nr = (PAGE_SIZE/size); 212 struct page *page; 213 if (nr > nelem) 214 nr = nelem; 215 pos = pos->next; 216 page = list_entry(pos, struct page, lru); 217 pagedata = page_address(page); 218 ret = (*fn)(pagedata, nr, state); 219 if (ret) 220 break; 221 nelem -= nr; 222 } 223 224 return ret; 225 } 226 227 struct mmap_gfn_state { 228 unsigned long va; 229 struct vm_area_struct *vma; 230 domid_t domain; 231 }; 232 233 static int mmap_gfn_range(void *data, void *state) 234 { 235 struct privcmd_mmap_entry *msg = data; 236 struct mmap_gfn_state *st = state; 237 struct vm_area_struct *vma = st->vma; 238 int rc; 239 240 /* Do not allow range to wrap the address space. */ 241 if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || 242 ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) 243 return -EINVAL; 244 245 /* Range chunks must be contiguous in va space. */ 246 if ((msg->va != st->va) || 247 ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) 248 return -EINVAL; 249 250 rc = xen_remap_domain_gfn_range(vma, 251 msg->va & PAGE_MASK, 252 msg->mfn, msg->npages, 253 vma->vm_page_prot, 254 st->domain, NULL); 255 if (rc < 0) 256 return rc; 257 258 st->va += msg->npages << PAGE_SHIFT; 259 260 return 0; 261 } 262 263 static long privcmd_ioctl_mmap(struct file *file, void __user *udata) 264 { 265 struct privcmd_data *data = file->private_data; 266 struct privcmd_mmap mmapcmd; 267 struct mm_struct *mm = current->mm; 268 struct vm_area_struct *vma; 269 int rc; 270 LIST_HEAD(pagelist); 271 struct mmap_gfn_state state; 272 273 /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ 274 if (xen_feature(XENFEAT_auto_translated_physmap)) 275 return -ENOSYS; 276 277 if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) 278 return -EFAULT; 279 280 /* If restriction is in place, check the domid matches */ 281 if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom) 282 return -EPERM; 283 284 rc = gather_array(&pagelist, 285 mmapcmd.num, sizeof(struct privcmd_mmap_entry), 286 mmapcmd.entry); 287 288 if (rc || list_empty(&pagelist)) 289 goto out; 290 291 mmap_write_lock(mm); 292 293 { 294 struct page *page = list_first_entry(&pagelist, 295 struct page, lru); 296 struct privcmd_mmap_entry *msg = page_address(page); 297 298 vma = vma_lookup(mm, msg->va); 299 rc = -EINVAL; 300 301 if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) 302 goto out_up; 303 vma->vm_private_data = PRIV_VMA_LOCKED; 304 } 305 306 state.va = vma->vm_start; 307 state.vma = vma; 308 state.domain = mmapcmd.dom; 309 310 rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), 311 &pagelist, 312 mmap_gfn_range, &state); 313 314 315 out_up: 316 mmap_write_unlock(mm); 317 318 out: 319 free_page_list(&pagelist); 320 321 return rc; 322 } 323 324 struct mmap_batch_state { 325 domid_t domain; 326 unsigned long va; 327 struct vm_area_struct *vma; 328 int index; 329 /* A tristate: 330 * 0 for no errors 331 * 1 if at least one error has happened (and no 332 * -ENOENT errors have happened) 333 * -ENOENT if at least 1 -ENOENT has happened. 334 */ 335 int global_error; 336 int version; 337 338 /* User-space gfn array to store errors in the second pass for V1. */ 339 xen_pfn_t __user *user_gfn; 340 /* User-space int array to store errors in the second pass for V2. */ 341 int __user *user_err; 342 }; 343 344 /* auto translated dom0 note: if domU being created is PV, then gfn is 345 * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP). 346 */ 347 static int mmap_batch_fn(void *data, int nr, void *state) 348 { 349 xen_pfn_t *gfnp = data; 350 struct mmap_batch_state *st = state; 351 struct vm_area_struct *vma = st->vma; 352 struct page **pages = vma->vm_private_data; 353 struct page **cur_pages = NULL; 354 int ret; 355 356 if (xen_feature(XENFEAT_auto_translated_physmap)) 357 cur_pages = &pages[st->index]; 358 359 BUG_ON(nr < 0); 360 ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr, 361 (int *)gfnp, st->vma->vm_page_prot, 362 st->domain, cur_pages); 363 364 /* Adjust the global_error? */ 365 if (ret != nr) { 366 if (ret == -ENOENT) 367 st->global_error = -ENOENT; 368 else { 369 /* Record that at least one error has happened. */ 370 if (st->global_error == 0) 371 st->global_error = 1; 372 } 373 } 374 st->va += XEN_PAGE_SIZE * nr; 375 st->index += nr / XEN_PFN_PER_PAGE; 376 377 return 0; 378 } 379 380 static int mmap_return_error(int err, struct mmap_batch_state *st) 381 { 382 int ret; 383 384 if (st->version == 1) { 385 if (err) { 386 xen_pfn_t gfn; 387 388 ret = get_user(gfn, st->user_gfn); 389 if (ret < 0) 390 return ret; 391 /* 392 * V1 encodes the error codes in the 32bit top 393 * nibble of the gfn (with its known 394 * limitations vis-a-vis 64 bit callers). 395 */ 396 gfn |= (err == -ENOENT) ? 397 PRIVCMD_MMAPBATCH_PAGED_ERROR : 398 PRIVCMD_MMAPBATCH_MFN_ERROR; 399 return __put_user(gfn, st->user_gfn++); 400 } else 401 st->user_gfn++; 402 } else { /* st->version == 2 */ 403 if (err) 404 return __put_user(err, st->user_err++); 405 else 406 st->user_err++; 407 } 408 409 return 0; 410 } 411 412 static int mmap_return_errors(void *data, int nr, void *state) 413 { 414 struct mmap_batch_state *st = state; 415 int *errs = data; 416 int i; 417 int ret; 418 419 for (i = 0; i < nr; i++) { 420 ret = mmap_return_error(errs[i], st); 421 if (ret < 0) 422 return ret; 423 } 424 return 0; 425 } 426 427 /* Allocate pfns that are then mapped with gfns from foreign domid. Update 428 * the vma with the page info to use later. 429 * Returns: 0 if success, otherwise -errno 430 */ 431 static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) 432 { 433 int rc; 434 struct page **pages; 435 436 pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); 437 if (pages == NULL) 438 return -ENOMEM; 439 440 rc = xen_alloc_unpopulated_pages(numpgs, pages); 441 if (rc != 0) { 442 pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, 443 numpgs, rc); 444 kvfree(pages); 445 return -ENOMEM; 446 } 447 BUG_ON(vma->vm_private_data != NULL); 448 vma->vm_private_data = pages; 449 450 return 0; 451 } 452 453 static const struct vm_operations_struct privcmd_vm_ops; 454 455 static long privcmd_ioctl_mmap_batch( 456 struct file *file, void __user *udata, int version) 457 { 458 struct privcmd_data *data = file->private_data; 459 int ret; 460 struct privcmd_mmapbatch_v2 m; 461 struct mm_struct *mm = current->mm; 462 struct vm_area_struct *vma; 463 unsigned long nr_pages; 464 LIST_HEAD(pagelist); 465 struct mmap_batch_state state; 466 467 switch (version) { 468 case 1: 469 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) 470 return -EFAULT; 471 /* Returns per-frame error in m.arr. */ 472 m.err = NULL; 473 if (!access_ok(m.arr, m.num * sizeof(*m.arr))) 474 return -EFAULT; 475 break; 476 case 2: 477 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) 478 return -EFAULT; 479 /* Returns per-frame error code in m.err. */ 480 if (!access_ok(m.err, m.num * (sizeof(*m.err)))) 481 return -EFAULT; 482 break; 483 default: 484 return -EINVAL; 485 } 486 487 /* If restriction is in place, check the domid matches */ 488 if (data->domid != DOMID_INVALID && data->domid != m.dom) 489 return -EPERM; 490 491 nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); 492 if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) 493 return -EINVAL; 494 495 ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); 496 497 if (ret) 498 goto out; 499 if (list_empty(&pagelist)) { 500 ret = -EINVAL; 501 goto out; 502 } 503 504 if (version == 2) { 505 /* Zero error array now to only copy back actual errors. */ 506 if (clear_user(m.err, sizeof(int) * m.num)) { 507 ret = -EFAULT; 508 goto out; 509 } 510 } 511 512 mmap_write_lock(mm); 513 514 vma = find_vma(mm, m.addr); 515 if (!vma || 516 vma->vm_ops != &privcmd_vm_ops) { 517 ret = -EINVAL; 518 goto out_unlock; 519 } 520 521 /* 522 * Caller must either: 523 * 524 * Map the whole VMA range, which will also allocate all the 525 * pages required for the auto_translated_physmap case. 526 * 527 * Or 528 * 529 * Map unmapped holes left from a previous map attempt (e.g., 530 * because those foreign frames were previously paged out). 531 */ 532 if (vma->vm_private_data == NULL) { 533 if (m.addr != vma->vm_start || 534 m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { 535 ret = -EINVAL; 536 goto out_unlock; 537 } 538 if (xen_feature(XENFEAT_auto_translated_physmap)) { 539 ret = alloc_empty_pages(vma, nr_pages); 540 if (ret < 0) 541 goto out_unlock; 542 } else 543 vma->vm_private_data = PRIV_VMA_LOCKED; 544 } else { 545 if (m.addr < vma->vm_start || 546 m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { 547 ret = -EINVAL; 548 goto out_unlock; 549 } 550 if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { 551 ret = -EINVAL; 552 goto out_unlock; 553 } 554 } 555 556 state.domain = m.dom; 557 state.vma = vma; 558 state.va = m.addr; 559 state.index = 0; 560 state.global_error = 0; 561 state.version = version; 562 563 BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0); 564 /* mmap_batch_fn guarantees ret == 0 */ 565 BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), 566 &pagelist, mmap_batch_fn, &state)); 567 568 mmap_write_unlock(mm); 569 570 if (state.global_error) { 571 /* Write back errors in second pass. */ 572 state.user_gfn = (xen_pfn_t *)m.arr; 573 state.user_err = m.err; 574 ret = traverse_pages_block(m.num, sizeof(xen_pfn_t), 575 &pagelist, mmap_return_errors, &state); 576 } else 577 ret = 0; 578 579 /* If we have not had any EFAULT-like global errors then set the global 580 * error to -ENOENT if necessary. */ 581 if ((ret == 0) && (state.global_error == -ENOENT)) 582 ret = -ENOENT; 583 584 out: 585 free_page_list(&pagelist); 586 return ret; 587 588 out_unlock: 589 mmap_write_unlock(mm); 590 goto out; 591 } 592 593 static int lock_pages( 594 struct privcmd_dm_op_buf kbufs[], unsigned int num, 595 struct page *pages[], unsigned int nr_pages, unsigned int *pinned) 596 { 597 unsigned int i, off = 0; 598 599 for (i = 0; i < num; ) { 600 unsigned int requested; 601 int page_count; 602 603 requested = DIV_ROUND_UP( 604 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 605 PAGE_SIZE) - off; 606 if (requested > nr_pages) 607 return -ENOSPC; 608 609 page_count = pin_user_pages_fast( 610 (unsigned long)kbufs[i].uptr + off * PAGE_SIZE, 611 requested, FOLL_WRITE, pages); 612 if (page_count <= 0) 613 return page_count ? : -EFAULT; 614 615 *pinned += page_count; 616 nr_pages -= page_count; 617 pages += page_count; 618 619 off = (requested == page_count) ? 0 : off + page_count; 620 i += !off; 621 } 622 623 return 0; 624 } 625 626 static void unlock_pages(struct page *pages[], unsigned int nr_pages) 627 { 628 unpin_user_pages_dirty_lock(pages, nr_pages, true); 629 } 630 631 static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) 632 { 633 struct privcmd_data *data = file->private_data; 634 struct privcmd_dm_op kdata; 635 struct privcmd_dm_op_buf *kbufs; 636 unsigned int nr_pages = 0; 637 struct page **pages = NULL; 638 struct xen_dm_op_buf *xbufs = NULL; 639 unsigned int i; 640 long rc; 641 unsigned int pinned = 0; 642 643 if (copy_from_user(&kdata, udata, sizeof(kdata))) 644 return -EFAULT; 645 646 /* If restriction is in place, check the domid matches */ 647 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 648 return -EPERM; 649 650 if (kdata.num == 0) 651 return 0; 652 653 if (kdata.num > privcmd_dm_op_max_num) 654 return -E2BIG; 655 656 kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL); 657 if (!kbufs) 658 return -ENOMEM; 659 660 if (copy_from_user(kbufs, kdata.ubufs, 661 sizeof(*kbufs) * kdata.num)) { 662 rc = -EFAULT; 663 goto out; 664 } 665 666 for (i = 0; i < kdata.num; i++) { 667 if (kbufs[i].size > privcmd_dm_op_buf_max_size) { 668 rc = -E2BIG; 669 goto out; 670 } 671 672 if (!access_ok(kbufs[i].uptr, 673 kbufs[i].size)) { 674 rc = -EFAULT; 675 goto out; 676 } 677 678 nr_pages += DIV_ROUND_UP( 679 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 680 PAGE_SIZE); 681 } 682 683 pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); 684 if (!pages) { 685 rc = -ENOMEM; 686 goto out; 687 } 688 689 xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL); 690 if (!xbufs) { 691 rc = -ENOMEM; 692 goto out; 693 } 694 695 rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned); 696 if (rc < 0) 697 goto out; 698 699 for (i = 0; i < kdata.num; i++) { 700 set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); 701 xbufs[i].size = kbufs[i].size; 702 } 703 704 xen_preemptible_hcall_begin(); 705 rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs); 706 xen_preemptible_hcall_end(); 707 708 out: 709 unlock_pages(pages, pinned); 710 kfree(xbufs); 711 kfree(pages); 712 kfree(kbufs); 713 714 return rc; 715 } 716 717 static long privcmd_ioctl_restrict(struct file *file, void __user *udata) 718 { 719 struct privcmd_data *data = file->private_data; 720 domid_t dom; 721 722 if (copy_from_user(&dom, udata, sizeof(dom))) 723 return -EFAULT; 724 725 /* Set restriction to the specified domain, or check it matches */ 726 if (data->domid == DOMID_INVALID) 727 data->domid = dom; 728 else if (data->domid != dom) 729 return -EINVAL; 730 731 return 0; 732 } 733 734 static long privcmd_ioctl_mmap_resource(struct file *file, 735 struct privcmd_mmap_resource __user *udata) 736 { 737 struct privcmd_data *data = file->private_data; 738 struct mm_struct *mm = current->mm; 739 struct vm_area_struct *vma; 740 struct privcmd_mmap_resource kdata; 741 xen_pfn_t *pfns = NULL; 742 struct xen_mem_acquire_resource xdata = { }; 743 int rc; 744 745 if (copy_from_user(&kdata, udata, sizeof(kdata))) 746 return -EFAULT; 747 748 /* If restriction is in place, check the domid matches */ 749 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 750 return -EPERM; 751 752 /* Both fields must be set or unset */ 753 if (!!kdata.addr != !!kdata.num) 754 return -EINVAL; 755 756 xdata.domid = kdata.dom; 757 xdata.type = kdata.type; 758 xdata.id = kdata.id; 759 760 if (!kdata.addr && !kdata.num) { 761 /* Query the size of the resource. */ 762 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 763 if (rc) 764 return rc; 765 return __put_user(xdata.nr_frames, &udata->num); 766 } 767 768 mmap_write_lock(mm); 769 770 vma = find_vma(mm, kdata.addr); 771 if (!vma || vma->vm_ops != &privcmd_vm_ops) { 772 rc = -EINVAL; 773 goto out; 774 } 775 776 pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL | __GFP_NOWARN); 777 if (!pfns) { 778 rc = -ENOMEM; 779 goto out; 780 } 781 782 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && 783 xen_feature(XENFEAT_auto_translated_physmap)) { 784 unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); 785 struct page **pages; 786 unsigned int i; 787 788 rc = alloc_empty_pages(vma, nr); 789 if (rc < 0) 790 goto out; 791 792 pages = vma->vm_private_data; 793 794 for (i = 0; i < kdata.num; i++) { 795 xen_pfn_t pfn = 796 page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); 797 798 pfns[i] = pfn + (i % XEN_PFN_PER_PAGE); 799 } 800 } else 801 vma->vm_private_data = PRIV_VMA_LOCKED; 802 803 xdata.frame = kdata.idx; 804 xdata.nr_frames = kdata.num; 805 set_xen_guest_handle(xdata.frame_list, pfns); 806 807 xen_preemptible_hcall_begin(); 808 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 809 xen_preemptible_hcall_end(); 810 811 if (rc) 812 goto out; 813 814 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && 815 xen_feature(XENFEAT_auto_translated_physmap)) { 816 rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); 817 } else { 818 unsigned int domid = 819 (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? 820 DOMID_SELF : kdata.dom; 821 int num, *errs = (int *)pfns; 822 823 BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); 824 num = xen_remap_domain_mfn_array(vma, 825 kdata.addr & PAGE_MASK, 826 pfns, kdata.num, errs, 827 vma->vm_page_prot, 828 domid); 829 if (num < 0) 830 rc = num; 831 else if (num != kdata.num) { 832 unsigned int i; 833 834 for (i = 0; i < num; i++) { 835 rc = errs[i]; 836 if (rc < 0) 837 break; 838 } 839 } else 840 rc = 0; 841 } 842 843 out: 844 mmap_write_unlock(mm); 845 kfree(pfns); 846 847 return rc; 848 } 849 850 static long privcmd_ioctl_pcidev_get_gsi(struct file *file, void __user *udata) 851 { 852 #if defined(CONFIG_XEN_ACPI) 853 int rc = -EINVAL; 854 struct privcmd_pcidev_get_gsi kdata; 855 856 if (copy_from_user(&kdata, udata, sizeof(kdata))) 857 return -EFAULT; 858 859 if (IS_REACHABLE(CONFIG_XEN_PCIDEV_BACKEND)) 860 rc = pcistub_get_gsi_from_sbdf(kdata.sbdf); 861 862 if (rc < 0) 863 return rc; 864 865 kdata.gsi = rc; 866 if (copy_to_user(udata, &kdata, sizeof(kdata))) 867 return -EFAULT; 868 869 return 0; 870 #else 871 return -EINVAL; 872 #endif 873 } 874 875 #ifdef CONFIG_XEN_PRIVCMD_EVENTFD 876 /* Irqfd support */ 877 static struct workqueue_struct *irqfd_cleanup_wq; 878 static DEFINE_SPINLOCK(irqfds_lock); 879 DEFINE_STATIC_SRCU(irqfds_srcu); 880 static LIST_HEAD(irqfds_list); 881 882 struct privcmd_kernel_irqfd { 883 struct xen_dm_op_buf xbufs; 884 domid_t dom; 885 bool error; 886 struct eventfd_ctx *eventfd; 887 struct work_struct shutdown; 888 wait_queue_entry_t wait; 889 struct list_head list; 890 poll_table pt; 891 }; 892 893 static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd) 894 { 895 lockdep_assert_held(&irqfds_lock); 896 897 list_del_init(&kirqfd->list); 898 queue_work(irqfd_cleanup_wq, &kirqfd->shutdown); 899 } 900 901 static void irqfd_shutdown(struct work_struct *work) 902 { 903 struct privcmd_kernel_irqfd *kirqfd = 904 container_of(work, struct privcmd_kernel_irqfd, shutdown); 905 u64 cnt; 906 907 /* Make sure irqfd has been initialized in assign path */ 908 synchronize_srcu(&irqfds_srcu); 909 910 eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt); 911 eventfd_ctx_put(kirqfd->eventfd); 912 kfree(kirqfd); 913 } 914 915 static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd) 916 { 917 u64 cnt; 918 long rc; 919 920 eventfd_ctx_do_read(kirqfd->eventfd, &cnt); 921 922 xen_preemptible_hcall_begin(); 923 rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs); 924 xen_preemptible_hcall_end(); 925 926 /* Don't repeat the error message for consecutive failures */ 927 if (rc && !kirqfd->error) { 928 pr_err("Failed to configure irq for guest domain: %d\n", 929 kirqfd->dom); 930 } 931 932 kirqfd->error = rc; 933 } 934 935 static int 936 irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) 937 { 938 struct privcmd_kernel_irqfd *kirqfd = 939 container_of(wait, struct privcmd_kernel_irqfd, wait); 940 __poll_t flags = key_to_poll(key); 941 942 if (flags & EPOLLIN) 943 irqfd_inject(kirqfd); 944 945 if (flags & EPOLLHUP) { 946 unsigned long flags; 947 948 spin_lock_irqsave(&irqfds_lock, flags); 949 irqfd_deactivate(kirqfd); 950 spin_unlock_irqrestore(&irqfds_lock, flags); 951 } 952 953 return 0; 954 } 955 956 static void 957 irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) 958 { 959 struct privcmd_kernel_irqfd *kirqfd = 960 container_of(pt, struct privcmd_kernel_irqfd, pt); 961 962 add_wait_queue_priority(wqh, &kirqfd->wait); 963 } 964 965 static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd) 966 { 967 struct privcmd_kernel_irqfd *kirqfd, *tmp; 968 unsigned long flags; 969 __poll_t events; 970 struct fd f; 971 void *dm_op; 972 int ret, idx; 973 974 kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL); 975 if (!kirqfd) 976 return -ENOMEM; 977 dm_op = kirqfd + 1; 978 979 if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) { 980 ret = -EFAULT; 981 goto error_kfree; 982 } 983 984 kirqfd->xbufs.size = irqfd->size; 985 set_xen_guest_handle(kirqfd->xbufs.h, dm_op); 986 kirqfd->dom = irqfd->dom; 987 INIT_WORK(&kirqfd->shutdown, irqfd_shutdown); 988 989 f = fdget(irqfd->fd); 990 if (!fd_file(f)) { 991 ret = -EBADF; 992 goto error_kfree; 993 } 994 995 kirqfd->eventfd = eventfd_ctx_fileget(fd_file(f)); 996 if (IS_ERR(kirqfd->eventfd)) { 997 ret = PTR_ERR(kirqfd->eventfd); 998 goto error_fd_put; 999 } 1000 1001 /* 1002 * Install our own custom wake-up handling so we are notified via a 1003 * callback whenever someone signals the underlying eventfd. 1004 */ 1005 init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup); 1006 init_poll_funcptr(&kirqfd->pt, irqfd_poll_func); 1007 1008 spin_lock_irqsave(&irqfds_lock, flags); 1009 1010 list_for_each_entry(tmp, &irqfds_list, list) { 1011 if (kirqfd->eventfd == tmp->eventfd) { 1012 ret = -EBUSY; 1013 spin_unlock_irqrestore(&irqfds_lock, flags); 1014 goto error_eventfd; 1015 } 1016 } 1017 1018 idx = srcu_read_lock(&irqfds_srcu); 1019 list_add_tail(&kirqfd->list, &irqfds_list); 1020 spin_unlock_irqrestore(&irqfds_lock, flags); 1021 1022 /* 1023 * Check if there was an event already pending on the eventfd before we 1024 * registered, and trigger it as if we didn't miss it. 1025 */ 1026 events = vfs_poll(fd_file(f), &kirqfd->pt); 1027 if (events & EPOLLIN) 1028 irqfd_inject(kirqfd); 1029 1030 srcu_read_unlock(&irqfds_srcu, idx); 1031 1032 /* 1033 * Do not drop the file until the kirqfd is fully initialized, otherwise 1034 * we might race against the EPOLLHUP. 1035 */ 1036 fdput(f); 1037 return 0; 1038 1039 error_eventfd: 1040 eventfd_ctx_put(kirqfd->eventfd); 1041 1042 error_fd_put: 1043 fdput(f); 1044 1045 error_kfree: 1046 kfree(kirqfd); 1047 return ret; 1048 } 1049 1050 static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd) 1051 { 1052 struct privcmd_kernel_irqfd *kirqfd; 1053 struct eventfd_ctx *eventfd; 1054 unsigned long flags; 1055 1056 eventfd = eventfd_ctx_fdget(irqfd->fd); 1057 if (IS_ERR(eventfd)) 1058 return PTR_ERR(eventfd); 1059 1060 spin_lock_irqsave(&irqfds_lock, flags); 1061 1062 list_for_each_entry(kirqfd, &irqfds_list, list) { 1063 if (kirqfd->eventfd == eventfd) { 1064 irqfd_deactivate(kirqfd); 1065 break; 1066 } 1067 } 1068 1069 spin_unlock_irqrestore(&irqfds_lock, flags); 1070 1071 eventfd_ctx_put(eventfd); 1072 1073 /* 1074 * Block until we know all outstanding shutdown jobs have completed so 1075 * that we guarantee there will not be any more interrupts once this 1076 * deassign function returns. 1077 */ 1078 flush_workqueue(irqfd_cleanup_wq); 1079 1080 return 0; 1081 } 1082 1083 static long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1084 { 1085 struct privcmd_data *data = file->private_data; 1086 struct privcmd_irqfd irqfd; 1087 1088 if (copy_from_user(&irqfd, udata, sizeof(irqfd))) 1089 return -EFAULT; 1090 1091 /* No other flags should be set */ 1092 if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN) 1093 return -EINVAL; 1094 1095 /* If restriction is in place, check the domid matches */ 1096 if (data->domid != DOMID_INVALID && data->domid != irqfd.dom) 1097 return -EPERM; 1098 1099 if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN) 1100 return privcmd_irqfd_deassign(&irqfd); 1101 1102 return privcmd_irqfd_assign(&irqfd); 1103 } 1104 1105 static int privcmd_irqfd_init(void) 1106 { 1107 irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0); 1108 if (!irqfd_cleanup_wq) 1109 return -ENOMEM; 1110 1111 return 0; 1112 } 1113 1114 static void privcmd_irqfd_exit(void) 1115 { 1116 struct privcmd_kernel_irqfd *kirqfd, *tmp; 1117 unsigned long flags; 1118 1119 spin_lock_irqsave(&irqfds_lock, flags); 1120 1121 list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) 1122 irqfd_deactivate(kirqfd); 1123 1124 spin_unlock_irqrestore(&irqfds_lock, flags); 1125 1126 destroy_workqueue(irqfd_cleanup_wq); 1127 } 1128 1129 /* Ioeventfd Support */ 1130 #define QUEUE_NOTIFY_VQ_MASK 0xFFFF 1131 1132 static DEFINE_MUTEX(ioreq_lock); 1133 static LIST_HEAD(ioreq_list); 1134 1135 /* per-eventfd structure */ 1136 struct privcmd_kernel_ioeventfd { 1137 struct eventfd_ctx *eventfd; 1138 struct list_head list; 1139 u64 addr; 1140 unsigned int addr_len; 1141 unsigned int vq; 1142 }; 1143 1144 /* per-guest CPU / port structure */ 1145 struct ioreq_port { 1146 int vcpu; 1147 unsigned int port; 1148 struct privcmd_kernel_ioreq *kioreq; 1149 }; 1150 1151 /* per-guest structure */ 1152 struct privcmd_kernel_ioreq { 1153 domid_t dom; 1154 unsigned int vcpus; 1155 u64 uioreq; 1156 struct ioreq *ioreq; 1157 spinlock_t lock; /* Protects ioeventfds list */ 1158 struct list_head ioeventfds; 1159 struct list_head list; 1160 struct ioreq_port ports[] __counted_by(vcpus); 1161 }; 1162 1163 static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id) 1164 { 1165 struct ioreq_port *port = dev_id; 1166 struct privcmd_kernel_ioreq *kioreq = port->kioreq; 1167 struct ioreq *ioreq = &kioreq->ioreq[port->vcpu]; 1168 struct privcmd_kernel_ioeventfd *kioeventfd; 1169 unsigned int state = STATE_IOREQ_READY; 1170 1171 if (ioreq->state != STATE_IOREQ_READY || 1172 ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE) 1173 return IRQ_NONE; 1174 1175 /* 1176 * We need a barrier, smp_mb(), here to ensure reads are finished before 1177 * `state` is updated. Since the lock implementation ensures that 1178 * appropriate barrier will be added anyway, we can avoid adding 1179 * explicit barrier here. 1180 * 1181 * Ideally we don't need to update `state` within the locks, but we do 1182 * that here to avoid adding explicit barrier. 1183 */ 1184 1185 spin_lock(&kioreq->lock); 1186 ioreq->state = STATE_IOREQ_INPROCESS; 1187 1188 list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { 1189 if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY && 1190 ioreq->size == kioeventfd->addr_len && 1191 (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) { 1192 eventfd_signal(kioeventfd->eventfd); 1193 state = STATE_IORESP_READY; 1194 break; 1195 } 1196 } 1197 spin_unlock(&kioreq->lock); 1198 1199 /* 1200 * We need a barrier, smp_mb(), here to ensure writes are finished 1201 * before `state` is updated. Since the lock implementation ensures that 1202 * appropriate barrier will be added anyway, we can avoid adding 1203 * explicit barrier here. 1204 */ 1205 1206 ioreq->state = state; 1207 1208 if (state == STATE_IORESP_READY) { 1209 notify_remote_via_evtchn(port->port); 1210 return IRQ_HANDLED; 1211 } 1212 1213 return IRQ_NONE; 1214 } 1215 1216 static void ioreq_free(struct privcmd_kernel_ioreq *kioreq) 1217 { 1218 struct ioreq_port *ports = kioreq->ports; 1219 int i; 1220 1221 lockdep_assert_held(&ioreq_lock); 1222 1223 list_del(&kioreq->list); 1224 1225 for (i = kioreq->vcpus - 1; i >= 0; i--) 1226 unbind_from_irqhandler(irq_from_evtchn(ports[i].port), &ports[i]); 1227 1228 kfree(kioreq); 1229 } 1230 1231 static 1232 struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd) 1233 { 1234 struct privcmd_kernel_ioreq *kioreq; 1235 struct mm_struct *mm = current->mm; 1236 struct vm_area_struct *vma; 1237 struct page **pages; 1238 unsigned int *ports; 1239 int ret, size, i; 1240 1241 lockdep_assert_held(&ioreq_lock); 1242 1243 size = struct_size(kioreq, ports, ioeventfd->vcpus); 1244 kioreq = kzalloc(size, GFP_KERNEL); 1245 if (!kioreq) 1246 return ERR_PTR(-ENOMEM); 1247 1248 kioreq->dom = ioeventfd->dom; 1249 kioreq->vcpus = ioeventfd->vcpus; 1250 kioreq->uioreq = ioeventfd->ioreq; 1251 spin_lock_init(&kioreq->lock); 1252 INIT_LIST_HEAD(&kioreq->ioeventfds); 1253 1254 /* The memory for ioreq server must have been mapped earlier */ 1255 mmap_write_lock(mm); 1256 vma = find_vma(mm, (unsigned long)ioeventfd->ioreq); 1257 if (!vma) { 1258 pr_err("Failed to find vma for ioreq page!\n"); 1259 mmap_write_unlock(mm); 1260 ret = -EFAULT; 1261 goto error_kfree; 1262 } 1263 1264 pages = vma->vm_private_data; 1265 kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0])); 1266 mmap_write_unlock(mm); 1267 1268 ports = memdup_array_user(u64_to_user_ptr(ioeventfd->ports), 1269 kioreq->vcpus, sizeof(*ports)); 1270 if (IS_ERR(ports)) { 1271 ret = PTR_ERR(ports); 1272 goto error_kfree; 1273 } 1274 1275 for (i = 0; i < kioreq->vcpus; i++) { 1276 kioreq->ports[i].vcpu = i; 1277 kioreq->ports[i].port = ports[i]; 1278 kioreq->ports[i].kioreq = kioreq; 1279 1280 ret = bind_evtchn_to_irqhandler_lateeoi(ports[i], 1281 ioeventfd_interrupt, IRQF_SHARED, "ioeventfd", 1282 &kioreq->ports[i]); 1283 if (ret < 0) 1284 goto error_unbind; 1285 } 1286 1287 kfree(ports); 1288 1289 list_add_tail(&kioreq->list, &ioreq_list); 1290 1291 return kioreq; 1292 1293 error_unbind: 1294 while (--i >= 0) 1295 unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]); 1296 1297 kfree(ports); 1298 error_kfree: 1299 kfree(kioreq); 1300 return ERR_PTR(ret); 1301 } 1302 1303 static struct privcmd_kernel_ioreq * 1304 get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd) 1305 { 1306 struct privcmd_kernel_ioreq *kioreq; 1307 unsigned long flags; 1308 1309 list_for_each_entry(kioreq, &ioreq_list, list) { 1310 struct privcmd_kernel_ioeventfd *kioeventfd; 1311 1312 /* 1313 * kioreq fields can be accessed here without a lock as they are 1314 * never updated after being added to the ioreq_list. 1315 */ 1316 if (kioreq->uioreq != ioeventfd->ioreq) { 1317 continue; 1318 } else if (kioreq->dom != ioeventfd->dom || 1319 kioreq->vcpus != ioeventfd->vcpus) { 1320 pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n", 1321 kioreq->dom, ioeventfd->dom, kioreq->vcpus, 1322 ioeventfd->vcpus); 1323 return ERR_PTR(-EINVAL); 1324 } 1325 1326 /* Look for a duplicate eventfd for the same guest */ 1327 spin_lock_irqsave(&kioreq->lock, flags); 1328 list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { 1329 if (eventfd == kioeventfd->eventfd) { 1330 spin_unlock_irqrestore(&kioreq->lock, flags); 1331 return ERR_PTR(-EBUSY); 1332 } 1333 } 1334 spin_unlock_irqrestore(&kioreq->lock, flags); 1335 1336 return kioreq; 1337 } 1338 1339 /* Matching kioreq isn't found, allocate a new one */ 1340 return alloc_ioreq(ioeventfd); 1341 } 1342 1343 static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd) 1344 { 1345 list_del(&kioeventfd->list); 1346 eventfd_ctx_put(kioeventfd->eventfd); 1347 kfree(kioeventfd); 1348 } 1349 1350 static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd) 1351 { 1352 struct privcmd_kernel_ioeventfd *kioeventfd; 1353 struct privcmd_kernel_ioreq *kioreq; 1354 unsigned long flags; 1355 struct fd f; 1356 int ret; 1357 1358 /* Check for range overflow */ 1359 if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr) 1360 return -EINVAL; 1361 1362 /* Vhost requires us to support length 1, 2, 4, and 8 */ 1363 if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 || 1364 ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8)) 1365 return -EINVAL; 1366 1367 /* 4096 vcpus limit enough ? */ 1368 if (!ioeventfd->vcpus || ioeventfd->vcpus > 4096) 1369 return -EINVAL; 1370 1371 kioeventfd = kzalloc(sizeof(*kioeventfd), GFP_KERNEL); 1372 if (!kioeventfd) 1373 return -ENOMEM; 1374 1375 f = fdget(ioeventfd->event_fd); 1376 if (!fd_file(f)) { 1377 ret = -EBADF; 1378 goto error_kfree; 1379 } 1380 1381 kioeventfd->eventfd = eventfd_ctx_fileget(fd_file(f)); 1382 fdput(f); 1383 1384 if (IS_ERR(kioeventfd->eventfd)) { 1385 ret = PTR_ERR(kioeventfd->eventfd); 1386 goto error_kfree; 1387 } 1388 1389 kioeventfd->addr = ioeventfd->addr; 1390 kioeventfd->addr_len = ioeventfd->addr_len; 1391 kioeventfd->vq = ioeventfd->vq; 1392 1393 mutex_lock(&ioreq_lock); 1394 kioreq = get_ioreq(ioeventfd, kioeventfd->eventfd); 1395 if (IS_ERR(kioreq)) { 1396 mutex_unlock(&ioreq_lock); 1397 ret = PTR_ERR(kioreq); 1398 goto error_eventfd; 1399 } 1400 1401 spin_lock_irqsave(&kioreq->lock, flags); 1402 list_add_tail(&kioeventfd->list, &kioreq->ioeventfds); 1403 spin_unlock_irqrestore(&kioreq->lock, flags); 1404 1405 mutex_unlock(&ioreq_lock); 1406 1407 return 0; 1408 1409 error_eventfd: 1410 eventfd_ctx_put(kioeventfd->eventfd); 1411 1412 error_kfree: 1413 kfree(kioeventfd); 1414 return ret; 1415 } 1416 1417 static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd) 1418 { 1419 struct privcmd_kernel_ioreq *kioreq, *tkioreq; 1420 struct eventfd_ctx *eventfd; 1421 unsigned long flags; 1422 int ret = 0; 1423 1424 eventfd = eventfd_ctx_fdget(ioeventfd->event_fd); 1425 if (IS_ERR(eventfd)) 1426 return PTR_ERR(eventfd); 1427 1428 mutex_lock(&ioreq_lock); 1429 list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) { 1430 struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; 1431 /* 1432 * kioreq fields can be accessed here without a lock as they are 1433 * never updated after being added to the ioreq_list. 1434 */ 1435 if (kioreq->dom != ioeventfd->dom || 1436 kioreq->uioreq != ioeventfd->ioreq || 1437 kioreq->vcpus != ioeventfd->vcpus) 1438 continue; 1439 1440 spin_lock_irqsave(&kioreq->lock, flags); 1441 list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) { 1442 if (eventfd == kioeventfd->eventfd) { 1443 ioeventfd_free(kioeventfd); 1444 spin_unlock_irqrestore(&kioreq->lock, flags); 1445 1446 if (list_empty(&kioreq->ioeventfds)) 1447 ioreq_free(kioreq); 1448 goto unlock; 1449 } 1450 } 1451 spin_unlock_irqrestore(&kioreq->lock, flags); 1452 break; 1453 } 1454 1455 pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n", 1456 ioeventfd->dom, ioeventfd->addr); 1457 ret = -ENODEV; 1458 1459 unlock: 1460 mutex_unlock(&ioreq_lock); 1461 eventfd_ctx_put(eventfd); 1462 1463 return ret; 1464 } 1465 1466 static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) 1467 { 1468 struct privcmd_data *data = file->private_data; 1469 struct privcmd_ioeventfd ioeventfd; 1470 1471 if (copy_from_user(&ioeventfd, udata, sizeof(ioeventfd))) 1472 return -EFAULT; 1473 1474 /* No other flags should be set */ 1475 if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) 1476 return -EINVAL; 1477 1478 /* If restriction is in place, check the domid matches */ 1479 if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom) 1480 return -EPERM; 1481 1482 if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) 1483 return privcmd_ioeventfd_deassign(&ioeventfd); 1484 1485 return privcmd_ioeventfd_assign(&ioeventfd); 1486 } 1487 1488 static void privcmd_ioeventfd_exit(void) 1489 { 1490 struct privcmd_kernel_ioreq *kioreq, *tmp; 1491 unsigned long flags; 1492 1493 mutex_lock(&ioreq_lock); 1494 list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) { 1495 struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; 1496 1497 spin_lock_irqsave(&kioreq->lock, flags); 1498 list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) 1499 ioeventfd_free(kioeventfd); 1500 spin_unlock_irqrestore(&kioreq->lock, flags); 1501 1502 ioreq_free(kioreq); 1503 } 1504 mutex_unlock(&ioreq_lock); 1505 } 1506 #else 1507 static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1508 { 1509 return -EOPNOTSUPP; 1510 } 1511 1512 static inline int privcmd_irqfd_init(void) 1513 { 1514 return 0; 1515 } 1516 1517 static inline void privcmd_irqfd_exit(void) 1518 { 1519 } 1520 1521 static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) 1522 { 1523 return -EOPNOTSUPP; 1524 } 1525 1526 static inline void privcmd_ioeventfd_exit(void) 1527 { 1528 } 1529 #endif /* CONFIG_XEN_PRIVCMD_EVENTFD */ 1530 1531 static long privcmd_ioctl(struct file *file, 1532 unsigned int cmd, unsigned long data) 1533 { 1534 int ret = -ENOTTY; 1535 void __user *udata = (void __user *) data; 1536 1537 switch (cmd) { 1538 case IOCTL_PRIVCMD_HYPERCALL: 1539 ret = privcmd_ioctl_hypercall(file, udata); 1540 break; 1541 1542 case IOCTL_PRIVCMD_MMAP: 1543 ret = privcmd_ioctl_mmap(file, udata); 1544 break; 1545 1546 case IOCTL_PRIVCMD_MMAPBATCH: 1547 ret = privcmd_ioctl_mmap_batch(file, udata, 1); 1548 break; 1549 1550 case IOCTL_PRIVCMD_MMAPBATCH_V2: 1551 ret = privcmd_ioctl_mmap_batch(file, udata, 2); 1552 break; 1553 1554 case IOCTL_PRIVCMD_DM_OP: 1555 ret = privcmd_ioctl_dm_op(file, udata); 1556 break; 1557 1558 case IOCTL_PRIVCMD_RESTRICT: 1559 ret = privcmd_ioctl_restrict(file, udata); 1560 break; 1561 1562 case IOCTL_PRIVCMD_MMAP_RESOURCE: 1563 ret = privcmd_ioctl_mmap_resource(file, udata); 1564 break; 1565 1566 case IOCTL_PRIVCMD_IRQFD: 1567 ret = privcmd_ioctl_irqfd(file, udata); 1568 break; 1569 1570 case IOCTL_PRIVCMD_IOEVENTFD: 1571 ret = privcmd_ioctl_ioeventfd(file, udata); 1572 break; 1573 1574 case IOCTL_PRIVCMD_PCIDEV_GET_GSI: 1575 ret = privcmd_ioctl_pcidev_get_gsi(file, udata); 1576 break; 1577 1578 default: 1579 break; 1580 } 1581 1582 return ret; 1583 } 1584 1585 static int privcmd_open(struct inode *ino, struct file *file) 1586 { 1587 struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL); 1588 1589 if (!data) 1590 return -ENOMEM; 1591 1592 /* DOMID_INVALID implies no restriction */ 1593 data->domid = DOMID_INVALID; 1594 1595 file->private_data = data; 1596 return 0; 1597 } 1598 1599 static int privcmd_release(struct inode *ino, struct file *file) 1600 { 1601 struct privcmd_data *data = file->private_data; 1602 1603 kfree(data); 1604 return 0; 1605 } 1606 1607 static void privcmd_close(struct vm_area_struct *vma) 1608 { 1609 struct page **pages = vma->vm_private_data; 1610 int numpgs = vma_pages(vma); 1611 int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; 1612 int rc; 1613 1614 if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) 1615 return; 1616 1617 rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); 1618 if (rc == 0) 1619 xen_free_unpopulated_pages(numpgs, pages); 1620 else 1621 pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", 1622 numpgs, rc); 1623 kvfree(pages); 1624 } 1625 1626 static vm_fault_t privcmd_fault(struct vm_fault *vmf) 1627 { 1628 printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", 1629 vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, 1630 vmf->pgoff, (void *)vmf->address); 1631 1632 return VM_FAULT_SIGBUS; 1633 } 1634 1635 static const struct vm_operations_struct privcmd_vm_ops = { 1636 .close = privcmd_close, 1637 .fault = privcmd_fault 1638 }; 1639 1640 static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) 1641 { 1642 /* DONTCOPY is essential for Xen because copy_page_range doesn't know 1643 * how to recreate these mappings */ 1644 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | 1645 VM_DONTEXPAND | VM_DONTDUMP); 1646 vma->vm_ops = &privcmd_vm_ops; 1647 vma->vm_private_data = NULL; 1648 1649 return 0; 1650 } 1651 1652 /* 1653 * For MMAPBATCH*. This allows asserting the singleshot mapping 1654 * on a per pfn/pte basis. Mapping calls that fail with ENOENT 1655 * can be then retried until success. 1656 */ 1657 static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) 1658 { 1659 return pte_none(ptep_get(pte)) ? 0 : -EBUSY; 1660 } 1661 1662 static int privcmd_vma_range_is_mapped( 1663 struct vm_area_struct *vma, 1664 unsigned long addr, 1665 unsigned long nr_pages) 1666 { 1667 return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, 1668 is_mapped_fn, NULL) != 0; 1669 } 1670 1671 const struct file_operations xen_privcmd_fops = { 1672 .owner = THIS_MODULE, 1673 .unlocked_ioctl = privcmd_ioctl, 1674 .open = privcmd_open, 1675 .release = privcmd_release, 1676 .mmap = privcmd_mmap, 1677 }; 1678 EXPORT_SYMBOL_GPL(xen_privcmd_fops); 1679 1680 static struct miscdevice privcmd_dev = { 1681 .minor = MISC_DYNAMIC_MINOR, 1682 .name = "xen/privcmd", 1683 .fops = &xen_privcmd_fops, 1684 }; 1685 1686 static int __init privcmd_init(void) 1687 { 1688 int err; 1689 1690 if (!xen_domain()) 1691 return -ENODEV; 1692 1693 err = misc_register(&privcmd_dev); 1694 if (err != 0) { 1695 pr_err("Could not register Xen privcmd device\n"); 1696 return err; 1697 } 1698 1699 err = misc_register(&xen_privcmdbuf_dev); 1700 if (err != 0) { 1701 pr_err("Could not register Xen hypercall-buf device\n"); 1702 goto err_privcmdbuf; 1703 } 1704 1705 err = privcmd_irqfd_init(); 1706 if (err != 0) { 1707 pr_err("irqfd init failed\n"); 1708 goto err_irqfd; 1709 } 1710 1711 return 0; 1712 1713 err_irqfd: 1714 misc_deregister(&xen_privcmdbuf_dev); 1715 err_privcmdbuf: 1716 misc_deregister(&privcmd_dev); 1717 return err; 1718 } 1719 1720 static void __exit privcmd_exit(void) 1721 { 1722 privcmd_ioeventfd_exit(); 1723 privcmd_irqfd_exit(); 1724 misc_deregister(&privcmd_dev); 1725 misc_deregister(&xen_privcmdbuf_dev); 1726 } 1727 1728 module_init(privcmd_init); 1729 module_exit(privcmd_exit); 1730