1 // SPDX-License-Identifier: GPL-2.0-only 2 /****************************************************************************** 3 * privcmd.c 4 * 5 * Interface to privileged domain-0 commands. 6 * 7 * Copyright (c) 2002-2004, K A Fraser, B Dragovic 8 */ 9 10 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 11 12 #include <linux/eventfd.h> 13 #include <linux/file.h> 14 #include <linux/kernel.h> 15 #include <linux/module.h> 16 #include <linux/mutex.h> 17 #include <linux/poll.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/string.h> 21 #include <linux/workqueue.h> 22 #include <linux/errno.h> 23 #include <linux/mm.h> 24 #include <linux/mman.h> 25 #include <linux/uaccess.h> 26 #include <linux/swap.h> 27 #include <linux/highmem.h> 28 #include <linux/pagemap.h> 29 #include <linux/seq_file.h> 30 #include <linux/miscdevice.h> 31 #include <linux/moduleparam.h> 32 #include <linux/virtio_mmio.h> 33 34 #include <asm/xen/hypervisor.h> 35 #include <asm/xen/hypercall.h> 36 37 #include <xen/xen.h> 38 #include <xen/events.h> 39 #include <xen/privcmd.h> 40 #include <xen/interface/xen.h> 41 #include <xen/interface/memory.h> 42 #include <xen/interface/hvm/dm_op.h> 43 #include <xen/interface/hvm/ioreq.h> 44 #include <xen/features.h> 45 #include <xen/page.h> 46 #include <xen/xen-ops.h> 47 #include <xen/balloon.h> 48 49 #include "privcmd.h" 50 51 MODULE_LICENSE("GPL"); 52 53 #define PRIV_VMA_LOCKED ((void *)1) 54 55 static unsigned int privcmd_dm_op_max_num = 16; 56 module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644); 57 MODULE_PARM_DESC(dm_op_max_nr_bufs, 58 "Maximum number of buffers per dm_op hypercall"); 59 60 static unsigned int privcmd_dm_op_buf_max_size = 4096; 61 module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, 62 0644); 63 MODULE_PARM_DESC(dm_op_buf_max_size, 64 "Maximum size of a dm_op hypercall buffer"); 65 66 struct privcmd_data { 67 domid_t domid; 68 }; 69 70 static int privcmd_vma_range_is_mapped( 71 struct vm_area_struct *vma, 72 unsigned long addr, 73 unsigned long nr_pages); 74 75 static long privcmd_ioctl_hypercall(struct file *file, void __user *udata) 76 { 77 struct privcmd_data *data = file->private_data; 78 struct privcmd_hypercall hypercall; 79 long ret; 80 81 /* Disallow arbitrary hypercalls if restricted */ 82 if (data->domid != DOMID_INVALID) 83 return -EPERM; 84 85 if (copy_from_user(&hypercall, udata, sizeof(hypercall))) 86 return -EFAULT; 87 88 xen_preemptible_hcall_begin(); 89 ret = privcmd_call(hypercall.op, 90 hypercall.arg[0], hypercall.arg[1], 91 hypercall.arg[2], hypercall.arg[3], 92 hypercall.arg[4]); 93 xen_preemptible_hcall_end(); 94 95 return ret; 96 } 97 98 static void free_page_list(struct list_head *pages) 99 { 100 struct page *p, *n; 101 102 list_for_each_entry_safe(p, n, pages, lru) 103 __free_page(p); 104 105 INIT_LIST_HEAD(pages); 106 } 107 108 /* 109 * Given an array of items in userspace, return a list of pages 110 * containing the data. If copying fails, either because of memory 111 * allocation failure or a problem reading user memory, return an 112 * error code; its up to the caller to dispose of any partial list. 113 */ 114 static int gather_array(struct list_head *pagelist, 115 unsigned nelem, size_t size, 116 const void __user *data) 117 { 118 unsigned pageidx; 119 void *pagedata; 120 int ret; 121 122 if (size > PAGE_SIZE) 123 return 0; 124 125 pageidx = PAGE_SIZE; 126 pagedata = NULL; /* quiet, gcc */ 127 while (nelem--) { 128 if (pageidx > PAGE_SIZE-size) { 129 struct page *page = alloc_page(GFP_KERNEL); 130 131 ret = -ENOMEM; 132 if (page == NULL) 133 goto fail; 134 135 pagedata = page_address(page); 136 137 list_add_tail(&page->lru, pagelist); 138 pageidx = 0; 139 } 140 141 ret = -EFAULT; 142 if (copy_from_user(pagedata + pageidx, data, size)) 143 goto fail; 144 145 data += size; 146 pageidx += size; 147 } 148 149 ret = 0; 150 151 fail: 152 return ret; 153 } 154 155 /* 156 * Call function "fn" on each element of the array fragmented 157 * over a list of pages. 158 */ 159 static int traverse_pages(unsigned nelem, size_t size, 160 struct list_head *pos, 161 int (*fn)(void *data, void *state), 162 void *state) 163 { 164 void *pagedata; 165 unsigned pageidx; 166 int ret = 0; 167 168 BUG_ON(size > PAGE_SIZE); 169 170 pageidx = PAGE_SIZE; 171 pagedata = NULL; /* hush, gcc */ 172 173 while (nelem--) { 174 if (pageidx > PAGE_SIZE-size) { 175 struct page *page; 176 pos = pos->next; 177 page = list_entry(pos, struct page, lru); 178 pagedata = page_address(page); 179 pageidx = 0; 180 } 181 182 ret = (*fn)(pagedata + pageidx, state); 183 if (ret) 184 break; 185 pageidx += size; 186 } 187 188 return ret; 189 } 190 191 /* 192 * Similar to traverse_pages, but use each page as a "block" of 193 * data to be processed as one unit. 194 */ 195 static int traverse_pages_block(unsigned nelem, size_t size, 196 struct list_head *pos, 197 int (*fn)(void *data, int nr, void *state), 198 void *state) 199 { 200 void *pagedata; 201 int ret = 0; 202 203 BUG_ON(size > PAGE_SIZE); 204 205 while (nelem) { 206 int nr = (PAGE_SIZE/size); 207 struct page *page; 208 if (nr > nelem) 209 nr = nelem; 210 pos = pos->next; 211 page = list_entry(pos, struct page, lru); 212 pagedata = page_address(page); 213 ret = (*fn)(pagedata, nr, state); 214 if (ret) 215 break; 216 nelem -= nr; 217 } 218 219 return ret; 220 } 221 222 struct mmap_gfn_state { 223 unsigned long va; 224 struct vm_area_struct *vma; 225 domid_t domain; 226 }; 227 228 static int mmap_gfn_range(void *data, void *state) 229 { 230 struct privcmd_mmap_entry *msg = data; 231 struct mmap_gfn_state *st = state; 232 struct vm_area_struct *vma = st->vma; 233 int rc; 234 235 /* Do not allow range to wrap the address space. */ 236 if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || 237 ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) 238 return -EINVAL; 239 240 /* Range chunks must be contiguous in va space. */ 241 if ((msg->va != st->va) || 242 ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) 243 return -EINVAL; 244 245 rc = xen_remap_domain_gfn_range(vma, 246 msg->va & PAGE_MASK, 247 msg->mfn, msg->npages, 248 vma->vm_page_prot, 249 st->domain, NULL); 250 if (rc < 0) 251 return rc; 252 253 st->va += msg->npages << PAGE_SHIFT; 254 255 return 0; 256 } 257 258 static long privcmd_ioctl_mmap(struct file *file, void __user *udata) 259 { 260 struct privcmd_data *data = file->private_data; 261 struct privcmd_mmap mmapcmd; 262 struct mm_struct *mm = current->mm; 263 struct vm_area_struct *vma; 264 int rc; 265 LIST_HEAD(pagelist); 266 struct mmap_gfn_state state; 267 268 /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ 269 if (xen_feature(XENFEAT_auto_translated_physmap)) 270 return -ENOSYS; 271 272 if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) 273 return -EFAULT; 274 275 /* If restriction is in place, check the domid matches */ 276 if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom) 277 return -EPERM; 278 279 rc = gather_array(&pagelist, 280 mmapcmd.num, sizeof(struct privcmd_mmap_entry), 281 mmapcmd.entry); 282 283 if (rc || list_empty(&pagelist)) 284 goto out; 285 286 mmap_write_lock(mm); 287 288 { 289 struct page *page = list_first_entry(&pagelist, 290 struct page, lru); 291 struct privcmd_mmap_entry *msg = page_address(page); 292 293 vma = vma_lookup(mm, msg->va); 294 rc = -EINVAL; 295 296 if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) 297 goto out_up; 298 vma->vm_private_data = PRIV_VMA_LOCKED; 299 } 300 301 state.va = vma->vm_start; 302 state.vma = vma; 303 state.domain = mmapcmd.dom; 304 305 rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), 306 &pagelist, 307 mmap_gfn_range, &state); 308 309 310 out_up: 311 mmap_write_unlock(mm); 312 313 out: 314 free_page_list(&pagelist); 315 316 return rc; 317 } 318 319 struct mmap_batch_state { 320 domid_t domain; 321 unsigned long va; 322 struct vm_area_struct *vma; 323 int index; 324 /* A tristate: 325 * 0 for no errors 326 * 1 if at least one error has happened (and no 327 * -ENOENT errors have happened) 328 * -ENOENT if at least 1 -ENOENT has happened. 329 */ 330 int global_error; 331 int version; 332 333 /* User-space gfn array to store errors in the second pass for V1. */ 334 xen_pfn_t __user *user_gfn; 335 /* User-space int array to store errors in the second pass for V2. */ 336 int __user *user_err; 337 }; 338 339 /* auto translated dom0 note: if domU being created is PV, then gfn is 340 * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP). 341 */ 342 static int mmap_batch_fn(void *data, int nr, void *state) 343 { 344 xen_pfn_t *gfnp = data; 345 struct mmap_batch_state *st = state; 346 struct vm_area_struct *vma = st->vma; 347 struct page **pages = vma->vm_private_data; 348 struct page **cur_pages = NULL; 349 int ret; 350 351 if (xen_feature(XENFEAT_auto_translated_physmap)) 352 cur_pages = &pages[st->index]; 353 354 BUG_ON(nr < 0); 355 ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr, 356 (int *)gfnp, st->vma->vm_page_prot, 357 st->domain, cur_pages); 358 359 /* Adjust the global_error? */ 360 if (ret != nr) { 361 if (ret == -ENOENT) 362 st->global_error = -ENOENT; 363 else { 364 /* Record that at least one error has happened. */ 365 if (st->global_error == 0) 366 st->global_error = 1; 367 } 368 } 369 st->va += XEN_PAGE_SIZE * nr; 370 st->index += nr / XEN_PFN_PER_PAGE; 371 372 return 0; 373 } 374 375 static int mmap_return_error(int err, struct mmap_batch_state *st) 376 { 377 int ret; 378 379 if (st->version == 1) { 380 if (err) { 381 xen_pfn_t gfn; 382 383 ret = get_user(gfn, st->user_gfn); 384 if (ret < 0) 385 return ret; 386 /* 387 * V1 encodes the error codes in the 32bit top 388 * nibble of the gfn (with its known 389 * limitations vis-a-vis 64 bit callers). 390 */ 391 gfn |= (err == -ENOENT) ? 392 PRIVCMD_MMAPBATCH_PAGED_ERROR : 393 PRIVCMD_MMAPBATCH_MFN_ERROR; 394 return __put_user(gfn, st->user_gfn++); 395 } else 396 st->user_gfn++; 397 } else { /* st->version == 2 */ 398 if (err) 399 return __put_user(err, st->user_err++); 400 else 401 st->user_err++; 402 } 403 404 return 0; 405 } 406 407 static int mmap_return_errors(void *data, int nr, void *state) 408 { 409 struct mmap_batch_state *st = state; 410 int *errs = data; 411 int i; 412 int ret; 413 414 for (i = 0; i < nr; i++) { 415 ret = mmap_return_error(errs[i], st); 416 if (ret < 0) 417 return ret; 418 } 419 return 0; 420 } 421 422 /* Allocate pfns that are then mapped with gfns from foreign domid. Update 423 * the vma with the page info to use later. 424 * Returns: 0 if success, otherwise -errno 425 */ 426 static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) 427 { 428 int rc; 429 struct page **pages; 430 431 pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); 432 if (pages == NULL) 433 return -ENOMEM; 434 435 rc = xen_alloc_unpopulated_pages(numpgs, pages); 436 if (rc != 0) { 437 pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, 438 numpgs, rc); 439 kvfree(pages); 440 return -ENOMEM; 441 } 442 BUG_ON(vma->vm_private_data != NULL); 443 vma->vm_private_data = pages; 444 445 return 0; 446 } 447 448 static const struct vm_operations_struct privcmd_vm_ops; 449 450 static long privcmd_ioctl_mmap_batch( 451 struct file *file, void __user *udata, int version) 452 { 453 struct privcmd_data *data = file->private_data; 454 int ret; 455 struct privcmd_mmapbatch_v2 m; 456 struct mm_struct *mm = current->mm; 457 struct vm_area_struct *vma; 458 unsigned long nr_pages; 459 LIST_HEAD(pagelist); 460 struct mmap_batch_state state; 461 462 switch (version) { 463 case 1: 464 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) 465 return -EFAULT; 466 /* Returns per-frame error in m.arr. */ 467 m.err = NULL; 468 if (!access_ok(m.arr, m.num * sizeof(*m.arr))) 469 return -EFAULT; 470 break; 471 case 2: 472 if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) 473 return -EFAULT; 474 /* Returns per-frame error code in m.err. */ 475 if (!access_ok(m.err, m.num * (sizeof(*m.err)))) 476 return -EFAULT; 477 break; 478 default: 479 return -EINVAL; 480 } 481 482 /* If restriction is in place, check the domid matches */ 483 if (data->domid != DOMID_INVALID && data->domid != m.dom) 484 return -EPERM; 485 486 nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); 487 if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) 488 return -EINVAL; 489 490 ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); 491 492 if (ret) 493 goto out; 494 if (list_empty(&pagelist)) { 495 ret = -EINVAL; 496 goto out; 497 } 498 499 if (version == 2) { 500 /* Zero error array now to only copy back actual errors. */ 501 if (clear_user(m.err, sizeof(int) * m.num)) { 502 ret = -EFAULT; 503 goto out; 504 } 505 } 506 507 mmap_write_lock(mm); 508 509 vma = find_vma(mm, m.addr); 510 if (!vma || 511 vma->vm_ops != &privcmd_vm_ops) { 512 ret = -EINVAL; 513 goto out_unlock; 514 } 515 516 /* 517 * Caller must either: 518 * 519 * Map the whole VMA range, which will also allocate all the 520 * pages required for the auto_translated_physmap case. 521 * 522 * Or 523 * 524 * Map unmapped holes left from a previous map attempt (e.g., 525 * because those foreign frames were previously paged out). 526 */ 527 if (vma->vm_private_data == NULL) { 528 if (m.addr != vma->vm_start || 529 m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { 530 ret = -EINVAL; 531 goto out_unlock; 532 } 533 if (xen_feature(XENFEAT_auto_translated_physmap)) { 534 ret = alloc_empty_pages(vma, nr_pages); 535 if (ret < 0) 536 goto out_unlock; 537 } else 538 vma->vm_private_data = PRIV_VMA_LOCKED; 539 } else { 540 if (m.addr < vma->vm_start || 541 m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { 542 ret = -EINVAL; 543 goto out_unlock; 544 } 545 if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { 546 ret = -EINVAL; 547 goto out_unlock; 548 } 549 } 550 551 state.domain = m.dom; 552 state.vma = vma; 553 state.va = m.addr; 554 state.index = 0; 555 state.global_error = 0; 556 state.version = version; 557 558 BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0); 559 /* mmap_batch_fn guarantees ret == 0 */ 560 BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), 561 &pagelist, mmap_batch_fn, &state)); 562 563 mmap_write_unlock(mm); 564 565 if (state.global_error) { 566 /* Write back errors in second pass. */ 567 state.user_gfn = (xen_pfn_t *)m.arr; 568 state.user_err = m.err; 569 ret = traverse_pages_block(m.num, sizeof(xen_pfn_t), 570 &pagelist, mmap_return_errors, &state); 571 } else 572 ret = 0; 573 574 /* If we have not had any EFAULT-like global errors then set the global 575 * error to -ENOENT if necessary. */ 576 if ((ret == 0) && (state.global_error == -ENOENT)) 577 ret = -ENOENT; 578 579 out: 580 free_page_list(&pagelist); 581 return ret; 582 583 out_unlock: 584 mmap_write_unlock(mm); 585 goto out; 586 } 587 588 static int lock_pages( 589 struct privcmd_dm_op_buf kbufs[], unsigned int num, 590 struct page *pages[], unsigned int nr_pages, unsigned int *pinned) 591 { 592 unsigned int i, off = 0; 593 594 for (i = 0; i < num; ) { 595 unsigned int requested; 596 int page_count; 597 598 requested = DIV_ROUND_UP( 599 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 600 PAGE_SIZE) - off; 601 if (requested > nr_pages) 602 return -ENOSPC; 603 604 page_count = pin_user_pages_fast( 605 (unsigned long)kbufs[i].uptr + off * PAGE_SIZE, 606 requested, FOLL_WRITE, pages); 607 if (page_count <= 0) 608 return page_count ? : -EFAULT; 609 610 *pinned += page_count; 611 nr_pages -= page_count; 612 pages += page_count; 613 614 off = (requested == page_count) ? 0 : off + page_count; 615 i += !off; 616 } 617 618 return 0; 619 } 620 621 static void unlock_pages(struct page *pages[], unsigned int nr_pages) 622 { 623 unpin_user_pages_dirty_lock(pages, nr_pages, true); 624 } 625 626 static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) 627 { 628 struct privcmd_data *data = file->private_data; 629 struct privcmd_dm_op kdata; 630 struct privcmd_dm_op_buf *kbufs; 631 unsigned int nr_pages = 0; 632 struct page **pages = NULL; 633 struct xen_dm_op_buf *xbufs = NULL; 634 unsigned int i; 635 long rc; 636 unsigned int pinned = 0; 637 638 if (copy_from_user(&kdata, udata, sizeof(kdata))) 639 return -EFAULT; 640 641 /* If restriction is in place, check the domid matches */ 642 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 643 return -EPERM; 644 645 if (kdata.num == 0) 646 return 0; 647 648 if (kdata.num > privcmd_dm_op_max_num) 649 return -E2BIG; 650 651 kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL); 652 if (!kbufs) 653 return -ENOMEM; 654 655 if (copy_from_user(kbufs, kdata.ubufs, 656 sizeof(*kbufs) * kdata.num)) { 657 rc = -EFAULT; 658 goto out; 659 } 660 661 for (i = 0; i < kdata.num; i++) { 662 if (kbufs[i].size > privcmd_dm_op_buf_max_size) { 663 rc = -E2BIG; 664 goto out; 665 } 666 667 if (!access_ok(kbufs[i].uptr, 668 kbufs[i].size)) { 669 rc = -EFAULT; 670 goto out; 671 } 672 673 nr_pages += DIV_ROUND_UP( 674 offset_in_page(kbufs[i].uptr) + kbufs[i].size, 675 PAGE_SIZE); 676 } 677 678 pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); 679 if (!pages) { 680 rc = -ENOMEM; 681 goto out; 682 } 683 684 xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL); 685 if (!xbufs) { 686 rc = -ENOMEM; 687 goto out; 688 } 689 690 rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned); 691 if (rc < 0) 692 goto out; 693 694 for (i = 0; i < kdata.num; i++) { 695 set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); 696 xbufs[i].size = kbufs[i].size; 697 } 698 699 xen_preemptible_hcall_begin(); 700 rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs); 701 xen_preemptible_hcall_end(); 702 703 out: 704 unlock_pages(pages, pinned); 705 kfree(xbufs); 706 kfree(pages); 707 kfree(kbufs); 708 709 return rc; 710 } 711 712 static long privcmd_ioctl_restrict(struct file *file, void __user *udata) 713 { 714 struct privcmd_data *data = file->private_data; 715 domid_t dom; 716 717 if (copy_from_user(&dom, udata, sizeof(dom))) 718 return -EFAULT; 719 720 /* Set restriction to the specified domain, or check it matches */ 721 if (data->domid == DOMID_INVALID) 722 data->domid = dom; 723 else if (data->domid != dom) 724 return -EINVAL; 725 726 return 0; 727 } 728 729 static long privcmd_ioctl_mmap_resource(struct file *file, 730 struct privcmd_mmap_resource __user *udata) 731 { 732 struct privcmd_data *data = file->private_data; 733 struct mm_struct *mm = current->mm; 734 struct vm_area_struct *vma; 735 struct privcmd_mmap_resource kdata; 736 xen_pfn_t *pfns = NULL; 737 struct xen_mem_acquire_resource xdata = { }; 738 int rc; 739 740 if (copy_from_user(&kdata, udata, sizeof(kdata))) 741 return -EFAULT; 742 743 /* If restriction is in place, check the domid matches */ 744 if (data->domid != DOMID_INVALID && data->domid != kdata.dom) 745 return -EPERM; 746 747 /* Both fields must be set or unset */ 748 if (!!kdata.addr != !!kdata.num) 749 return -EINVAL; 750 751 xdata.domid = kdata.dom; 752 xdata.type = kdata.type; 753 xdata.id = kdata.id; 754 755 if (!kdata.addr && !kdata.num) { 756 /* Query the size of the resource. */ 757 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 758 if (rc) 759 return rc; 760 return __put_user(xdata.nr_frames, &udata->num); 761 } 762 763 mmap_write_lock(mm); 764 765 vma = find_vma(mm, kdata.addr); 766 if (!vma || vma->vm_ops != &privcmd_vm_ops) { 767 rc = -EINVAL; 768 goto out; 769 } 770 771 pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL | __GFP_NOWARN); 772 if (!pfns) { 773 rc = -ENOMEM; 774 goto out; 775 } 776 777 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && 778 xen_feature(XENFEAT_auto_translated_physmap)) { 779 unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); 780 struct page **pages; 781 unsigned int i; 782 783 rc = alloc_empty_pages(vma, nr); 784 if (rc < 0) 785 goto out; 786 787 pages = vma->vm_private_data; 788 789 for (i = 0; i < kdata.num; i++) { 790 xen_pfn_t pfn = 791 page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); 792 793 pfns[i] = pfn + (i % XEN_PFN_PER_PAGE); 794 } 795 } else 796 vma->vm_private_data = PRIV_VMA_LOCKED; 797 798 xdata.frame = kdata.idx; 799 xdata.nr_frames = kdata.num; 800 set_xen_guest_handle(xdata.frame_list, pfns); 801 802 xen_preemptible_hcall_begin(); 803 rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); 804 xen_preemptible_hcall_end(); 805 806 if (rc) 807 goto out; 808 809 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && 810 xen_feature(XENFEAT_auto_translated_physmap)) { 811 rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); 812 } else { 813 unsigned int domid = 814 (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? 815 DOMID_SELF : kdata.dom; 816 int num, *errs = (int *)pfns; 817 818 BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); 819 num = xen_remap_domain_mfn_array(vma, 820 kdata.addr & PAGE_MASK, 821 pfns, kdata.num, errs, 822 vma->vm_page_prot, 823 domid); 824 if (num < 0) 825 rc = num; 826 else if (num != kdata.num) { 827 unsigned int i; 828 829 for (i = 0; i < num; i++) { 830 rc = errs[i]; 831 if (rc < 0) 832 break; 833 } 834 } else 835 rc = 0; 836 } 837 838 out: 839 mmap_write_unlock(mm); 840 kfree(pfns); 841 842 return rc; 843 } 844 845 #ifdef CONFIG_XEN_PRIVCMD_EVENTFD 846 /* Irqfd support */ 847 static struct workqueue_struct *irqfd_cleanup_wq; 848 static DEFINE_MUTEX(irqfds_lock); 849 static LIST_HEAD(irqfds_list); 850 851 struct privcmd_kernel_irqfd { 852 struct xen_dm_op_buf xbufs; 853 domid_t dom; 854 bool error; 855 struct eventfd_ctx *eventfd; 856 struct work_struct shutdown; 857 wait_queue_entry_t wait; 858 struct list_head list; 859 poll_table pt; 860 }; 861 862 static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd) 863 { 864 lockdep_assert_held(&irqfds_lock); 865 866 list_del_init(&kirqfd->list); 867 queue_work(irqfd_cleanup_wq, &kirqfd->shutdown); 868 } 869 870 static void irqfd_shutdown(struct work_struct *work) 871 { 872 struct privcmd_kernel_irqfd *kirqfd = 873 container_of(work, struct privcmd_kernel_irqfd, shutdown); 874 u64 cnt; 875 876 eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt); 877 eventfd_ctx_put(kirqfd->eventfd); 878 kfree(kirqfd); 879 } 880 881 static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd) 882 { 883 u64 cnt; 884 long rc; 885 886 eventfd_ctx_do_read(kirqfd->eventfd, &cnt); 887 888 xen_preemptible_hcall_begin(); 889 rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs); 890 xen_preemptible_hcall_end(); 891 892 /* Don't repeat the error message for consecutive failures */ 893 if (rc && !kirqfd->error) { 894 pr_err("Failed to configure irq for guest domain: %d\n", 895 kirqfd->dom); 896 } 897 898 kirqfd->error = rc; 899 } 900 901 static int 902 irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) 903 { 904 struct privcmd_kernel_irqfd *kirqfd = 905 container_of(wait, struct privcmd_kernel_irqfd, wait); 906 __poll_t flags = key_to_poll(key); 907 908 if (flags & EPOLLIN) 909 irqfd_inject(kirqfd); 910 911 if (flags & EPOLLHUP) { 912 mutex_lock(&irqfds_lock); 913 irqfd_deactivate(kirqfd); 914 mutex_unlock(&irqfds_lock); 915 } 916 917 return 0; 918 } 919 920 static void 921 irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) 922 { 923 struct privcmd_kernel_irqfd *kirqfd = 924 container_of(pt, struct privcmd_kernel_irqfd, pt); 925 926 add_wait_queue_priority(wqh, &kirqfd->wait); 927 } 928 929 static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd) 930 { 931 struct privcmd_kernel_irqfd *kirqfd, *tmp; 932 __poll_t events; 933 struct fd f; 934 void *dm_op; 935 int ret; 936 937 kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL); 938 if (!kirqfd) 939 return -ENOMEM; 940 dm_op = kirqfd + 1; 941 942 if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) { 943 ret = -EFAULT; 944 goto error_kfree; 945 } 946 947 kirqfd->xbufs.size = irqfd->size; 948 set_xen_guest_handle(kirqfd->xbufs.h, dm_op); 949 kirqfd->dom = irqfd->dom; 950 INIT_WORK(&kirqfd->shutdown, irqfd_shutdown); 951 952 f = fdget(irqfd->fd); 953 if (!f.file) { 954 ret = -EBADF; 955 goto error_kfree; 956 } 957 958 kirqfd->eventfd = eventfd_ctx_fileget(f.file); 959 if (IS_ERR(kirqfd->eventfd)) { 960 ret = PTR_ERR(kirqfd->eventfd); 961 goto error_fd_put; 962 } 963 964 /* 965 * Install our own custom wake-up handling so we are notified via a 966 * callback whenever someone signals the underlying eventfd. 967 */ 968 init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup); 969 init_poll_funcptr(&kirqfd->pt, irqfd_poll_func); 970 971 mutex_lock(&irqfds_lock); 972 973 list_for_each_entry(tmp, &irqfds_list, list) { 974 if (kirqfd->eventfd == tmp->eventfd) { 975 ret = -EBUSY; 976 mutex_unlock(&irqfds_lock); 977 goto error_eventfd; 978 } 979 } 980 981 list_add_tail(&kirqfd->list, &irqfds_list); 982 mutex_unlock(&irqfds_lock); 983 984 /* 985 * Check if there was an event already pending on the eventfd before we 986 * registered, and trigger it as if we didn't miss it. 987 */ 988 events = vfs_poll(f.file, &kirqfd->pt); 989 if (events & EPOLLIN) 990 irqfd_inject(kirqfd); 991 992 /* 993 * Do not drop the file until the kirqfd is fully initialized, otherwise 994 * we might race against the EPOLLHUP. 995 */ 996 fdput(f); 997 return 0; 998 999 error_eventfd: 1000 eventfd_ctx_put(kirqfd->eventfd); 1001 1002 error_fd_put: 1003 fdput(f); 1004 1005 error_kfree: 1006 kfree(kirqfd); 1007 return ret; 1008 } 1009 1010 static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd) 1011 { 1012 struct privcmd_kernel_irqfd *kirqfd; 1013 struct eventfd_ctx *eventfd; 1014 1015 eventfd = eventfd_ctx_fdget(irqfd->fd); 1016 if (IS_ERR(eventfd)) 1017 return PTR_ERR(eventfd); 1018 1019 mutex_lock(&irqfds_lock); 1020 1021 list_for_each_entry(kirqfd, &irqfds_list, list) { 1022 if (kirqfd->eventfd == eventfd) { 1023 irqfd_deactivate(kirqfd); 1024 break; 1025 } 1026 } 1027 1028 mutex_unlock(&irqfds_lock); 1029 1030 eventfd_ctx_put(eventfd); 1031 1032 /* 1033 * Block until we know all outstanding shutdown jobs have completed so 1034 * that we guarantee there will not be any more interrupts once this 1035 * deassign function returns. 1036 */ 1037 flush_workqueue(irqfd_cleanup_wq); 1038 1039 return 0; 1040 } 1041 1042 static long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1043 { 1044 struct privcmd_data *data = file->private_data; 1045 struct privcmd_irqfd irqfd; 1046 1047 if (copy_from_user(&irqfd, udata, sizeof(irqfd))) 1048 return -EFAULT; 1049 1050 /* No other flags should be set */ 1051 if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN) 1052 return -EINVAL; 1053 1054 /* If restriction is in place, check the domid matches */ 1055 if (data->domid != DOMID_INVALID && data->domid != irqfd.dom) 1056 return -EPERM; 1057 1058 if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN) 1059 return privcmd_irqfd_deassign(&irqfd); 1060 1061 return privcmd_irqfd_assign(&irqfd); 1062 } 1063 1064 static int privcmd_irqfd_init(void) 1065 { 1066 irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0); 1067 if (!irqfd_cleanup_wq) 1068 return -ENOMEM; 1069 1070 return 0; 1071 } 1072 1073 static void privcmd_irqfd_exit(void) 1074 { 1075 struct privcmd_kernel_irqfd *kirqfd, *tmp; 1076 1077 mutex_lock(&irqfds_lock); 1078 1079 list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) 1080 irqfd_deactivate(kirqfd); 1081 1082 mutex_unlock(&irqfds_lock); 1083 1084 destroy_workqueue(irqfd_cleanup_wq); 1085 } 1086 1087 /* Ioeventfd Support */ 1088 #define QUEUE_NOTIFY_VQ_MASK 0xFFFF 1089 1090 static DEFINE_MUTEX(ioreq_lock); 1091 static LIST_HEAD(ioreq_list); 1092 1093 /* per-eventfd structure */ 1094 struct privcmd_kernel_ioeventfd { 1095 struct eventfd_ctx *eventfd; 1096 struct list_head list; 1097 u64 addr; 1098 unsigned int addr_len; 1099 unsigned int vq; 1100 }; 1101 1102 /* per-guest CPU / port structure */ 1103 struct ioreq_port { 1104 int vcpu; 1105 unsigned int port; 1106 struct privcmd_kernel_ioreq *kioreq; 1107 }; 1108 1109 /* per-guest structure */ 1110 struct privcmd_kernel_ioreq { 1111 domid_t dom; 1112 unsigned int vcpus; 1113 u64 uioreq; 1114 struct ioreq *ioreq; 1115 spinlock_t lock; /* Protects ioeventfds list */ 1116 struct list_head ioeventfds; 1117 struct list_head list; 1118 struct ioreq_port ports[] __counted_by(vcpus); 1119 }; 1120 1121 static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id) 1122 { 1123 struct ioreq_port *port = dev_id; 1124 struct privcmd_kernel_ioreq *kioreq = port->kioreq; 1125 struct ioreq *ioreq = &kioreq->ioreq[port->vcpu]; 1126 struct privcmd_kernel_ioeventfd *kioeventfd; 1127 unsigned int state = STATE_IOREQ_READY; 1128 1129 if (ioreq->state != STATE_IOREQ_READY || 1130 ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE) 1131 return IRQ_NONE; 1132 1133 /* 1134 * We need a barrier, smp_mb(), here to ensure reads are finished before 1135 * `state` is updated. Since the lock implementation ensures that 1136 * appropriate barrier will be added anyway, we can avoid adding 1137 * explicit barrier here. 1138 * 1139 * Ideally we don't need to update `state` within the locks, but we do 1140 * that here to avoid adding explicit barrier. 1141 */ 1142 1143 spin_lock(&kioreq->lock); 1144 ioreq->state = STATE_IOREQ_INPROCESS; 1145 1146 list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { 1147 if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY && 1148 ioreq->size == kioeventfd->addr_len && 1149 (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) { 1150 eventfd_signal(kioeventfd->eventfd, 1); 1151 state = STATE_IORESP_READY; 1152 break; 1153 } 1154 } 1155 spin_unlock(&kioreq->lock); 1156 1157 /* 1158 * We need a barrier, smp_mb(), here to ensure writes are finished 1159 * before `state` is updated. Since the lock implementation ensures that 1160 * appropriate barrier will be added anyway, we can avoid adding 1161 * explicit barrier here. 1162 */ 1163 1164 ioreq->state = state; 1165 1166 if (state == STATE_IORESP_READY) { 1167 notify_remote_via_evtchn(port->port); 1168 return IRQ_HANDLED; 1169 } 1170 1171 return IRQ_NONE; 1172 } 1173 1174 static void ioreq_free(struct privcmd_kernel_ioreq *kioreq) 1175 { 1176 struct ioreq_port *ports = kioreq->ports; 1177 int i; 1178 1179 lockdep_assert_held(&ioreq_lock); 1180 1181 list_del(&kioreq->list); 1182 1183 for (i = kioreq->vcpus - 1; i >= 0; i--) 1184 unbind_from_irqhandler(irq_from_evtchn(ports[i].port), &ports[i]); 1185 1186 kfree(kioreq); 1187 } 1188 1189 static 1190 struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd) 1191 { 1192 struct privcmd_kernel_ioreq *kioreq; 1193 struct mm_struct *mm = current->mm; 1194 struct vm_area_struct *vma; 1195 struct page **pages; 1196 unsigned int *ports; 1197 int ret, size, i; 1198 1199 lockdep_assert_held(&ioreq_lock); 1200 1201 size = struct_size(kioreq, ports, ioeventfd->vcpus); 1202 kioreq = kzalloc(size, GFP_KERNEL); 1203 if (!kioreq) 1204 return ERR_PTR(-ENOMEM); 1205 1206 kioreq->dom = ioeventfd->dom; 1207 kioreq->vcpus = ioeventfd->vcpus; 1208 kioreq->uioreq = ioeventfd->ioreq; 1209 spin_lock_init(&kioreq->lock); 1210 INIT_LIST_HEAD(&kioreq->ioeventfds); 1211 1212 /* The memory for ioreq server must have been mapped earlier */ 1213 mmap_write_lock(mm); 1214 vma = find_vma(mm, (unsigned long)ioeventfd->ioreq); 1215 if (!vma) { 1216 pr_err("Failed to find vma for ioreq page!\n"); 1217 mmap_write_unlock(mm); 1218 ret = -EFAULT; 1219 goto error_kfree; 1220 } 1221 1222 pages = vma->vm_private_data; 1223 kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0])); 1224 mmap_write_unlock(mm); 1225 1226 size = sizeof(*ports) * kioreq->vcpus; 1227 ports = kzalloc(size, GFP_KERNEL); 1228 if (!ports) { 1229 ret = -ENOMEM; 1230 goto error_kfree; 1231 } 1232 1233 if (copy_from_user(ports, u64_to_user_ptr(ioeventfd->ports), size)) { 1234 ret = -EFAULT; 1235 goto error_kfree_ports; 1236 } 1237 1238 for (i = 0; i < kioreq->vcpus; i++) { 1239 kioreq->ports[i].vcpu = i; 1240 kioreq->ports[i].port = ports[i]; 1241 kioreq->ports[i].kioreq = kioreq; 1242 1243 ret = bind_evtchn_to_irqhandler_lateeoi(ports[i], 1244 ioeventfd_interrupt, IRQF_SHARED, "ioeventfd", 1245 &kioreq->ports[i]); 1246 if (ret < 0) 1247 goto error_unbind; 1248 } 1249 1250 kfree(ports); 1251 1252 list_add_tail(&kioreq->list, &ioreq_list); 1253 1254 return kioreq; 1255 1256 error_unbind: 1257 while (--i >= 0) 1258 unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]); 1259 error_kfree_ports: 1260 kfree(ports); 1261 error_kfree: 1262 kfree(kioreq); 1263 return ERR_PTR(ret); 1264 } 1265 1266 static struct privcmd_kernel_ioreq * 1267 get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd) 1268 { 1269 struct privcmd_kernel_ioreq *kioreq; 1270 unsigned long flags; 1271 1272 list_for_each_entry(kioreq, &ioreq_list, list) { 1273 struct privcmd_kernel_ioeventfd *kioeventfd; 1274 1275 /* 1276 * kioreq fields can be accessed here without a lock as they are 1277 * never updated after being added to the ioreq_list. 1278 */ 1279 if (kioreq->uioreq != ioeventfd->ioreq) { 1280 continue; 1281 } else if (kioreq->dom != ioeventfd->dom || 1282 kioreq->vcpus != ioeventfd->vcpus) { 1283 pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n", 1284 kioreq->dom, ioeventfd->dom, kioreq->vcpus, 1285 ioeventfd->vcpus); 1286 return ERR_PTR(-EINVAL); 1287 } 1288 1289 /* Look for a duplicate eventfd for the same guest */ 1290 spin_lock_irqsave(&kioreq->lock, flags); 1291 list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { 1292 if (eventfd == kioeventfd->eventfd) { 1293 spin_unlock_irqrestore(&kioreq->lock, flags); 1294 return ERR_PTR(-EBUSY); 1295 } 1296 } 1297 spin_unlock_irqrestore(&kioreq->lock, flags); 1298 1299 return kioreq; 1300 } 1301 1302 /* Matching kioreq isn't found, allocate a new one */ 1303 return alloc_ioreq(ioeventfd); 1304 } 1305 1306 static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd) 1307 { 1308 list_del(&kioeventfd->list); 1309 eventfd_ctx_put(kioeventfd->eventfd); 1310 kfree(kioeventfd); 1311 } 1312 1313 static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd) 1314 { 1315 struct privcmd_kernel_ioeventfd *kioeventfd; 1316 struct privcmd_kernel_ioreq *kioreq; 1317 unsigned long flags; 1318 struct fd f; 1319 int ret; 1320 1321 /* Check for range overflow */ 1322 if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr) 1323 return -EINVAL; 1324 1325 /* Vhost requires us to support length 1, 2, 4, and 8 */ 1326 if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 || 1327 ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8)) 1328 return -EINVAL; 1329 1330 /* 4096 vcpus limit enough ? */ 1331 if (!ioeventfd->vcpus || ioeventfd->vcpus > 4096) 1332 return -EINVAL; 1333 1334 kioeventfd = kzalloc(sizeof(*kioeventfd), GFP_KERNEL); 1335 if (!kioeventfd) 1336 return -ENOMEM; 1337 1338 f = fdget(ioeventfd->event_fd); 1339 if (!f.file) { 1340 ret = -EBADF; 1341 goto error_kfree; 1342 } 1343 1344 kioeventfd->eventfd = eventfd_ctx_fileget(f.file); 1345 fdput(f); 1346 1347 if (IS_ERR(kioeventfd->eventfd)) { 1348 ret = PTR_ERR(kioeventfd->eventfd); 1349 goto error_kfree; 1350 } 1351 1352 kioeventfd->addr = ioeventfd->addr; 1353 kioeventfd->addr_len = ioeventfd->addr_len; 1354 kioeventfd->vq = ioeventfd->vq; 1355 1356 mutex_lock(&ioreq_lock); 1357 kioreq = get_ioreq(ioeventfd, kioeventfd->eventfd); 1358 if (IS_ERR(kioreq)) { 1359 mutex_unlock(&ioreq_lock); 1360 ret = PTR_ERR(kioreq); 1361 goto error_eventfd; 1362 } 1363 1364 spin_lock_irqsave(&kioreq->lock, flags); 1365 list_add_tail(&kioeventfd->list, &kioreq->ioeventfds); 1366 spin_unlock_irqrestore(&kioreq->lock, flags); 1367 1368 mutex_unlock(&ioreq_lock); 1369 1370 return 0; 1371 1372 error_eventfd: 1373 eventfd_ctx_put(kioeventfd->eventfd); 1374 1375 error_kfree: 1376 kfree(kioeventfd); 1377 return ret; 1378 } 1379 1380 static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd) 1381 { 1382 struct privcmd_kernel_ioreq *kioreq, *tkioreq; 1383 struct eventfd_ctx *eventfd; 1384 unsigned long flags; 1385 int ret = 0; 1386 1387 eventfd = eventfd_ctx_fdget(ioeventfd->event_fd); 1388 if (IS_ERR(eventfd)) 1389 return PTR_ERR(eventfd); 1390 1391 mutex_lock(&ioreq_lock); 1392 list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) { 1393 struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; 1394 /* 1395 * kioreq fields can be accessed here without a lock as they are 1396 * never updated after being added to the ioreq_list. 1397 */ 1398 if (kioreq->dom != ioeventfd->dom || 1399 kioreq->uioreq != ioeventfd->ioreq || 1400 kioreq->vcpus != ioeventfd->vcpus) 1401 continue; 1402 1403 spin_lock_irqsave(&kioreq->lock, flags); 1404 list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) { 1405 if (eventfd == kioeventfd->eventfd) { 1406 ioeventfd_free(kioeventfd); 1407 spin_unlock_irqrestore(&kioreq->lock, flags); 1408 1409 if (list_empty(&kioreq->ioeventfds)) 1410 ioreq_free(kioreq); 1411 goto unlock; 1412 } 1413 } 1414 spin_unlock_irqrestore(&kioreq->lock, flags); 1415 break; 1416 } 1417 1418 pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n", 1419 ioeventfd->dom, ioeventfd->addr); 1420 ret = -ENODEV; 1421 1422 unlock: 1423 mutex_unlock(&ioreq_lock); 1424 eventfd_ctx_put(eventfd); 1425 1426 return ret; 1427 } 1428 1429 static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) 1430 { 1431 struct privcmd_data *data = file->private_data; 1432 struct privcmd_ioeventfd ioeventfd; 1433 1434 if (copy_from_user(&ioeventfd, udata, sizeof(ioeventfd))) 1435 return -EFAULT; 1436 1437 /* No other flags should be set */ 1438 if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) 1439 return -EINVAL; 1440 1441 /* If restriction is in place, check the domid matches */ 1442 if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom) 1443 return -EPERM; 1444 1445 if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) 1446 return privcmd_ioeventfd_deassign(&ioeventfd); 1447 1448 return privcmd_ioeventfd_assign(&ioeventfd); 1449 } 1450 1451 static void privcmd_ioeventfd_exit(void) 1452 { 1453 struct privcmd_kernel_ioreq *kioreq, *tmp; 1454 unsigned long flags; 1455 1456 mutex_lock(&ioreq_lock); 1457 list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) { 1458 struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; 1459 1460 spin_lock_irqsave(&kioreq->lock, flags); 1461 list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) 1462 ioeventfd_free(kioeventfd); 1463 spin_unlock_irqrestore(&kioreq->lock, flags); 1464 1465 ioreq_free(kioreq); 1466 } 1467 mutex_unlock(&ioreq_lock); 1468 } 1469 #else 1470 static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata) 1471 { 1472 return -EOPNOTSUPP; 1473 } 1474 1475 static inline int privcmd_irqfd_init(void) 1476 { 1477 return 0; 1478 } 1479 1480 static inline void privcmd_irqfd_exit(void) 1481 { 1482 } 1483 1484 static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) 1485 { 1486 return -EOPNOTSUPP; 1487 } 1488 1489 static inline void privcmd_ioeventfd_exit(void) 1490 { 1491 } 1492 #endif /* CONFIG_XEN_PRIVCMD_EVENTFD */ 1493 1494 static long privcmd_ioctl(struct file *file, 1495 unsigned int cmd, unsigned long data) 1496 { 1497 int ret = -ENOTTY; 1498 void __user *udata = (void __user *) data; 1499 1500 switch (cmd) { 1501 case IOCTL_PRIVCMD_HYPERCALL: 1502 ret = privcmd_ioctl_hypercall(file, udata); 1503 break; 1504 1505 case IOCTL_PRIVCMD_MMAP: 1506 ret = privcmd_ioctl_mmap(file, udata); 1507 break; 1508 1509 case IOCTL_PRIVCMD_MMAPBATCH: 1510 ret = privcmd_ioctl_mmap_batch(file, udata, 1); 1511 break; 1512 1513 case IOCTL_PRIVCMD_MMAPBATCH_V2: 1514 ret = privcmd_ioctl_mmap_batch(file, udata, 2); 1515 break; 1516 1517 case IOCTL_PRIVCMD_DM_OP: 1518 ret = privcmd_ioctl_dm_op(file, udata); 1519 break; 1520 1521 case IOCTL_PRIVCMD_RESTRICT: 1522 ret = privcmd_ioctl_restrict(file, udata); 1523 break; 1524 1525 case IOCTL_PRIVCMD_MMAP_RESOURCE: 1526 ret = privcmd_ioctl_mmap_resource(file, udata); 1527 break; 1528 1529 case IOCTL_PRIVCMD_IRQFD: 1530 ret = privcmd_ioctl_irqfd(file, udata); 1531 break; 1532 1533 case IOCTL_PRIVCMD_IOEVENTFD: 1534 ret = privcmd_ioctl_ioeventfd(file, udata); 1535 break; 1536 1537 default: 1538 break; 1539 } 1540 1541 return ret; 1542 } 1543 1544 static int privcmd_open(struct inode *ino, struct file *file) 1545 { 1546 struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL); 1547 1548 if (!data) 1549 return -ENOMEM; 1550 1551 /* DOMID_INVALID implies no restriction */ 1552 data->domid = DOMID_INVALID; 1553 1554 file->private_data = data; 1555 return 0; 1556 } 1557 1558 static int privcmd_release(struct inode *ino, struct file *file) 1559 { 1560 struct privcmd_data *data = file->private_data; 1561 1562 kfree(data); 1563 return 0; 1564 } 1565 1566 static void privcmd_close(struct vm_area_struct *vma) 1567 { 1568 struct page **pages = vma->vm_private_data; 1569 int numpgs = vma_pages(vma); 1570 int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; 1571 int rc; 1572 1573 if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) 1574 return; 1575 1576 rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); 1577 if (rc == 0) 1578 xen_free_unpopulated_pages(numpgs, pages); 1579 else 1580 pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", 1581 numpgs, rc); 1582 kvfree(pages); 1583 } 1584 1585 static vm_fault_t privcmd_fault(struct vm_fault *vmf) 1586 { 1587 printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", 1588 vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, 1589 vmf->pgoff, (void *)vmf->address); 1590 1591 return VM_FAULT_SIGBUS; 1592 } 1593 1594 static const struct vm_operations_struct privcmd_vm_ops = { 1595 .close = privcmd_close, 1596 .fault = privcmd_fault 1597 }; 1598 1599 static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) 1600 { 1601 /* DONTCOPY is essential for Xen because copy_page_range doesn't know 1602 * how to recreate these mappings */ 1603 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | 1604 VM_DONTEXPAND | VM_DONTDUMP); 1605 vma->vm_ops = &privcmd_vm_ops; 1606 vma->vm_private_data = NULL; 1607 1608 return 0; 1609 } 1610 1611 /* 1612 * For MMAPBATCH*. This allows asserting the singleshot mapping 1613 * on a per pfn/pte basis. Mapping calls that fail with ENOENT 1614 * can be then retried until success. 1615 */ 1616 static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) 1617 { 1618 return pte_none(ptep_get(pte)) ? 0 : -EBUSY; 1619 } 1620 1621 static int privcmd_vma_range_is_mapped( 1622 struct vm_area_struct *vma, 1623 unsigned long addr, 1624 unsigned long nr_pages) 1625 { 1626 return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, 1627 is_mapped_fn, NULL) != 0; 1628 } 1629 1630 const struct file_operations xen_privcmd_fops = { 1631 .owner = THIS_MODULE, 1632 .unlocked_ioctl = privcmd_ioctl, 1633 .open = privcmd_open, 1634 .release = privcmd_release, 1635 .mmap = privcmd_mmap, 1636 }; 1637 EXPORT_SYMBOL_GPL(xen_privcmd_fops); 1638 1639 static struct miscdevice privcmd_dev = { 1640 .minor = MISC_DYNAMIC_MINOR, 1641 .name = "xen/privcmd", 1642 .fops = &xen_privcmd_fops, 1643 }; 1644 1645 static int __init privcmd_init(void) 1646 { 1647 int err; 1648 1649 if (!xen_domain()) 1650 return -ENODEV; 1651 1652 err = misc_register(&privcmd_dev); 1653 if (err != 0) { 1654 pr_err("Could not register Xen privcmd device\n"); 1655 return err; 1656 } 1657 1658 err = misc_register(&xen_privcmdbuf_dev); 1659 if (err != 0) { 1660 pr_err("Could not register Xen hypercall-buf device\n"); 1661 goto err_privcmdbuf; 1662 } 1663 1664 err = privcmd_irqfd_init(); 1665 if (err != 0) { 1666 pr_err("irqfd init failed\n"); 1667 goto err_irqfd; 1668 } 1669 1670 return 0; 1671 1672 err_irqfd: 1673 misc_deregister(&xen_privcmdbuf_dev); 1674 err_privcmdbuf: 1675 misc_deregister(&privcmd_dev); 1676 return err; 1677 } 1678 1679 static void __exit privcmd_exit(void) 1680 { 1681 privcmd_ioeventfd_exit(); 1682 privcmd_irqfd_exit(); 1683 misc_deregister(&privcmd_dev); 1684 misc_deregister(&xen_privcmdbuf_dev); 1685 } 1686 1687 module_init(privcmd_init); 1688 module_exit(privcmd_exit); 1689