1 /* Copyright (C) 2009 Red Hat, Inc. 2 * Copyright (C) 2006 Rusty Russell IBM Corporation 3 * 4 * Author: Michael S. Tsirkin <mst@redhat.com> 5 * 6 * Inspiration, some code, and most witty comments come from 7 * Documentation/virtual/lguest/lguest.c, by Rusty Russell 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. 10 * 11 * Generic code for virtio server in host kernel. 12 */ 13 14 #include <linux/eventfd.h> 15 #include <linux/vhost.h> 16 #include <linux/uio.h> 17 #include <linux/mm.h> 18 #include <linux/mmu_context.h> 19 #include <linux/miscdevice.h> 20 #include <linux/mutex.h> 21 #include <linux/poll.h> 22 #include <linux/file.h> 23 #include <linux/highmem.h> 24 #include <linux/slab.h> 25 #include <linux/vmalloc.h> 26 #include <linux/kthread.h> 27 #include <linux/cgroup.h> 28 #include <linux/module.h> 29 #include <linux/sort.h> 30 31 #include "vhost.h" 32 33 static ushort max_mem_regions = 64; 34 module_param(max_mem_regions, ushort, 0444); 35 MODULE_PARM_DESC(max_mem_regions, 36 "Maximum number of memory regions in memory map. (default: 64)"); 37 38 enum { 39 VHOST_MEMORY_F_LOG = 0x1, 40 }; 41 42 #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num]) 43 #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num]) 44 45 #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY 46 static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq) 47 { 48 vq->user_be = !virtio_legacy_is_little_endian(); 49 } 50 51 static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp) 52 { 53 struct vhost_vring_state s; 54 55 if (vq->private_data) 56 return -EBUSY; 57 58 if (copy_from_user(&s, argp, sizeof(s))) 59 return -EFAULT; 60 61 if (s.num != VHOST_VRING_LITTLE_ENDIAN && 62 s.num != VHOST_VRING_BIG_ENDIAN) 63 return -EINVAL; 64 65 vq->user_be = s.num; 66 67 return 0; 68 } 69 70 static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx, 71 int __user *argp) 72 { 73 struct vhost_vring_state s = { 74 .index = idx, 75 .num = vq->user_be 76 }; 77 78 if (copy_to_user(argp, &s, sizeof(s))) 79 return -EFAULT; 80 81 return 0; 82 } 83 84 static void vhost_init_is_le(struct vhost_virtqueue *vq) 85 { 86 /* Note for legacy virtio: user_be is initialized at reset time 87 * according to the host endianness. If userspace does not set an 88 * explicit endianness, the default behavior is native endian, as 89 * expected by legacy virtio. 90 */ 91 vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be; 92 } 93 #else 94 static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq) 95 { 96 } 97 98 static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp) 99 { 100 return -ENOIOCTLCMD; 101 } 102 103 static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx, 104 int __user *argp) 105 { 106 return -ENOIOCTLCMD; 107 } 108 109 static void vhost_init_is_le(struct vhost_virtqueue *vq) 110 { 111 if (vhost_has_feature(vq, VIRTIO_F_VERSION_1)) 112 vq->is_le = true; 113 } 114 #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */ 115 116 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, 117 poll_table *pt) 118 { 119 struct vhost_poll *poll; 120 121 poll = container_of(pt, struct vhost_poll, table); 122 poll->wqh = wqh; 123 add_wait_queue(wqh, &poll->wait); 124 } 125 126 static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, 127 void *key) 128 { 129 struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); 130 131 if (!((unsigned long)key & poll->mask)) 132 return 0; 133 134 vhost_poll_queue(poll); 135 return 0; 136 } 137 138 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) 139 { 140 INIT_LIST_HEAD(&work->node); 141 work->fn = fn; 142 init_waitqueue_head(&work->done); 143 work->flushing = 0; 144 work->queue_seq = work->done_seq = 0; 145 } 146 EXPORT_SYMBOL_GPL(vhost_work_init); 147 148 /* Init poll structure */ 149 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, 150 unsigned long mask, struct vhost_dev *dev) 151 { 152 init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); 153 init_poll_funcptr(&poll->table, vhost_poll_func); 154 poll->mask = mask; 155 poll->dev = dev; 156 poll->wqh = NULL; 157 158 vhost_work_init(&poll->work, fn); 159 } 160 EXPORT_SYMBOL_GPL(vhost_poll_init); 161 162 /* Start polling a file. We add ourselves to file's wait queue. The caller must 163 * keep a reference to a file until after vhost_poll_stop is called. */ 164 int vhost_poll_start(struct vhost_poll *poll, struct file *file) 165 { 166 unsigned long mask; 167 int ret = 0; 168 169 if (poll->wqh) 170 return 0; 171 172 mask = file->f_op->poll(file, &poll->table); 173 if (mask) 174 vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); 175 if (mask & POLLERR) { 176 if (poll->wqh) 177 remove_wait_queue(poll->wqh, &poll->wait); 178 ret = -EINVAL; 179 } 180 181 return ret; 182 } 183 EXPORT_SYMBOL_GPL(vhost_poll_start); 184 185 /* Stop polling a file. After this function returns, it becomes safe to drop the 186 * file reference. You must also flush afterwards. */ 187 void vhost_poll_stop(struct vhost_poll *poll) 188 { 189 if (poll->wqh) { 190 remove_wait_queue(poll->wqh, &poll->wait); 191 poll->wqh = NULL; 192 } 193 } 194 EXPORT_SYMBOL_GPL(vhost_poll_stop); 195 196 static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, 197 unsigned seq) 198 { 199 int left; 200 201 spin_lock_irq(&dev->work_lock); 202 left = seq - work->done_seq; 203 spin_unlock_irq(&dev->work_lock); 204 return left <= 0; 205 } 206 207 void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) 208 { 209 unsigned seq; 210 int flushing; 211 212 spin_lock_irq(&dev->work_lock); 213 seq = work->queue_seq; 214 work->flushing++; 215 spin_unlock_irq(&dev->work_lock); 216 wait_event(work->done, vhost_work_seq_done(dev, work, seq)); 217 spin_lock_irq(&dev->work_lock); 218 flushing = --work->flushing; 219 spin_unlock_irq(&dev->work_lock); 220 BUG_ON(flushing < 0); 221 } 222 EXPORT_SYMBOL_GPL(vhost_work_flush); 223 224 /* Flush any work that has been scheduled. When calling this, don't hold any 225 * locks that are also used by the callback. */ 226 void vhost_poll_flush(struct vhost_poll *poll) 227 { 228 vhost_work_flush(poll->dev, &poll->work); 229 } 230 EXPORT_SYMBOL_GPL(vhost_poll_flush); 231 232 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) 233 { 234 unsigned long flags; 235 236 spin_lock_irqsave(&dev->work_lock, flags); 237 if (list_empty(&work->node)) { 238 list_add_tail(&work->node, &dev->work_list); 239 work->queue_seq++; 240 spin_unlock_irqrestore(&dev->work_lock, flags); 241 wake_up_process(dev->worker); 242 } else { 243 spin_unlock_irqrestore(&dev->work_lock, flags); 244 } 245 } 246 EXPORT_SYMBOL_GPL(vhost_work_queue); 247 248 void vhost_poll_queue(struct vhost_poll *poll) 249 { 250 vhost_work_queue(poll->dev, &poll->work); 251 } 252 EXPORT_SYMBOL_GPL(vhost_poll_queue); 253 254 static void vhost_vq_reset(struct vhost_dev *dev, 255 struct vhost_virtqueue *vq) 256 { 257 vq->num = 1; 258 vq->desc = NULL; 259 vq->avail = NULL; 260 vq->used = NULL; 261 vq->last_avail_idx = 0; 262 vq->avail_idx = 0; 263 vq->last_used_idx = 0; 264 vq->signalled_used = 0; 265 vq->signalled_used_valid = false; 266 vq->used_flags = 0; 267 vq->log_used = false; 268 vq->log_addr = -1ull; 269 vq->private_data = NULL; 270 vq->acked_features = 0; 271 vq->log_base = NULL; 272 vq->error_ctx = NULL; 273 vq->error = NULL; 274 vq->kick = NULL; 275 vq->call_ctx = NULL; 276 vq->call = NULL; 277 vq->log_ctx = NULL; 278 vq->memory = NULL; 279 vq->is_le = virtio_legacy_is_little_endian(); 280 vhost_vq_reset_user_be(vq); 281 } 282 283 static int vhost_worker(void *data) 284 { 285 struct vhost_dev *dev = data; 286 struct vhost_work *work = NULL; 287 unsigned uninitialized_var(seq); 288 mm_segment_t oldfs = get_fs(); 289 290 set_fs(USER_DS); 291 use_mm(dev->mm); 292 293 for (;;) { 294 /* mb paired w/ kthread_stop */ 295 set_current_state(TASK_INTERRUPTIBLE); 296 297 spin_lock_irq(&dev->work_lock); 298 if (work) { 299 work->done_seq = seq; 300 if (work->flushing) 301 wake_up_all(&work->done); 302 } 303 304 if (kthread_should_stop()) { 305 spin_unlock_irq(&dev->work_lock); 306 __set_current_state(TASK_RUNNING); 307 break; 308 } 309 if (!list_empty(&dev->work_list)) { 310 work = list_first_entry(&dev->work_list, 311 struct vhost_work, node); 312 list_del_init(&work->node); 313 seq = work->queue_seq; 314 } else 315 work = NULL; 316 spin_unlock_irq(&dev->work_lock); 317 318 if (work) { 319 __set_current_state(TASK_RUNNING); 320 work->fn(work); 321 if (need_resched()) 322 schedule(); 323 } else 324 schedule(); 325 326 } 327 unuse_mm(dev->mm); 328 set_fs(oldfs); 329 return 0; 330 } 331 332 static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) 333 { 334 kfree(vq->indirect); 335 vq->indirect = NULL; 336 kfree(vq->log); 337 vq->log = NULL; 338 kfree(vq->heads); 339 vq->heads = NULL; 340 } 341 342 /* Helper to allocate iovec buffers for all vqs. */ 343 static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) 344 { 345 struct vhost_virtqueue *vq; 346 int i; 347 348 for (i = 0; i < dev->nvqs; ++i) { 349 vq = dev->vqs[i]; 350 vq->indirect = kmalloc(sizeof *vq->indirect * UIO_MAXIOV, 351 GFP_KERNEL); 352 vq->log = kmalloc(sizeof *vq->log * UIO_MAXIOV, GFP_KERNEL); 353 vq->heads = kmalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL); 354 if (!vq->indirect || !vq->log || !vq->heads) 355 goto err_nomem; 356 } 357 return 0; 358 359 err_nomem: 360 for (; i >= 0; --i) 361 vhost_vq_free_iovecs(dev->vqs[i]); 362 return -ENOMEM; 363 } 364 365 static void vhost_dev_free_iovecs(struct vhost_dev *dev) 366 { 367 int i; 368 369 for (i = 0; i < dev->nvqs; ++i) 370 vhost_vq_free_iovecs(dev->vqs[i]); 371 } 372 373 void vhost_dev_init(struct vhost_dev *dev, 374 struct vhost_virtqueue **vqs, int nvqs) 375 { 376 struct vhost_virtqueue *vq; 377 int i; 378 379 dev->vqs = vqs; 380 dev->nvqs = nvqs; 381 mutex_init(&dev->mutex); 382 dev->log_ctx = NULL; 383 dev->log_file = NULL; 384 dev->memory = NULL; 385 dev->mm = NULL; 386 spin_lock_init(&dev->work_lock); 387 INIT_LIST_HEAD(&dev->work_list); 388 dev->worker = NULL; 389 390 for (i = 0; i < dev->nvqs; ++i) { 391 vq = dev->vqs[i]; 392 vq->log = NULL; 393 vq->indirect = NULL; 394 vq->heads = NULL; 395 vq->dev = dev; 396 mutex_init(&vq->mutex); 397 vhost_vq_reset(dev, vq); 398 if (vq->handle_kick) 399 vhost_poll_init(&vq->poll, vq->handle_kick, 400 POLLIN, dev); 401 } 402 } 403 EXPORT_SYMBOL_GPL(vhost_dev_init); 404 405 /* Caller should have device mutex */ 406 long vhost_dev_check_owner(struct vhost_dev *dev) 407 { 408 /* Are you the owner? If not, I don't think you mean to do that */ 409 return dev->mm == current->mm ? 0 : -EPERM; 410 } 411 EXPORT_SYMBOL_GPL(vhost_dev_check_owner); 412 413 struct vhost_attach_cgroups_struct { 414 struct vhost_work work; 415 struct task_struct *owner; 416 int ret; 417 }; 418 419 static void vhost_attach_cgroups_work(struct vhost_work *work) 420 { 421 struct vhost_attach_cgroups_struct *s; 422 423 s = container_of(work, struct vhost_attach_cgroups_struct, work); 424 s->ret = cgroup_attach_task_all(s->owner, current); 425 } 426 427 static int vhost_attach_cgroups(struct vhost_dev *dev) 428 { 429 struct vhost_attach_cgroups_struct attach; 430 431 attach.owner = current; 432 vhost_work_init(&attach.work, vhost_attach_cgroups_work); 433 vhost_work_queue(dev, &attach.work); 434 vhost_work_flush(dev, &attach.work); 435 return attach.ret; 436 } 437 438 /* Caller should have device mutex */ 439 bool vhost_dev_has_owner(struct vhost_dev *dev) 440 { 441 return dev->mm; 442 } 443 EXPORT_SYMBOL_GPL(vhost_dev_has_owner); 444 445 /* Caller should have device mutex */ 446 long vhost_dev_set_owner(struct vhost_dev *dev) 447 { 448 struct task_struct *worker; 449 int err; 450 451 /* Is there an owner already? */ 452 if (vhost_dev_has_owner(dev)) { 453 err = -EBUSY; 454 goto err_mm; 455 } 456 457 /* No owner, become one */ 458 dev->mm = get_task_mm(current); 459 worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); 460 if (IS_ERR(worker)) { 461 err = PTR_ERR(worker); 462 goto err_worker; 463 } 464 465 dev->worker = worker; 466 wake_up_process(worker); /* avoid contributing to loadavg */ 467 468 err = vhost_attach_cgroups(dev); 469 if (err) 470 goto err_cgroup; 471 472 err = vhost_dev_alloc_iovecs(dev); 473 if (err) 474 goto err_cgroup; 475 476 return 0; 477 err_cgroup: 478 kthread_stop(worker); 479 dev->worker = NULL; 480 err_worker: 481 if (dev->mm) 482 mmput(dev->mm); 483 dev->mm = NULL; 484 err_mm: 485 return err; 486 } 487 EXPORT_SYMBOL_GPL(vhost_dev_set_owner); 488 489 struct vhost_memory *vhost_dev_reset_owner_prepare(void) 490 { 491 return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); 492 } 493 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare); 494 495 /* Caller should have device mutex */ 496 void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory) 497 { 498 int i; 499 500 vhost_dev_cleanup(dev, true); 501 502 /* Restore memory to default empty mapping. */ 503 memory->nregions = 0; 504 dev->memory = memory; 505 /* We don't need VQ locks below since vhost_dev_cleanup makes sure 506 * VQs aren't running. 507 */ 508 for (i = 0; i < dev->nvqs; ++i) 509 dev->vqs[i]->memory = memory; 510 } 511 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner); 512 513 void vhost_dev_stop(struct vhost_dev *dev) 514 { 515 int i; 516 517 for (i = 0; i < dev->nvqs; ++i) { 518 if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) { 519 vhost_poll_stop(&dev->vqs[i]->poll); 520 vhost_poll_flush(&dev->vqs[i]->poll); 521 } 522 } 523 } 524 EXPORT_SYMBOL_GPL(vhost_dev_stop); 525 526 /* Caller should have device mutex if and only if locked is set */ 527 void vhost_dev_cleanup(struct vhost_dev *dev, bool locked) 528 { 529 int i; 530 531 for (i = 0; i < dev->nvqs; ++i) { 532 if (dev->vqs[i]->error_ctx) 533 eventfd_ctx_put(dev->vqs[i]->error_ctx); 534 if (dev->vqs[i]->error) 535 fput(dev->vqs[i]->error); 536 if (dev->vqs[i]->kick) 537 fput(dev->vqs[i]->kick); 538 if (dev->vqs[i]->call_ctx) 539 eventfd_ctx_put(dev->vqs[i]->call_ctx); 540 if (dev->vqs[i]->call) 541 fput(dev->vqs[i]->call); 542 vhost_vq_reset(dev, dev->vqs[i]); 543 } 544 vhost_dev_free_iovecs(dev); 545 if (dev->log_ctx) 546 eventfd_ctx_put(dev->log_ctx); 547 dev->log_ctx = NULL; 548 if (dev->log_file) 549 fput(dev->log_file); 550 dev->log_file = NULL; 551 /* No one will access memory at this point */ 552 kvfree(dev->memory); 553 dev->memory = NULL; 554 WARN_ON(!list_empty(&dev->work_list)); 555 if (dev->worker) { 556 kthread_stop(dev->worker); 557 dev->worker = NULL; 558 } 559 if (dev->mm) 560 mmput(dev->mm); 561 dev->mm = NULL; 562 } 563 EXPORT_SYMBOL_GPL(vhost_dev_cleanup); 564 565 static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) 566 { 567 u64 a = addr / VHOST_PAGE_SIZE / 8; 568 569 /* Make sure 64 bit math will not overflow. */ 570 if (a > ULONG_MAX - (unsigned long)log_base || 571 a + (unsigned long)log_base > ULONG_MAX) 572 return 0; 573 574 return access_ok(VERIFY_WRITE, log_base + a, 575 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); 576 } 577 578 /* Caller should have vq mutex and device mutex. */ 579 static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem, 580 int log_all) 581 { 582 int i; 583 584 if (!mem) 585 return 0; 586 587 for (i = 0; i < mem->nregions; ++i) { 588 struct vhost_memory_region *m = mem->regions + i; 589 unsigned long a = m->userspace_addr; 590 if (m->memory_size > ULONG_MAX) 591 return 0; 592 else if (!access_ok(VERIFY_WRITE, (void __user *)a, 593 m->memory_size)) 594 return 0; 595 else if (log_all && !log_access_ok(log_base, 596 m->guest_phys_addr, 597 m->memory_size)) 598 return 0; 599 } 600 return 1; 601 } 602 603 /* Can we switch to this memory table? */ 604 /* Caller should have device mutex but not vq mutex */ 605 static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, 606 int log_all) 607 { 608 int i; 609 610 for (i = 0; i < d->nvqs; ++i) { 611 int ok; 612 bool log; 613 614 mutex_lock(&d->vqs[i]->mutex); 615 log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL); 616 /* If ring is inactive, will check when it's enabled. */ 617 if (d->vqs[i]->private_data) 618 ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log); 619 else 620 ok = 1; 621 mutex_unlock(&d->vqs[i]->mutex); 622 if (!ok) 623 return 0; 624 } 625 return 1; 626 } 627 628 static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, 629 struct vring_desc __user *desc, 630 struct vring_avail __user *avail, 631 struct vring_used __user *used) 632 { 633 size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; 634 return access_ok(VERIFY_READ, desc, num * sizeof *desc) && 635 access_ok(VERIFY_READ, avail, 636 sizeof *avail + num * sizeof *avail->ring + s) && 637 access_ok(VERIFY_WRITE, used, 638 sizeof *used + num * sizeof *used->ring + s); 639 } 640 641 /* Can we log writes? */ 642 /* Caller should have device mutex but not vq mutex */ 643 int vhost_log_access_ok(struct vhost_dev *dev) 644 { 645 return memory_access_ok(dev, dev->memory, 1); 646 } 647 EXPORT_SYMBOL_GPL(vhost_log_access_ok); 648 649 /* Verify access for write logging. */ 650 /* Caller should have vq mutex and device mutex */ 651 static int vq_log_access_ok(struct vhost_virtqueue *vq, 652 void __user *log_base) 653 { 654 size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; 655 656 return vq_memory_access_ok(log_base, vq->memory, 657 vhost_has_feature(vq, VHOST_F_LOG_ALL)) && 658 (!vq->log_used || log_access_ok(log_base, vq->log_addr, 659 sizeof *vq->used + 660 vq->num * sizeof *vq->used->ring + s)); 661 } 662 663 /* Can we start vq? */ 664 /* Caller should have vq mutex and device mutex */ 665 int vhost_vq_access_ok(struct vhost_virtqueue *vq) 666 { 667 return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used) && 668 vq_log_access_ok(vq, vq->log_base); 669 } 670 EXPORT_SYMBOL_GPL(vhost_vq_access_ok); 671 672 static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2) 673 { 674 const struct vhost_memory_region *r1 = p1, *r2 = p2; 675 if (r1->guest_phys_addr < r2->guest_phys_addr) 676 return 1; 677 if (r1->guest_phys_addr > r2->guest_phys_addr) 678 return -1; 679 return 0; 680 } 681 682 static void *vhost_kvzalloc(unsigned long size) 683 { 684 void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 685 686 if (!n) { 687 n = vzalloc(size); 688 if (!n) 689 return ERR_PTR(-ENOMEM); 690 } 691 return n; 692 } 693 694 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) 695 { 696 struct vhost_memory mem, *newmem, *oldmem; 697 unsigned long size = offsetof(struct vhost_memory, regions); 698 int i; 699 700 if (copy_from_user(&mem, m, size)) 701 return -EFAULT; 702 if (mem.padding) 703 return -EOPNOTSUPP; 704 if (mem.nregions > max_mem_regions) 705 return -E2BIG; 706 newmem = vhost_kvzalloc(size + mem.nregions * sizeof(*m->regions)); 707 if (!newmem) 708 return -ENOMEM; 709 710 memcpy(newmem, &mem, size); 711 if (copy_from_user(newmem->regions, m->regions, 712 mem.nregions * sizeof *m->regions)) { 713 kvfree(newmem); 714 return -EFAULT; 715 } 716 sort(newmem->regions, newmem->nregions, sizeof(*newmem->regions), 717 vhost_memory_reg_sort_cmp, NULL); 718 719 if (!memory_access_ok(d, newmem, 0)) { 720 kvfree(newmem); 721 return -EFAULT; 722 } 723 oldmem = d->memory; 724 d->memory = newmem; 725 726 /* All memory accesses are done under some VQ mutex. */ 727 for (i = 0; i < d->nvqs; ++i) { 728 mutex_lock(&d->vqs[i]->mutex); 729 d->vqs[i]->memory = newmem; 730 mutex_unlock(&d->vqs[i]->mutex); 731 } 732 kvfree(oldmem); 733 return 0; 734 } 735 736 long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) 737 { 738 struct file *eventfp, *filep = NULL; 739 bool pollstart = false, pollstop = false; 740 struct eventfd_ctx *ctx = NULL; 741 u32 __user *idxp = argp; 742 struct vhost_virtqueue *vq; 743 struct vhost_vring_state s; 744 struct vhost_vring_file f; 745 struct vhost_vring_addr a; 746 u32 idx; 747 long r; 748 749 r = get_user(idx, idxp); 750 if (r < 0) 751 return r; 752 if (idx >= d->nvqs) 753 return -ENOBUFS; 754 755 vq = d->vqs[idx]; 756 757 mutex_lock(&vq->mutex); 758 759 switch (ioctl) { 760 case VHOST_SET_VRING_NUM: 761 /* Resizing ring with an active backend? 762 * You don't want to do that. */ 763 if (vq->private_data) { 764 r = -EBUSY; 765 break; 766 } 767 if (copy_from_user(&s, argp, sizeof s)) { 768 r = -EFAULT; 769 break; 770 } 771 if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) { 772 r = -EINVAL; 773 break; 774 } 775 vq->num = s.num; 776 break; 777 case VHOST_SET_VRING_BASE: 778 /* Moving base with an active backend? 779 * You don't want to do that. */ 780 if (vq->private_data) { 781 r = -EBUSY; 782 break; 783 } 784 if (copy_from_user(&s, argp, sizeof s)) { 785 r = -EFAULT; 786 break; 787 } 788 if (s.num > 0xffff) { 789 r = -EINVAL; 790 break; 791 } 792 vq->last_avail_idx = s.num; 793 /* Forget the cached index value. */ 794 vq->avail_idx = vq->last_avail_idx; 795 break; 796 case VHOST_GET_VRING_BASE: 797 s.index = idx; 798 s.num = vq->last_avail_idx; 799 if (copy_to_user(argp, &s, sizeof s)) 800 r = -EFAULT; 801 break; 802 case VHOST_SET_VRING_ADDR: 803 if (copy_from_user(&a, argp, sizeof a)) { 804 r = -EFAULT; 805 break; 806 } 807 if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) { 808 r = -EOPNOTSUPP; 809 break; 810 } 811 /* For 32bit, verify that the top 32bits of the user 812 data are set to zero. */ 813 if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr || 814 (u64)(unsigned long)a.used_user_addr != a.used_user_addr || 815 (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) { 816 r = -EFAULT; 817 break; 818 } 819 820 /* Make sure it's safe to cast pointers to vring types. */ 821 BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE); 822 BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE); 823 if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) || 824 (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) || 825 (a.log_guest_addr & (sizeof(u64) - 1))) { 826 r = -EINVAL; 827 break; 828 } 829 830 /* We only verify access here if backend is configured. 831 * If it is not, we don't as size might not have been setup. 832 * We will verify when backend is configured. */ 833 if (vq->private_data) { 834 if (!vq_access_ok(vq, vq->num, 835 (void __user *)(unsigned long)a.desc_user_addr, 836 (void __user *)(unsigned long)a.avail_user_addr, 837 (void __user *)(unsigned long)a.used_user_addr)) { 838 r = -EINVAL; 839 break; 840 } 841 842 /* Also validate log access for used ring if enabled. */ 843 if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && 844 !log_access_ok(vq->log_base, a.log_guest_addr, 845 sizeof *vq->used + 846 vq->num * sizeof *vq->used->ring)) { 847 r = -EINVAL; 848 break; 849 } 850 } 851 852 vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); 853 vq->desc = (void __user *)(unsigned long)a.desc_user_addr; 854 vq->avail = (void __user *)(unsigned long)a.avail_user_addr; 855 vq->log_addr = a.log_guest_addr; 856 vq->used = (void __user *)(unsigned long)a.used_user_addr; 857 break; 858 case VHOST_SET_VRING_KICK: 859 if (copy_from_user(&f, argp, sizeof f)) { 860 r = -EFAULT; 861 break; 862 } 863 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); 864 if (IS_ERR(eventfp)) { 865 r = PTR_ERR(eventfp); 866 break; 867 } 868 if (eventfp != vq->kick) { 869 pollstop = (filep = vq->kick) != NULL; 870 pollstart = (vq->kick = eventfp) != NULL; 871 } else 872 filep = eventfp; 873 break; 874 case VHOST_SET_VRING_CALL: 875 if (copy_from_user(&f, argp, sizeof f)) { 876 r = -EFAULT; 877 break; 878 } 879 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); 880 if (IS_ERR(eventfp)) { 881 r = PTR_ERR(eventfp); 882 break; 883 } 884 if (eventfp != vq->call) { 885 filep = vq->call; 886 ctx = vq->call_ctx; 887 vq->call = eventfp; 888 vq->call_ctx = eventfp ? 889 eventfd_ctx_fileget(eventfp) : NULL; 890 } else 891 filep = eventfp; 892 break; 893 case VHOST_SET_VRING_ERR: 894 if (copy_from_user(&f, argp, sizeof f)) { 895 r = -EFAULT; 896 break; 897 } 898 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); 899 if (IS_ERR(eventfp)) { 900 r = PTR_ERR(eventfp); 901 break; 902 } 903 if (eventfp != vq->error) { 904 filep = vq->error; 905 vq->error = eventfp; 906 ctx = vq->error_ctx; 907 vq->error_ctx = eventfp ? 908 eventfd_ctx_fileget(eventfp) : NULL; 909 } else 910 filep = eventfp; 911 break; 912 case VHOST_SET_VRING_ENDIAN: 913 r = vhost_set_vring_endian(vq, argp); 914 break; 915 case VHOST_GET_VRING_ENDIAN: 916 r = vhost_get_vring_endian(vq, idx, argp); 917 break; 918 default: 919 r = -ENOIOCTLCMD; 920 } 921 922 if (pollstop && vq->handle_kick) 923 vhost_poll_stop(&vq->poll); 924 925 if (ctx) 926 eventfd_ctx_put(ctx); 927 if (filep) 928 fput(filep); 929 930 if (pollstart && vq->handle_kick) 931 r = vhost_poll_start(&vq->poll, vq->kick); 932 933 mutex_unlock(&vq->mutex); 934 935 if (pollstop && vq->handle_kick) 936 vhost_poll_flush(&vq->poll); 937 return r; 938 } 939 EXPORT_SYMBOL_GPL(vhost_vring_ioctl); 940 941 /* Caller must have device mutex */ 942 long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) 943 { 944 struct file *eventfp, *filep = NULL; 945 struct eventfd_ctx *ctx = NULL; 946 u64 p; 947 long r; 948 int i, fd; 949 950 /* If you are not the owner, you can become one */ 951 if (ioctl == VHOST_SET_OWNER) { 952 r = vhost_dev_set_owner(d); 953 goto done; 954 } 955 956 /* You must be the owner to do anything else */ 957 r = vhost_dev_check_owner(d); 958 if (r) 959 goto done; 960 961 switch (ioctl) { 962 case VHOST_SET_MEM_TABLE: 963 r = vhost_set_memory(d, argp); 964 break; 965 case VHOST_SET_LOG_BASE: 966 if (copy_from_user(&p, argp, sizeof p)) { 967 r = -EFAULT; 968 break; 969 } 970 if ((u64)(unsigned long)p != p) { 971 r = -EFAULT; 972 break; 973 } 974 for (i = 0; i < d->nvqs; ++i) { 975 struct vhost_virtqueue *vq; 976 void __user *base = (void __user *)(unsigned long)p; 977 vq = d->vqs[i]; 978 mutex_lock(&vq->mutex); 979 /* If ring is inactive, will check when it's enabled. */ 980 if (vq->private_data && !vq_log_access_ok(vq, base)) 981 r = -EFAULT; 982 else 983 vq->log_base = base; 984 mutex_unlock(&vq->mutex); 985 } 986 break; 987 case VHOST_SET_LOG_FD: 988 r = get_user(fd, (int __user *)argp); 989 if (r < 0) 990 break; 991 eventfp = fd == -1 ? NULL : eventfd_fget(fd); 992 if (IS_ERR(eventfp)) { 993 r = PTR_ERR(eventfp); 994 break; 995 } 996 if (eventfp != d->log_file) { 997 filep = d->log_file; 998 ctx = d->log_ctx; 999 d->log_ctx = eventfp ? 1000 eventfd_ctx_fileget(eventfp) : NULL; 1001 } else 1002 filep = eventfp; 1003 for (i = 0; i < d->nvqs; ++i) { 1004 mutex_lock(&d->vqs[i]->mutex); 1005 d->vqs[i]->log_ctx = d->log_ctx; 1006 mutex_unlock(&d->vqs[i]->mutex); 1007 } 1008 if (ctx) 1009 eventfd_ctx_put(ctx); 1010 if (filep) 1011 fput(filep); 1012 break; 1013 default: 1014 r = -ENOIOCTLCMD; 1015 break; 1016 } 1017 done: 1018 return r; 1019 } 1020 EXPORT_SYMBOL_GPL(vhost_dev_ioctl); 1021 1022 static const struct vhost_memory_region *find_region(struct vhost_memory *mem, 1023 __u64 addr, __u32 len) 1024 { 1025 const struct vhost_memory_region *reg; 1026 int start = 0, end = mem->nregions; 1027 1028 while (start < end) { 1029 int slot = start + (end - start) / 2; 1030 reg = mem->regions + slot; 1031 if (addr >= reg->guest_phys_addr) 1032 end = slot; 1033 else 1034 start = slot + 1; 1035 } 1036 1037 reg = mem->regions + start; 1038 if (addr >= reg->guest_phys_addr && 1039 reg->guest_phys_addr + reg->memory_size > addr) 1040 return reg; 1041 return NULL; 1042 } 1043 1044 /* TODO: This is really inefficient. We need something like get_user() 1045 * (instruction directly accesses the data, with an exception table entry 1046 * returning -EFAULT). See Documentation/x86/exception-tables.txt. 1047 */ 1048 static int set_bit_to_user(int nr, void __user *addr) 1049 { 1050 unsigned long log = (unsigned long)addr; 1051 struct page *page; 1052 void *base; 1053 int bit = nr + (log % PAGE_SIZE) * 8; 1054 int r; 1055 1056 r = get_user_pages_fast(log, 1, 1, &page); 1057 if (r < 0) 1058 return r; 1059 BUG_ON(r != 1); 1060 base = kmap_atomic(page); 1061 set_bit(bit, base); 1062 kunmap_atomic(base); 1063 set_page_dirty_lock(page); 1064 put_page(page); 1065 return 0; 1066 } 1067 1068 static int log_write(void __user *log_base, 1069 u64 write_address, u64 write_length) 1070 { 1071 u64 write_page = write_address / VHOST_PAGE_SIZE; 1072 int r; 1073 1074 if (!write_length) 1075 return 0; 1076 write_length += write_address % VHOST_PAGE_SIZE; 1077 for (;;) { 1078 u64 base = (u64)(unsigned long)log_base; 1079 u64 log = base + write_page / 8; 1080 int bit = write_page % 8; 1081 if ((u64)(unsigned long)log != log) 1082 return -EFAULT; 1083 r = set_bit_to_user(bit, (void __user *)(unsigned long)log); 1084 if (r < 0) 1085 return r; 1086 if (write_length <= VHOST_PAGE_SIZE) 1087 break; 1088 write_length -= VHOST_PAGE_SIZE; 1089 write_page += 1; 1090 } 1091 return r; 1092 } 1093 1094 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, 1095 unsigned int log_num, u64 len) 1096 { 1097 int i, r; 1098 1099 /* Make sure data written is seen before log. */ 1100 smp_wmb(); 1101 for (i = 0; i < log_num; ++i) { 1102 u64 l = min(log[i].len, len); 1103 r = log_write(vq->log_base, log[i].addr, l); 1104 if (r < 0) 1105 return r; 1106 len -= l; 1107 if (!len) { 1108 if (vq->log_ctx) 1109 eventfd_signal(vq->log_ctx, 1); 1110 return 0; 1111 } 1112 } 1113 /* Length written exceeds what we have stored. This is a bug. */ 1114 BUG(); 1115 return 0; 1116 } 1117 EXPORT_SYMBOL_GPL(vhost_log_write); 1118 1119 static int vhost_update_used_flags(struct vhost_virtqueue *vq) 1120 { 1121 void __user *used; 1122 if (__put_user(cpu_to_vhost16(vq, vq->used_flags), &vq->used->flags) < 0) 1123 return -EFAULT; 1124 if (unlikely(vq->log_used)) { 1125 /* Make sure the flag is seen before log. */ 1126 smp_wmb(); 1127 /* Log used flag write. */ 1128 used = &vq->used->flags; 1129 log_write(vq->log_base, vq->log_addr + 1130 (used - (void __user *)vq->used), 1131 sizeof vq->used->flags); 1132 if (vq->log_ctx) 1133 eventfd_signal(vq->log_ctx, 1); 1134 } 1135 return 0; 1136 } 1137 1138 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event) 1139 { 1140 if (__put_user(cpu_to_vhost16(vq, vq->avail_idx), vhost_avail_event(vq))) 1141 return -EFAULT; 1142 if (unlikely(vq->log_used)) { 1143 void __user *used; 1144 /* Make sure the event is seen before log. */ 1145 smp_wmb(); 1146 /* Log avail event write */ 1147 used = vhost_avail_event(vq); 1148 log_write(vq->log_base, vq->log_addr + 1149 (used - (void __user *)vq->used), 1150 sizeof *vhost_avail_event(vq)); 1151 if (vq->log_ctx) 1152 eventfd_signal(vq->log_ctx, 1); 1153 } 1154 return 0; 1155 } 1156 1157 int vhost_init_used(struct vhost_virtqueue *vq) 1158 { 1159 __virtio16 last_used_idx; 1160 int r; 1161 if (!vq->private_data) { 1162 vq->is_le = virtio_legacy_is_little_endian(); 1163 return 0; 1164 } 1165 1166 vhost_init_is_le(vq); 1167 1168 r = vhost_update_used_flags(vq); 1169 if (r) 1170 return r; 1171 vq->signalled_used_valid = false; 1172 if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) 1173 return -EFAULT; 1174 r = __get_user(last_used_idx, &vq->used->idx); 1175 if (r) 1176 return r; 1177 vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx); 1178 return 0; 1179 } 1180 EXPORT_SYMBOL_GPL(vhost_init_used); 1181 1182 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, 1183 struct iovec iov[], int iov_size) 1184 { 1185 const struct vhost_memory_region *reg; 1186 struct vhost_memory *mem; 1187 struct iovec *_iov; 1188 u64 s = 0; 1189 int ret = 0; 1190 1191 mem = vq->memory; 1192 while ((u64)len > s) { 1193 u64 size; 1194 if (unlikely(ret >= iov_size)) { 1195 ret = -ENOBUFS; 1196 break; 1197 } 1198 reg = find_region(mem, addr, len); 1199 if (unlikely(!reg)) { 1200 ret = -EFAULT; 1201 break; 1202 } 1203 _iov = iov + ret; 1204 size = reg->memory_size - addr + reg->guest_phys_addr; 1205 _iov->iov_len = min((u64)len - s, size); 1206 _iov->iov_base = (void __user *)(unsigned long) 1207 (reg->userspace_addr + addr - reg->guest_phys_addr); 1208 s += size; 1209 addr += size; 1210 ++ret; 1211 } 1212 1213 return ret; 1214 } 1215 1216 /* Each buffer in the virtqueues is actually a chain of descriptors. This 1217 * function returns the next descriptor in the chain, 1218 * or -1U if we're at the end. */ 1219 static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc) 1220 { 1221 unsigned int next; 1222 1223 /* If this descriptor says it doesn't chain, we're done. */ 1224 if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT))) 1225 return -1U; 1226 1227 /* Check they're not leading us off end of descriptors. */ 1228 next = vhost16_to_cpu(vq, desc->next); 1229 /* Make sure compiler knows to grab that: we don't want it changing! */ 1230 /* We will use the result as an index in an array, so most 1231 * architectures only need a compiler barrier here. */ 1232 read_barrier_depends(); 1233 1234 return next; 1235 } 1236 1237 static int get_indirect(struct vhost_virtqueue *vq, 1238 struct iovec iov[], unsigned int iov_size, 1239 unsigned int *out_num, unsigned int *in_num, 1240 struct vhost_log *log, unsigned int *log_num, 1241 struct vring_desc *indirect) 1242 { 1243 struct vring_desc desc; 1244 unsigned int i = 0, count, found = 0; 1245 u32 len = vhost32_to_cpu(vq, indirect->len); 1246 struct iov_iter from; 1247 int ret; 1248 1249 /* Sanity check */ 1250 if (unlikely(len % sizeof desc)) { 1251 vq_err(vq, "Invalid length in indirect descriptor: " 1252 "len 0x%llx not multiple of 0x%zx\n", 1253 (unsigned long long)len, 1254 sizeof desc); 1255 return -EINVAL; 1256 } 1257 1258 ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect, 1259 UIO_MAXIOV); 1260 if (unlikely(ret < 0)) { 1261 vq_err(vq, "Translation failure %d in indirect.\n", ret); 1262 return ret; 1263 } 1264 iov_iter_init(&from, READ, vq->indirect, ret, len); 1265 1266 /* We will use the result as an address to read from, so most 1267 * architectures only need a compiler barrier here. */ 1268 read_barrier_depends(); 1269 1270 count = len / sizeof desc; 1271 /* Buffers are chained via a 16 bit next field, so 1272 * we can have at most 2^16 of these. */ 1273 if (unlikely(count > USHRT_MAX + 1)) { 1274 vq_err(vq, "Indirect buffer length too big: %d\n", 1275 indirect->len); 1276 return -E2BIG; 1277 } 1278 1279 do { 1280 unsigned iov_count = *in_num + *out_num; 1281 if (unlikely(++found > count)) { 1282 vq_err(vq, "Loop detected: last one at %u " 1283 "indirect size %u\n", 1284 i, count); 1285 return -EINVAL; 1286 } 1287 if (unlikely(copy_from_iter(&desc, sizeof(desc), &from) != 1288 sizeof(desc))) { 1289 vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n", 1290 i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); 1291 return -EINVAL; 1292 } 1293 if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) { 1294 vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n", 1295 i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); 1296 return -EINVAL; 1297 } 1298 1299 ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr), 1300 vhost32_to_cpu(vq, desc.len), iov + iov_count, 1301 iov_size - iov_count); 1302 if (unlikely(ret < 0)) { 1303 vq_err(vq, "Translation failure %d indirect idx %d\n", 1304 ret, i); 1305 return ret; 1306 } 1307 /* If this is an input descriptor, increment that count. */ 1308 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) { 1309 *in_num += ret; 1310 if (unlikely(log)) { 1311 log[*log_num].addr = vhost64_to_cpu(vq, desc.addr); 1312 log[*log_num].len = vhost32_to_cpu(vq, desc.len); 1313 ++*log_num; 1314 } 1315 } else { 1316 /* If it's an output descriptor, they're all supposed 1317 * to come before any input descriptors. */ 1318 if (unlikely(*in_num)) { 1319 vq_err(vq, "Indirect descriptor " 1320 "has out after in: idx %d\n", i); 1321 return -EINVAL; 1322 } 1323 *out_num += ret; 1324 } 1325 } while ((i = next_desc(vq, &desc)) != -1); 1326 return 0; 1327 } 1328 1329 /* This looks in the virtqueue and for the first available buffer, and converts 1330 * it to an iovec for convenient access. Since descriptors consist of some 1331 * number of output then some number of input descriptors, it's actually two 1332 * iovecs, but we pack them into one and note how many of each there were. 1333 * 1334 * This function returns the descriptor number found, or vq->num (which is 1335 * never a valid descriptor number) if none was found. A negative code is 1336 * returned on error. */ 1337 int vhost_get_vq_desc(struct vhost_virtqueue *vq, 1338 struct iovec iov[], unsigned int iov_size, 1339 unsigned int *out_num, unsigned int *in_num, 1340 struct vhost_log *log, unsigned int *log_num) 1341 { 1342 struct vring_desc desc; 1343 unsigned int i, head, found = 0; 1344 u16 last_avail_idx; 1345 __virtio16 avail_idx; 1346 __virtio16 ring_head; 1347 int ret; 1348 1349 /* Check it isn't doing very strange things with descriptor numbers. */ 1350 last_avail_idx = vq->last_avail_idx; 1351 if (unlikely(__get_user(avail_idx, &vq->avail->idx))) { 1352 vq_err(vq, "Failed to access avail idx at %p\n", 1353 &vq->avail->idx); 1354 return -EFAULT; 1355 } 1356 vq->avail_idx = vhost16_to_cpu(vq, avail_idx); 1357 1358 if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { 1359 vq_err(vq, "Guest moved used index from %u to %u", 1360 last_avail_idx, vq->avail_idx); 1361 return -EFAULT; 1362 } 1363 1364 /* If there's nothing new since last we looked, return invalid. */ 1365 if (vq->avail_idx == last_avail_idx) 1366 return vq->num; 1367 1368 /* Only get avail ring entries after they have been exposed by guest. */ 1369 smp_rmb(); 1370 1371 /* Grab the next descriptor number they're advertising, and increment 1372 * the index we've seen. */ 1373 if (unlikely(__get_user(ring_head, 1374 &vq->avail->ring[last_avail_idx % vq->num]))) { 1375 vq_err(vq, "Failed to read head: idx %d address %p\n", 1376 last_avail_idx, 1377 &vq->avail->ring[last_avail_idx % vq->num]); 1378 return -EFAULT; 1379 } 1380 1381 head = vhost16_to_cpu(vq, ring_head); 1382 1383 /* If their number is silly, that's an error. */ 1384 if (unlikely(head >= vq->num)) { 1385 vq_err(vq, "Guest says index %u > %u is available", 1386 head, vq->num); 1387 return -EINVAL; 1388 } 1389 1390 /* When we start there are none of either input nor output. */ 1391 *out_num = *in_num = 0; 1392 if (unlikely(log)) 1393 *log_num = 0; 1394 1395 i = head; 1396 do { 1397 unsigned iov_count = *in_num + *out_num; 1398 if (unlikely(i >= vq->num)) { 1399 vq_err(vq, "Desc index is %u > %u, head = %u", 1400 i, vq->num, head); 1401 return -EINVAL; 1402 } 1403 if (unlikely(++found > vq->num)) { 1404 vq_err(vq, "Loop detected: last one at %u " 1405 "vq size %u head %u\n", 1406 i, vq->num, head); 1407 return -EINVAL; 1408 } 1409 ret = __copy_from_user(&desc, vq->desc + i, sizeof desc); 1410 if (unlikely(ret)) { 1411 vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", 1412 i, vq->desc + i); 1413 return -EFAULT; 1414 } 1415 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) { 1416 ret = get_indirect(vq, iov, iov_size, 1417 out_num, in_num, 1418 log, log_num, &desc); 1419 if (unlikely(ret < 0)) { 1420 vq_err(vq, "Failure detected " 1421 "in indirect descriptor at idx %d\n", i); 1422 return ret; 1423 } 1424 continue; 1425 } 1426 1427 ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr), 1428 vhost32_to_cpu(vq, desc.len), iov + iov_count, 1429 iov_size - iov_count); 1430 if (unlikely(ret < 0)) { 1431 vq_err(vq, "Translation failure %d descriptor idx %d\n", 1432 ret, i); 1433 return ret; 1434 } 1435 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) { 1436 /* If this is an input descriptor, 1437 * increment that count. */ 1438 *in_num += ret; 1439 if (unlikely(log)) { 1440 log[*log_num].addr = vhost64_to_cpu(vq, desc.addr); 1441 log[*log_num].len = vhost32_to_cpu(vq, desc.len); 1442 ++*log_num; 1443 } 1444 } else { 1445 /* If it's an output descriptor, they're all supposed 1446 * to come before any input descriptors. */ 1447 if (unlikely(*in_num)) { 1448 vq_err(vq, "Descriptor has out after in: " 1449 "idx %d\n", i); 1450 return -EINVAL; 1451 } 1452 *out_num += ret; 1453 } 1454 } while ((i = next_desc(vq, &desc)) != -1); 1455 1456 /* On success, increment avail index. */ 1457 vq->last_avail_idx++; 1458 1459 /* Assume notifications from guest are disabled at this point, 1460 * if they aren't we would need to update avail_event index. */ 1461 BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY)); 1462 return head; 1463 } 1464 EXPORT_SYMBOL_GPL(vhost_get_vq_desc); 1465 1466 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ 1467 void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n) 1468 { 1469 vq->last_avail_idx -= n; 1470 } 1471 EXPORT_SYMBOL_GPL(vhost_discard_vq_desc); 1472 1473 /* After we've used one of their buffers, we tell them about it. We'll then 1474 * want to notify the guest, using eventfd. */ 1475 int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) 1476 { 1477 struct vring_used_elem heads = { 1478 cpu_to_vhost32(vq, head), 1479 cpu_to_vhost32(vq, len) 1480 }; 1481 1482 return vhost_add_used_n(vq, &heads, 1); 1483 } 1484 EXPORT_SYMBOL_GPL(vhost_add_used); 1485 1486 static int __vhost_add_used_n(struct vhost_virtqueue *vq, 1487 struct vring_used_elem *heads, 1488 unsigned count) 1489 { 1490 struct vring_used_elem __user *used; 1491 u16 old, new; 1492 int start; 1493 1494 start = vq->last_used_idx % vq->num; 1495 used = vq->used->ring + start; 1496 if (count == 1) { 1497 if (__put_user(heads[0].id, &used->id)) { 1498 vq_err(vq, "Failed to write used id"); 1499 return -EFAULT; 1500 } 1501 if (__put_user(heads[0].len, &used->len)) { 1502 vq_err(vq, "Failed to write used len"); 1503 return -EFAULT; 1504 } 1505 } else if (__copy_to_user(used, heads, count * sizeof *used)) { 1506 vq_err(vq, "Failed to write used"); 1507 return -EFAULT; 1508 } 1509 if (unlikely(vq->log_used)) { 1510 /* Make sure data is seen before log. */ 1511 smp_wmb(); 1512 /* Log used ring entry write. */ 1513 log_write(vq->log_base, 1514 vq->log_addr + 1515 ((void __user *)used - (void __user *)vq->used), 1516 count * sizeof *used); 1517 } 1518 old = vq->last_used_idx; 1519 new = (vq->last_used_idx += count); 1520 /* If the driver never bothers to signal in a very long while, 1521 * used index might wrap around. If that happens, invalidate 1522 * signalled_used index we stored. TODO: make sure driver 1523 * signals at least once in 2^16 and remove this. */ 1524 if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old))) 1525 vq->signalled_used_valid = false; 1526 return 0; 1527 } 1528 1529 /* After we've used one of their buffers, we tell them about it. We'll then 1530 * want to notify the guest, using eventfd. */ 1531 int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, 1532 unsigned count) 1533 { 1534 int start, n, r; 1535 1536 start = vq->last_used_idx % vq->num; 1537 n = vq->num - start; 1538 if (n < count) { 1539 r = __vhost_add_used_n(vq, heads, n); 1540 if (r < 0) 1541 return r; 1542 heads += n; 1543 count -= n; 1544 } 1545 r = __vhost_add_used_n(vq, heads, count); 1546 1547 /* Make sure buffer is written before we update index. */ 1548 smp_wmb(); 1549 if (__put_user(cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx)) { 1550 vq_err(vq, "Failed to increment used idx"); 1551 return -EFAULT; 1552 } 1553 if (unlikely(vq->log_used)) { 1554 /* Log used index update. */ 1555 log_write(vq->log_base, 1556 vq->log_addr + offsetof(struct vring_used, idx), 1557 sizeof vq->used->idx); 1558 if (vq->log_ctx) 1559 eventfd_signal(vq->log_ctx, 1); 1560 } 1561 return r; 1562 } 1563 EXPORT_SYMBOL_GPL(vhost_add_used_n); 1564 1565 static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) 1566 { 1567 __u16 old, new; 1568 __virtio16 event; 1569 bool v; 1570 /* Flush out used index updates. This is paired 1571 * with the barrier that the Guest executes when enabling 1572 * interrupts. */ 1573 smp_mb(); 1574 1575 if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) && 1576 unlikely(vq->avail_idx == vq->last_avail_idx)) 1577 return true; 1578 1579 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { 1580 __virtio16 flags; 1581 if (__get_user(flags, &vq->avail->flags)) { 1582 vq_err(vq, "Failed to get flags"); 1583 return true; 1584 } 1585 return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT)); 1586 } 1587 old = vq->signalled_used; 1588 v = vq->signalled_used_valid; 1589 new = vq->signalled_used = vq->last_used_idx; 1590 vq->signalled_used_valid = true; 1591 1592 if (unlikely(!v)) 1593 return true; 1594 1595 if (__get_user(event, vhost_used_event(vq))) { 1596 vq_err(vq, "Failed to get used event idx"); 1597 return true; 1598 } 1599 return vring_need_event(vhost16_to_cpu(vq, event), new, old); 1600 } 1601 1602 /* This actually signals the guest, using eventfd. */ 1603 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) 1604 { 1605 /* Signal the Guest tell them we used something up. */ 1606 if (vq->call_ctx && vhost_notify(dev, vq)) 1607 eventfd_signal(vq->call_ctx, 1); 1608 } 1609 EXPORT_SYMBOL_GPL(vhost_signal); 1610 1611 /* And here's the combo meal deal. Supersize me! */ 1612 void vhost_add_used_and_signal(struct vhost_dev *dev, 1613 struct vhost_virtqueue *vq, 1614 unsigned int head, int len) 1615 { 1616 vhost_add_used(vq, head, len); 1617 vhost_signal(dev, vq); 1618 } 1619 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal); 1620 1621 /* multi-buffer version of vhost_add_used_and_signal */ 1622 void vhost_add_used_and_signal_n(struct vhost_dev *dev, 1623 struct vhost_virtqueue *vq, 1624 struct vring_used_elem *heads, unsigned count) 1625 { 1626 vhost_add_used_n(vq, heads, count); 1627 vhost_signal(dev, vq); 1628 } 1629 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); 1630 1631 /* OK, now we need to know about added descriptors. */ 1632 bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) 1633 { 1634 __virtio16 avail_idx; 1635 int r; 1636 1637 if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) 1638 return false; 1639 vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; 1640 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { 1641 r = vhost_update_used_flags(vq); 1642 if (r) { 1643 vq_err(vq, "Failed to enable notification at %p: %d\n", 1644 &vq->used->flags, r); 1645 return false; 1646 } 1647 } else { 1648 r = vhost_update_avail_event(vq, vq->avail_idx); 1649 if (r) { 1650 vq_err(vq, "Failed to update avail event index at %p: %d\n", 1651 vhost_avail_event(vq), r); 1652 return false; 1653 } 1654 } 1655 /* They could have slipped one in as we were doing that: make 1656 * sure it's written, then check again. */ 1657 smp_mb(); 1658 r = __get_user(avail_idx, &vq->avail->idx); 1659 if (r) { 1660 vq_err(vq, "Failed to check avail idx at %p: %d\n", 1661 &vq->avail->idx, r); 1662 return false; 1663 } 1664 1665 return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx; 1666 } 1667 EXPORT_SYMBOL_GPL(vhost_enable_notify); 1668 1669 /* We don't need to be notified again. */ 1670 void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) 1671 { 1672 int r; 1673 1674 if (vq->used_flags & VRING_USED_F_NO_NOTIFY) 1675 return; 1676 vq->used_flags |= VRING_USED_F_NO_NOTIFY; 1677 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { 1678 r = vhost_update_used_flags(vq); 1679 if (r) 1680 vq_err(vq, "Failed to enable notification at %p: %d\n", 1681 &vq->used->flags, r); 1682 } 1683 } 1684 EXPORT_SYMBOL_GPL(vhost_disable_notify); 1685 1686 static int __init vhost_init(void) 1687 { 1688 return 0; 1689 } 1690 1691 static void __exit vhost_exit(void) 1692 { 1693 } 1694 1695 module_init(vhost_init); 1696 module_exit(vhost_exit); 1697 1698 MODULE_VERSION("0.0.1"); 1699 MODULE_LICENSE("GPL v2"); 1700 MODULE_AUTHOR("Michael S. Tsirkin"); 1701 MODULE_DESCRIPTION("Host kernel accelerator for virtio"); 1702