1 /* Copyright (C) 2009 Red Hat, Inc. 2 * Author: Michael S. Tsirkin <mst@redhat.com> 3 * 4 * This work is licensed under the terms of the GNU GPL, version 2. 5 * 6 * virtio-net server in host kernel. 7 */ 8 9 #include <linux/compat.h> 10 #include <linux/eventfd.h> 11 #include <linux/vhost.h> 12 #include <linux/virtio_net.h> 13 #include <linux/mmu_context.h> 14 #include <linux/miscdevice.h> 15 #include <linux/module.h> 16 #include <linux/mutex.h> 17 #include <linux/workqueue.h> 18 #include <linux/rcupdate.h> 19 #include <linux/file.h> 20 21 #include <linux/net.h> 22 #include <linux/if_packet.h> 23 #include <linux/if_arp.h> 24 #include <linux/if_tun.h> 25 #include <linux/if_macvlan.h> 26 27 #include <net/sock.h> 28 29 #include "vhost.h" 30 31 /* Max number of bytes transferred before requeueing the job. 32 * Using this limit prevents one virtqueue from starving others. */ 33 #define VHOST_NET_WEIGHT 0x80000 34 35 enum { 36 VHOST_NET_VQ_RX = 0, 37 VHOST_NET_VQ_TX = 1, 38 VHOST_NET_VQ_MAX = 2, 39 }; 40 41 enum vhost_net_poll_state { 42 VHOST_NET_POLL_DISABLED = 0, 43 VHOST_NET_POLL_STARTED = 1, 44 VHOST_NET_POLL_STOPPED = 2, 45 }; 46 47 struct vhost_net { 48 struct vhost_dev dev; 49 struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; 50 struct vhost_poll poll[VHOST_NET_VQ_MAX]; 51 /* Tells us whether we are polling a socket for TX. 52 * We only do this when socket buffer fills up. 53 * Protected by tx vq lock. */ 54 enum vhost_net_poll_state tx_poll_state; 55 }; 56 57 /* Pop first len bytes from iovec. Return number of segments used. */ 58 static int move_iovec_hdr(struct iovec *from, struct iovec *to, 59 size_t len, int iov_count) 60 { 61 int seg = 0; 62 size_t size; 63 while (len && seg < iov_count) { 64 size = min(from->iov_len, len); 65 to->iov_base = from->iov_base; 66 to->iov_len = size; 67 from->iov_len -= size; 68 from->iov_base += size; 69 len -= size; 70 ++from; 71 ++to; 72 ++seg; 73 } 74 return seg; 75 } 76 77 /* Caller must have TX VQ lock */ 78 static void tx_poll_stop(struct vhost_net *net) 79 { 80 if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED)) 81 return; 82 vhost_poll_stop(net->poll + VHOST_NET_VQ_TX); 83 net->tx_poll_state = VHOST_NET_POLL_STOPPED; 84 } 85 86 /* Caller must have TX VQ lock */ 87 static void tx_poll_start(struct vhost_net *net, struct socket *sock) 88 { 89 if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED)) 90 return; 91 vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file); 92 net->tx_poll_state = VHOST_NET_POLL_STARTED; 93 } 94 95 /* Expects to be always run from workqueue - which acts as 96 * read-size critical section for our kind of RCU. */ 97 static void handle_tx(struct vhost_net *net) 98 { 99 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; 100 unsigned head, out, in, s; 101 struct msghdr msg = { 102 .msg_name = NULL, 103 .msg_namelen = 0, 104 .msg_control = NULL, 105 .msg_controllen = 0, 106 .msg_iov = vq->iov, 107 .msg_flags = MSG_DONTWAIT, 108 }; 109 size_t len, total_len = 0; 110 int err, wmem; 111 size_t hdr_size; 112 struct socket *sock = rcu_dereference(vq->private_data); 113 if (!sock) 114 return; 115 116 wmem = atomic_read(&sock->sk->sk_wmem_alloc); 117 if (wmem >= sock->sk->sk_sndbuf) { 118 mutex_lock(&vq->mutex); 119 tx_poll_start(net, sock); 120 mutex_unlock(&vq->mutex); 121 return; 122 } 123 124 use_mm(net->dev.mm); 125 mutex_lock(&vq->mutex); 126 vhost_disable_notify(vq); 127 128 if (wmem < sock->sk->sk_sndbuf * 2) 129 tx_poll_stop(net); 130 hdr_size = vq->hdr_size; 131 132 for (;;) { 133 head = vhost_get_vq_desc(&net->dev, vq, vq->iov, 134 ARRAY_SIZE(vq->iov), 135 &out, &in, 136 NULL, NULL); 137 /* Nothing new? Wait for eventfd to tell us they refilled. */ 138 if (head == vq->num) { 139 wmem = atomic_read(&sock->sk->sk_wmem_alloc); 140 if (wmem >= sock->sk->sk_sndbuf * 3 / 4) { 141 tx_poll_start(net, sock); 142 set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); 143 break; 144 } 145 if (unlikely(vhost_enable_notify(vq))) { 146 vhost_disable_notify(vq); 147 continue; 148 } 149 break; 150 } 151 if (in) { 152 vq_err(vq, "Unexpected descriptor format for TX: " 153 "out %d, int %d\n", out, in); 154 break; 155 } 156 /* Skip header. TODO: support TSO. */ 157 s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); 158 msg.msg_iovlen = out; 159 len = iov_length(vq->iov, out); 160 /* Sanity check */ 161 if (!len) { 162 vq_err(vq, "Unexpected header len for TX: " 163 "%zd expected %zd\n", 164 iov_length(vq->hdr, s), hdr_size); 165 break; 166 } 167 /* TODO: Check specific error and bomb out unless ENOBUFS? */ 168 err = sock->ops->sendmsg(NULL, sock, &msg, len); 169 if (unlikely(err < 0)) { 170 vhost_discard_vq_desc(vq); 171 tx_poll_start(net, sock); 172 break; 173 } 174 if (err != len) 175 pr_err("Truncated TX packet: " 176 " len %d != %zd\n", err, len); 177 vhost_add_used_and_signal(&net->dev, vq, head, 0); 178 total_len += len; 179 if (unlikely(total_len >= VHOST_NET_WEIGHT)) { 180 vhost_poll_queue(&vq->poll); 181 break; 182 } 183 } 184 185 mutex_unlock(&vq->mutex); 186 unuse_mm(net->dev.mm); 187 } 188 189 /* Expects to be always run from workqueue - which acts as 190 * read-size critical section for our kind of RCU. */ 191 static void handle_rx(struct vhost_net *net) 192 { 193 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; 194 unsigned head, out, in, log, s; 195 struct vhost_log *vq_log; 196 struct msghdr msg = { 197 .msg_name = NULL, 198 .msg_namelen = 0, 199 .msg_control = NULL, /* FIXME: get and handle RX aux data. */ 200 .msg_controllen = 0, 201 .msg_iov = vq->iov, 202 .msg_flags = MSG_DONTWAIT, 203 }; 204 205 struct virtio_net_hdr hdr = { 206 .flags = 0, 207 .gso_type = VIRTIO_NET_HDR_GSO_NONE 208 }; 209 210 size_t len, total_len = 0; 211 int err; 212 size_t hdr_size; 213 struct socket *sock = rcu_dereference(vq->private_data); 214 if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) 215 return; 216 217 use_mm(net->dev.mm); 218 mutex_lock(&vq->mutex); 219 vhost_disable_notify(vq); 220 hdr_size = vq->hdr_size; 221 222 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? 223 vq->log : NULL; 224 225 for (;;) { 226 head = vhost_get_vq_desc(&net->dev, vq, vq->iov, 227 ARRAY_SIZE(vq->iov), 228 &out, &in, 229 vq_log, &log); 230 /* OK, now we need to know about added descriptors. */ 231 if (head == vq->num) { 232 if (unlikely(vhost_enable_notify(vq))) { 233 /* They have slipped one in as we were 234 * doing that: check again. */ 235 vhost_disable_notify(vq); 236 continue; 237 } 238 /* Nothing new? Wait for eventfd to tell us 239 * they refilled. */ 240 break; 241 } 242 /* We don't need to be notified again. */ 243 if (out) { 244 vq_err(vq, "Unexpected descriptor format for RX: " 245 "out %d, int %d\n", 246 out, in); 247 break; 248 } 249 /* Skip header. TODO: support TSO/mergeable rx buffers. */ 250 s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in); 251 msg.msg_iovlen = in; 252 len = iov_length(vq->iov, in); 253 /* Sanity check */ 254 if (!len) { 255 vq_err(vq, "Unexpected header len for RX: " 256 "%zd expected %zd\n", 257 iov_length(vq->hdr, s), hdr_size); 258 break; 259 } 260 err = sock->ops->recvmsg(NULL, sock, &msg, 261 len, MSG_DONTWAIT | MSG_TRUNC); 262 /* TODO: Check specific error and bomb out unless EAGAIN? */ 263 if (err < 0) { 264 vhost_discard_vq_desc(vq); 265 break; 266 } 267 /* TODO: Should check and handle checksum. */ 268 if (err > len) { 269 pr_err("Discarded truncated rx packet: " 270 " len %d > %zd\n", err, len); 271 vhost_discard_vq_desc(vq); 272 continue; 273 } 274 len = err; 275 err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size); 276 if (err) { 277 vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", 278 vq->iov->iov_base, err); 279 break; 280 } 281 len += hdr_size; 282 vhost_add_used_and_signal(&net->dev, vq, head, len); 283 if (unlikely(vq_log)) 284 vhost_log_write(vq, vq_log, log, len); 285 total_len += len; 286 if (unlikely(total_len >= VHOST_NET_WEIGHT)) { 287 vhost_poll_queue(&vq->poll); 288 break; 289 } 290 } 291 292 mutex_unlock(&vq->mutex); 293 unuse_mm(net->dev.mm); 294 } 295 296 static void handle_tx_kick(struct work_struct *work) 297 { 298 struct vhost_virtqueue *vq; 299 struct vhost_net *net; 300 vq = container_of(work, struct vhost_virtqueue, poll.work); 301 net = container_of(vq->dev, struct vhost_net, dev); 302 handle_tx(net); 303 } 304 305 static void handle_rx_kick(struct work_struct *work) 306 { 307 struct vhost_virtqueue *vq; 308 struct vhost_net *net; 309 vq = container_of(work, struct vhost_virtqueue, poll.work); 310 net = container_of(vq->dev, struct vhost_net, dev); 311 handle_rx(net); 312 } 313 314 static void handle_tx_net(struct work_struct *work) 315 { 316 struct vhost_net *net; 317 net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); 318 handle_tx(net); 319 } 320 321 static void handle_rx_net(struct work_struct *work) 322 { 323 struct vhost_net *net; 324 net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); 325 handle_rx(net); 326 } 327 328 static int vhost_net_open(struct inode *inode, struct file *f) 329 { 330 struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); 331 int r; 332 if (!n) 333 return -ENOMEM; 334 n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; 335 n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; 336 r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); 337 if (r < 0) { 338 kfree(n); 339 return r; 340 } 341 342 vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); 343 vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); 344 n->tx_poll_state = VHOST_NET_POLL_DISABLED; 345 346 f->private_data = n; 347 348 return 0; 349 } 350 351 static void vhost_net_disable_vq(struct vhost_net *n, 352 struct vhost_virtqueue *vq) 353 { 354 if (!vq->private_data) 355 return; 356 if (vq == n->vqs + VHOST_NET_VQ_TX) { 357 tx_poll_stop(n); 358 n->tx_poll_state = VHOST_NET_POLL_DISABLED; 359 } else 360 vhost_poll_stop(n->poll + VHOST_NET_VQ_RX); 361 } 362 363 static void vhost_net_enable_vq(struct vhost_net *n, 364 struct vhost_virtqueue *vq) 365 { 366 struct socket *sock = vq->private_data; 367 if (!sock) 368 return; 369 if (vq == n->vqs + VHOST_NET_VQ_TX) { 370 n->tx_poll_state = VHOST_NET_POLL_STOPPED; 371 tx_poll_start(n, sock); 372 } else 373 vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file); 374 } 375 376 static struct socket *vhost_net_stop_vq(struct vhost_net *n, 377 struct vhost_virtqueue *vq) 378 { 379 struct socket *sock; 380 381 mutex_lock(&vq->mutex); 382 sock = vq->private_data; 383 vhost_net_disable_vq(n, vq); 384 rcu_assign_pointer(vq->private_data, NULL); 385 mutex_unlock(&vq->mutex); 386 return sock; 387 } 388 389 static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, 390 struct socket **rx_sock) 391 { 392 *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX); 393 *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX); 394 } 395 396 static void vhost_net_flush_vq(struct vhost_net *n, int index) 397 { 398 vhost_poll_flush(n->poll + index); 399 vhost_poll_flush(&n->dev.vqs[index].poll); 400 } 401 402 static void vhost_net_flush(struct vhost_net *n) 403 { 404 vhost_net_flush_vq(n, VHOST_NET_VQ_TX); 405 vhost_net_flush_vq(n, VHOST_NET_VQ_RX); 406 } 407 408 static int vhost_net_release(struct inode *inode, struct file *f) 409 { 410 struct vhost_net *n = f->private_data; 411 struct socket *tx_sock; 412 struct socket *rx_sock; 413 414 vhost_net_stop(n, &tx_sock, &rx_sock); 415 vhost_net_flush(n); 416 vhost_dev_cleanup(&n->dev); 417 if (tx_sock) 418 fput(tx_sock->file); 419 if (rx_sock) 420 fput(rx_sock->file); 421 /* We do an extra flush before freeing memory, 422 * since jobs can re-queue themselves. */ 423 vhost_net_flush(n); 424 kfree(n); 425 return 0; 426 } 427 428 static struct socket *get_raw_socket(int fd) 429 { 430 struct { 431 struct sockaddr_ll sa; 432 char buf[MAX_ADDR_LEN]; 433 } uaddr; 434 int uaddr_len = sizeof uaddr, r; 435 struct socket *sock = sockfd_lookup(fd, &r); 436 if (!sock) 437 return ERR_PTR(-ENOTSOCK); 438 439 /* Parameter checking */ 440 if (sock->sk->sk_type != SOCK_RAW) { 441 r = -ESOCKTNOSUPPORT; 442 goto err; 443 } 444 445 r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, 446 &uaddr_len, 0); 447 if (r) 448 goto err; 449 450 if (uaddr.sa.sll_family != AF_PACKET) { 451 r = -EPFNOSUPPORT; 452 goto err; 453 } 454 return sock; 455 err: 456 fput(sock->file); 457 return ERR_PTR(r); 458 } 459 460 static struct socket *get_tap_socket(int fd) 461 { 462 struct file *file = fget(fd); 463 struct socket *sock; 464 if (!file) 465 return ERR_PTR(-EBADF); 466 sock = tun_get_socket(file); 467 if (!IS_ERR(sock)) 468 return sock; 469 sock = macvtap_get_socket(file); 470 if (IS_ERR(sock)) 471 fput(file); 472 return sock; 473 } 474 475 static struct socket *get_socket(int fd) 476 { 477 struct socket *sock; 478 /* special case to disable backend */ 479 if (fd == -1) 480 return NULL; 481 sock = get_raw_socket(fd); 482 if (!IS_ERR(sock)) 483 return sock; 484 sock = get_tap_socket(fd); 485 if (!IS_ERR(sock)) 486 return sock; 487 return ERR_PTR(-ENOTSOCK); 488 } 489 490 static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) 491 { 492 struct socket *sock, *oldsock; 493 struct vhost_virtqueue *vq; 494 int r; 495 496 mutex_lock(&n->dev.mutex); 497 r = vhost_dev_check_owner(&n->dev); 498 if (r) 499 goto err; 500 501 if (index >= VHOST_NET_VQ_MAX) { 502 r = -ENOBUFS; 503 goto err; 504 } 505 vq = n->vqs + index; 506 mutex_lock(&vq->mutex); 507 508 /* Verify that ring has been setup correctly. */ 509 if (!vhost_vq_access_ok(vq)) { 510 r = -EFAULT; 511 goto err; 512 } 513 sock = get_socket(fd); 514 if (IS_ERR(sock)) { 515 r = PTR_ERR(sock); 516 goto err; 517 } 518 519 /* start polling new socket */ 520 oldsock = vq->private_data; 521 if (sock == oldsock) 522 goto done; 523 524 vhost_net_disable_vq(n, vq); 525 rcu_assign_pointer(vq->private_data, sock); 526 vhost_net_enable_vq(n, vq); 527 mutex_unlock(&vq->mutex); 528 done: 529 if (oldsock) { 530 vhost_net_flush_vq(n, index); 531 fput(oldsock->file); 532 } 533 err: 534 mutex_unlock(&n->dev.mutex); 535 return r; 536 } 537 538 static long vhost_net_reset_owner(struct vhost_net *n) 539 { 540 struct socket *tx_sock = NULL; 541 struct socket *rx_sock = NULL; 542 long err; 543 mutex_lock(&n->dev.mutex); 544 err = vhost_dev_check_owner(&n->dev); 545 if (err) 546 goto done; 547 vhost_net_stop(n, &tx_sock, &rx_sock); 548 vhost_net_flush(n); 549 err = vhost_dev_reset_owner(&n->dev); 550 done: 551 mutex_unlock(&n->dev.mutex); 552 if (tx_sock) 553 fput(tx_sock->file); 554 if (rx_sock) 555 fput(rx_sock->file); 556 return err; 557 } 558 559 static int vhost_net_set_features(struct vhost_net *n, u64 features) 560 { 561 size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? 562 sizeof(struct virtio_net_hdr) : 0; 563 int i; 564 mutex_lock(&n->dev.mutex); 565 if ((features & (1 << VHOST_F_LOG_ALL)) && 566 !vhost_log_access_ok(&n->dev)) { 567 mutex_unlock(&n->dev.mutex); 568 return -EFAULT; 569 } 570 n->dev.acked_features = features; 571 smp_wmb(); 572 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 573 mutex_lock(&n->vqs[i].mutex); 574 n->vqs[i].hdr_size = hdr_size; 575 mutex_unlock(&n->vqs[i].mutex); 576 } 577 vhost_net_flush(n); 578 mutex_unlock(&n->dev.mutex); 579 return 0; 580 } 581 582 static long vhost_net_ioctl(struct file *f, unsigned int ioctl, 583 unsigned long arg) 584 { 585 struct vhost_net *n = f->private_data; 586 void __user *argp = (void __user *)arg; 587 u64 __user *featurep = argp; 588 struct vhost_vring_file backend; 589 u64 features; 590 int r; 591 switch (ioctl) { 592 case VHOST_NET_SET_BACKEND: 593 r = copy_from_user(&backend, argp, sizeof backend); 594 if (r < 0) 595 return r; 596 return vhost_net_set_backend(n, backend.index, backend.fd); 597 case VHOST_GET_FEATURES: 598 features = VHOST_FEATURES; 599 return copy_to_user(featurep, &features, sizeof features); 600 case VHOST_SET_FEATURES: 601 r = copy_from_user(&features, featurep, sizeof features); 602 if (r < 0) 603 return r; 604 if (features & ~VHOST_FEATURES) 605 return -EOPNOTSUPP; 606 return vhost_net_set_features(n, features); 607 case VHOST_RESET_OWNER: 608 return vhost_net_reset_owner(n); 609 default: 610 mutex_lock(&n->dev.mutex); 611 r = vhost_dev_ioctl(&n->dev, ioctl, arg); 612 vhost_net_flush(n); 613 mutex_unlock(&n->dev.mutex); 614 return r; 615 } 616 } 617 618 #ifdef CONFIG_COMPAT 619 static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, 620 unsigned long arg) 621 { 622 return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); 623 } 624 #endif 625 626 const static struct file_operations vhost_net_fops = { 627 .owner = THIS_MODULE, 628 .release = vhost_net_release, 629 .unlocked_ioctl = vhost_net_ioctl, 630 #ifdef CONFIG_COMPAT 631 .compat_ioctl = vhost_net_compat_ioctl, 632 #endif 633 .open = vhost_net_open, 634 }; 635 636 static struct miscdevice vhost_net_misc = { 637 VHOST_NET_MINOR, 638 "vhost-net", 639 &vhost_net_fops, 640 }; 641 642 int vhost_net_init(void) 643 { 644 int r = vhost_init(); 645 if (r) 646 goto err_init; 647 r = misc_register(&vhost_net_misc); 648 if (r) 649 goto err_reg; 650 return 0; 651 err_reg: 652 vhost_cleanup(); 653 err_init: 654 return r; 655 656 } 657 module_init(vhost_net_init); 658 659 void vhost_net_exit(void) 660 { 661 misc_deregister(&vhost_net_misc); 662 vhost_cleanup(); 663 } 664 module_exit(vhost_net_exit); 665 666 MODULE_VERSION("0.0.1"); 667 MODULE_LICENSE("GPL v2"); 668 MODULE_AUTHOR("Michael S. Tsirkin"); 669 MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); 670