1 #include <linux/etherdevice.h> 2 #include <linux/if_macvlan.h> 3 #include <linux/if_vlan.h> 4 #include <linux/interrupt.h> 5 #include <linux/nsproxy.h> 6 #include <linux/compat.h> 7 #include <linux/if_tun.h> 8 #include <linux/module.h> 9 #include <linux/skbuff.h> 10 #include <linux/cache.h> 11 #include <linux/sched.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/init.h> 15 #include <linux/wait.h> 16 #include <linux/cdev.h> 17 #include <linux/idr.h> 18 #include <linux/fs.h> 19 20 #include <net/net_namespace.h> 21 #include <net/rtnetlink.h> 22 #include <net/sock.h> 23 #include <linux/virtio_net.h> 24 25 /* 26 * A macvtap queue is the central object of this driver, it connects 27 * an open character device to a macvlan interface. There can be 28 * multiple queues on one interface, which map back to queues 29 * implemented in hardware on the underlying device. 30 * 31 * macvtap_proto is used to allocate queues through the sock allocation 32 * mechanism. 33 * 34 */ 35 struct macvtap_queue { 36 struct sock sk; 37 struct socket sock; 38 struct socket_wq wq; 39 int vnet_hdr_sz; 40 struct macvlan_dev __rcu *vlan; 41 struct file *file; 42 unsigned int flags; 43 u16 queue_index; 44 bool enabled; 45 struct list_head next; 46 }; 47 48 static struct proto macvtap_proto = { 49 .name = "macvtap", 50 .owner = THIS_MODULE, 51 .obj_size = sizeof (struct macvtap_queue), 52 }; 53 54 /* 55 * Variables for dealing with macvtaps device numbers. 56 */ 57 static dev_t macvtap_major; 58 #define MACVTAP_NUM_DEVS (1U << MINORBITS) 59 static DEFINE_MUTEX(minor_lock); 60 static DEFINE_IDR(minor_idr); 61 62 #define GOODCOPY_LEN 128 63 static struct class *macvtap_class; 64 static struct cdev macvtap_cdev; 65 66 static const struct proto_ops macvtap_socket_ops; 67 68 #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ 69 NETIF_F_TSO6 | NETIF_F_UFO) 70 #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) 71 /* 72 * RCU usage: 73 * The macvtap_queue and the macvlan_dev are loosely coupled, the 74 * pointers from one to the other can only be read while rcu_read_lock 75 * or rtnl is held. 76 * 77 * Both the file and the macvlan_dev hold a reference on the macvtap_queue 78 * through sock_hold(&q->sk). When the macvlan_dev goes away first, 79 * q->vlan becomes inaccessible. When the files gets closed, 80 * macvtap_get_queue() fails. 81 * 82 * There may still be references to the struct sock inside of the 83 * queue from outbound SKBs, but these never reference back to the 84 * file or the dev. The data structure is freed through __sk_free 85 * when both our references and any pending SKBs are gone. 86 */ 87 88 static int macvtap_enable_queue(struct net_device *dev, struct file *file, 89 struct macvtap_queue *q) 90 { 91 struct macvlan_dev *vlan = netdev_priv(dev); 92 int err = -EINVAL; 93 94 ASSERT_RTNL(); 95 96 if (q->enabled) 97 goto out; 98 99 err = 0; 100 rcu_assign_pointer(vlan->taps[vlan->numvtaps], q); 101 q->queue_index = vlan->numvtaps; 102 q->enabled = true; 103 104 vlan->numvtaps++; 105 out: 106 return err; 107 } 108 109 static int macvtap_set_queue(struct net_device *dev, struct file *file, 110 struct macvtap_queue *q) 111 { 112 struct macvlan_dev *vlan = netdev_priv(dev); 113 int err = -EBUSY; 114 115 rtnl_lock(); 116 if (vlan->numqueues == MAX_MACVTAP_QUEUES) 117 goto out; 118 119 err = 0; 120 rcu_assign_pointer(q->vlan, vlan); 121 rcu_assign_pointer(vlan->taps[vlan->numvtaps], q); 122 sock_hold(&q->sk); 123 124 q->file = file; 125 q->queue_index = vlan->numvtaps; 126 q->enabled = true; 127 file->private_data = q; 128 list_add_tail(&q->next, &vlan->queue_list); 129 130 vlan->numvtaps++; 131 vlan->numqueues++; 132 133 out: 134 rtnl_unlock(); 135 return err; 136 } 137 138 static int macvtap_disable_queue(struct macvtap_queue *q) 139 { 140 struct macvlan_dev *vlan; 141 struct macvtap_queue *nq; 142 143 ASSERT_RTNL(); 144 if (!q->enabled) 145 return -EINVAL; 146 147 vlan = rtnl_dereference(q->vlan); 148 149 if (vlan) { 150 int index = q->queue_index; 151 BUG_ON(index >= vlan->numvtaps); 152 nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]); 153 nq->queue_index = index; 154 155 rcu_assign_pointer(vlan->taps[index], nq); 156 RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL); 157 q->enabled = false; 158 159 vlan->numvtaps--; 160 } 161 162 return 0; 163 } 164 165 /* 166 * The file owning the queue got closed, give up both 167 * the reference that the files holds as well as the 168 * one from the macvlan_dev if that still exists. 169 * 170 * Using the spinlock makes sure that we don't get 171 * to the queue again after destroying it. 172 */ 173 static void macvtap_put_queue(struct macvtap_queue *q) 174 { 175 struct macvlan_dev *vlan; 176 177 rtnl_lock(); 178 vlan = rtnl_dereference(q->vlan); 179 180 if (vlan) { 181 if (q->enabled) 182 BUG_ON(macvtap_disable_queue(q)); 183 184 vlan->numqueues--; 185 RCU_INIT_POINTER(q->vlan, NULL); 186 sock_put(&q->sk); 187 list_del_init(&q->next); 188 } 189 190 rtnl_unlock(); 191 192 synchronize_rcu(); 193 sock_put(&q->sk); 194 } 195 196 /* 197 * Select a queue based on the rxq of the device on which this packet 198 * arrived. If the incoming device is not mq, calculate a flow hash 199 * to select a queue. If all fails, find the first available queue. 200 * Cache vlan->numvtaps since it can become zero during the execution 201 * of this function. 202 */ 203 static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, 204 struct sk_buff *skb) 205 { 206 struct macvlan_dev *vlan = netdev_priv(dev); 207 struct macvtap_queue *tap = NULL; 208 /* Access to taps array is protected by rcu, but access to numvtaps 209 * isn't. Below we use it to lookup a queue, but treat it as a hint 210 * and validate that the result isn't NULL - in case we are 211 * racing against queue removal. 212 */ 213 int numvtaps = ACCESS_ONCE(vlan->numvtaps); 214 __u32 rxq; 215 216 if (!numvtaps) 217 goto out; 218 219 /* Check if we can use flow to select a queue */ 220 rxq = skb_get_rxhash(skb); 221 if (rxq) { 222 tap = rcu_dereference(vlan->taps[rxq % numvtaps]); 223 goto out; 224 } 225 226 if (likely(skb_rx_queue_recorded(skb))) { 227 rxq = skb_get_rx_queue(skb); 228 229 while (unlikely(rxq >= numvtaps)) 230 rxq -= numvtaps; 231 232 tap = rcu_dereference(vlan->taps[rxq]); 233 goto out; 234 } 235 236 tap = rcu_dereference(vlan->taps[0]); 237 out: 238 return tap; 239 } 240 241 /* 242 * The net_device is going away, give up the reference 243 * that it holds on all queues and safely set the pointer 244 * from the queues to NULL. 245 */ 246 static void macvtap_del_queues(struct net_device *dev) 247 { 248 struct macvlan_dev *vlan = netdev_priv(dev); 249 struct macvtap_queue *q, *tmp, *qlist[MAX_MACVTAP_QUEUES]; 250 int i, j = 0; 251 252 ASSERT_RTNL(); 253 list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) { 254 list_del_init(&q->next); 255 qlist[j++] = q; 256 RCU_INIT_POINTER(q->vlan, NULL); 257 if (q->enabled) 258 vlan->numvtaps--; 259 vlan->numqueues--; 260 } 261 for (i = 0; i < vlan->numvtaps; i++) 262 RCU_INIT_POINTER(vlan->taps[i], NULL); 263 BUG_ON(vlan->numvtaps); 264 BUG_ON(vlan->numqueues); 265 /* guarantee that any future macvtap_set_queue will fail */ 266 vlan->numvtaps = MAX_MACVTAP_QUEUES; 267 268 for (--j; j >= 0; j--) 269 sock_put(&qlist[j]->sk); 270 } 271 272 /* 273 * Forward happens for data that gets sent from one macvlan 274 * endpoint to another one in bridge mode. We just take 275 * the skb and put it into the receive queue. 276 */ 277 static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) 278 { 279 struct macvlan_dev *vlan = netdev_priv(dev); 280 struct macvtap_queue *q = macvtap_get_queue(dev, skb); 281 netdev_features_t features; 282 if (!q) 283 goto drop; 284 285 if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) 286 goto drop; 287 288 skb->dev = dev; 289 /* Apply the forward feature mask so that we perform segmentation 290 * according to users wishes. 291 */ 292 features = netif_skb_features(skb) & vlan->tap_features; 293 if (netif_needs_gso(skb, features)) { 294 struct sk_buff *segs = __skb_gso_segment(skb, features, false); 295 296 if (IS_ERR(segs)) 297 goto drop; 298 299 if (!segs) { 300 skb_queue_tail(&q->sk.sk_receive_queue, skb); 301 goto wake_up; 302 } 303 304 kfree_skb(skb); 305 while (segs) { 306 struct sk_buff *nskb = segs->next; 307 308 segs->next = NULL; 309 skb_queue_tail(&q->sk.sk_receive_queue, segs); 310 segs = nskb; 311 } 312 } else { 313 skb_queue_tail(&q->sk.sk_receive_queue, skb); 314 } 315 316 wake_up: 317 wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); 318 return NET_RX_SUCCESS; 319 320 drop: 321 kfree_skb(skb); 322 return NET_RX_DROP; 323 } 324 325 /* 326 * Receive is for data from the external interface (lowerdev), 327 * in case of macvtap, we can treat that the same way as 328 * forward, which macvlan cannot. 329 */ 330 static int macvtap_receive(struct sk_buff *skb) 331 { 332 skb_push(skb, ETH_HLEN); 333 return macvtap_forward(skb->dev, skb); 334 } 335 336 static int macvtap_get_minor(struct macvlan_dev *vlan) 337 { 338 int retval = -ENOMEM; 339 340 mutex_lock(&minor_lock); 341 retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL); 342 if (retval >= 0) { 343 vlan->minor = retval; 344 } else if (retval == -ENOSPC) { 345 printk(KERN_ERR "too many macvtap devices\n"); 346 retval = -EINVAL; 347 } 348 mutex_unlock(&minor_lock); 349 return retval < 0 ? retval : 0; 350 } 351 352 static void macvtap_free_minor(struct macvlan_dev *vlan) 353 { 354 mutex_lock(&minor_lock); 355 if (vlan->minor) { 356 idr_remove(&minor_idr, vlan->minor); 357 vlan->minor = 0; 358 } 359 mutex_unlock(&minor_lock); 360 } 361 362 static struct net_device *dev_get_by_macvtap_minor(int minor) 363 { 364 struct net_device *dev = NULL; 365 struct macvlan_dev *vlan; 366 367 mutex_lock(&minor_lock); 368 vlan = idr_find(&minor_idr, minor); 369 if (vlan) { 370 dev = vlan->dev; 371 dev_hold(dev); 372 } 373 mutex_unlock(&minor_lock); 374 return dev; 375 } 376 377 static int macvtap_newlink(struct net *src_net, 378 struct net_device *dev, 379 struct nlattr *tb[], 380 struct nlattr *data[]) 381 { 382 struct macvlan_dev *vlan = netdev_priv(dev); 383 INIT_LIST_HEAD(&vlan->queue_list); 384 385 /* Since macvlan supports all offloads by default, make 386 * tap support all offloads also. 387 */ 388 vlan->tap_features = TUN_OFFLOADS; 389 390 /* Don't put anything that may fail after macvlan_common_newlink 391 * because we can't undo what it does. 392 */ 393 return macvlan_common_newlink(src_net, dev, tb, data, 394 macvtap_receive, macvtap_forward); 395 } 396 397 static void macvtap_dellink(struct net_device *dev, 398 struct list_head *head) 399 { 400 macvtap_del_queues(dev); 401 macvlan_dellink(dev, head); 402 } 403 404 static void macvtap_setup(struct net_device *dev) 405 { 406 macvlan_common_setup(dev); 407 dev->tx_queue_len = TUN_READQ_SIZE; 408 } 409 410 static struct rtnl_link_ops macvtap_link_ops __read_mostly = { 411 .kind = "macvtap", 412 .setup = macvtap_setup, 413 .newlink = macvtap_newlink, 414 .dellink = macvtap_dellink, 415 }; 416 417 418 static void macvtap_sock_write_space(struct sock *sk) 419 { 420 wait_queue_head_t *wqueue; 421 422 if (!sock_writeable(sk) || 423 !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) 424 return; 425 426 wqueue = sk_sleep(sk); 427 if (wqueue && waitqueue_active(wqueue)) 428 wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); 429 } 430 431 static void macvtap_sock_destruct(struct sock *sk) 432 { 433 skb_queue_purge(&sk->sk_receive_queue); 434 } 435 436 static int macvtap_open(struct inode *inode, struct file *file) 437 { 438 struct net *net = current->nsproxy->net_ns; 439 struct net_device *dev = dev_get_by_macvtap_minor(iminor(inode)); 440 struct macvtap_queue *q; 441 int err; 442 443 err = -ENODEV; 444 if (!dev) 445 goto out; 446 447 err = -ENOMEM; 448 q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, 449 &macvtap_proto); 450 if (!q) 451 goto out; 452 453 RCU_INIT_POINTER(q->sock.wq, &q->wq); 454 init_waitqueue_head(&q->wq.wait); 455 q->sock.type = SOCK_RAW; 456 q->sock.state = SS_CONNECTED; 457 q->sock.file = file; 458 q->sock.ops = &macvtap_socket_ops; 459 sock_init_data(&q->sock, &q->sk); 460 q->sk.sk_write_space = macvtap_sock_write_space; 461 q->sk.sk_destruct = macvtap_sock_destruct; 462 q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; 463 q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); 464 465 /* 466 * so far only KVM virtio_net uses macvtap, enable zero copy between 467 * guest kernel and host kernel when lower device supports zerocopy 468 * 469 * The macvlan supports zerocopy iff the lower device supports zero 470 * copy so we don't have to look at the lower device directly. 471 */ 472 if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG)) 473 sock_set_flag(&q->sk, SOCK_ZEROCOPY); 474 475 err = macvtap_set_queue(dev, file, q); 476 if (err) 477 sock_put(&q->sk); 478 479 out: 480 if (dev) 481 dev_put(dev); 482 483 return err; 484 } 485 486 static int macvtap_release(struct inode *inode, struct file *file) 487 { 488 struct macvtap_queue *q = file->private_data; 489 macvtap_put_queue(q); 490 return 0; 491 } 492 493 static unsigned int macvtap_poll(struct file *file, poll_table * wait) 494 { 495 struct macvtap_queue *q = file->private_data; 496 unsigned int mask = POLLERR; 497 498 if (!q) 499 goto out; 500 501 mask = 0; 502 poll_wait(file, &q->wq.wait, wait); 503 504 if (!skb_queue_empty(&q->sk.sk_receive_queue)) 505 mask |= POLLIN | POLLRDNORM; 506 507 if (sock_writeable(&q->sk) || 508 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && 509 sock_writeable(&q->sk))) 510 mask |= POLLOUT | POLLWRNORM; 511 512 out: 513 return mask; 514 } 515 516 static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, 517 size_t len, size_t linear, 518 int noblock, int *err) 519 { 520 struct sk_buff *skb; 521 522 /* Under a page? Don't bother with paged skb. */ 523 if (prepad + len < PAGE_SIZE || !linear) 524 linear = len; 525 526 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, 527 err); 528 if (!skb) 529 return NULL; 530 531 skb_reserve(skb, prepad); 532 skb_put(skb, linear); 533 skb->data_len = len - linear; 534 skb->len += len - linear; 535 536 return skb; 537 } 538 539 /* set skb frags from iovec, this can move to core network code for reuse */ 540 static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, 541 int offset, size_t count) 542 { 543 int len = iov_length(from, count) - offset; 544 int copy = skb_headlen(skb); 545 int size, offset1 = 0; 546 int i = 0; 547 548 /* Skip over from offset */ 549 while (count && (offset >= from->iov_len)) { 550 offset -= from->iov_len; 551 ++from; 552 --count; 553 } 554 555 /* copy up to skb headlen */ 556 while (count && (copy > 0)) { 557 size = min_t(unsigned int, copy, from->iov_len - offset); 558 if (copy_from_user(skb->data + offset1, from->iov_base + offset, 559 size)) 560 return -EFAULT; 561 if (copy > size) { 562 ++from; 563 --count; 564 offset = 0; 565 } else 566 offset += size; 567 copy -= size; 568 offset1 += size; 569 } 570 571 if (len == offset1) 572 return 0; 573 574 while (count--) { 575 struct page *page[MAX_SKB_FRAGS]; 576 int num_pages; 577 unsigned long base; 578 unsigned long truesize; 579 580 len = from->iov_len - offset; 581 if (!len) { 582 offset = 0; 583 ++from; 584 continue; 585 } 586 base = (unsigned long)from->iov_base + offset; 587 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; 588 if (i + size > MAX_SKB_FRAGS) 589 return -EMSGSIZE; 590 num_pages = get_user_pages_fast(base, size, 0, &page[i]); 591 if (num_pages != size) { 592 int j; 593 594 for (j = 0; j < num_pages; j++) 595 put_page(page[i + j]); 596 return -EFAULT; 597 } 598 truesize = size * PAGE_SIZE; 599 skb->data_len += len; 600 skb->len += len; 601 skb->truesize += truesize; 602 atomic_add(truesize, &skb->sk->sk_wmem_alloc); 603 while (len) { 604 int off = base & ~PAGE_MASK; 605 int size = min_t(int, len, PAGE_SIZE - off); 606 __skb_fill_page_desc(skb, i, page[i], off, size); 607 skb_shinfo(skb)->nr_frags++; 608 /* increase sk_wmem_alloc */ 609 base += size; 610 len -= size; 611 i++; 612 } 613 offset = 0; 614 ++from; 615 } 616 return 0; 617 } 618 619 /* 620 * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should 621 * be shared with the tun/tap driver. 622 */ 623 static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb, 624 struct virtio_net_hdr *vnet_hdr) 625 { 626 unsigned short gso_type = 0; 627 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 628 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { 629 case VIRTIO_NET_HDR_GSO_TCPV4: 630 gso_type = SKB_GSO_TCPV4; 631 break; 632 case VIRTIO_NET_HDR_GSO_TCPV6: 633 gso_type = SKB_GSO_TCPV6; 634 break; 635 case VIRTIO_NET_HDR_GSO_UDP: 636 gso_type = SKB_GSO_UDP; 637 break; 638 default: 639 return -EINVAL; 640 } 641 642 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) 643 gso_type |= SKB_GSO_TCP_ECN; 644 645 if (vnet_hdr->gso_size == 0) 646 return -EINVAL; 647 } 648 649 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 650 if (!skb_partial_csum_set(skb, vnet_hdr->csum_start, 651 vnet_hdr->csum_offset)) 652 return -EINVAL; 653 } 654 655 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 656 skb_shinfo(skb)->gso_size = vnet_hdr->gso_size; 657 skb_shinfo(skb)->gso_type = gso_type; 658 659 /* Header must be checked, and gso_segs computed. */ 660 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 661 skb_shinfo(skb)->gso_segs = 0; 662 } 663 return 0; 664 } 665 666 static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb, 667 struct virtio_net_hdr *vnet_hdr) 668 { 669 memset(vnet_hdr, 0, sizeof(*vnet_hdr)); 670 671 if (skb_is_gso(skb)) { 672 struct skb_shared_info *sinfo = skb_shinfo(skb); 673 674 /* This is a hint as to how much should be linear. */ 675 vnet_hdr->hdr_len = skb_headlen(skb); 676 vnet_hdr->gso_size = sinfo->gso_size; 677 if (sinfo->gso_type & SKB_GSO_TCPV4) 678 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 679 else if (sinfo->gso_type & SKB_GSO_TCPV6) 680 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 681 else if (sinfo->gso_type & SKB_GSO_UDP) 682 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; 683 else 684 BUG(); 685 if (sinfo->gso_type & SKB_GSO_TCP_ECN) 686 vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; 687 } else 688 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 689 690 if (skb->ip_summed == CHECKSUM_PARTIAL) { 691 vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 692 vnet_hdr->csum_start = skb_checksum_start_offset(skb); 693 vnet_hdr->csum_offset = skb->csum_offset; 694 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { 695 vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; 696 } /* else everything is zero */ 697 698 return 0; 699 } 700 701 702 /* Get packet from user space buffer */ 703 static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, 704 const struct iovec *iv, unsigned long total_len, 705 size_t count, int noblock) 706 { 707 struct sk_buff *skb; 708 struct macvlan_dev *vlan; 709 unsigned long len = total_len; 710 int err; 711 struct virtio_net_hdr vnet_hdr = { 0 }; 712 int vnet_hdr_len = 0; 713 int copylen = 0; 714 bool zerocopy = false; 715 716 if (q->flags & IFF_VNET_HDR) { 717 vnet_hdr_len = q->vnet_hdr_sz; 718 719 err = -EINVAL; 720 if (len < vnet_hdr_len) 721 goto err; 722 len -= vnet_hdr_len; 723 724 err = memcpy_fromiovecend((void *)&vnet_hdr, iv, 0, 725 sizeof(vnet_hdr)); 726 if (err < 0) 727 goto err; 728 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 729 vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 730 vnet_hdr.hdr_len) 731 vnet_hdr.hdr_len = vnet_hdr.csum_start + 732 vnet_hdr.csum_offset + 2; 733 err = -EINVAL; 734 if (vnet_hdr.hdr_len > len) 735 goto err; 736 } 737 738 err = -EINVAL; 739 if (unlikely(len < ETH_HLEN)) 740 goto err; 741 742 err = -EMSGSIZE; 743 if (unlikely(count > UIO_MAXIOV)) 744 goto err; 745 746 if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) 747 zerocopy = true; 748 749 if (zerocopy) { 750 /* Userspace may produce vectors with count greater than 751 * MAX_SKB_FRAGS, so we need to linearize parts of the skb 752 * to let the rest of data to be fit in the frags. 753 */ 754 if (count > MAX_SKB_FRAGS) { 755 copylen = iov_length(iv, count - MAX_SKB_FRAGS); 756 if (copylen < vnet_hdr_len) 757 copylen = 0; 758 else 759 copylen -= vnet_hdr_len; 760 } 761 /* There are 256 bytes to be copied in skb, so there is enough 762 * room for skb expand head in case it is used. 763 * The rest buffer is mapped from userspace. 764 */ 765 if (copylen < vnet_hdr.hdr_len) 766 copylen = vnet_hdr.hdr_len; 767 if (!copylen) 768 copylen = GOODCOPY_LEN; 769 } else 770 copylen = len; 771 772 skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen, 773 vnet_hdr.hdr_len, noblock, &err); 774 if (!skb) 775 goto err; 776 777 if (zerocopy) 778 err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); 779 else 780 err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, 781 len); 782 if (err) 783 goto err_kfree; 784 785 skb_set_network_header(skb, ETH_HLEN); 786 skb_reset_mac_header(skb); 787 skb->protocol = eth_hdr(skb)->h_proto; 788 789 if (vnet_hdr_len) { 790 err = macvtap_skb_from_vnet_hdr(skb, &vnet_hdr); 791 if (err) 792 goto err_kfree; 793 } 794 795 skb_probe_transport_header(skb, ETH_HLEN); 796 797 rcu_read_lock(); 798 vlan = rcu_dereference(q->vlan); 799 /* copy skb_ubuf_info for callback when skb has no error */ 800 if (zerocopy) { 801 skb_shinfo(skb)->destructor_arg = m->msg_control; 802 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; 803 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; 804 } 805 if (vlan) 806 macvlan_start_xmit(skb, vlan->dev); 807 else 808 kfree_skb(skb); 809 rcu_read_unlock(); 810 811 return total_len; 812 813 err_kfree: 814 kfree_skb(skb); 815 816 err: 817 rcu_read_lock(); 818 vlan = rcu_dereference(q->vlan); 819 if (vlan) 820 vlan->dev->stats.tx_dropped++; 821 rcu_read_unlock(); 822 823 return err; 824 } 825 826 static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, 827 unsigned long count, loff_t pos) 828 { 829 struct file *file = iocb->ki_filp; 830 ssize_t result = -ENOLINK; 831 struct macvtap_queue *q = file->private_data; 832 833 result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count, 834 file->f_flags & O_NONBLOCK); 835 return result; 836 } 837 838 /* Put packet to the user space buffer */ 839 static ssize_t macvtap_put_user(struct macvtap_queue *q, 840 const struct sk_buff *skb, 841 const struct iovec *iv, int len) 842 { 843 struct macvlan_dev *vlan; 844 int ret; 845 int vnet_hdr_len = 0; 846 int vlan_offset = 0; 847 int copied; 848 849 if (q->flags & IFF_VNET_HDR) { 850 struct virtio_net_hdr vnet_hdr; 851 vnet_hdr_len = q->vnet_hdr_sz; 852 if ((len -= vnet_hdr_len) < 0) 853 return -EINVAL; 854 855 ret = macvtap_skb_to_vnet_hdr(skb, &vnet_hdr); 856 if (ret) 857 return ret; 858 859 if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr))) 860 return -EFAULT; 861 } 862 copied = vnet_hdr_len; 863 864 if (!vlan_tx_tag_present(skb)) 865 len = min_t(int, skb->len, len); 866 else { 867 int copy; 868 struct { 869 __be16 h_vlan_proto; 870 __be16 h_vlan_TCI; 871 } veth; 872 veth.h_vlan_proto = htons(ETH_P_8021Q); 873 veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); 874 875 vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); 876 len = min_t(int, skb->len + VLAN_HLEN, len); 877 878 copy = min_t(int, vlan_offset, len); 879 ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); 880 len -= copy; 881 copied += copy; 882 if (ret || !len) 883 goto done; 884 885 copy = min_t(int, sizeof(veth), len); 886 ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); 887 len -= copy; 888 copied += copy; 889 if (ret || !len) 890 goto done; 891 } 892 893 ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); 894 copied += len; 895 896 done: 897 rcu_read_lock(); 898 vlan = rcu_dereference(q->vlan); 899 if (vlan) 900 macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0); 901 rcu_read_unlock(); 902 903 return ret ? ret : copied; 904 } 905 906 static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb, 907 const struct iovec *iv, unsigned long len, 908 int noblock) 909 { 910 DEFINE_WAIT(wait); 911 struct sk_buff *skb; 912 ssize_t ret = 0; 913 914 while (len) { 915 if (!noblock) 916 prepare_to_wait(sk_sleep(&q->sk), &wait, 917 TASK_INTERRUPTIBLE); 918 919 /* Read frames from the queue */ 920 skb = skb_dequeue(&q->sk.sk_receive_queue); 921 if (!skb) { 922 if (noblock) { 923 ret = -EAGAIN; 924 break; 925 } 926 if (signal_pending(current)) { 927 ret = -ERESTARTSYS; 928 break; 929 } 930 /* Nothing to read, let's sleep */ 931 schedule(); 932 continue; 933 } 934 ret = macvtap_put_user(q, skb, iv, len); 935 kfree_skb(skb); 936 break; 937 } 938 939 if (!noblock) 940 finish_wait(sk_sleep(&q->sk), &wait); 941 return ret; 942 } 943 944 static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, 945 unsigned long count, loff_t pos) 946 { 947 struct file *file = iocb->ki_filp; 948 struct macvtap_queue *q = file->private_data; 949 ssize_t len, ret = 0; 950 951 len = iov_length(iv, count); 952 if (len < 0) { 953 ret = -EINVAL; 954 goto out; 955 } 956 957 ret = macvtap_do_read(q, iocb, iv, len, file->f_flags & O_NONBLOCK); 958 ret = min_t(ssize_t, ret, len); /* XXX copied from tun.c. Why? */ 959 out: 960 return ret; 961 } 962 963 static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q) 964 { 965 struct macvlan_dev *vlan; 966 967 ASSERT_RTNL(); 968 vlan = rtnl_dereference(q->vlan); 969 if (vlan) 970 dev_hold(vlan->dev); 971 972 return vlan; 973 } 974 975 static void macvtap_put_vlan(struct macvlan_dev *vlan) 976 { 977 dev_put(vlan->dev); 978 } 979 980 static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags) 981 { 982 struct macvtap_queue *q = file->private_data; 983 struct macvlan_dev *vlan; 984 int ret; 985 986 vlan = macvtap_get_vlan(q); 987 if (!vlan) 988 return -EINVAL; 989 990 if (flags & IFF_ATTACH_QUEUE) 991 ret = macvtap_enable_queue(vlan->dev, file, q); 992 else if (flags & IFF_DETACH_QUEUE) 993 ret = macvtap_disable_queue(q); 994 else 995 ret = -EINVAL; 996 997 macvtap_put_vlan(vlan); 998 return ret; 999 } 1000 1001 static int set_offload(struct macvtap_queue *q, unsigned long arg) 1002 { 1003 struct macvlan_dev *vlan; 1004 netdev_features_t features; 1005 netdev_features_t feature_mask = 0; 1006 1007 vlan = rtnl_dereference(q->vlan); 1008 if (!vlan) 1009 return -ENOLINK; 1010 1011 features = vlan->dev->features; 1012 1013 if (arg & TUN_F_CSUM) { 1014 feature_mask = NETIF_F_HW_CSUM; 1015 1016 if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { 1017 if (arg & TUN_F_TSO_ECN) 1018 feature_mask |= NETIF_F_TSO_ECN; 1019 if (arg & TUN_F_TSO4) 1020 feature_mask |= NETIF_F_TSO; 1021 if (arg & TUN_F_TSO6) 1022 feature_mask |= NETIF_F_TSO6; 1023 } 1024 1025 if (arg & TUN_F_UFO) 1026 feature_mask |= NETIF_F_UFO; 1027 } 1028 1029 /* tun/tap driver inverts the usage for TSO offloads, where 1030 * setting the TSO bit means that the userspace wants to 1031 * accept TSO frames and turning it off means that user space 1032 * does not support TSO. 1033 * For macvtap, we have to invert it to mean the same thing. 1034 * When user space turns off TSO, we turn off GSO/LRO so that 1035 * user-space will not receive TSO frames. 1036 */ 1037 if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO)) 1038 features |= RX_OFFLOADS; 1039 else 1040 features &= ~RX_OFFLOADS; 1041 1042 /* tap_features are the same as features on tun/tap and 1043 * reflect user expectations. 1044 */ 1045 vlan->tap_features = vlan->dev->features & 1046 (feature_mask | ~TUN_OFFLOADS); 1047 vlan->set_features = features; 1048 netdev_update_features(vlan->dev); 1049 1050 return 0; 1051 } 1052 1053 /* 1054 * provide compatibility with generic tun/tap interface 1055 */ 1056 static long macvtap_ioctl(struct file *file, unsigned int cmd, 1057 unsigned long arg) 1058 { 1059 struct macvtap_queue *q = file->private_data; 1060 struct macvlan_dev *vlan; 1061 void __user *argp = (void __user *)arg; 1062 struct ifreq __user *ifr = argp; 1063 unsigned int __user *up = argp; 1064 unsigned int u; 1065 int __user *sp = argp; 1066 int s; 1067 int ret; 1068 1069 switch (cmd) { 1070 case TUNSETIFF: 1071 /* ignore the name, just look at flags */ 1072 if (get_user(u, &ifr->ifr_flags)) 1073 return -EFAULT; 1074 1075 ret = 0; 1076 if ((u & ~(IFF_VNET_HDR | IFF_MULTI_QUEUE)) != 1077 (IFF_NO_PI | IFF_TAP)) 1078 ret = -EINVAL; 1079 else 1080 q->flags = u; 1081 1082 return ret; 1083 1084 case TUNGETIFF: 1085 rtnl_lock(); 1086 vlan = macvtap_get_vlan(q); 1087 if (!vlan) { 1088 rtnl_unlock(); 1089 return -ENOLINK; 1090 } 1091 1092 ret = 0; 1093 if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) || 1094 put_user(q->flags, &ifr->ifr_flags)) 1095 ret = -EFAULT; 1096 macvtap_put_vlan(vlan); 1097 rtnl_unlock(); 1098 return ret; 1099 1100 case TUNSETQUEUE: 1101 if (get_user(u, &ifr->ifr_flags)) 1102 return -EFAULT; 1103 rtnl_lock(); 1104 ret = macvtap_ioctl_set_queue(file, u); 1105 rtnl_unlock(); 1106 1107 case TUNGETFEATURES: 1108 if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR | 1109 IFF_MULTI_QUEUE, up)) 1110 return -EFAULT; 1111 return 0; 1112 1113 case TUNSETSNDBUF: 1114 if (get_user(u, up)) 1115 return -EFAULT; 1116 1117 q->sk.sk_sndbuf = u; 1118 return 0; 1119 1120 case TUNGETVNETHDRSZ: 1121 s = q->vnet_hdr_sz; 1122 if (put_user(s, sp)) 1123 return -EFAULT; 1124 return 0; 1125 1126 case TUNSETVNETHDRSZ: 1127 if (get_user(s, sp)) 1128 return -EFAULT; 1129 if (s < (int)sizeof(struct virtio_net_hdr)) 1130 return -EINVAL; 1131 1132 q->vnet_hdr_sz = s; 1133 return 0; 1134 1135 case TUNSETOFFLOAD: 1136 /* let the user check for future flags */ 1137 if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | 1138 TUN_F_TSO_ECN | TUN_F_UFO)) 1139 return -EINVAL; 1140 1141 /* TODO: only accept frames with the features that 1142 got enabled for forwarded frames */ 1143 if (!(q->flags & IFF_VNET_HDR)) 1144 return -EINVAL; 1145 rtnl_lock(); 1146 ret = set_offload(q, arg); 1147 rtnl_unlock(); 1148 return ret; 1149 1150 default: 1151 return -EINVAL; 1152 } 1153 } 1154 1155 #ifdef CONFIG_COMPAT 1156 static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, 1157 unsigned long arg) 1158 { 1159 return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); 1160 } 1161 #endif 1162 1163 static const struct file_operations macvtap_fops = { 1164 .owner = THIS_MODULE, 1165 .open = macvtap_open, 1166 .release = macvtap_release, 1167 .aio_read = macvtap_aio_read, 1168 .aio_write = macvtap_aio_write, 1169 .poll = macvtap_poll, 1170 .llseek = no_llseek, 1171 .unlocked_ioctl = macvtap_ioctl, 1172 #ifdef CONFIG_COMPAT 1173 .compat_ioctl = macvtap_compat_ioctl, 1174 #endif 1175 }; 1176 1177 static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock, 1178 struct msghdr *m, size_t total_len) 1179 { 1180 struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); 1181 return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen, 1182 m->msg_flags & MSG_DONTWAIT); 1183 } 1184 1185 static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, 1186 struct msghdr *m, size_t total_len, 1187 int flags) 1188 { 1189 struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); 1190 int ret; 1191 if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) 1192 return -EINVAL; 1193 ret = macvtap_do_read(q, iocb, m->msg_iov, total_len, 1194 flags & MSG_DONTWAIT); 1195 if (ret > total_len) { 1196 m->msg_flags |= MSG_TRUNC; 1197 ret = flags & MSG_TRUNC ? ret : total_len; 1198 } 1199 return ret; 1200 } 1201 1202 /* Ops structure to mimic raw sockets with tun */ 1203 static const struct proto_ops macvtap_socket_ops = { 1204 .sendmsg = macvtap_sendmsg, 1205 .recvmsg = macvtap_recvmsg, 1206 }; 1207 1208 /* Get an underlying socket object from tun file. Returns error unless file is 1209 * attached to a device. The returned object works like a packet socket, it 1210 * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for 1211 * holding a reference to the file for as long as the socket is in use. */ 1212 struct socket *macvtap_get_socket(struct file *file) 1213 { 1214 struct macvtap_queue *q; 1215 if (file->f_op != &macvtap_fops) 1216 return ERR_PTR(-EINVAL); 1217 q = file->private_data; 1218 if (!q) 1219 return ERR_PTR(-EBADFD); 1220 return &q->sock; 1221 } 1222 EXPORT_SYMBOL_GPL(macvtap_get_socket); 1223 1224 static int macvtap_device_event(struct notifier_block *unused, 1225 unsigned long event, void *ptr) 1226 { 1227 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1228 struct macvlan_dev *vlan; 1229 struct device *classdev; 1230 dev_t devt; 1231 int err; 1232 1233 if (dev->rtnl_link_ops != &macvtap_link_ops) 1234 return NOTIFY_DONE; 1235 1236 vlan = netdev_priv(dev); 1237 1238 switch (event) { 1239 case NETDEV_REGISTER: 1240 /* Create the device node here after the network device has 1241 * been registered but before register_netdevice has 1242 * finished running. 1243 */ 1244 err = macvtap_get_minor(vlan); 1245 if (err) 1246 return notifier_from_errno(err); 1247 1248 devt = MKDEV(MAJOR(macvtap_major), vlan->minor); 1249 classdev = device_create(macvtap_class, &dev->dev, devt, 1250 dev, "tap%d", dev->ifindex); 1251 if (IS_ERR(classdev)) { 1252 macvtap_free_minor(vlan); 1253 return notifier_from_errno(PTR_ERR(classdev)); 1254 } 1255 break; 1256 case NETDEV_UNREGISTER: 1257 devt = MKDEV(MAJOR(macvtap_major), vlan->minor); 1258 device_destroy(macvtap_class, devt); 1259 macvtap_free_minor(vlan); 1260 break; 1261 } 1262 1263 return NOTIFY_DONE; 1264 } 1265 1266 static struct notifier_block macvtap_notifier_block __read_mostly = { 1267 .notifier_call = macvtap_device_event, 1268 }; 1269 1270 static int macvtap_init(void) 1271 { 1272 int err; 1273 1274 err = alloc_chrdev_region(&macvtap_major, 0, 1275 MACVTAP_NUM_DEVS, "macvtap"); 1276 if (err) 1277 goto out1; 1278 1279 cdev_init(&macvtap_cdev, &macvtap_fops); 1280 err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); 1281 if (err) 1282 goto out2; 1283 1284 macvtap_class = class_create(THIS_MODULE, "macvtap"); 1285 if (IS_ERR(macvtap_class)) { 1286 err = PTR_ERR(macvtap_class); 1287 goto out3; 1288 } 1289 1290 err = register_netdevice_notifier(&macvtap_notifier_block); 1291 if (err) 1292 goto out4; 1293 1294 err = macvlan_link_register(&macvtap_link_ops); 1295 if (err) 1296 goto out5; 1297 1298 return 0; 1299 1300 out5: 1301 unregister_netdevice_notifier(&macvtap_notifier_block); 1302 out4: 1303 class_unregister(macvtap_class); 1304 out3: 1305 cdev_del(&macvtap_cdev); 1306 out2: 1307 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); 1308 out1: 1309 return err; 1310 } 1311 module_init(macvtap_init); 1312 1313 static void macvtap_exit(void) 1314 { 1315 rtnl_link_unregister(&macvtap_link_ops); 1316 unregister_netdevice_notifier(&macvtap_notifier_block); 1317 class_unregister(macvtap_class); 1318 cdev_del(&macvtap_cdev); 1319 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); 1320 } 1321 module_exit(macvtap_exit); 1322 1323 MODULE_ALIAS_RTNL_LINK("macvtap"); 1324 MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); 1325 MODULE_LICENSE("GPL"); 1326