1 #include <linux/etherdevice.h> 2 #include <linux/if_macvlan.h> 3 #include <linux/if_vlan.h> 4 #include <linux/interrupt.h> 5 #include <linux/nsproxy.h> 6 #include <linux/compat.h> 7 #include <linux/if_tun.h> 8 #include <linux/module.h> 9 #include <linux/skbuff.h> 10 #include <linux/cache.h> 11 #include <linux/sched.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/init.h> 15 #include <linux/wait.h> 16 #include <linux/cdev.h> 17 #include <linux/idr.h> 18 #include <linux/fs.h> 19 20 #include <net/net_namespace.h> 21 #include <net/rtnetlink.h> 22 #include <net/sock.h> 23 #include <linux/virtio_net.h> 24 25 /* 26 * A macvtap queue is the central object of this driver, it connects 27 * an open character device to a macvlan interface. There can be 28 * multiple queues on one interface, which map back to queues 29 * implemented in hardware on the underlying device. 30 * 31 * macvtap_proto is used to allocate queues through the sock allocation 32 * mechanism. 33 * 34 */ 35 struct macvtap_queue { 36 struct sock sk; 37 struct socket sock; 38 struct socket_wq wq; 39 int vnet_hdr_sz; 40 struct macvlan_dev __rcu *vlan; 41 struct file *file; 42 unsigned int flags; 43 u16 queue_index; 44 bool enabled; 45 struct list_head next; 46 }; 47 48 static struct proto macvtap_proto = { 49 .name = "macvtap", 50 .owner = THIS_MODULE, 51 .obj_size = sizeof (struct macvtap_queue), 52 }; 53 54 /* 55 * Variables for dealing with macvtaps device numbers. 56 */ 57 static dev_t macvtap_major; 58 #define MACVTAP_NUM_DEVS (1U << MINORBITS) 59 static DEFINE_MUTEX(minor_lock); 60 static DEFINE_IDR(minor_idr); 61 62 #define GOODCOPY_LEN 128 63 static struct class *macvtap_class; 64 static struct cdev macvtap_cdev; 65 66 static const struct proto_ops macvtap_socket_ops; 67 68 #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ 69 NETIF_F_TSO6 | NETIF_F_UFO) 70 #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) 71 /* 72 * RCU usage: 73 * The macvtap_queue and the macvlan_dev are loosely coupled, the 74 * pointers from one to the other can only be read while rcu_read_lock 75 * or rtnl is held. 76 * 77 * Both the file and the macvlan_dev hold a reference on the macvtap_queue 78 * through sock_hold(&q->sk). When the macvlan_dev goes away first, 79 * q->vlan becomes inaccessible. When the files gets closed, 80 * macvtap_get_queue() fails. 81 * 82 * There may still be references to the struct sock inside of the 83 * queue from outbound SKBs, but these never reference back to the 84 * file or the dev. The data structure is freed through __sk_free 85 * when both our references and any pending SKBs are gone. 86 */ 87 88 static int macvtap_enable_queue(struct net_device *dev, struct file *file, 89 struct macvtap_queue *q) 90 { 91 struct macvlan_dev *vlan = netdev_priv(dev); 92 int err = -EINVAL; 93 94 ASSERT_RTNL(); 95 96 if (q->enabled) 97 goto out; 98 99 err = 0; 100 rcu_assign_pointer(vlan->taps[vlan->numvtaps], q); 101 q->queue_index = vlan->numvtaps; 102 q->enabled = true; 103 104 vlan->numvtaps++; 105 out: 106 return err; 107 } 108 109 static int macvtap_set_queue(struct net_device *dev, struct file *file, 110 struct macvtap_queue *q) 111 { 112 struct macvlan_dev *vlan = netdev_priv(dev); 113 int err = -EBUSY; 114 115 rtnl_lock(); 116 if (vlan->numqueues == MAX_MACVTAP_QUEUES) 117 goto out; 118 119 err = 0; 120 rcu_assign_pointer(q->vlan, vlan); 121 rcu_assign_pointer(vlan->taps[vlan->numvtaps], q); 122 sock_hold(&q->sk); 123 124 q->file = file; 125 q->queue_index = vlan->numvtaps; 126 q->enabled = true; 127 file->private_data = q; 128 list_add_tail(&q->next, &vlan->queue_list); 129 130 vlan->numvtaps++; 131 vlan->numqueues++; 132 133 out: 134 rtnl_unlock(); 135 return err; 136 } 137 138 static int macvtap_disable_queue(struct macvtap_queue *q) 139 { 140 struct macvlan_dev *vlan; 141 struct macvtap_queue *nq; 142 143 ASSERT_RTNL(); 144 if (!q->enabled) 145 return -EINVAL; 146 147 vlan = rtnl_dereference(q->vlan); 148 149 if (vlan) { 150 int index = q->queue_index; 151 BUG_ON(index >= vlan->numvtaps); 152 nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]); 153 nq->queue_index = index; 154 155 rcu_assign_pointer(vlan->taps[index], nq); 156 RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL); 157 q->enabled = false; 158 159 vlan->numvtaps--; 160 } 161 162 return 0; 163 } 164 165 /* 166 * The file owning the queue got closed, give up both 167 * the reference that the files holds as well as the 168 * one from the macvlan_dev if that still exists. 169 * 170 * Using the spinlock makes sure that we don't get 171 * to the queue again after destroying it. 172 */ 173 static void macvtap_put_queue(struct macvtap_queue *q) 174 { 175 struct macvlan_dev *vlan; 176 177 rtnl_lock(); 178 vlan = rtnl_dereference(q->vlan); 179 180 if (vlan) { 181 if (q->enabled) 182 BUG_ON(macvtap_disable_queue(q)); 183 184 vlan->numqueues--; 185 RCU_INIT_POINTER(q->vlan, NULL); 186 sock_put(&q->sk); 187 list_del_init(&q->next); 188 } 189 190 rtnl_unlock(); 191 192 synchronize_rcu(); 193 sock_put(&q->sk); 194 } 195 196 /* 197 * Select a queue based on the rxq of the device on which this packet 198 * arrived. If the incoming device is not mq, calculate a flow hash 199 * to select a queue. If all fails, find the first available queue. 200 * Cache vlan->numvtaps since it can become zero during the execution 201 * of this function. 202 */ 203 static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, 204 struct sk_buff *skb) 205 { 206 struct macvlan_dev *vlan = netdev_priv(dev); 207 struct macvtap_queue *tap = NULL; 208 /* Access to taps array is protected by rcu, but access to numvtaps 209 * isn't. Below we use it to lookup a queue, but treat it as a hint 210 * and validate that the result isn't NULL - in case we are 211 * racing against queue removal. 212 */ 213 int numvtaps = ACCESS_ONCE(vlan->numvtaps); 214 __u32 rxq; 215 216 if (!numvtaps) 217 goto out; 218 219 /* Check if we can use flow to select a queue */ 220 rxq = skb_get_rxhash(skb); 221 if (rxq) { 222 tap = rcu_dereference(vlan->taps[rxq % numvtaps]); 223 goto out; 224 } 225 226 if (likely(skb_rx_queue_recorded(skb))) { 227 rxq = skb_get_rx_queue(skb); 228 229 while (unlikely(rxq >= numvtaps)) 230 rxq -= numvtaps; 231 232 tap = rcu_dereference(vlan->taps[rxq]); 233 goto out; 234 } 235 236 tap = rcu_dereference(vlan->taps[0]); 237 out: 238 return tap; 239 } 240 241 /* 242 * The net_device is going away, give up the reference 243 * that it holds on all queues and safely set the pointer 244 * from the queues to NULL. 245 */ 246 static void macvtap_del_queues(struct net_device *dev) 247 { 248 struct macvlan_dev *vlan = netdev_priv(dev); 249 struct macvtap_queue *q, *tmp, *qlist[MAX_MACVTAP_QUEUES]; 250 int i, j = 0; 251 252 ASSERT_RTNL(); 253 list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) { 254 list_del_init(&q->next); 255 qlist[j++] = q; 256 RCU_INIT_POINTER(q->vlan, NULL); 257 if (q->enabled) 258 vlan->numvtaps--; 259 vlan->numqueues--; 260 } 261 for (i = 0; i < vlan->numvtaps; i++) 262 RCU_INIT_POINTER(vlan->taps[i], NULL); 263 BUG_ON(vlan->numvtaps); 264 BUG_ON(vlan->numqueues); 265 /* guarantee that any future macvtap_set_queue will fail */ 266 vlan->numvtaps = MAX_MACVTAP_QUEUES; 267 268 for (--j; j >= 0; j--) 269 sock_put(&qlist[j]->sk); 270 } 271 272 /* 273 * Forward happens for data that gets sent from one macvlan 274 * endpoint to another one in bridge mode. We just take 275 * the skb and put it into the receive queue. 276 */ 277 static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) 278 { 279 struct macvlan_dev *vlan = netdev_priv(dev); 280 struct macvtap_queue *q = macvtap_get_queue(dev, skb); 281 netdev_features_t features; 282 if (!q) 283 goto drop; 284 285 if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) 286 goto drop; 287 288 skb->dev = dev; 289 /* Apply the forward feature mask so that we perform segmentation 290 * according to users wishes. 291 */ 292 features = netif_skb_features(skb) & vlan->tap_features; 293 if (netif_needs_gso(skb, features)) { 294 struct sk_buff *segs = __skb_gso_segment(skb, features, false); 295 296 if (IS_ERR(segs)) 297 goto drop; 298 299 if (!segs) { 300 skb_queue_tail(&q->sk.sk_receive_queue, skb); 301 goto wake_up; 302 } 303 304 kfree_skb(skb); 305 while (segs) { 306 struct sk_buff *nskb = segs->next; 307 308 segs->next = NULL; 309 skb_queue_tail(&q->sk.sk_receive_queue, segs); 310 segs = nskb; 311 } 312 } else { 313 skb_queue_tail(&q->sk.sk_receive_queue, skb); 314 } 315 316 wake_up: 317 wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); 318 return NET_RX_SUCCESS; 319 320 drop: 321 kfree_skb(skb); 322 return NET_RX_DROP; 323 } 324 325 /* 326 * Receive is for data from the external interface (lowerdev), 327 * in case of macvtap, we can treat that the same way as 328 * forward, which macvlan cannot. 329 */ 330 static int macvtap_receive(struct sk_buff *skb) 331 { 332 skb_push(skb, ETH_HLEN); 333 return macvtap_forward(skb->dev, skb); 334 } 335 336 static int macvtap_get_minor(struct macvlan_dev *vlan) 337 { 338 int retval = -ENOMEM; 339 340 mutex_lock(&minor_lock); 341 retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL); 342 if (retval >= 0) { 343 vlan->minor = retval; 344 } else if (retval == -ENOSPC) { 345 printk(KERN_ERR "too many macvtap devices\n"); 346 retval = -EINVAL; 347 } 348 mutex_unlock(&minor_lock); 349 return retval < 0 ? retval : 0; 350 } 351 352 static void macvtap_free_minor(struct macvlan_dev *vlan) 353 { 354 mutex_lock(&minor_lock); 355 if (vlan->minor) { 356 idr_remove(&minor_idr, vlan->minor); 357 vlan->minor = 0; 358 } 359 mutex_unlock(&minor_lock); 360 } 361 362 static struct net_device *dev_get_by_macvtap_minor(int minor) 363 { 364 struct net_device *dev = NULL; 365 struct macvlan_dev *vlan; 366 367 mutex_lock(&minor_lock); 368 vlan = idr_find(&minor_idr, minor); 369 if (vlan) { 370 dev = vlan->dev; 371 dev_hold(dev); 372 } 373 mutex_unlock(&minor_lock); 374 return dev; 375 } 376 377 static int macvtap_newlink(struct net *src_net, 378 struct net_device *dev, 379 struct nlattr *tb[], 380 struct nlattr *data[]) 381 { 382 struct macvlan_dev *vlan = netdev_priv(dev); 383 INIT_LIST_HEAD(&vlan->queue_list); 384 385 /* Since macvlan supports all offloads by default, make 386 * tap support all offloads also. 387 */ 388 vlan->tap_features = TUN_OFFLOADS; 389 390 /* Don't put anything that may fail after macvlan_common_newlink 391 * because we can't undo what it does. 392 */ 393 return macvlan_common_newlink(src_net, dev, tb, data, 394 macvtap_receive, macvtap_forward); 395 } 396 397 static void macvtap_dellink(struct net_device *dev, 398 struct list_head *head) 399 { 400 macvtap_del_queues(dev); 401 macvlan_dellink(dev, head); 402 } 403 404 static void macvtap_setup(struct net_device *dev) 405 { 406 macvlan_common_setup(dev); 407 dev->tx_queue_len = TUN_READQ_SIZE; 408 } 409 410 static struct rtnl_link_ops macvtap_link_ops __read_mostly = { 411 .kind = "macvtap", 412 .setup = macvtap_setup, 413 .newlink = macvtap_newlink, 414 .dellink = macvtap_dellink, 415 }; 416 417 418 static void macvtap_sock_write_space(struct sock *sk) 419 { 420 wait_queue_head_t *wqueue; 421 422 if (!sock_writeable(sk) || 423 !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) 424 return; 425 426 wqueue = sk_sleep(sk); 427 if (wqueue && waitqueue_active(wqueue)) 428 wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); 429 } 430 431 static void macvtap_sock_destruct(struct sock *sk) 432 { 433 skb_queue_purge(&sk->sk_receive_queue); 434 } 435 436 static int macvtap_open(struct inode *inode, struct file *file) 437 { 438 struct net *net = current->nsproxy->net_ns; 439 struct net_device *dev = dev_get_by_macvtap_minor(iminor(inode)); 440 struct macvtap_queue *q; 441 int err; 442 443 err = -ENODEV; 444 if (!dev) 445 goto out; 446 447 err = -ENOMEM; 448 q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, 449 &macvtap_proto); 450 if (!q) 451 goto out; 452 453 RCU_INIT_POINTER(q->sock.wq, &q->wq); 454 init_waitqueue_head(&q->wq.wait); 455 q->sock.type = SOCK_RAW; 456 q->sock.state = SS_CONNECTED; 457 q->sock.file = file; 458 q->sock.ops = &macvtap_socket_ops; 459 sock_init_data(&q->sock, &q->sk); 460 q->sk.sk_write_space = macvtap_sock_write_space; 461 q->sk.sk_destruct = macvtap_sock_destruct; 462 q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; 463 q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); 464 465 /* 466 * so far only KVM virtio_net uses macvtap, enable zero copy between 467 * guest kernel and host kernel when lower device supports zerocopy 468 * 469 * The macvlan supports zerocopy iff the lower device supports zero 470 * copy so we don't have to look at the lower device directly. 471 */ 472 if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG)) 473 sock_set_flag(&q->sk, SOCK_ZEROCOPY); 474 475 err = macvtap_set_queue(dev, file, q); 476 if (err) 477 sock_put(&q->sk); 478 479 out: 480 if (dev) 481 dev_put(dev); 482 483 return err; 484 } 485 486 static int macvtap_release(struct inode *inode, struct file *file) 487 { 488 struct macvtap_queue *q = file->private_data; 489 macvtap_put_queue(q); 490 return 0; 491 } 492 493 static unsigned int macvtap_poll(struct file *file, poll_table * wait) 494 { 495 struct macvtap_queue *q = file->private_data; 496 unsigned int mask = POLLERR; 497 498 if (!q) 499 goto out; 500 501 mask = 0; 502 poll_wait(file, &q->wq.wait, wait); 503 504 if (!skb_queue_empty(&q->sk.sk_receive_queue)) 505 mask |= POLLIN | POLLRDNORM; 506 507 if (sock_writeable(&q->sk) || 508 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && 509 sock_writeable(&q->sk))) 510 mask |= POLLOUT | POLLWRNORM; 511 512 out: 513 return mask; 514 } 515 516 static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, 517 size_t len, size_t linear, 518 int noblock, int *err) 519 { 520 struct sk_buff *skb; 521 522 /* Under a page? Don't bother with paged skb. */ 523 if (prepad + len < PAGE_SIZE || !linear) 524 linear = len; 525 526 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, 527 err); 528 if (!skb) 529 return NULL; 530 531 skb_reserve(skb, prepad); 532 skb_put(skb, linear); 533 skb->data_len = len - linear; 534 skb->len += len - linear; 535 536 return skb; 537 } 538 539 /* set skb frags from iovec, this can move to core network code for reuse */ 540 static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, 541 int offset, size_t count) 542 { 543 int len = iov_length(from, count) - offset; 544 int copy = skb_headlen(skb); 545 int size, offset1 = 0; 546 int i = 0; 547 548 /* Skip over from offset */ 549 while (count && (offset >= from->iov_len)) { 550 offset -= from->iov_len; 551 ++from; 552 --count; 553 } 554 555 /* copy up to skb headlen */ 556 while (count && (copy > 0)) { 557 size = min_t(unsigned int, copy, from->iov_len - offset); 558 if (copy_from_user(skb->data + offset1, from->iov_base + offset, 559 size)) 560 return -EFAULT; 561 if (copy > size) { 562 ++from; 563 --count; 564 offset = 0; 565 } else 566 offset += size; 567 copy -= size; 568 offset1 += size; 569 } 570 571 if (len == offset1) 572 return 0; 573 574 while (count--) { 575 struct page *page[MAX_SKB_FRAGS]; 576 int num_pages; 577 unsigned long base; 578 unsigned long truesize; 579 580 len = from->iov_len - offset; 581 if (!len) { 582 offset = 0; 583 ++from; 584 continue; 585 } 586 base = (unsigned long)from->iov_base + offset; 587 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; 588 if (i + size > MAX_SKB_FRAGS) 589 return -EMSGSIZE; 590 num_pages = get_user_pages_fast(base, size, 0, &page[i]); 591 if (num_pages != size) { 592 int j; 593 594 for (j = 0; j < num_pages; j++) 595 put_page(page[i + j]); 596 return -EFAULT; 597 } 598 truesize = size * PAGE_SIZE; 599 skb->data_len += len; 600 skb->len += len; 601 skb->truesize += truesize; 602 atomic_add(truesize, &skb->sk->sk_wmem_alloc); 603 while (len) { 604 int off = base & ~PAGE_MASK; 605 int size = min_t(int, len, PAGE_SIZE - off); 606 __skb_fill_page_desc(skb, i, page[i], off, size); 607 skb_shinfo(skb)->nr_frags++; 608 /* increase sk_wmem_alloc */ 609 base += size; 610 len -= size; 611 i++; 612 } 613 offset = 0; 614 ++from; 615 } 616 return 0; 617 } 618 619 /* 620 * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should 621 * be shared with the tun/tap driver. 622 */ 623 static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb, 624 struct virtio_net_hdr *vnet_hdr) 625 { 626 unsigned short gso_type = 0; 627 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 628 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { 629 case VIRTIO_NET_HDR_GSO_TCPV4: 630 gso_type = SKB_GSO_TCPV4; 631 break; 632 case VIRTIO_NET_HDR_GSO_TCPV6: 633 gso_type = SKB_GSO_TCPV6; 634 break; 635 case VIRTIO_NET_HDR_GSO_UDP: 636 gso_type = SKB_GSO_UDP; 637 break; 638 default: 639 return -EINVAL; 640 } 641 642 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) 643 gso_type |= SKB_GSO_TCP_ECN; 644 645 if (vnet_hdr->gso_size == 0) 646 return -EINVAL; 647 } 648 649 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 650 if (!skb_partial_csum_set(skb, vnet_hdr->csum_start, 651 vnet_hdr->csum_offset)) 652 return -EINVAL; 653 } 654 655 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 656 skb_shinfo(skb)->gso_size = vnet_hdr->gso_size; 657 skb_shinfo(skb)->gso_type = gso_type; 658 659 /* Header must be checked, and gso_segs computed. */ 660 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 661 skb_shinfo(skb)->gso_segs = 0; 662 } 663 return 0; 664 } 665 666 static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb, 667 struct virtio_net_hdr *vnet_hdr) 668 { 669 memset(vnet_hdr, 0, sizeof(*vnet_hdr)); 670 671 if (skb_is_gso(skb)) { 672 struct skb_shared_info *sinfo = skb_shinfo(skb); 673 674 /* This is a hint as to how much should be linear. */ 675 vnet_hdr->hdr_len = skb_headlen(skb); 676 vnet_hdr->gso_size = sinfo->gso_size; 677 if (sinfo->gso_type & SKB_GSO_TCPV4) 678 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 679 else if (sinfo->gso_type & SKB_GSO_TCPV6) 680 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 681 else if (sinfo->gso_type & SKB_GSO_UDP) 682 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; 683 else 684 BUG(); 685 if (sinfo->gso_type & SKB_GSO_TCP_ECN) 686 vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; 687 } else 688 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 689 690 if (skb->ip_summed == CHECKSUM_PARTIAL) { 691 vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 692 vnet_hdr->csum_start = skb_checksum_start_offset(skb); 693 vnet_hdr->csum_offset = skb->csum_offset; 694 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { 695 vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; 696 } /* else everything is zero */ 697 698 return 0; 699 } 700 701 702 /* Get packet from user space buffer */ 703 static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, 704 const struct iovec *iv, unsigned long total_len, 705 size_t count, int noblock) 706 { 707 struct sk_buff *skb; 708 struct macvlan_dev *vlan; 709 unsigned long len = total_len; 710 int err; 711 struct virtio_net_hdr vnet_hdr = { 0 }; 712 int vnet_hdr_len = 0; 713 int copylen = 0; 714 bool zerocopy = false; 715 size_t linear; 716 717 if (q->flags & IFF_VNET_HDR) { 718 vnet_hdr_len = q->vnet_hdr_sz; 719 720 err = -EINVAL; 721 if (len < vnet_hdr_len) 722 goto err; 723 len -= vnet_hdr_len; 724 725 err = memcpy_fromiovecend((void *)&vnet_hdr, iv, 0, 726 sizeof(vnet_hdr)); 727 if (err < 0) 728 goto err; 729 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 730 vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 731 vnet_hdr.hdr_len) 732 vnet_hdr.hdr_len = vnet_hdr.csum_start + 733 vnet_hdr.csum_offset + 2; 734 err = -EINVAL; 735 if (vnet_hdr.hdr_len > len) 736 goto err; 737 } 738 739 err = -EINVAL; 740 if (unlikely(len < ETH_HLEN)) 741 goto err; 742 743 err = -EMSGSIZE; 744 if (unlikely(count > UIO_MAXIOV)) 745 goto err; 746 747 if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) 748 zerocopy = true; 749 750 if (zerocopy) { 751 /* Userspace may produce vectors with count greater than 752 * MAX_SKB_FRAGS, so we need to linearize parts of the skb 753 * to let the rest of data to be fit in the frags. 754 */ 755 if (count > MAX_SKB_FRAGS) { 756 copylen = iov_length(iv, count - MAX_SKB_FRAGS); 757 if (copylen < vnet_hdr_len) 758 copylen = 0; 759 else 760 copylen -= vnet_hdr_len; 761 } 762 /* There are 256 bytes to be copied in skb, so there is enough 763 * room for skb expand head in case it is used. 764 * The rest buffer is mapped from userspace. 765 */ 766 if (copylen < vnet_hdr.hdr_len) 767 copylen = vnet_hdr.hdr_len; 768 if (!copylen) 769 copylen = GOODCOPY_LEN; 770 linear = copylen; 771 } else { 772 copylen = len; 773 linear = vnet_hdr.hdr_len; 774 } 775 776 skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen, 777 linear, noblock, &err); 778 if (!skb) 779 goto err; 780 781 if (zerocopy) 782 err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); 783 else 784 err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, 785 len); 786 if (err) 787 goto err_kfree; 788 789 skb_set_network_header(skb, ETH_HLEN); 790 skb_reset_mac_header(skb); 791 skb->protocol = eth_hdr(skb)->h_proto; 792 793 if (vnet_hdr_len) { 794 err = macvtap_skb_from_vnet_hdr(skb, &vnet_hdr); 795 if (err) 796 goto err_kfree; 797 } 798 799 skb_probe_transport_header(skb, ETH_HLEN); 800 801 rcu_read_lock(); 802 vlan = rcu_dereference(q->vlan); 803 /* copy skb_ubuf_info for callback when skb has no error */ 804 if (zerocopy) { 805 skb_shinfo(skb)->destructor_arg = m->msg_control; 806 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; 807 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; 808 } 809 if (vlan) 810 macvlan_start_xmit(skb, vlan->dev); 811 else 812 kfree_skb(skb); 813 rcu_read_unlock(); 814 815 return total_len; 816 817 err_kfree: 818 kfree_skb(skb); 819 820 err: 821 rcu_read_lock(); 822 vlan = rcu_dereference(q->vlan); 823 if (vlan) 824 vlan->dev->stats.tx_dropped++; 825 rcu_read_unlock(); 826 827 return err; 828 } 829 830 static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, 831 unsigned long count, loff_t pos) 832 { 833 struct file *file = iocb->ki_filp; 834 ssize_t result = -ENOLINK; 835 struct macvtap_queue *q = file->private_data; 836 837 result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count, 838 file->f_flags & O_NONBLOCK); 839 return result; 840 } 841 842 /* Put packet to the user space buffer */ 843 static ssize_t macvtap_put_user(struct macvtap_queue *q, 844 const struct sk_buff *skb, 845 const struct iovec *iv, int len) 846 { 847 struct macvlan_dev *vlan; 848 int ret; 849 int vnet_hdr_len = 0; 850 int vlan_offset = 0; 851 int copied; 852 853 if (q->flags & IFF_VNET_HDR) { 854 struct virtio_net_hdr vnet_hdr; 855 vnet_hdr_len = q->vnet_hdr_sz; 856 if ((len -= vnet_hdr_len) < 0) 857 return -EINVAL; 858 859 ret = macvtap_skb_to_vnet_hdr(skb, &vnet_hdr); 860 if (ret) 861 return ret; 862 863 if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr))) 864 return -EFAULT; 865 } 866 copied = vnet_hdr_len; 867 868 if (!vlan_tx_tag_present(skb)) 869 len = min_t(int, skb->len, len); 870 else { 871 int copy; 872 struct { 873 __be16 h_vlan_proto; 874 __be16 h_vlan_TCI; 875 } veth; 876 veth.h_vlan_proto = htons(ETH_P_8021Q); 877 veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); 878 879 vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); 880 len = min_t(int, skb->len + VLAN_HLEN, len); 881 882 copy = min_t(int, vlan_offset, len); 883 ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); 884 len -= copy; 885 copied += copy; 886 if (ret || !len) 887 goto done; 888 889 copy = min_t(int, sizeof(veth), len); 890 ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); 891 len -= copy; 892 copied += copy; 893 if (ret || !len) 894 goto done; 895 } 896 897 ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); 898 copied += len; 899 900 done: 901 rcu_read_lock(); 902 vlan = rcu_dereference(q->vlan); 903 if (vlan) 904 macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0); 905 rcu_read_unlock(); 906 907 return ret ? ret : copied; 908 } 909 910 static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb, 911 const struct iovec *iv, unsigned long len, 912 int noblock) 913 { 914 DEFINE_WAIT(wait); 915 struct sk_buff *skb; 916 ssize_t ret = 0; 917 918 while (len) { 919 if (!noblock) 920 prepare_to_wait(sk_sleep(&q->sk), &wait, 921 TASK_INTERRUPTIBLE); 922 923 /* Read frames from the queue */ 924 skb = skb_dequeue(&q->sk.sk_receive_queue); 925 if (!skb) { 926 if (noblock) { 927 ret = -EAGAIN; 928 break; 929 } 930 if (signal_pending(current)) { 931 ret = -ERESTARTSYS; 932 break; 933 } 934 /* Nothing to read, let's sleep */ 935 schedule(); 936 continue; 937 } 938 ret = macvtap_put_user(q, skb, iv, len); 939 kfree_skb(skb); 940 break; 941 } 942 943 if (!noblock) 944 finish_wait(sk_sleep(&q->sk), &wait); 945 return ret; 946 } 947 948 static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, 949 unsigned long count, loff_t pos) 950 { 951 struct file *file = iocb->ki_filp; 952 struct macvtap_queue *q = file->private_data; 953 ssize_t len, ret = 0; 954 955 len = iov_length(iv, count); 956 if (len < 0) { 957 ret = -EINVAL; 958 goto out; 959 } 960 961 ret = macvtap_do_read(q, iocb, iv, len, file->f_flags & O_NONBLOCK); 962 ret = min_t(ssize_t, ret, len); /* XXX copied from tun.c. Why? */ 963 out: 964 return ret; 965 } 966 967 static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q) 968 { 969 struct macvlan_dev *vlan; 970 971 ASSERT_RTNL(); 972 vlan = rtnl_dereference(q->vlan); 973 if (vlan) 974 dev_hold(vlan->dev); 975 976 return vlan; 977 } 978 979 static void macvtap_put_vlan(struct macvlan_dev *vlan) 980 { 981 dev_put(vlan->dev); 982 } 983 984 static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags) 985 { 986 struct macvtap_queue *q = file->private_data; 987 struct macvlan_dev *vlan; 988 int ret; 989 990 vlan = macvtap_get_vlan(q); 991 if (!vlan) 992 return -EINVAL; 993 994 if (flags & IFF_ATTACH_QUEUE) 995 ret = macvtap_enable_queue(vlan->dev, file, q); 996 else if (flags & IFF_DETACH_QUEUE) 997 ret = macvtap_disable_queue(q); 998 else 999 ret = -EINVAL; 1000 1001 macvtap_put_vlan(vlan); 1002 return ret; 1003 } 1004 1005 static int set_offload(struct macvtap_queue *q, unsigned long arg) 1006 { 1007 struct macvlan_dev *vlan; 1008 netdev_features_t features; 1009 netdev_features_t feature_mask = 0; 1010 1011 vlan = rtnl_dereference(q->vlan); 1012 if (!vlan) 1013 return -ENOLINK; 1014 1015 features = vlan->dev->features; 1016 1017 if (arg & TUN_F_CSUM) { 1018 feature_mask = NETIF_F_HW_CSUM; 1019 1020 if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { 1021 if (arg & TUN_F_TSO_ECN) 1022 feature_mask |= NETIF_F_TSO_ECN; 1023 if (arg & TUN_F_TSO4) 1024 feature_mask |= NETIF_F_TSO; 1025 if (arg & TUN_F_TSO6) 1026 feature_mask |= NETIF_F_TSO6; 1027 } 1028 1029 if (arg & TUN_F_UFO) 1030 feature_mask |= NETIF_F_UFO; 1031 } 1032 1033 /* tun/tap driver inverts the usage for TSO offloads, where 1034 * setting the TSO bit means that the userspace wants to 1035 * accept TSO frames and turning it off means that user space 1036 * does not support TSO. 1037 * For macvtap, we have to invert it to mean the same thing. 1038 * When user space turns off TSO, we turn off GSO/LRO so that 1039 * user-space will not receive TSO frames. 1040 */ 1041 if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO)) 1042 features |= RX_OFFLOADS; 1043 else 1044 features &= ~RX_OFFLOADS; 1045 1046 /* tap_features are the same as features on tun/tap and 1047 * reflect user expectations. 1048 */ 1049 vlan->tap_features = vlan->dev->features & 1050 (feature_mask | ~TUN_OFFLOADS); 1051 vlan->set_features = features; 1052 netdev_update_features(vlan->dev); 1053 1054 return 0; 1055 } 1056 1057 /* 1058 * provide compatibility with generic tun/tap interface 1059 */ 1060 static long macvtap_ioctl(struct file *file, unsigned int cmd, 1061 unsigned long arg) 1062 { 1063 struct macvtap_queue *q = file->private_data; 1064 struct macvlan_dev *vlan; 1065 void __user *argp = (void __user *)arg; 1066 struct ifreq __user *ifr = argp; 1067 unsigned int __user *up = argp; 1068 unsigned int u; 1069 int __user *sp = argp; 1070 int s; 1071 int ret; 1072 1073 switch (cmd) { 1074 case TUNSETIFF: 1075 /* ignore the name, just look at flags */ 1076 if (get_user(u, &ifr->ifr_flags)) 1077 return -EFAULT; 1078 1079 ret = 0; 1080 if ((u & ~(IFF_VNET_HDR | IFF_MULTI_QUEUE)) != 1081 (IFF_NO_PI | IFF_TAP)) 1082 ret = -EINVAL; 1083 else 1084 q->flags = u; 1085 1086 return ret; 1087 1088 case TUNGETIFF: 1089 rtnl_lock(); 1090 vlan = macvtap_get_vlan(q); 1091 if (!vlan) { 1092 rtnl_unlock(); 1093 return -ENOLINK; 1094 } 1095 1096 ret = 0; 1097 if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) || 1098 put_user(q->flags, &ifr->ifr_flags)) 1099 ret = -EFAULT; 1100 macvtap_put_vlan(vlan); 1101 rtnl_unlock(); 1102 return ret; 1103 1104 case TUNSETQUEUE: 1105 if (get_user(u, &ifr->ifr_flags)) 1106 return -EFAULT; 1107 rtnl_lock(); 1108 ret = macvtap_ioctl_set_queue(file, u); 1109 rtnl_unlock(); 1110 1111 case TUNGETFEATURES: 1112 if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR | 1113 IFF_MULTI_QUEUE, up)) 1114 return -EFAULT; 1115 return 0; 1116 1117 case TUNSETSNDBUF: 1118 if (get_user(u, up)) 1119 return -EFAULT; 1120 1121 q->sk.sk_sndbuf = u; 1122 return 0; 1123 1124 case TUNGETVNETHDRSZ: 1125 s = q->vnet_hdr_sz; 1126 if (put_user(s, sp)) 1127 return -EFAULT; 1128 return 0; 1129 1130 case TUNSETVNETHDRSZ: 1131 if (get_user(s, sp)) 1132 return -EFAULT; 1133 if (s < (int)sizeof(struct virtio_net_hdr)) 1134 return -EINVAL; 1135 1136 q->vnet_hdr_sz = s; 1137 return 0; 1138 1139 case TUNSETOFFLOAD: 1140 /* let the user check for future flags */ 1141 if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | 1142 TUN_F_TSO_ECN | TUN_F_UFO)) 1143 return -EINVAL; 1144 1145 /* TODO: only accept frames with the features that 1146 got enabled for forwarded frames */ 1147 if (!(q->flags & IFF_VNET_HDR)) 1148 return -EINVAL; 1149 rtnl_lock(); 1150 ret = set_offload(q, arg); 1151 rtnl_unlock(); 1152 return ret; 1153 1154 default: 1155 return -EINVAL; 1156 } 1157 } 1158 1159 #ifdef CONFIG_COMPAT 1160 static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, 1161 unsigned long arg) 1162 { 1163 return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); 1164 } 1165 #endif 1166 1167 static const struct file_operations macvtap_fops = { 1168 .owner = THIS_MODULE, 1169 .open = macvtap_open, 1170 .release = macvtap_release, 1171 .aio_read = macvtap_aio_read, 1172 .aio_write = macvtap_aio_write, 1173 .poll = macvtap_poll, 1174 .llseek = no_llseek, 1175 .unlocked_ioctl = macvtap_ioctl, 1176 #ifdef CONFIG_COMPAT 1177 .compat_ioctl = macvtap_compat_ioctl, 1178 #endif 1179 }; 1180 1181 static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock, 1182 struct msghdr *m, size_t total_len) 1183 { 1184 struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); 1185 return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen, 1186 m->msg_flags & MSG_DONTWAIT); 1187 } 1188 1189 static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, 1190 struct msghdr *m, size_t total_len, 1191 int flags) 1192 { 1193 struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); 1194 int ret; 1195 if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) 1196 return -EINVAL; 1197 ret = macvtap_do_read(q, iocb, m->msg_iov, total_len, 1198 flags & MSG_DONTWAIT); 1199 if (ret > total_len) { 1200 m->msg_flags |= MSG_TRUNC; 1201 ret = flags & MSG_TRUNC ? ret : total_len; 1202 } 1203 return ret; 1204 } 1205 1206 /* Ops structure to mimic raw sockets with tun */ 1207 static const struct proto_ops macvtap_socket_ops = { 1208 .sendmsg = macvtap_sendmsg, 1209 .recvmsg = macvtap_recvmsg, 1210 }; 1211 1212 /* Get an underlying socket object from tun file. Returns error unless file is 1213 * attached to a device. The returned object works like a packet socket, it 1214 * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for 1215 * holding a reference to the file for as long as the socket is in use. */ 1216 struct socket *macvtap_get_socket(struct file *file) 1217 { 1218 struct macvtap_queue *q; 1219 if (file->f_op != &macvtap_fops) 1220 return ERR_PTR(-EINVAL); 1221 q = file->private_data; 1222 if (!q) 1223 return ERR_PTR(-EBADFD); 1224 return &q->sock; 1225 } 1226 EXPORT_SYMBOL_GPL(macvtap_get_socket); 1227 1228 static int macvtap_device_event(struct notifier_block *unused, 1229 unsigned long event, void *ptr) 1230 { 1231 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1232 struct macvlan_dev *vlan; 1233 struct device *classdev; 1234 dev_t devt; 1235 int err; 1236 1237 if (dev->rtnl_link_ops != &macvtap_link_ops) 1238 return NOTIFY_DONE; 1239 1240 vlan = netdev_priv(dev); 1241 1242 switch (event) { 1243 case NETDEV_REGISTER: 1244 /* Create the device node here after the network device has 1245 * been registered but before register_netdevice has 1246 * finished running. 1247 */ 1248 err = macvtap_get_minor(vlan); 1249 if (err) 1250 return notifier_from_errno(err); 1251 1252 devt = MKDEV(MAJOR(macvtap_major), vlan->minor); 1253 classdev = device_create(macvtap_class, &dev->dev, devt, 1254 dev, "tap%d", dev->ifindex); 1255 if (IS_ERR(classdev)) { 1256 macvtap_free_minor(vlan); 1257 return notifier_from_errno(PTR_ERR(classdev)); 1258 } 1259 break; 1260 case NETDEV_UNREGISTER: 1261 devt = MKDEV(MAJOR(macvtap_major), vlan->minor); 1262 device_destroy(macvtap_class, devt); 1263 macvtap_free_minor(vlan); 1264 break; 1265 } 1266 1267 return NOTIFY_DONE; 1268 } 1269 1270 static struct notifier_block macvtap_notifier_block __read_mostly = { 1271 .notifier_call = macvtap_device_event, 1272 }; 1273 1274 static int macvtap_init(void) 1275 { 1276 int err; 1277 1278 err = alloc_chrdev_region(&macvtap_major, 0, 1279 MACVTAP_NUM_DEVS, "macvtap"); 1280 if (err) 1281 goto out1; 1282 1283 cdev_init(&macvtap_cdev, &macvtap_fops); 1284 err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); 1285 if (err) 1286 goto out2; 1287 1288 macvtap_class = class_create(THIS_MODULE, "macvtap"); 1289 if (IS_ERR(macvtap_class)) { 1290 err = PTR_ERR(macvtap_class); 1291 goto out3; 1292 } 1293 1294 err = register_netdevice_notifier(&macvtap_notifier_block); 1295 if (err) 1296 goto out4; 1297 1298 err = macvlan_link_register(&macvtap_link_ops); 1299 if (err) 1300 goto out5; 1301 1302 return 0; 1303 1304 out5: 1305 unregister_netdevice_notifier(&macvtap_notifier_block); 1306 out4: 1307 class_unregister(macvtap_class); 1308 out3: 1309 cdev_del(&macvtap_cdev); 1310 out2: 1311 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); 1312 out1: 1313 return err; 1314 } 1315 module_init(macvtap_init); 1316 1317 static void macvtap_exit(void) 1318 { 1319 rtnl_link_unregister(&macvtap_link_ops); 1320 unregister_netdevice_notifier(&macvtap_notifier_block); 1321 class_unregister(macvtap_class); 1322 cdev_del(&macvtap_cdev); 1323 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); 1324 } 1325 module_exit(macvtap_exit); 1326 1327 MODULE_ALIAS_RTNL_LINK("macvtap"); 1328 MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); 1329 MODULE_LICENSE("GPL"); 1330