1 #include <linux/etherdevice.h> 2 #include <linux/if_macvlan.h> 3 #include <linux/if_vlan.h> 4 #include <linux/interrupt.h> 5 #include <linux/nsproxy.h> 6 #include <linux/compat.h> 7 #include <linux/if_tun.h> 8 #include <linux/module.h> 9 #include <linux/skbuff.h> 10 #include <linux/cache.h> 11 #include <linux/sched.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/init.h> 15 #include <linux/wait.h> 16 #include <linux/cdev.h> 17 #include <linux/idr.h> 18 #include <linux/fs.h> 19 20 #include <net/net_namespace.h> 21 #include <net/rtnetlink.h> 22 #include <net/sock.h> 23 #include <linux/virtio_net.h> 24 25 /* 26 * A macvtap queue is the central object of this driver, it connects 27 * an open character device to a macvlan interface. There can be 28 * multiple queues on one interface, which map back to queues 29 * implemented in hardware on the underlying device. 30 * 31 * macvtap_proto is used to allocate queues through the sock allocation 32 * mechanism. 33 * 34 */ 35 struct macvtap_queue { 36 struct sock sk; 37 struct socket sock; 38 struct socket_wq wq; 39 int vnet_hdr_sz; 40 struct macvlan_dev __rcu *vlan; 41 struct file *file; 42 unsigned int flags; 43 u16 queue_index; 44 bool enabled; 45 struct list_head next; 46 }; 47 48 static struct proto macvtap_proto = { 49 .name = "macvtap", 50 .owner = THIS_MODULE, 51 .obj_size = sizeof (struct macvtap_queue), 52 }; 53 54 /* 55 * Variables for dealing with macvtaps device numbers. 56 */ 57 static dev_t macvtap_major; 58 #define MACVTAP_NUM_DEVS (1U << MINORBITS) 59 static DEFINE_MUTEX(minor_lock); 60 static DEFINE_IDR(minor_idr); 61 62 #define GOODCOPY_LEN 128 63 static struct class *macvtap_class; 64 static struct cdev macvtap_cdev; 65 66 static const struct proto_ops macvtap_socket_ops; 67 68 #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ 69 NETIF_F_TSO6 | NETIF_F_UFO) 70 #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) 71 /* 72 * RCU usage: 73 * The macvtap_queue and the macvlan_dev are loosely coupled, the 74 * pointers from one to the other can only be read while rcu_read_lock 75 * or rtnl is held. 76 * 77 * Both the file and the macvlan_dev hold a reference on the macvtap_queue 78 * through sock_hold(&q->sk). When the macvlan_dev goes away first, 79 * q->vlan becomes inaccessible. When the files gets closed, 80 * macvtap_get_queue() fails. 81 * 82 * There may still be references to the struct sock inside of the 83 * queue from outbound SKBs, but these never reference back to the 84 * file or the dev. The data structure is freed through __sk_free 85 * when both our references and any pending SKBs are gone. 86 */ 87 88 static int macvtap_enable_queue(struct net_device *dev, struct file *file, 89 struct macvtap_queue *q) 90 { 91 struct macvlan_dev *vlan = netdev_priv(dev); 92 int err = -EINVAL; 93 94 ASSERT_RTNL(); 95 96 if (q->enabled) 97 goto out; 98 99 err = 0; 100 rcu_assign_pointer(vlan->taps[vlan->numvtaps], q); 101 q->queue_index = vlan->numvtaps; 102 q->enabled = true; 103 104 vlan->numvtaps++; 105 out: 106 return err; 107 } 108 109 static int macvtap_set_queue(struct net_device *dev, struct file *file, 110 struct macvtap_queue *q) 111 { 112 struct macvlan_dev *vlan = netdev_priv(dev); 113 int err = -EBUSY; 114 115 rtnl_lock(); 116 if (vlan->numqueues == MAX_MACVTAP_QUEUES) 117 goto out; 118 119 err = 0; 120 rcu_assign_pointer(q->vlan, vlan); 121 rcu_assign_pointer(vlan->taps[vlan->numvtaps], q); 122 sock_hold(&q->sk); 123 124 q->file = file; 125 q->queue_index = vlan->numvtaps; 126 q->enabled = true; 127 file->private_data = q; 128 list_add_tail(&q->next, &vlan->queue_list); 129 130 vlan->numvtaps++; 131 vlan->numqueues++; 132 133 out: 134 rtnl_unlock(); 135 return err; 136 } 137 138 static int macvtap_disable_queue(struct macvtap_queue *q) 139 { 140 struct macvlan_dev *vlan; 141 struct macvtap_queue *nq; 142 143 ASSERT_RTNL(); 144 if (!q->enabled) 145 return -EINVAL; 146 147 vlan = rtnl_dereference(q->vlan); 148 149 if (vlan) { 150 int index = q->queue_index; 151 BUG_ON(index >= vlan->numvtaps); 152 nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]); 153 nq->queue_index = index; 154 155 rcu_assign_pointer(vlan->taps[index], nq); 156 RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL); 157 q->enabled = false; 158 159 vlan->numvtaps--; 160 } 161 162 return 0; 163 } 164 165 /* 166 * The file owning the queue got closed, give up both 167 * the reference that the files holds as well as the 168 * one from the macvlan_dev if that still exists. 169 * 170 * Using the spinlock makes sure that we don't get 171 * to the queue again after destroying it. 172 */ 173 static void macvtap_put_queue(struct macvtap_queue *q) 174 { 175 struct macvlan_dev *vlan; 176 177 rtnl_lock(); 178 vlan = rtnl_dereference(q->vlan); 179 180 if (vlan) { 181 if (q->enabled) 182 BUG_ON(macvtap_disable_queue(q)); 183 184 vlan->numqueues--; 185 RCU_INIT_POINTER(q->vlan, NULL); 186 sock_put(&q->sk); 187 list_del_init(&q->next); 188 } 189 190 rtnl_unlock(); 191 192 synchronize_rcu(); 193 sock_put(&q->sk); 194 } 195 196 /* 197 * Select a queue based on the rxq of the device on which this packet 198 * arrived. If the incoming device is not mq, calculate a flow hash 199 * to select a queue. If all fails, find the first available queue. 200 * Cache vlan->numvtaps since it can become zero during the execution 201 * of this function. 202 */ 203 static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, 204 struct sk_buff *skb) 205 { 206 struct macvlan_dev *vlan = netdev_priv(dev); 207 struct macvtap_queue *tap = NULL; 208 /* Access to taps array is protected by rcu, but access to numvtaps 209 * isn't. Below we use it to lookup a queue, but treat it as a hint 210 * and validate that the result isn't NULL - in case we are 211 * racing against queue removal. 212 */ 213 int numvtaps = ACCESS_ONCE(vlan->numvtaps); 214 __u32 rxq; 215 216 if (!numvtaps) 217 goto out; 218 219 /* Check if we can use flow to select a queue */ 220 rxq = skb_get_rxhash(skb); 221 if (rxq) { 222 tap = rcu_dereference(vlan->taps[rxq % numvtaps]); 223 goto out; 224 } 225 226 if (likely(skb_rx_queue_recorded(skb))) { 227 rxq = skb_get_rx_queue(skb); 228 229 while (unlikely(rxq >= numvtaps)) 230 rxq -= numvtaps; 231 232 tap = rcu_dereference(vlan->taps[rxq]); 233 goto out; 234 } 235 236 tap = rcu_dereference(vlan->taps[0]); 237 out: 238 return tap; 239 } 240 241 /* 242 * The net_device is going away, give up the reference 243 * that it holds on all queues and safely set the pointer 244 * from the queues to NULL. 245 */ 246 static void macvtap_del_queues(struct net_device *dev) 247 { 248 struct macvlan_dev *vlan = netdev_priv(dev); 249 struct macvtap_queue *q, *tmp, *qlist[MAX_MACVTAP_QUEUES]; 250 int i, j = 0; 251 252 ASSERT_RTNL(); 253 list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) { 254 list_del_init(&q->next); 255 qlist[j++] = q; 256 RCU_INIT_POINTER(q->vlan, NULL); 257 if (q->enabled) 258 vlan->numvtaps--; 259 vlan->numqueues--; 260 } 261 for (i = 0; i < vlan->numvtaps; i++) 262 RCU_INIT_POINTER(vlan->taps[i], NULL); 263 BUG_ON(vlan->numvtaps); 264 BUG_ON(vlan->numqueues); 265 /* guarantee that any future macvtap_set_queue will fail */ 266 vlan->numvtaps = MAX_MACVTAP_QUEUES; 267 268 for (--j; j >= 0; j--) 269 sock_put(&qlist[j]->sk); 270 } 271 272 /* 273 * Forward happens for data that gets sent from one macvlan 274 * endpoint to another one in bridge mode. We just take 275 * the skb and put it into the receive queue. 276 */ 277 static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) 278 { 279 struct macvlan_dev *vlan = netdev_priv(dev); 280 struct macvtap_queue *q = macvtap_get_queue(dev, skb); 281 netdev_features_t features; 282 if (!q) 283 goto drop; 284 285 if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) 286 goto drop; 287 288 skb->dev = dev; 289 /* Apply the forward feature mask so that we perform segmentation 290 * according to users wishes. 291 */ 292 features = netif_skb_features(skb) & vlan->tap_features; 293 if (netif_needs_gso(skb, features)) { 294 struct sk_buff *segs = __skb_gso_segment(skb, features, false); 295 296 if (IS_ERR(segs)) 297 goto drop; 298 299 if (!segs) { 300 skb_queue_tail(&q->sk.sk_receive_queue, skb); 301 goto wake_up; 302 } 303 304 kfree_skb(skb); 305 while (segs) { 306 struct sk_buff *nskb = segs->next; 307 308 segs->next = NULL; 309 skb_queue_tail(&q->sk.sk_receive_queue, segs); 310 segs = nskb; 311 } 312 } else { 313 skb_queue_tail(&q->sk.sk_receive_queue, skb); 314 } 315 316 wake_up: 317 wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); 318 return NET_RX_SUCCESS; 319 320 drop: 321 kfree_skb(skb); 322 return NET_RX_DROP; 323 } 324 325 /* 326 * Receive is for data from the external interface (lowerdev), 327 * in case of macvtap, we can treat that the same way as 328 * forward, which macvlan cannot. 329 */ 330 static int macvtap_receive(struct sk_buff *skb) 331 { 332 skb_push(skb, ETH_HLEN); 333 return macvtap_forward(skb->dev, skb); 334 } 335 336 static int macvtap_get_minor(struct macvlan_dev *vlan) 337 { 338 int retval = -ENOMEM; 339 340 mutex_lock(&minor_lock); 341 retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL); 342 if (retval >= 0) { 343 vlan->minor = retval; 344 } else if (retval == -ENOSPC) { 345 printk(KERN_ERR "too many macvtap devices\n"); 346 retval = -EINVAL; 347 } 348 mutex_unlock(&minor_lock); 349 return retval < 0 ? retval : 0; 350 } 351 352 static void macvtap_free_minor(struct macvlan_dev *vlan) 353 { 354 mutex_lock(&minor_lock); 355 if (vlan->minor) { 356 idr_remove(&minor_idr, vlan->minor); 357 vlan->minor = 0; 358 } 359 mutex_unlock(&minor_lock); 360 } 361 362 static struct net_device *dev_get_by_macvtap_minor(int minor) 363 { 364 struct net_device *dev = NULL; 365 struct macvlan_dev *vlan; 366 367 mutex_lock(&minor_lock); 368 vlan = idr_find(&minor_idr, minor); 369 if (vlan) { 370 dev = vlan->dev; 371 dev_hold(dev); 372 } 373 mutex_unlock(&minor_lock); 374 return dev; 375 } 376 377 static int macvtap_newlink(struct net *src_net, 378 struct net_device *dev, 379 struct nlattr *tb[], 380 struct nlattr *data[]) 381 { 382 struct macvlan_dev *vlan = netdev_priv(dev); 383 INIT_LIST_HEAD(&vlan->queue_list); 384 385 /* Since macvlan supports all offloads by default, make 386 * tap support all offloads also. 387 */ 388 vlan->tap_features = TUN_OFFLOADS; 389 390 /* Don't put anything that may fail after macvlan_common_newlink 391 * because we can't undo what it does. 392 */ 393 return macvlan_common_newlink(src_net, dev, tb, data, 394 macvtap_receive, macvtap_forward); 395 } 396 397 static void macvtap_dellink(struct net_device *dev, 398 struct list_head *head) 399 { 400 macvtap_del_queues(dev); 401 macvlan_dellink(dev, head); 402 } 403 404 static void macvtap_setup(struct net_device *dev) 405 { 406 macvlan_common_setup(dev); 407 dev->tx_queue_len = TUN_READQ_SIZE; 408 } 409 410 static struct rtnl_link_ops macvtap_link_ops __read_mostly = { 411 .kind = "macvtap", 412 .setup = macvtap_setup, 413 .newlink = macvtap_newlink, 414 .dellink = macvtap_dellink, 415 }; 416 417 418 static void macvtap_sock_write_space(struct sock *sk) 419 { 420 wait_queue_head_t *wqueue; 421 422 if (!sock_writeable(sk) || 423 !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) 424 return; 425 426 wqueue = sk_sleep(sk); 427 if (wqueue && waitqueue_active(wqueue)) 428 wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); 429 } 430 431 static void macvtap_sock_destruct(struct sock *sk) 432 { 433 skb_queue_purge(&sk->sk_receive_queue); 434 } 435 436 static int macvtap_open(struct inode *inode, struct file *file) 437 { 438 struct net *net = current->nsproxy->net_ns; 439 struct net_device *dev = dev_get_by_macvtap_minor(iminor(inode)); 440 struct macvtap_queue *q; 441 int err; 442 443 err = -ENODEV; 444 if (!dev) 445 goto out; 446 447 err = -ENOMEM; 448 q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, 449 &macvtap_proto); 450 if (!q) 451 goto out; 452 453 RCU_INIT_POINTER(q->sock.wq, &q->wq); 454 init_waitqueue_head(&q->wq.wait); 455 q->sock.type = SOCK_RAW; 456 q->sock.state = SS_CONNECTED; 457 q->sock.file = file; 458 q->sock.ops = &macvtap_socket_ops; 459 sock_init_data(&q->sock, &q->sk); 460 q->sk.sk_write_space = macvtap_sock_write_space; 461 q->sk.sk_destruct = macvtap_sock_destruct; 462 q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; 463 q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); 464 465 /* 466 * so far only KVM virtio_net uses macvtap, enable zero copy between 467 * guest kernel and host kernel when lower device supports zerocopy 468 * 469 * The macvlan supports zerocopy iff the lower device supports zero 470 * copy so we don't have to look at the lower device directly. 471 */ 472 if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG)) 473 sock_set_flag(&q->sk, SOCK_ZEROCOPY); 474 475 err = macvtap_set_queue(dev, file, q); 476 if (err) 477 sock_put(&q->sk); 478 479 out: 480 if (dev) 481 dev_put(dev); 482 483 return err; 484 } 485 486 static int macvtap_release(struct inode *inode, struct file *file) 487 { 488 struct macvtap_queue *q = file->private_data; 489 macvtap_put_queue(q); 490 return 0; 491 } 492 493 static unsigned int macvtap_poll(struct file *file, poll_table * wait) 494 { 495 struct macvtap_queue *q = file->private_data; 496 unsigned int mask = POLLERR; 497 498 if (!q) 499 goto out; 500 501 mask = 0; 502 poll_wait(file, &q->wq.wait, wait); 503 504 if (!skb_queue_empty(&q->sk.sk_receive_queue)) 505 mask |= POLLIN | POLLRDNORM; 506 507 if (sock_writeable(&q->sk) || 508 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && 509 sock_writeable(&q->sk))) 510 mask |= POLLOUT | POLLWRNORM; 511 512 out: 513 return mask; 514 } 515 516 static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, 517 size_t len, size_t linear, 518 int noblock, int *err) 519 { 520 struct sk_buff *skb; 521 522 /* Under a page? Don't bother with paged skb. */ 523 if (prepad + len < PAGE_SIZE || !linear) 524 linear = len; 525 526 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, 527 err); 528 if (!skb) 529 return NULL; 530 531 skb_reserve(skb, prepad); 532 skb_put(skb, linear); 533 skb->data_len = len - linear; 534 skb->len += len - linear; 535 536 return skb; 537 } 538 539 /* set skb frags from iovec, this can move to core network code for reuse */ 540 static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, 541 int offset, size_t count) 542 { 543 int len = iov_length(from, count) - offset; 544 int copy = skb_headlen(skb); 545 int size, offset1 = 0; 546 int i = 0; 547 548 /* Skip over from offset */ 549 while (count && (offset >= from->iov_len)) { 550 offset -= from->iov_len; 551 ++from; 552 --count; 553 } 554 555 /* copy up to skb headlen */ 556 while (count && (copy > 0)) { 557 size = min_t(unsigned int, copy, from->iov_len - offset); 558 if (copy_from_user(skb->data + offset1, from->iov_base + offset, 559 size)) 560 return -EFAULT; 561 if (copy > size) { 562 ++from; 563 --count; 564 offset = 0; 565 } else 566 offset += size; 567 copy -= size; 568 offset1 += size; 569 } 570 571 if (len == offset1) 572 return 0; 573 574 while (count--) { 575 struct page *page[MAX_SKB_FRAGS]; 576 int num_pages; 577 unsigned long base; 578 unsigned long truesize; 579 580 len = from->iov_len - offset; 581 if (!len) { 582 offset = 0; 583 ++from; 584 continue; 585 } 586 base = (unsigned long)from->iov_base + offset; 587 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; 588 if (i + size > MAX_SKB_FRAGS) 589 return -EMSGSIZE; 590 num_pages = get_user_pages_fast(base, size, 0, &page[i]); 591 if (num_pages != size) { 592 int j; 593 594 for (j = 0; j < num_pages; j++) 595 put_page(page[i + j]); 596 return -EFAULT; 597 } 598 truesize = size * PAGE_SIZE; 599 skb->data_len += len; 600 skb->len += len; 601 skb->truesize += truesize; 602 atomic_add(truesize, &skb->sk->sk_wmem_alloc); 603 while (len) { 604 int off = base & ~PAGE_MASK; 605 int size = min_t(int, len, PAGE_SIZE - off); 606 __skb_fill_page_desc(skb, i, page[i], off, size); 607 skb_shinfo(skb)->nr_frags++; 608 /* increase sk_wmem_alloc */ 609 base += size; 610 len -= size; 611 i++; 612 } 613 offset = 0; 614 ++from; 615 } 616 return 0; 617 } 618 619 /* 620 * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should 621 * be shared with the tun/tap driver. 622 */ 623 static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb, 624 struct virtio_net_hdr *vnet_hdr) 625 { 626 unsigned short gso_type = 0; 627 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 628 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { 629 case VIRTIO_NET_HDR_GSO_TCPV4: 630 gso_type = SKB_GSO_TCPV4; 631 break; 632 case VIRTIO_NET_HDR_GSO_TCPV6: 633 gso_type = SKB_GSO_TCPV6; 634 break; 635 case VIRTIO_NET_HDR_GSO_UDP: 636 gso_type = SKB_GSO_UDP; 637 break; 638 default: 639 return -EINVAL; 640 } 641 642 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) 643 gso_type |= SKB_GSO_TCP_ECN; 644 645 if (vnet_hdr->gso_size == 0) 646 return -EINVAL; 647 } 648 649 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 650 if (!skb_partial_csum_set(skb, vnet_hdr->csum_start, 651 vnet_hdr->csum_offset)) 652 return -EINVAL; 653 } 654 655 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 656 skb_shinfo(skb)->gso_size = vnet_hdr->gso_size; 657 skb_shinfo(skb)->gso_type = gso_type; 658 659 /* Header must be checked, and gso_segs computed. */ 660 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 661 skb_shinfo(skb)->gso_segs = 0; 662 } 663 return 0; 664 } 665 666 static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb, 667 struct virtio_net_hdr *vnet_hdr) 668 { 669 memset(vnet_hdr, 0, sizeof(*vnet_hdr)); 670 671 if (skb_is_gso(skb)) { 672 struct skb_shared_info *sinfo = skb_shinfo(skb); 673 674 /* This is a hint as to how much should be linear. */ 675 vnet_hdr->hdr_len = skb_headlen(skb); 676 vnet_hdr->gso_size = sinfo->gso_size; 677 if (sinfo->gso_type & SKB_GSO_TCPV4) 678 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 679 else if (sinfo->gso_type & SKB_GSO_TCPV6) 680 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 681 else if (sinfo->gso_type & SKB_GSO_UDP) 682 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; 683 else 684 BUG(); 685 if (sinfo->gso_type & SKB_GSO_TCP_ECN) 686 vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; 687 } else 688 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 689 690 if (skb->ip_summed == CHECKSUM_PARTIAL) { 691 vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 692 vnet_hdr->csum_start = skb_checksum_start_offset(skb); 693 vnet_hdr->csum_offset = skb->csum_offset; 694 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { 695 vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; 696 } /* else everything is zero */ 697 698 return 0; 699 } 700 701 static unsigned long iov_pages(const struct iovec *iv, int offset, 702 unsigned long nr_segs) 703 { 704 unsigned long seg, base; 705 int pages = 0, len, size; 706 707 while (nr_segs && (offset >= iv->iov_len)) { 708 offset -= iv->iov_len; 709 ++iv; 710 --nr_segs; 711 } 712 713 for (seg = 0; seg < nr_segs; seg++) { 714 base = (unsigned long)iv[seg].iov_base + offset; 715 len = iv[seg].iov_len - offset; 716 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; 717 pages += size; 718 offset = 0; 719 } 720 721 return pages; 722 } 723 724 /* Get packet from user space buffer */ 725 static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, 726 const struct iovec *iv, unsigned long total_len, 727 size_t count, int noblock) 728 { 729 struct sk_buff *skb; 730 struct macvlan_dev *vlan; 731 unsigned long len = total_len; 732 int err; 733 struct virtio_net_hdr vnet_hdr = { 0 }; 734 int vnet_hdr_len = 0; 735 int copylen = 0; 736 bool zerocopy = false; 737 size_t linear; 738 739 if (q->flags & IFF_VNET_HDR) { 740 vnet_hdr_len = q->vnet_hdr_sz; 741 742 err = -EINVAL; 743 if (len < vnet_hdr_len) 744 goto err; 745 len -= vnet_hdr_len; 746 747 err = memcpy_fromiovecend((void *)&vnet_hdr, iv, 0, 748 sizeof(vnet_hdr)); 749 if (err < 0) 750 goto err; 751 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 752 vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 753 vnet_hdr.hdr_len) 754 vnet_hdr.hdr_len = vnet_hdr.csum_start + 755 vnet_hdr.csum_offset + 2; 756 err = -EINVAL; 757 if (vnet_hdr.hdr_len > len) 758 goto err; 759 } 760 761 err = -EINVAL; 762 if (unlikely(len < ETH_HLEN)) 763 goto err; 764 765 err = -EMSGSIZE; 766 if (unlikely(count > UIO_MAXIOV)) 767 goto err; 768 769 if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { 770 copylen = vnet_hdr.hdr_len ? vnet_hdr.hdr_len : GOODCOPY_LEN; 771 linear = copylen; 772 if (iov_pages(iv, vnet_hdr_len + copylen, count) 773 <= MAX_SKB_FRAGS) 774 zerocopy = true; 775 } 776 777 if (!zerocopy) { 778 copylen = len; 779 linear = vnet_hdr.hdr_len; 780 } 781 782 skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen, 783 linear, noblock, &err); 784 if (!skb) 785 goto err; 786 787 if (zerocopy) 788 err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); 789 else { 790 err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, 791 len); 792 if (!err && m && m->msg_control) { 793 struct ubuf_info *uarg = m->msg_control; 794 uarg->callback(uarg, false); 795 } 796 } 797 798 if (err) 799 goto err_kfree; 800 801 skb_set_network_header(skb, ETH_HLEN); 802 skb_reset_mac_header(skb); 803 skb->protocol = eth_hdr(skb)->h_proto; 804 805 if (vnet_hdr_len) { 806 err = macvtap_skb_from_vnet_hdr(skb, &vnet_hdr); 807 if (err) 808 goto err_kfree; 809 } 810 811 skb_probe_transport_header(skb, ETH_HLEN); 812 813 rcu_read_lock(); 814 vlan = rcu_dereference(q->vlan); 815 /* copy skb_ubuf_info for callback when skb has no error */ 816 if (zerocopy) { 817 skb_shinfo(skb)->destructor_arg = m->msg_control; 818 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; 819 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; 820 } 821 if (vlan) 822 macvlan_start_xmit(skb, vlan->dev); 823 else 824 kfree_skb(skb); 825 rcu_read_unlock(); 826 827 return total_len; 828 829 err_kfree: 830 kfree_skb(skb); 831 832 err: 833 rcu_read_lock(); 834 vlan = rcu_dereference(q->vlan); 835 if (vlan) 836 vlan->dev->stats.tx_dropped++; 837 rcu_read_unlock(); 838 839 return err; 840 } 841 842 static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, 843 unsigned long count, loff_t pos) 844 { 845 struct file *file = iocb->ki_filp; 846 ssize_t result = -ENOLINK; 847 struct macvtap_queue *q = file->private_data; 848 849 result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count, 850 file->f_flags & O_NONBLOCK); 851 return result; 852 } 853 854 /* Put packet to the user space buffer */ 855 static ssize_t macvtap_put_user(struct macvtap_queue *q, 856 const struct sk_buff *skb, 857 const struct iovec *iv, int len) 858 { 859 struct macvlan_dev *vlan; 860 int ret; 861 int vnet_hdr_len = 0; 862 int vlan_offset = 0; 863 int copied; 864 865 if (q->flags & IFF_VNET_HDR) { 866 struct virtio_net_hdr vnet_hdr; 867 vnet_hdr_len = q->vnet_hdr_sz; 868 if ((len -= vnet_hdr_len) < 0) 869 return -EINVAL; 870 871 ret = macvtap_skb_to_vnet_hdr(skb, &vnet_hdr); 872 if (ret) 873 return ret; 874 875 if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr))) 876 return -EFAULT; 877 } 878 copied = vnet_hdr_len; 879 880 if (!vlan_tx_tag_present(skb)) 881 len = min_t(int, skb->len, len); 882 else { 883 int copy; 884 struct { 885 __be16 h_vlan_proto; 886 __be16 h_vlan_TCI; 887 } veth; 888 veth.h_vlan_proto = skb->vlan_proto; 889 veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); 890 891 vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); 892 len = min_t(int, skb->len + VLAN_HLEN, len); 893 894 copy = min_t(int, vlan_offset, len); 895 ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); 896 len -= copy; 897 copied += copy; 898 if (ret || !len) 899 goto done; 900 901 copy = min_t(int, sizeof(veth), len); 902 ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); 903 len -= copy; 904 copied += copy; 905 if (ret || !len) 906 goto done; 907 } 908 909 ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); 910 copied += len; 911 912 done: 913 rcu_read_lock(); 914 vlan = rcu_dereference(q->vlan); 915 if (vlan) 916 macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0); 917 rcu_read_unlock(); 918 919 return ret ? ret : copied; 920 } 921 922 static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb, 923 const struct iovec *iv, unsigned long len, 924 int noblock) 925 { 926 DEFINE_WAIT(wait); 927 struct sk_buff *skb; 928 ssize_t ret = 0; 929 930 while (len) { 931 if (!noblock) 932 prepare_to_wait(sk_sleep(&q->sk), &wait, 933 TASK_INTERRUPTIBLE); 934 935 /* Read frames from the queue */ 936 skb = skb_dequeue(&q->sk.sk_receive_queue); 937 if (!skb) { 938 if (noblock) { 939 ret = -EAGAIN; 940 break; 941 } 942 if (signal_pending(current)) { 943 ret = -ERESTARTSYS; 944 break; 945 } 946 /* Nothing to read, let's sleep */ 947 schedule(); 948 continue; 949 } 950 ret = macvtap_put_user(q, skb, iv, len); 951 kfree_skb(skb); 952 break; 953 } 954 955 if (!noblock) 956 finish_wait(sk_sleep(&q->sk), &wait); 957 return ret; 958 } 959 960 static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, 961 unsigned long count, loff_t pos) 962 { 963 struct file *file = iocb->ki_filp; 964 struct macvtap_queue *q = file->private_data; 965 ssize_t len, ret = 0; 966 967 len = iov_length(iv, count); 968 if (len < 0) { 969 ret = -EINVAL; 970 goto out; 971 } 972 973 ret = macvtap_do_read(q, iocb, iv, len, file->f_flags & O_NONBLOCK); 974 ret = min_t(ssize_t, ret, len); /* XXX copied from tun.c. Why? */ 975 out: 976 return ret; 977 } 978 979 static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q) 980 { 981 struct macvlan_dev *vlan; 982 983 ASSERT_RTNL(); 984 vlan = rtnl_dereference(q->vlan); 985 if (vlan) 986 dev_hold(vlan->dev); 987 988 return vlan; 989 } 990 991 static void macvtap_put_vlan(struct macvlan_dev *vlan) 992 { 993 dev_put(vlan->dev); 994 } 995 996 static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags) 997 { 998 struct macvtap_queue *q = file->private_data; 999 struct macvlan_dev *vlan; 1000 int ret; 1001 1002 vlan = macvtap_get_vlan(q); 1003 if (!vlan) 1004 return -EINVAL; 1005 1006 if (flags & IFF_ATTACH_QUEUE) 1007 ret = macvtap_enable_queue(vlan->dev, file, q); 1008 else if (flags & IFF_DETACH_QUEUE) 1009 ret = macvtap_disable_queue(q); 1010 else 1011 ret = -EINVAL; 1012 1013 macvtap_put_vlan(vlan); 1014 return ret; 1015 } 1016 1017 static int set_offload(struct macvtap_queue *q, unsigned long arg) 1018 { 1019 struct macvlan_dev *vlan; 1020 netdev_features_t features; 1021 netdev_features_t feature_mask = 0; 1022 1023 vlan = rtnl_dereference(q->vlan); 1024 if (!vlan) 1025 return -ENOLINK; 1026 1027 features = vlan->dev->features; 1028 1029 if (arg & TUN_F_CSUM) { 1030 feature_mask = NETIF_F_HW_CSUM; 1031 1032 if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { 1033 if (arg & TUN_F_TSO_ECN) 1034 feature_mask |= NETIF_F_TSO_ECN; 1035 if (arg & TUN_F_TSO4) 1036 feature_mask |= NETIF_F_TSO; 1037 if (arg & TUN_F_TSO6) 1038 feature_mask |= NETIF_F_TSO6; 1039 } 1040 1041 if (arg & TUN_F_UFO) 1042 feature_mask |= NETIF_F_UFO; 1043 } 1044 1045 /* tun/tap driver inverts the usage for TSO offloads, where 1046 * setting the TSO bit means that the userspace wants to 1047 * accept TSO frames and turning it off means that user space 1048 * does not support TSO. 1049 * For macvtap, we have to invert it to mean the same thing. 1050 * When user space turns off TSO, we turn off GSO/LRO so that 1051 * user-space will not receive TSO frames. 1052 */ 1053 if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO)) 1054 features |= RX_OFFLOADS; 1055 else 1056 features &= ~RX_OFFLOADS; 1057 1058 /* tap_features are the same as features on tun/tap and 1059 * reflect user expectations. 1060 */ 1061 vlan->tap_features = vlan->dev->features & 1062 (feature_mask | ~TUN_OFFLOADS); 1063 vlan->set_features = features; 1064 netdev_update_features(vlan->dev); 1065 1066 return 0; 1067 } 1068 1069 /* 1070 * provide compatibility with generic tun/tap interface 1071 */ 1072 static long macvtap_ioctl(struct file *file, unsigned int cmd, 1073 unsigned long arg) 1074 { 1075 struct macvtap_queue *q = file->private_data; 1076 struct macvlan_dev *vlan; 1077 void __user *argp = (void __user *)arg; 1078 struct ifreq __user *ifr = argp; 1079 unsigned int __user *up = argp; 1080 unsigned int u; 1081 int __user *sp = argp; 1082 int s; 1083 int ret; 1084 1085 switch (cmd) { 1086 case TUNSETIFF: 1087 /* ignore the name, just look at flags */ 1088 if (get_user(u, &ifr->ifr_flags)) 1089 return -EFAULT; 1090 1091 ret = 0; 1092 if ((u & ~(IFF_VNET_HDR | IFF_MULTI_QUEUE)) != 1093 (IFF_NO_PI | IFF_TAP)) 1094 ret = -EINVAL; 1095 else 1096 q->flags = u; 1097 1098 return ret; 1099 1100 case TUNGETIFF: 1101 rtnl_lock(); 1102 vlan = macvtap_get_vlan(q); 1103 if (!vlan) { 1104 rtnl_unlock(); 1105 return -ENOLINK; 1106 } 1107 1108 ret = 0; 1109 if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) || 1110 put_user(q->flags, &ifr->ifr_flags)) 1111 ret = -EFAULT; 1112 macvtap_put_vlan(vlan); 1113 rtnl_unlock(); 1114 return ret; 1115 1116 case TUNSETQUEUE: 1117 if (get_user(u, &ifr->ifr_flags)) 1118 return -EFAULT; 1119 rtnl_lock(); 1120 ret = macvtap_ioctl_set_queue(file, u); 1121 rtnl_unlock(); 1122 return ret; 1123 1124 case TUNGETFEATURES: 1125 if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR | 1126 IFF_MULTI_QUEUE, up)) 1127 return -EFAULT; 1128 return 0; 1129 1130 case TUNSETSNDBUF: 1131 if (get_user(u, up)) 1132 return -EFAULT; 1133 1134 q->sk.sk_sndbuf = u; 1135 return 0; 1136 1137 case TUNGETVNETHDRSZ: 1138 s = q->vnet_hdr_sz; 1139 if (put_user(s, sp)) 1140 return -EFAULT; 1141 return 0; 1142 1143 case TUNSETVNETHDRSZ: 1144 if (get_user(s, sp)) 1145 return -EFAULT; 1146 if (s < (int)sizeof(struct virtio_net_hdr)) 1147 return -EINVAL; 1148 1149 q->vnet_hdr_sz = s; 1150 return 0; 1151 1152 case TUNSETOFFLOAD: 1153 /* let the user check for future flags */ 1154 if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | 1155 TUN_F_TSO_ECN | TUN_F_UFO)) 1156 return -EINVAL; 1157 1158 /* TODO: only accept frames with the features that 1159 got enabled for forwarded frames */ 1160 if (!(q->flags & IFF_VNET_HDR)) 1161 return -EINVAL; 1162 rtnl_lock(); 1163 ret = set_offload(q, arg); 1164 rtnl_unlock(); 1165 return ret; 1166 1167 default: 1168 return -EINVAL; 1169 } 1170 } 1171 1172 #ifdef CONFIG_COMPAT 1173 static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, 1174 unsigned long arg) 1175 { 1176 return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); 1177 } 1178 #endif 1179 1180 static const struct file_operations macvtap_fops = { 1181 .owner = THIS_MODULE, 1182 .open = macvtap_open, 1183 .release = macvtap_release, 1184 .aio_read = macvtap_aio_read, 1185 .aio_write = macvtap_aio_write, 1186 .poll = macvtap_poll, 1187 .llseek = no_llseek, 1188 .unlocked_ioctl = macvtap_ioctl, 1189 #ifdef CONFIG_COMPAT 1190 .compat_ioctl = macvtap_compat_ioctl, 1191 #endif 1192 }; 1193 1194 static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock, 1195 struct msghdr *m, size_t total_len) 1196 { 1197 struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); 1198 return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen, 1199 m->msg_flags & MSG_DONTWAIT); 1200 } 1201 1202 static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, 1203 struct msghdr *m, size_t total_len, 1204 int flags) 1205 { 1206 struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); 1207 int ret; 1208 if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) 1209 return -EINVAL; 1210 ret = macvtap_do_read(q, iocb, m->msg_iov, total_len, 1211 flags & MSG_DONTWAIT); 1212 if (ret > total_len) { 1213 m->msg_flags |= MSG_TRUNC; 1214 ret = flags & MSG_TRUNC ? ret : total_len; 1215 } 1216 return ret; 1217 } 1218 1219 /* Ops structure to mimic raw sockets with tun */ 1220 static const struct proto_ops macvtap_socket_ops = { 1221 .sendmsg = macvtap_sendmsg, 1222 .recvmsg = macvtap_recvmsg, 1223 }; 1224 1225 /* Get an underlying socket object from tun file. Returns error unless file is 1226 * attached to a device. The returned object works like a packet socket, it 1227 * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for 1228 * holding a reference to the file for as long as the socket is in use. */ 1229 struct socket *macvtap_get_socket(struct file *file) 1230 { 1231 struct macvtap_queue *q; 1232 if (file->f_op != &macvtap_fops) 1233 return ERR_PTR(-EINVAL); 1234 q = file->private_data; 1235 if (!q) 1236 return ERR_PTR(-EBADFD); 1237 return &q->sock; 1238 } 1239 EXPORT_SYMBOL_GPL(macvtap_get_socket); 1240 1241 static int macvtap_device_event(struct notifier_block *unused, 1242 unsigned long event, void *ptr) 1243 { 1244 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1245 struct macvlan_dev *vlan; 1246 struct device *classdev; 1247 dev_t devt; 1248 int err; 1249 1250 if (dev->rtnl_link_ops != &macvtap_link_ops) 1251 return NOTIFY_DONE; 1252 1253 vlan = netdev_priv(dev); 1254 1255 switch (event) { 1256 case NETDEV_REGISTER: 1257 /* Create the device node here after the network device has 1258 * been registered but before register_netdevice has 1259 * finished running. 1260 */ 1261 err = macvtap_get_minor(vlan); 1262 if (err) 1263 return notifier_from_errno(err); 1264 1265 devt = MKDEV(MAJOR(macvtap_major), vlan->minor); 1266 classdev = device_create(macvtap_class, &dev->dev, devt, 1267 dev, "tap%d", dev->ifindex); 1268 if (IS_ERR(classdev)) { 1269 macvtap_free_minor(vlan); 1270 return notifier_from_errno(PTR_ERR(classdev)); 1271 } 1272 break; 1273 case NETDEV_UNREGISTER: 1274 devt = MKDEV(MAJOR(macvtap_major), vlan->minor); 1275 device_destroy(macvtap_class, devt); 1276 macvtap_free_minor(vlan); 1277 break; 1278 } 1279 1280 return NOTIFY_DONE; 1281 } 1282 1283 static struct notifier_block macvtap_notifier_block __read_mostly = { 1284 .notifier_call = macvtap_device_event, 1285 }; 1286 1287 static int macvtap_init(void) 1288 { 1289 int err; 1290 1291 err = alloc_chrdev_region(&macvtap_major, 0, 1292 MACVTAP_NUM_DEVS, "macvtap"); 1293 if (err) 1294 goto out1; 1295 1296 cdev_init(&macvtap_cdev, &macvtap_fops); 1297 err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); 1298 if (err) 1299 goto out2; 1300 1301 macvtap_class = class_create(THIS_MODULE, "macvtap"); 1302 if (IS_ERR(macvtap_class)) { 1303 err = PTR_ERR(macvtap_class); 1304 goto out3; 1305 } 1306 1307 err = register_netdevice_notifier(&macvtap_notifier_block); 1308 if (err) 1309 goto out4; 1310 1311 err = macvlan_link_register(&macvtap_link_ops); 1312 if (err) 1313 goto out5; 1314 1315 return 0; 1316 1317 out5: 1318 unregister_netdevice_notifier(&macvtap_notifier_block); 1319 out4: 1320 class_unregister(macvtap_class); 1321 out3: 1322 cdev_del(&macvtap_cdev); 1323 out2: 1324 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); 1325 out1: 1326 return err; 1327 } 1328 module_init(macvtap_init); 1329 1330 static void macvtap_exit(void) 1331 { 1332 rtnl_link_unregister(&macvtap_link_ops); 1333 unregister_netdevice_notifier(&macvtap_notifier_block); 1334 class_unregister(macvtap_class); 1335 cdev_del(&macvtap_cdev); 1336 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); 1337 } 1338 module_exit(macvtap_exit); 1339 1340 MODULE_ALIAS_RTNL_LINK("macvtap"); 1341 MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); 1342 MODULE_LICENSE("GPL"); 1343