1 /* 2 * drivers/net/veth.c 3 * 4 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 5 * 6 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 8 * 9 */ 10 11 #include <linux/netdevice.h> 12 #include <linux/slab.h> 13 #include <linux/ethtool.h> 14 #include <linux/etherdevice.h> 15 #include <linux/u64_stats_sync.h> 16 17 #include <net/rtnetlink.h> 18 #include <net/dst.h> 19 #include <net/xfrm.h> 20 #include <net/xdp.h> 21 #include <linux/veth.h> 22 #include <linux/module.h> 23 #include <linux/bpf.h> 24 #include <linux/filter.h> 25 #include <linux/ptr_ring.h> 26 #include <linux/bpf_trace.h> 27 #include <linux/net_tstamp.h> 28 29 #define DRV_NAME "veth" 30 #define DRV_VERSION "1.0" 31 32 #define VETH_XDP_FLAG BIT(0) 33 #define VETH_RING_SIZE 256 34 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 35 36 /* Separating two types of XDP xmit */ 37 #define VETH_XDP_TX BIT(0) 38 #define VETH_XDP_REDIR BIT(1) 39 40 struct veth_rq_stats { 41 u64 xdp_packets; 42 u64 xdp_bytes; 43 u64 xdp_drops; 44 struct u64_stats_sync syncp; 45 }; 46 47 struct veth_rq { 48 struct napi_struct xdp_napi; 49 struct net_device *dev; 50 struct bpf_prog __rcu *xdp_prog; 51 struct xdp_mem_info xdp_mem; 52 struct veth_rq_stats stats; 53 bool rx_notify_masked; 54 struct ptr_ring xdp_ring; 55 struct xdp_rxq_info xdp_rxq; 56 }; 57 58 struct veth_priv { 59 struct net_device __rcu *peer; 60 atomic64_t dropped; 61 struct bpf_prog *_xdp_prog; 62 struct veth_rq *rq; 63 unsigned int requested_headroom; 64 }; 65 66 /* 67 * ethtool interface 68 */ 69 70 struct veth_q_stat_desc { 71 char desc[ETH_GSTRING_LEN]; 72 size_t offset; 73 }; 74 75 #define VETH_RQ_STAT(m) offsetof(struct veth_rq_stats, m) 76 77 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 78 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 79 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 80 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 81 }; 82 83 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 84 85 static struct { 86 const char string[ETH_GSTRING_LEN]; 87 } ethtool_stats_keys[] = { 88 { "peer_ifindex" }, 89 }; 90 91 static int veth_get_link_ksettings(struct net_device *dev, 92 struct ethtool_link_ksettings *cmd) 93 { 94 cmd->base.speed = SPEED_10000; 95 cmd->base.duplex = DUPLEX_FULL; 96 cmd->base.port = PORT_TP; 97 cmd->base.autoneg = AUTONEG_DISABLE; 98 return 0; 99 } 100 101 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 102 { 103 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 104 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 105 } 106 107 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 108 { 109 char *p = (char *)buf; 110 int i, j; 111 112 switch(stringset) { 113 case ETH_SS_STATS: 114 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 115 p += sizeof(ethtool_stats_keys); 116 for (i = 0; i < dev->real_num_rx_queues; i++) { 117 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 118 snprintf(p, ETH_GSTRING_LEN, "rx_queue_%u_%s", 119 i, veth_rq_stats_desc[j].desc); 120 p += ETH_GSTRING_LEN; 121 } 122 } 123 break; 124 } 125 } 126 127 static int veth_get_sset_count(struct net_device *dev, int sset) 128 { 129 switch (sset) { 130 case ETH_SS_STATS: 131 return ARRAY_SIZE(ethtool_stats_keys) + 132 VETH_RQ_STATS_LEN * dev->real_num_rx_queues; 133 default: 134 return -EOPNOTSUPP; 135 } 136 } 137 138 static void veth_get_ethtool_stats(struct net_device *dev, 139 struct ethtool_stats *stats, u64 *data) 140 { 141 struct veth_priv *priv = netdev_priv(dev); 142 struct net_device *peer = rtnl_dereference(priv->peer); 143 int i, j, idx; 144 145 data[0] = peer ? peer->ifindex : 0; 146 idx = 1; 147 for (i = 0; i < dev->real_num_rx_queues; i++) { 148 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 149 const void *stats_base = (void *)rq_stats; 150 unsigned int start; 151 size_t offset; 152 153 do { 154 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 155 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 156 offset = veth_rq_stats_desc[j].offset; 157 data[idx + j] = *(u64 *)(stats_base + offset); 158 } 159 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 160 idx += VETH_RQ_STATS_LEN; 161 } 162 } 163 164 static int veth_get_ts_info(struct net_device *dev, 165 struct ethtool_ts_info *info) 166 { 167 info->so_timestamping = 168 SOF_TIMESTAMPING_TX_SOFTWARE | 169 SOF_TIMESTAMPING_RX_SOFTWARE | 170 SOF_TIMESTAMPING_SOFTWARE; 171 info->phc_index = -1; 172 173 return 0; 174 } 175 176 static const struct ethtool_ops veth_ethtool_ops = { 177 .get_drvinfo = veth_get_drvinfo, 178 .get_link = ethtool_op_get_link, 179 .get_strings = veth_get_strings, 180 .get_sset_count = veth_get_sset_count, 181 .get_ethtool_stats = veth_get_ethtool_stats, 182 .get_link_ksettings = veth_get_link_ksettings, 183 .get_ts_info = veth_get_ts_info, 184 }; 185 186 /* general routines */ 187 188 static bool veth_is_xdp_frame(void *ptr) 189 { 190 return (unsigned long)ptr & VETH_XDP_FLAG; 191 } 192 193 static void *veth_ptr_to_xdp(void *ptr) 194 { 195 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 196 } 197 198 static void *veth_xdp_to_ptr(void *ptr) 199 { 200 return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 201 } 202 203 static void veth_ptr_free(void *ptr) 204 { 205 if (veth_is_xdp_frame(ptr)) 206 xdp_return_frame(veth_ptr_to_xdp(ptr)); 207 else 208 kfree_skb(ptr); 209 } 210 211 static void __veth_xdp_flush(struct veth_rq *rq) 212 { 213 /* Write ptr_ring before reading rx_notify_masked */ 214 smp_mb(); 215 if (!rq->rx_notify_masked) { 216 rq->rx_notify_masked = true; 217 napi_schedule(&rq->xdp_napi); 218 } 219 } 220 221 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 222 { 223 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 224 dev_kfree_skb_any(skb); 225 return NET_RX_DROP; 226 } 227 228 return NET_RX_SUCCESS; 229 } 230 231 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 232 struct veth_rq *rq, bool xdp) 233 { 234 return __dev_forward_skb(dev, skb) ?: xdp ? 235 veth_xdp_rx(rq, skb) : 236 netif_rx(skb); 237 } 238 239 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 240 { 241 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 242 struct veth_rq *rq = NULL; 243 struct net_device *rcv; 244 int length = skb->len; 245 bool rcv_xdp = false; 246 int rxq; 247 248 rcu_read_lock(); 249 rcv = rcu_dereference(priv->peer); 250 if (unlikely(!rcv)) { 251 kfree_skb(skb); 252 goto drop; 253 } 254 255 rcv_priv = netdev_priv(rcv); 256 rxq = skb_get_queue_mapping(skb); 257 if (rxq < rcv->real_num_rx_queues) { 258 rq = &rcv_priv->rq[rxq]; 259 rcv_xdp = rcu_access_pointer(rq->xdp_prog); 260 if (rcv_xdp) 261 skb_record_rx_queue(skb, rxq); 262 } 263 264 skb_tx_timestamp(skb); 265 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 266 if (!rcv_xdp) { 267 struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats); 268 269 u64_stats_update_begin(&stats->syncp); 270 stats->bytes += length; 271 stats->packets++; 272 u64_stats_update_end(&stats->syncp); 273 } 274 } else { 275 drop: 276 atomic64_inc(&priv->dropped); 277 } 278 279 if (rcv_xdp) 280 __veth_xdp_flush(rq); 281 282 rcu_read_unlock(); 283 284 return NETDEV_TX_OK; 285 } 286 287 static u64 veth_stats_tx(struct pcpu_lstats *result, struct net_device *dev) 288 { 289 struct veth_priv *priv = netdev_priv(dev); 290 int cpu; 291 292 result->packets = 0; 293 result->bytes = 0; 294 for_each_possible_cpu(cpu) { 295 struct pcpu_lstats *stats = per_cpu_ptr(dev->lstats, cpu); 296 u64 packets, bytes; 297 unsigned int start; 298 299 do { 300 start = u64_stats_fetch_begin_irq(&stats->syncp); 301 packets = stats->packets; 302 bytes = stats->bytes; 303 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 304 result->packets += packets; 305 result->bytes += bytes; 306 } 307 return atomic64_read(&priv->dropped); 308 } 309 310 static void veth_stats_rx(struct veth_rq_stats *result, struct net_device *dev) 311 { 312 struct veth_priv *priv = netdev_priv(dev); 313 int i; 314 315 result->xdp_packets = 0; 316 result->xdp_bytes = 0; 317 result->xdp_drops = 0; 318 for (i = 0; i < dev->num_rx_queues; i++) { 319 struct veth_rq_stats *stats = &priv->rq[i].stats; 320 u64 packets, bytes, drops; 321 unsigned int start; 322 323 do { 324 start = u64_stats_fetch_begin_irq(&stats->syncp); 325 packets = stats->xdp_packets; 326 bytes = stats->xdp_bytes; 327 drops = stats->xdp_drops; 328 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 329 result->xdp_packets += packets; 330 result->xdp_bytes += bytes; 331 result->xdp_drops += drops; 332 } 333 } 334 335 static void veth_get_stats64(struct net_device *dev, 336 struct rtnl_link_stats64 *tot) 337 { 338 struct veth_priv *priv = netdev_priv(dev); 339 struct net_device *peer; 340 struct veth_rq_stats rx; 341 struct pcpu_lstats tx; 342 343 tot->tx_dropped = veth_stats_tx(&tx, dev); 344 tot->tx_bytes = tx.bytes; 345 tot->tx_packets = tx.packets; 346 347 veth_stats_rx(&rx, dev); 348 tot->rx_dropped = rx.xdp_drops; 349 tot->rx_bytes = rx.xdp_bytes; 350 tot->rx_packets = rx.xdp_packets; 351 352 rcu_read_lock(); 353 peer = rcu_dereference(priv->peer); 354 if (peer) { 355 tot->rx_dropped += veth_stats_tx(&tx, peer); 356 tot->rx_bytes += tx.bytes; 357 tot->rx_packets += tx.packets; 358 359 veth_stats_rx(&rx, peer); 360 tot->tx_bytes += rx.xdp_bytes; 361 tot->tx_packets += rx.xdp_packets; 362 } 363 rcu_read_unlock(); 364 } 365 366 /* fake multicast ability */ 367 static void veth_set_multicast_list(struct net_device *dev) 368 { 369 } 370 371 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 372 int buflen) 373 { 374 struct sk_buff *skb; 375 376 if (!buflen) { 377 buflen = SKB_DATA_ALIGN(headroom + len) + 378 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 379 } 380 skb = build_skb(head, buflen); 381 if (!skb) 382 return NULL; 383 384 skb_reserve(skb, headroom); 385 skb_put(skb, len); 386 387 return skb; 388 } 389 390 static int veth_select_rxq(struct net_device *dev) 391 { 392 return smp_processor_id() % dev->real_num_rx_queues; 393 } 394 395 static int veth_xdp_xmit(struct net_device *dev, int n, 396 struct xdp_frame **frames, u32 flags) 397 { 398 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 399 struct net_device *rcv; 400 int i, ret, drops = n; 401 unsigned int max_len; 402 struct veth_rq *rq; 403 404 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) { 405 ret = -EINVAL; 406 goto drop; 407 } 408 409 rcv = rcu_dereference(priv->peer); 410 if (unlikely(!rcv)) { 411 ret = -ENXIO; 412 goto drop; 413 } 414 415 rcv_priv = netdev_priv(rcv); 416 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 417 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 418 * side. This means an XDP program is loaded on the peer and the peer 419 * device is up. 420 */ 421 if (!rcu_access_pointer(rq->xdp_prog)) { 422 ret = -ENXIO; 423 goto drop; 424 } 425 426 drops = 0; 427 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 428 429 spin_lock(&rq->xdp_ring.producer_lock); 430 for (i = 0; i < n; i++) { 431 struct xdp_frame *frame = frames[i]; 432 void *ptr = veth_xdp_to_ptr(frame); 433 434 if (unlikely(frame->len > max_len || 435 __ptr_ring_produce(&rq->xdp_ring, ptr))) { 436 xdp_return_frame_rx_napi(frame); 437 drops++; 438 } 439 } 440 spin_unlock(&rq->xdp_ring.producer_lock); 441 442 if (flags & XDP_XMIT_FLUSH) 443 __veth_xdp_flush(rq); 444 445 if (likely(!drops)) 446 return n; 447 448 ret = n - drops; 449 drop: 450 atomic64_add(drops, &priv->dropped); 451 452 return ret; 453 } 454 455 static void veth_xdp_flush(struct net_device *dev) 456 { 457 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 458 struct net_device *rcv; 459 struct veth_rq *rq; 460 461 rcu_read_lock(); 462 rcv = rcu_dereference(priv->peer); 463 if (unlikely(!rcv)) 464 goto out; 465 466 rcv_priv = netdev_priv(rcv); 467 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 468 /* xdp_ring is initialized on receive side? */ 469 if (unlikely(!rcu_access_pointer(rq->xdp_prog))) 470 goto out; 471 472 __veth_xdp_flush(rq); 473 out: 474 rcu_read_unlock(); 475 } 476 477 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) 478 { 479 struct xdp_frame *frame = convert_to_xdp_frame(xdp); 480 481 if (unlikely(!frame)) 482 return -EOVERFLOW; 483 484 return veth_xdp_xmit(dev, 1, &frame, 0); 485 } 486 487 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 488 struct xdp_frame *frame, 489 unsigned int *xdp_xmit) 490 { 491 void *hard_start = frame->data - frame->headroom; 492 void *head = hard_start - sizeof(struct xdp_frame); 493 int len = frame->len, delta = 0; 494 struct xdp_frame orig_frame; 495 struct bpf_prog *xdp_prog; 496 unsigned int headroom; 497 struct sk_buff *skb; 498 499 rcu_read_lock(); 500 xdp_prog = rcu_dereference(rq->xdp_prog); 501 if (likely(xdp_prog)) { 502 struct xdp_buff xdp; 503 u32 act; 504 505 xdp.data_hard_start = hard_start; 506 xdp.data = frame->data; 507 xdp.data_end = frame->data + frame->len; 508 xdp.data_meta = frame->data - frame->metasize; 509 xdp.rxq = &rq->xdp_rxq; 510 511 act = bpf_prog_run_xdp(xdp_prog, &xdp); 512 513 switch (act) { 514 case XDP_PASS: 515 delta = frame->data - xdp.data; 516 len = xdp.data_end - xdp.data; 517 break; 518 case XDP_TX: 519 orig_frame = *frame; 520 xdp.data_hard_start = head; 521 xdp.rxq->mem = frame->mem; 522 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 523 trace_xdp_exception(rq->dev, xdp_prog, act); 524 frame = &orig_frame; 525 goto err_xdp; 526 } 527 *xdp_xmit |= VETH_XDP_TX; 528 rcu_read_unlock(); 529 goto xdp_xmit; 530 case XDP_REDIRECT: 531 orig_frame = *frame; 532 xdp.data_hard_start = head; 533 xdp.rxq->mem = frame->mem; 534 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 535 frame = &orig_frame; 536 goto err_xdp; 537 } 538 *xdp_xmit |= VETH_XDP_REDIR; 539 rcu_read_unlock(); 540 goto xdp_xmit; 541 default: 542 bpf_warn_invalid_xdp_action(act); 543 case XDP_ABORTED: 544 trace_xdp_exception(rq->dev, xdp_prog, act); 545 case XDP_DROP: 546 goto err_xdp; 547 } 548 } 549 rcu_read_unlock(); 550 551 headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 552 skb = veth_build_skb(head, headroom, len, 0); 553 if (!skb) { 554 xdp_return_frame(frame); 555 goto err; 556 } 557 558 xdp_scrub_frame(frame); 559 skb->protocol = eth_type_trans(skb, rq->dev); 560 err: 561 return skb; 562 err_xdp: 563 rcu_read_unlock(); 564 xdp_return_frame(frame); 565 xdp_xmit: 566 return NULL; 567 } 568 569 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, 570 unsigned int *xdp_xmit) 571 { 572 u32 pktlen, headroom, act, metalen; 573 void *orig_data, *orig_data_end; 574 struct bpf_prog *xdp_prog; 575 int mac_len, delta, off; 576 struct xdp_buff xdp; 577 578 skb_orphan(skb); 579 580 rcu_read_lock(); 581 xdp_prog = rcu_dereference(rq->xdp_prog); 582 if (unlikely(!xdp_prog)) { 583 rcu_read_unlock(); 584 goto out; 585 } 586 587 mac_len = skb->data - skb_mac_header(skb); 588 pktlen = skb->len + mac_len; 589 headroom = skb_headroom(skb) - mac_len; 590 591 if (skb_shared(skb) || skb_head_is_locked(skb) || 592 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 593 struct sk_buff *nskb; 594 int size, head_off; 595 void *head, *start; 596 struct page *page; 597 598 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 599 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 600 if (size > PAGE_SIZE) 601 goto drop; 602 603 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 604 if (!page) 605 goto drop; 606 607 head = page_address(page); 608 start = head + VETH_XDP_HEADROOM; 609 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 610 page_frag_free(head); 611 goto drop; 612 } 613 614 nskb = veth_build_skb(head, 615 VETH_XDP_HEADROOM + mac_len, skb->len, 616 PAGE_SIZE); 617 if (!nskb) { 618 page_frag_free(head); 619 goto drop; 620 } 621 622 skb_copy_header(nskb, skb); 623 head_off = skb_headroom(nskb) - skb_headroom(skb); 624 skb_headers_offset_update(nskb, head_off); 625 consume_skb(skb); 626 skb = nskb; 627 } 628 629 xdp.data_hard_start = skb->head; 630 xdp.data = skb_mac_header(skb); 631 xdp.data_end = xdp.data + pktlen; 632 xdp.data_meta = xdp.data; 633 xdp.rxq = &rq->xdp_rxq; 634 orig_data = xdp.data; 635 orig_data_end = xdp.data_end; 636 637 act = bpf_prog_run_xdp(xdp_prog, &xdp); 638 639 switch (act) { 640 case XDP_PASS: 641 break; 642 case XDP_TX: 643 get_page(virt_to_page(xdp.data)); 644 consume_skb(skb); 645 xdp.rxq->mem = rq->xdp_mem; 646 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 647 trace_xdp_exception(rq->dev, xdp_prog, act); 648 goto err_xdp; 649 } 650 *xdp_xmit |= VETH_XDP_TX; 651 rcu_read_unlock(); 652 goto xdp_xmit; 653 case XDP_REDIRECT: 654 get_page(virt_to_page(xdp.data)); 655 consume_skb(skb); 656 xdp.rxq->mem = rq->xdp_mem; 657 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) 658 goto err_xdp; 659 *xdp_xmit |= VETH_XDP_REDIR; 660 rcu_read_unlock(); 661 goto xdp_xmit; 662 default: 663 bpf_warn_invalid_xdp_action(act); 664 case XDP_ABORTED: 665 trace_xdp_exception(rq->dev, xdp_prog, act); 666 case XDP_DROP: 667 goto drop; 668 } 669 rcu_read_unlock(); 670 671 delta = orig_data - xdp.data; 672 off = mac_len + delta; 673 if (off > 0) 674 __skb_push(skb, off); 675 else if (off < 0) 676 __skb_pull(skb, -off); 677 skb->mac_header -= delta; 678 off = xdp.data_end - orig_data_end; 679 if (off != 0) 680 __skb_put(skb, off); 681 skb->protocol = eth_type_trans(skb, rq->dev); 682 683 metalen = xdp.data - xdp.data_meta; 684 if (metalen) 685 skb_metadata_set(skb, metalen); 686 out: 687 return skb; 688 drop: 689 rcu_read_unlock(); 690 kfree_skb(skb); 691 return NULL; 692 err_xdp: 693 rcu_read_unlock(); 694 page_frag_free(xdp.data); 695 xdp_xmit: 696 return NULL; 697 } 698 699 static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) 700 { 701 int i, done = 0, drops = 0, bytes = 0; 702 703 for (i = 0; i < budget; i++) { 704 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 705 unsigned int xdp_xmit_one = 0; 706 struct sk_buff *skb; 707 708 if (!ptr) 709 break; 710 711 if (veth_is_xdp_frame(ptr)) { 712 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 713 714 bytes += frame->len; 715 skb = veth_xdp_rcv_one(rq, frame, &xdp_xmit_one); 716 } else { 717 skb = ptr; 718 bytes += skb->len; 719 skb = veth_xdp_rcv_skb(rq, skb, &xdp_xmit_one); 720 } 721 *xdp_xmit |= xdp_xmit_one; 722 723 if (skb) 724 napi_gro_receive(&rq->xdp_napi, skb); 725 else if (!xdp_xmit_one) 726 drops++; 727 728 done++; 729 } 730 731 u64_stats_update_begin(&rq->stats.syncp); 732 rq->stats.xdp_packets += done; 733 rq->stats.xdp_bytes += bytes; 734 rq->stats.xdp_drops += drops; 735 u64_stats_update_end(&rq->stats.syncp); 736 737 return done; 738 } 739 740 static int veth_poll(struct napi_struct *napi, int budget) 741 { 742 struct veth_rq *rq = 743 container_of(napi, struct veth_rq, xdp_napi); 744 unsigned int xdp_xmit = 0; 745 int done; 746 747 xdp_set_return_frame_no_direct(); 748 done = veth_xdp_rcv(rq, budget, &xdp_xmit); 749 750 if (done < budget && napi_complete_done(napi, done)) { 751 /* Write rx_notify_masked before reading ptr_ring */ 752 smp_store_mb(rq->rx_notify_masked, false); 753 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 754 rq->rx_notify_masked = true; 755 napi_schedule(&rq->xdp_napi); 756 } 757 } 758 759 if (xdp_xmit & VETH_XDP_TX) 760 veth_xdp_flush(rq->dev); 761 if (xdp_xmit & VETH_XDP_REDIR) 762 xdp_do_flush_map(); 763 xdp_clear_return_frame_no_direct(); 764 765 return done; 766 } 767 768 static int veth_napi_add(struct net_device *dev) 769 { 770 struct veth_priv *priv = netdev_priv(dev); 771 int err, i; 772 773 for (i = 0; i < dev->real_num_rx_queues; i++) { 774 struct veth_rq *rq = &priv->rq[i]; 775 776 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 777 if (err) 778 goto err_xdp_ring; 779 } 780 781 for (i = 0; i < dev->real_num_rx_queues; i++) { 782 struct veth_rq *rq = &priv->rq[i]; 783 784 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 785 napi_enable(&rq->xdp_napi); 786 } 787 788 return 0; 789 err_xdp_ring: 790 for (i--; i >= 0; i--) 791 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 792 793 return err; 794 } 795 796 static void veth_napi_del(struct net_device *dev) 797 { 798 struct veth_priv *priv = netdev_priv(dev); 799 int i; 800 801 for (i = 0; i < dev->real_num_rx_queues; i++) { 802 struct veth_rq *rq = &priv->rq[i]; 803 804 napi_disable(&rq->xdp_napi); 805 napi_hash_del(&rq->xdp_napi); 806 } 807 synchronize_net(); 808 809 for (i = 0; i < dev->real_num_rx_queues; i++) { 810 struct veth_rq *rq = &priv->rq[i]; 811 812 netif_napi_del(&rq->xdp_napi); 813 rq->rx_notify_masked = false; 814 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 815 } 816 } 817 818 static int veth_enable_xdp(struct net_device *dev) 819 { 820 struct veth_priv *priv = netdev_priv(dev); 821 int err, i; 822 823 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 824 for (i = 0; i < dev->real_num_rx_queues; i++) { 825 struct veth_rq *rq = &priv->rq[i]; 826 827 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 828 if (err < 0) 829 goto err_rxq_reg; 830 831 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 832 MEM_TYPE_PAGE_SHARED, 833 NULL); 834 if (err < 0) 835 goto err_reg_mem; 836 837 /* Save original mem info as it can be overwritten */ 838 rq->xdp_mem = rq->xdp_rxq.mem; 839 } 840 841 err = veth_napi_add(dev); 842 if (err) 843 goto err_rxq_reg; 844 } 845 846 for (i = 0; i < dev->real_num_rx_queues; i++) 847 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 848 849 return 0; 850 err_reg_mem: 851 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 852 err_rxq_reg: 853 for (i--; i >= 0; i--) 854 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 855 856 return err; 857 } 858 859 static void veth_disable_xdp(struct net_device *dev) 860 { 861 struct veth_priv *priv = netdev_priv(dev); 862 int i; 863 864 for (i = 0; i < dev->real_num_rx_queues; i++) 865 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 866 veth_napi_del(dev); 867 for (i = 0; i < dev->real_num_rx_queues; i++) { 868 struct veth_rq *rq = &priv->rq[i]; 869 870 rq->xdp_rxq.mem = rq->xdp_mem; 871 xdp_rxq_info_unreg(&rq->xdp_rxq); 872 } 873 } 874 875 static int veth_open(struct net_device *dev) 876 { 877 struct veth_priv *priv = netdev_priv(dev); 878 struct net_device *peer = rtnl_dereference(priv->peer); 879 int err; 880 881 if (!peer) 882 return -ENOTCONN; 883 884 if (priv->_xdp_prog) { 885 err = veth_enable_xdp(dev); 886 if (err) 887 return err; 888 } 889 890 if (peer->flags & IFF_UP) { 891 netif_carrier_on(dev); 892 netif_carrier_on(peer); 893 } 894 895 return 0; 896 } 897 898 static int veth_close(struct net_device *dev) 899 { 900 struct veth_priv *priv = netdev_priv(dev); 901 struct net_device *peer = rtnl_dereference(priv->peer); 902 903 netif_carrier_off(dev); 904 if (peer) 905 netif_carrier_off(peer); 906 907 if (priv->_xdp_prog) 908 veth_disable_xdp(dev); 909 910 return 0; 911 } 912 913 static int is_valid_veth_mtu(int mtu) 914 { 915 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 916 } 917 918 static int veth_alloc_queues(struct net_device *dev) 919 { 920 struct veth_priv *priv = netdev_priv(dev); 921 int i; 922 923 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 924 if (!priv->rq) 925 return -ENOMEM; 926 927 for (i = 0; i < dev->num_rx_queues; i++) { 928 priv->rq[i].dev = dev; 929 u64_stats_init(&priv->rq[i].stats.syncp); 930 } 931 932 return 0; 933 } 934 935 static void veth_free_queues(struct net_device *dev) 936 { 937 struct veth_priv *priv = netdev_priv(dev); 938 939 kfree(priv->rq); 940 } 941 942 static int veth_dev_init(struct net_device *dev) 943 { 944 int err; 945 946 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 947 if (!dev->lstats) 948 return -ENOMEM; 949 950 err = veth_alloc_queues(dev); 951 if (err) { 952 free_percpu(dev->lstats); 953 return err; 954 } 955 956 return 0; 957 } 958 959 static void veth_dev_free(struct net_device *dev) 960 { 961 veth_free_queues(dev); 962 free_percpu(dev->lstats); 963 } 964 965 #ifdef CONFIG_NET_POLL_CONTROLLER 966 static void veth_poll_controller(struct net_device *dev) 967 { 968 /* veth only receives frames when its peer sends one 969 * Since it has nothing to do with disabling irqs, we are guaranteed 970 * never to have pending data when we poll for it so 971 * there is nothing to do here. 972 * 973 * We need this though so netpoll recognizes us as an interface that 974 * supports polling, which enables bridge devices in virt setups to 975 * still use netconsole 976 */ 977 } 978 #endif /* CONFIG_NET_POLL_CONTROLLER */ 979 980 static int veth_get_iflink(const struct net_device *dev) 981 { 982 struct veth_priv *priv = netdev_priv(dev); 983 struct net_device *peer; 984 int iflink; 985 986 rcu_read_lock(); 987 peer = rcu_dereference(priv->peer); 988 iflink = peer ? peer->ifindex : 0; 989 rcu_read_unlock(); 990 991 return iflink; 992 } 993 994 static netdev_features_t veth_fix_features(struct net_device *dev, 995 netdev_features_t features) 996 { 997 struct veth_priv *priv = netdev_priv(dev); 998 struct net_device *peer; 999 1000 peer = rtnl_dereference(priv->peer); 1001 if (peer) { 1002 struct veth_priv *peer_priv = netdev_priv(peer); 1003 1004 if (peer_priv->_xdp_prog) 1005 features &= ~NETIF_F_GSO_SOFTWARE; 1006 } 1007 1008 return features; 1009 } 1010 1011 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1012 { 1013 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1014 struct net_device *peer; 1015 1016 if (new_hr < 0) 1017 new_hr = 0; 1018 1019 rcu_read_lock(); 1020 peer = rcu_dereference(priv->peer); 1021 if (unlikely(!peer)) 1022 goto out; 1023 1024 peer_priv = netdev_priv(peer); 1025 priv->requested_headroom = new_hr; 1026 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1027 dev->needed_headroom = new_hr; 1028 peer->needed_headroom = new_hr; 1029 1030 out: 1031 rcu_read_unlock(); 1032 } 1033 1034 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1035 struct netlink_ext_ack *extack) 1036 { 1037 struct veth_priv *priv = netdev_priv(dev); 1038 struct bpf_prog *old_prog; 1039 struct net_device *peer; 1040 unsigned int max_mtu; 1041 int err; 1042 1043 old_prog = priv->_xdp_prog; 1044 priv->_xdp_prog = prog; 1045 peer = rtnl_dereference(priv->peer); 1046 1047 if (prog) { 1048 if (!peer) { 1049 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1050 err = -ENOTCONN; 1051 goto err; 1052 } 1053 1054 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1055 peer->hard_header_len - 1056 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1057 if (peer->mtu > max_mtu) { 1058 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1059 err = -ERANGE; 1060 goto err; 1061 } 1062 1063 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1064 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1065 err = -ENOSPC; 1066 goto err; 1067 } 1068 1069 if (dev->flags & IFF_UP) { 1070 err = veth_enable_xdp(dev); 1071 if (err) { 1072 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1073 goto err; 1074 } 1075 } 1076 1077 if (!old_prog) { 1078 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1079 peer->max_mtu = max_mtu; 1080 } 1081 } 1082 1083 if (old_prog) { 1084 if (!prog) { 1085 if (dev->flags & IFF_UP) 1086 veth_disable_xdp(dev); 1087 1088 if (peer) { 1089 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1090 peer->max_mtu = ETH_MAX_MTU; 1091 } 1092 } 1093 bpf_prog_put(old_prog); 1094 } 1095 1096 if ((!!old_prog ^ !!prog) && peer) 1097 netdev_update_features(peer); 1098 1099 return 0; 1100 err: 1101 priv->_xdp_prog = old_prog; 1102 1103 return err; 1104 } 1105 1106 static u32 veth_xdp_query(struct net_device *dev) 1107 { 1108 struct veth_priv *priv = netdev_priv(dev); 1109 const struct bpf_prog *xdp_prog; 1110 1111 xdp_prog = priv->_xdp_prog; 1112 if (xdp_prog) 1113 return xdp_prog->aux->id; 1114 1115 return 0; 1116 } 1117 1118 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1119 { 1120 switch (xdp->command) { 1121 case XDP_SETUP_PROG: 1122 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1123 case XDP_QUERY_PROG: 1124 xdp->prog_id = veth_xdp_query(dev); 1125 return 0; 1126 default: 1127 return -EINVAL; 1128 } 1129 } 1130 1131 static const struct net_device_ops veth_netdev_ops = { 1132 .ndo_init = veth_dev_init, 1133 .ndo_open = veth_open, 1134 .ndo_stop = veth_close, 1135 .ndo_start_xmit = veth_xmit, 1136 .ndo_get_stats64 = veth_get_stats64, 1137 .ndo_set_rx_mode = veth_set_multicast_list, 1138 .ndo_set_mac_address = eth_mac_addr, 1139 #ifdef CONFIG_NET_POLL_CONTROLLER 1140 .ndo_poll_controller = veth_poll_controller, 1141 #endif 1142 .ndo_get_iflink = veth_get_iflink, 1143 .ndo_fix_features = veth_fix_features, 1144 .ndo_features_check = passthru_features_check, 1145 .ndo_set_rx_headroom = veth_set_rx_headroom, 1146 .ndo_bpf = veth_xdp, 1147 .ndo_xdp_xmit = veth_xdp_xmit, 1148 }; 1149 1150 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1151 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1152 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1153 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1154 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1155 1156 static void veth_setup(struct net_device *dev) 1157 { 1158 ether_setup(dev); 1159 1160 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1161 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1162 dev->priv_flags |= IFF_NO_QUEUE; 1163 dev->priv_flags |= IFF_PHONY_HEADROOM; 1164 1165 dev->netdev_ops = &veth_netdev_ops; 1166 dev->ethtool_ops = &veth_ethtool_ops; 1167 dev->features |= NETIF_F_LLTX; 1168 dev->features |= VETH_FEATURES; 1169 dev->vlan_features = dev->features & 1170 ~(NETIF_F_HW_VLAN_CTAG_TX | 1171 NETIF_F_HW_VLAN_STAG_TX | 1172 NETIF_F_HW_VLAN_CTAG_RX | 1173 NETIF_F_HW_VLAN_STAG_RX); 1174 dev->needs_free_netdev = true; 1175 dev->priv_destructor = veth_dev_free; 1176 dev->max_mtu = ETH_MAX_MTU; 1177 1178 dev->hw_features = VETH_FEATURES; 1179 dev->hw_enc_features = VETH_FEATURES; 1180 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1181 } 1182 1183 /* 1184 * netlink interface 1185 */ 1186 1187 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1188 struct netlink_ext_ack *extack) 1189 { 1190 if (tb[IFLA_ADDRESS]) { 1191 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1192 return -EINVAL; 1193 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1194 return -EADDRNOTAVAIL; 1195 } 1196 if (tb[IFLA_MTU]) { 1197 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1198 return -EINVAL; 1199 } 1200 return 0; 1201 } 1202 1203 static struct rtnl_link_ops veth_link_ops; 1204 1205 static int veth_newlink(struct net *src_net, struct net_device *dev, 1206 struct nlattr *tb[], struct nlattr *data[], 1207 struct netlink_ext_ack *extack) 1208 { 1209 int err; 1210 struct net_device *peer; 1211 struct veth_priv *priv; 1212 char ifname[IFNAMSIZ]; 1213 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1214 unsigned char name_assign_type; 1215 struct ifinfomsg *ifmp; 1216 struct net *net; 1217 1218 /* 1219 * create and register peer first 1220 */ 1221 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1222 struct nlattr *nla_peer; 1223 1224 nla_peer = data[VETH_INFO_PEER]; 1225 ifmp = nla_data(nla_peer); 1226 err = rtnl_nla_parse_ifla(peer_tb, 1227 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1228 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1229 NULL); 1230 if (err < 0) 1231 return err; 1232 1233 err = veth_validate(peer_tb, NULL, extack); 1234 if (err < 0) 1235 return err; 1236 1237 tbp = peer_tb; 1238 } else { 1239 ifmp = NULL; 1240 tbp = tb; 1241 } 1242 1243 if (ifmp && tbp[IFLA_IFNAME]) { 1244 nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1245 name_assign_type = NET_NAME_USER; 1246 } else { 1247 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1248 name_assign_type = NET_NAME_ENUM; 1249 } 1250 1251 net = rtnl_link_get_net(src_net, tbp); 1252 if (IS_ERR(net)) 1253 return PTR_ERR(net); 1254 1255 peer = rtnl_create_link(net, ifname, name_assign_type, 1256 &veth_link_ops, tbp); 1257 if (IS_ERR(peer)) { 1258 put_net(net); 1259 return PTR_ERR(peer); 1260 } 1261 1262 if (!ifmp || !tbp[IFLA_ADDRESS]) 1263 eth_hw_addr_random(peer); 1264 1265 if (ifmp && (dev->ifindex != 0)) 1266 peer->ifindex = ifmp->ifi_index; 1267 1268 peer->gso_max_size = dev->gso_max_size; 1269 peer->gso_max_segs = dev->gso_max_segs; 1270 1271 err = register_netdevice(peer); 1272 put_net(net); 1273 net = NULL; 1274 if (err < 0) 1275 goto err_register_peer; 1276 1277 netif_carrier_off(peer); 1278 1279 err = rtnl_configure_link(peer, ifmp); 1280 if (err < 0) 1281 goto err_configure_peer; 1282 1283 /* 1284 * register dev last 1285 * 1286 * note, that since we've registered new device the dev's name 1287 * should be re-allocated 1288 */ 1289 1290 if (tb[IFLA_ADDRESS] == NULL) 1291 eth_hw_addr_random(dev); 1292 1293 if (tb[IFLA_IFNAME]) 1294 nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1295 else 1296 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1297 1298 err = register_netdevice(dev); 1299 if (err < 0) 1300 goto err_register_dev; 1301 1302 netif_carrier_off(dev); 1303 1304 /* 1305 * tie the deviced together 1306 */ 1307 1308 priv = netdev_priv(dev); 1309 rcu_assign_pointer(priv->peer, peer); 1310 1311 priv = netdev_priv(peer); 1312 rcu_assign_pointer(priv->peer, dev); 1313 1314 return 0; 1315 1316 err_register_dev: 1317 /* nothing to do */ 1318 err_configure_peer: 1319 unregister_netdevice(peer); 1320 return err; 1321 1322 err_register_peer: 1323 free_netdev(peer); 1324 return err; 1325 } 1326 1327 static void veth_dellink(struct net_device *dev, struct list_head *head) 1328 { 1329 struct veth_priv *priv; 1330 struct net_device *peer; 1331 1332 priv = netdev_priv(dev); 1333 peer = rtnl_dereference(priv->peer); 1334 1335 /* Note : dellink() is called from default_device_exit_batch(), 1336 * before a rcu_synchronize() point. The devices are guaranteed 1337 * not being freed before one RCU grace period. 1338 */ 1339 RCU_INIT_POINTER(priv->peer, NULL); 1340 unregister_netdevice_queue(dev, head); 1341 1342 if (peer) { 1343 priv = netdev_priv(peer); 1344 RCU_INIT_POINTER(priv->peer, NULL); 1345 unregister_netdevice_queue(peer, head); 1346 } 1347 } 1348 1349 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1350 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1351 }; 1352 1353 static struct net *veth_get_link_net(const struct net_device *dev) 1354 { 1355 struct veth_priv *priv = netdev_priv(dev); 1356 struct net_device *peer = rtnl_dereference(priv->peer); 1357 1358 return peer ? dev_net(peer) : dev_net(dev); 1359 } 1360 1361 static struct rtnl_link_ops veth_link_ops = { 1362 .kind = DRV_NAME, 1363 .priv_size = sizeof(struct veth_priv), 1364 .setup = veth_setup, 1365 .validate = veth_validate, 1366 .newlink = veth_newlink, 1367 .dellink = veth_dellink, 1368 .policy = veth_policy, 1369 .maxtype = VETH_INFO_MAX, 1370 .get_link_net = veth_get_link_net, 1371 }; 1372 1373 /* 1374 * init/fini 1375 */ 1376 1377 static __init int veth_init(void) 1378 { 1379 return rtnl_link_register(&veth_link_ops); 1380 } 1381 1382 static __exit void veth_exit(void) 1383 { 1384 rtnl_link_unregister(&veth_link_ops); 1385 } 1386 1387 module_init(veth_init); 1388 module_exit(veth_exit); 1389 1390 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1391 MODULE_LICENSE("GPL v2"); 1392 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1393