1 /* 2 * drivers/net/veth.c 3 * 4 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 5 * 6 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 8 * 9 */ 10 11 #include <linux/netdevice.h> 12 #include <linux/slab.h> 13 #include <linux/ethtool.h> 14 #include <linux/etherdevice.h> 15 #include <linux/u64_stats_sync.h> 16 17 #include <net/rtnetlink.h> 18 #include <net/dst.h> 19 #include <net/xfrm.h> 20 #include <net/xdp.h> 21 #include <linux/veth.h> 22 #include <linux/module.h> 23 #include <linux/bpf.h> 24 #include <linux/filter.h> 25 #include <linux/ptr_ring.h> 26 #include <linux/bpf_trace.h> 27 #include <linux/net_tstamp.h> 28 29 #define DRV_NAME "veth" 30 #define DRV_VERSION "1.0" 31 32 #define VETH_XDP_FLAG BIT(0) 33 #define VETH_RING_SIZE 256 34 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 35 36 /* Separating two types of XDP xmit */ 37 #define VETH_XDP_TX BIT(0) 38 #define VETH_XDP_REDIR BIT(1) 39 40 struct veth_rq_stats { 41 u64 xdp_packets; 42 u64 xdp_bytes; 43 u64 xdp_drops; 44 struct u64_stats_sync syncp; 45 }; 46 47 struct veth_rq { 48 struct napi_struct xdp_napi; 49 struct net_device *dev; 50 struct bpf_prog __rcu *xdp_prog; 51 struct xdp_mem_info xdp_mem; 52 struct veth_rq_stats stats; 53 bool rx_notify_masked; 54 struct ptr_ring xdp_ring; 55 struct xdp_rxq_info xdp_rxq; 56 }; 57 58 struct veth_priv { 59 struct net_device __rcu *peer; 60 atomic64_t dropped; 61 struct bpf_prog *_xdp_prog; 62 struct veth_rq *rq; 63 unsigned int requested_headroom; 64 }; 65 66 /* 67 * ethtool interface 68 */ 69 70 struct veth_q_stat_desc { 71 char desc[ETH_GSTRING_LEN]; 72 size_t offset; 73 }; 74 75 #define VETH_RQ_STAT(m) offsetof(struct veth_rq_stats, m) 76 77 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 78 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 79 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 80 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 81 }; 82 83 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 84 85 static struct { 86 const char string[ETH_GSTRING_LEN]; 87 } ethtool_stats_keys[] = { 88 { "peer_ifindex" }, 89 }; 90 91 static int veth_get_link_ksettings(struct net_device *dev, 92 struct ethtool_link_ksettings *cmd) 93 { 94 cmd->base.speed = SPEED_10000; 95 cmd->base.duplex = DUPLEX_FULL; 96 cmd->base.port = PORT_TP; 97 cmd->base.autoneg = AUTONEG_DISABLE; 98 return 0; 99 } 100 101 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 102 { 103 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 104 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 105 } 106 107 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 108 { 109 char *p = (char *)buf; 110 int i, j; 111 112 switch(stringset) { 113 case ETH_SS_STATS: 114 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 115 p += sizeof(ethtool_stats_keys); 116 for (i = 0; i < dev->real_num_rx_queues; i++) { 117 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 118 snprintf(p, ETH_GSTRING_LEN, "rx_queue_%u_%s", 119 i, veth_rq_stats_desc[j].desc); 120 p += ETH_GSTRING_LEN; 121 } 122 } 123 break; 124 } 125 } 126 127 static int veth_get_sset_count(struct net_device *dev, int sset) 128 { 129 switch (sset) { 130 case ETH_SS_STATS: 131 return ARRAY_SIZE(ethtool_stats_keys) + 132 VETH_RQ_STATS_LEN * dev->real_num_rx_queues; 133 default: 134 return -EOPNOTSUPP; 135 } 136 } 137 138 static void veth_get_ethtool_stats(struct net_device *dev, 139 struct ethtool_stats *stats, u64 *data) 140 { 141 struct veth_priv *priv = netdev_priv(dev); 142 struct net_device *peer = rtnl_dereference(priv->peer); 143 int i, j, idx; 144 145 data[0] = peer ? peer->ifindex : 0; 146 idx = 1; 147 for (i = 0; i < dev->real_num_rx_queues; i++) { 148 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 149 const void *stats_base = (void *)rq_stats; 150 unsigned int start; 151 size_t offset; 152 153 do { 154 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 155 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 156 offset = veth_rq_stats_desc[j].offset; 157 data[idx + j] = *(u64 *)(stats_base + offset); 158 } 159 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 160 idx += VETH_RQ_STATS_LEN; 161 } 162 } 163 164 static int veth_get_ts_info(struct net_device *dev, 165 struct ethtool_ts_info *info) 166 { 167 info->so_timestamping = 168 SOF_TIMESTAMPING_TX_SOFTWARE | 169 SOF_TIMESTAMPING_RX_SOFTWARE | 170 SOF_TIMESTAMPING_SOFTWARE; 171 info->phc_index = -1; 172 173 return 0; 174 } 175 176 static const struct ethtool_ops veth_ethtool_ops = { 177 .get_drvinfo = veth_get_drvinfo, 178 .get_link = ethtool_op_get_link, 179 .get_strings = veth_get_strings, 180 .get_sset_count = veth_get_sset_count, 181 .get_ethtool_stats = veth_get_ethtool_stats, 182 .get_link_ksettings = veth_get_link_ksettings, 183 .get_ts_info = veth_get_ts_info, 184 }; 185 186 /* general routines */ 187 188 static bool veth_is_xdp_frame(void *ptr) 189 { 190 return (unsigned long)ptr & VETH_XDP_FLAG; 191 } 192 193 static void *veth_ptr_to_xdp(void *ptr) 194 { 195 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 196 } 197 198 static void *veth_xdp_to_ptr(void *ptr) 199 { 200 return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 201 } 202 203 static void veth_ptr_free(void *ptr) 204 { 205 if (veth_is_xdp_frame(ptr)) 206 xdp_return_frame(veth_ptr_to_xdp(ptr)); 207 else 208 kfree_skb(ptr); 209 } 210 211 static void __veth_xdp_flush(struct veth_rq *rq) 212 { 213 /* Write ptr_ring before reading rx_notify_masked */ 214 smp_mb(); 215 if (!rq->rx_notify_masked) { 216 rq->rx_notify_masked = true; 217 napi_schedule(&rq->xdp_napi); 218 } 219 } 220 221 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 222 { 223 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 224 dev_kfree_skb_any(skb); 225 return NET_RX_DROP; 226 } 227 228 return NET_RX_SUCCESS; 229 } 230 231 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 232 struct veth_rq *rq, bool xdp) 233 { 234 return __dev_forward_skb(dev, skb) ?: xdp ? 235 veth_xdp_rx(rq, skb) : 236 netif_rx(skb); 237 } 238 239 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 240 { 241 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 242 struct veth_rq *rq = NULL; 243 struct net_device *rcv; 244 int length = skb->len; 245 bool rcv_xdp = false; 246 int rxq; 247 248 rcu_read_lock(); 249 rcv = rcu_dereference(priv->peer); 250 if (unlikely(!rcv)) { 251 kfree_skb(skb); 252 goto drop; 253 } 254 255 rcv_priv = netdev_priv(rcv); 256 rxq = skb_get_queue_mapping(skb); 257 if (rxq < rcv->real_num_rx_queues) { 258 rq = &rcv_priv->rq[rxq]; 259 rcv_xdp = rcu_access_pointer(rq->xdp_prog); 260 if (rcv_xdp) 261 skb_record_rx_queue(skb, rxq); 262 } 263 264 skb_tx_timestamp(skb); 265 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 266 if (!rcv_xdp) { 267 struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats); 268 269 u64_stats_update_begin(&stats->syncp); 270 stats->bytes += length; 271 stats->packets++; 272 u64_stats_update_end(&stats->syncp); 273 } 274 } else { 275 drop: 276 atomic64_inc(&priv->dropped); 277 } 278 279 if (rcv_xdp) 280 __veth_xdp_flush(rq); 281 282 rcu_read_unlock(); 283 284 return NETDEV_TX_OK; 285 } 286 287 static u64 veth_stats_tx(struct pcpu_lstats *result, struct net_device *dev) 288 { 289 struct veth_priv *priv = netdev_priv(dev); 290 int cpu; 291 292 result->packets = 0; 293 result->bytes = 0; 294 for_each_possible_cpu(cpu) { 295 struct pcpu_lstats *stats = per_cpu_ptr(dev->lstats, cpu); 296 u64 packets, bytes; 297 unsigned int start; 298 299 do { 300 start = u64_stats_fetch_begin_irq(&stats->syncp); 301 packets = stats->packets; 302 bytes = stats->bytes; 303 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 304 result->packets += packets; 305 result->bytes += bytes; 306 } 307 return atomic64_read(&priv->dropped); 308 } 309 310 static void veth_stats_rx(struct veth_rq_stats *result, struct net_device *dev) 311 { 312 struct veth_priv *priv = netdev_priv(dev); 313 int i; 314 315 result->xdp_packets = 0; 316 result->xdp_bytes = 0; 317 result->xdp_drops = 0; 318 for (i = 0; i < dev->num_rx_queues; i++) { 319 struct veth_rq_stats *stats = &priv->rq[i].stats; 320 u64 packets, bytes, drops; 321 unsigned int start; 322 323 do { 324 start = u64_stats_fetch_begin_irq(&stats->syncp); 325 packets = stats->xdp_packets; 326 bytes = stats->xdp_bytes; 327 drops = stats->xdp_drops; 328 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 329 result->xdp_packets += packets; 330 result->xdp_bytes += bytes; 331 result->xdp_drops += drops; 332 } 333 } 334 335 static void veth_get_stats64(struct net_device *dev, 336 struct rtnl_link_stats64 *tot) 337 { 338 struct veth_priv *priv = netdev_priv(dev); 339 struct net_device *peer; 340 struct veth_rq_stats rx; 341 struct pcpu_lstats tx; 342 343 tot->tx_dropped = veth_stats_tx(&tx, dev); 344 tot->tx_bytes = tx.bytes; 345 tot->tx_packets = tx.packets; 346 347 veth_stats_rx(&rx, dev); 348 tot->rx_dropped = rx.xdp_drops; 349 tot->rx_bytes = rx.xdp_bytes; 350 tot->rx_packets = rx.xdp_packets; 351 352 rcu_read_lock(); 353 peer = rcu_dereference(priv->peer); 354 if (peer) { 355 tot->rx_dropped += veth_stats_tx(&tx, peer); 356 tot->rx_bytes += tx.bytes; 357 tot->rx_packets += tx.packets; 358 359 veth_stats_rx(&rx, peer); 360 tot->tx_bytes += rx.xdp_bytes; 361 tot->tx_packets += rx.xdp_packets; 362 } 363 rcu_read_unlock(); 364 } 365 366 /* fake multicast ability */ 367 static void veth_set_multicast_list(struct net_device *dev) 368 { 369 } 370 371 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 372 int buflen) 373 { 374 struct sk_buff *skb; 375 376 if (!buflen) { 377 buflen = SKB_DATA_ALIGN(headroom + len) + 378 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 379 } 380 skb = build_skb(head, buflen); 381 if (!skb) 382 return NULL; 383 384 skb_reserve(skb, headroom); 385 skb_put(skb, len); 386 387 return skb; 388 } 389 390 static int veth_select_rxq(struct net_device *dev) 391 { 392 return smp_processor_id() % dev->real_num_rx_queues; 393 } 394 395 static int veth_xdp_xmit(struct net_device *dev, int n, 396 struct xdp_frame **frames, u32 flags) 397 { 398 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 399 struct net_device *rcv; 400 int i, ret, drops = n; 401 unsigned int max_len; 402 struct veth_rq *rq; 403 404 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) { 405 ret = -EINVAL; 406 goto drop; 407 } 408 409 rcv = rcu_dereference(priv->peer); 410 if (unlikely(!rcv)) { 411 ret = -ENXIO; 412 goto drop; 413 } 414 415 rcv_priv = netdev_priv(rcv); 416 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 417 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 418 * side. This means an XDP program is loaded on the peer and the peer 419 * device is up. 420 */ 421 if (!rcu_access_pointer(rq->xdp_prog)) { 422 ret = -ENXIO; 423 goto drop; 424 } 425 426 drops = 0; 427 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 428 429 spin_lock(&rq->xdp_ring.producer_lock); 430 for (i = 0; i < n; i++) { 431 struct xdp_frame *frame = frames[i]; 432 void *ptr = veth_xdp_to_ptr(frame); 433 434 if (unlikely(frame->len > max_len || 435 __ptr_ring_produce(&rq->xdp_ring, ptr))) { 436 xdp_return_frame_rx_napi(frame); 437 drops++; 438 } 439 } 440 spin_unlock(&rq->xdp_ring.producer_lock); 441 442 if (flags & XDP_XMIT_FLUSH) 443 __veth_xdp_flush(rq); 444 445 if (likely(!drops)) 446 return n; 447 448 ret = n - drops; 449 drop: 450 atomic64_add(drops, &priv->dropped); 451 452 return ret; 453 } 454 455 static void veth_xdp_flush(struct net_device *dev) 456 { 457 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 458 struct net_device *rcv; 459 struct veth_rq *rq; 460 461 rcu_read_lock(); 462 rcv = rcu_dereference(priv->peer); 463 if (unlikely(!rcv)) 464 goto out; 465 466 rcv_priv = netdev_priv(rcv); 467 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 468 /* xdp_ring is initialized on receive side? */ 469 if (unlikely(!rcu_access_pointer(rq->xdp_prog))) 470 goto out; 471 472 __veth_xdp_flush(rq); 473 out: 474 rcu_read_unlock(); 475 } 476 477 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) 478 { 479 struct xdp_frame *frame = convert_to_xdp_frame(xdp); 480 481 if (unlikely(!frame)) 482 return -EOVERFLOW; 483 484 return veth_xdp_xmit(dev, 1, &frame, 0); 485 } 486 487 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 488 struct xdp_frame *frame, 489 unsigned int *xdp_xmit) 490 { 491 void *hard_start = frame->data - frame->headroom; 492 void *head = hard_start - sizeof(struct xdp_frame); 493 int len = frame->len, delta = 0; 494 struct xdp_frame orig_frame; 495 struct bpf_prog *xdp_prog; 496 unsigned int headroom; 497 struct sk_buff *skb; 498 499 rcu_read_lock(); 500 xdp_prog = rcu_dereference(rq->xdp_prog); 501 if (likely(xdp_prog)) { 502 struct xdp_buff xdp; 503 u32 act; 504 505 xdp.data_hard_start = hard_start; 506 xdp.data = frame->data; 507 xdp.data_end = frame->data + frame->len; 508 xdp.data_meta = frame->data - frame->metasize; 509 xdp.rxq = &rq->xdp_rxq; 510 511 act = bpf_prog_run_xdp(xdp_prog, &xdp); 512 513 switch (act) { 514 case XDP_PASS: 515 delta = frame->data - xdp.data; 516 len = xdp.data_end - xdp.data; 517 break; 518 case XDP_TX: 519 orig_frame = *frame; 520 xdp.data_hard_start = head; 521 xdp.rxq->mem = frame->mem; 522 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 523 trace_xdp_exception(rq->dev, xdp_prog, act); 524 frame = &orig_frame; 525 goto err_xdp; 526 } 527 *xdp_xmit |= VETH_XDP_TX; 528 rcu_read_unlock(); 529 goto xdp_xmit; 530 case XDP_REDIRECT: 531 orig_frame = *frame; 532 xdp.data_hard_start = head; 533 xdp.rxq->mem = frame->mem; 534 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 535 frame = &orig_frame; 536 goto err_xdp; 537 } 538 *xdp_xmit |= VETH_XDP_REDIR; 539 rcu_read_unlock(); 540 goto xdp_xmit; 541 default: 542 bpf_warn_invalid_xdp_action(act); 543 /* fall through */ 544 case XDP_ABORTED: 545 trace_xdp_exception(rq->dev, xdp_prog, act); 546 /* fall through */ 547 case XDP_DROP: 548 goto err_xdp; 549 } 550 } 551 rcu_read_unlock(); 552 553 headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 554 skb = veth_build_skb(head, headroom, len, 0); 555 if (!skb) { 556 xdp_return_frame(frame); 557 goto err; 558 } 559 560 xdp_scrub_frame(frame); 561 skb->protocol = eth_type_trans(skb, rq->dev); 562 err: 563 return skb; 564 err_xdp: 565 rcu_read_unlock(); 566 xdp_return_frame(frame); 567 xdp_xmit: 568 return NULL; 569 } 570 571 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, 572 unsigned int *xdp_xmit) 573 { 574 u32 pktlen, headroom, act, metalen; 575 void *orig_data, *orig_data_end; 576 struct bpf_prog *xdp_prog; 577 int mac_len, delta, off; 578 struct xdp_buff xdp; 579 580 skb_orphan(skb); 581 582 rcu_read_lock(); 583 xdp_prog = rcu_dereference(rq->xdp_prog); 584 if (unlikely(!xdp_prog)) { 585 rcu_read_unlock(); 586 goto out; 587 } 588 589 mac_len = skb->data - skb_mac_header(skb); 590 pktlen = skb->len + mac_len; 591 headroom = skb_headroom(skb) - mac_len; 592 593 if (skb_shared(skb) || skb_head_is_locked(skb) || 594 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 595 struct sk_buff *nskb; 596 int size, head_off; 597 void *head, *start; 598 struct page *page; 599 600 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 601 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 602 if (size > PAGE_SIZE) 603 goto drop; 604 605 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 606 if (!page) 607 goto drop; 608 609 head = page_address(page); 610 start = head + VETH_XDP_HEADROOM; 611 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 612 page_frag_free(head); 613 goto drop; 614 } 615 616 nskb = veth_build_skb(head, 617 VETH_XDP_HEADROOM + mac_len, skb->len, 618 PAGE_SIZE); 619 if (!nskb) { 620 page_frag_free(head); 621 goto drop; 622 } 623 624 skb_copy_header(nskb, skb); 625 head_off = skb_headroom(nskb) - skb_headroom(skb); 626 skb_headers_offset_update(nskb, head_off); 627 consume_skb(skb); 628 skb = nskb; 629 } 630 631 xdp.data_hard_start = skb->head; 632 xdp.data = skb_mac_header(skb); 633 xdp.data_end = xdp.data + pktlen; 634 xdp.data_meta = xdp.data; 635 xdp.rxq = &rq->xdp_rxq; 636 orig_data = xdp.data; 637 orig_data_end = xdp.data_end; 638 639 act = bpf_prog_run_xdp(xdp_prog, &xdp); 640 641 switch (act) { 642 case XDP_PASS: 643 break; 644 case XDP_TX: 645 get_page(virt_to_page(xdp.data)); 646 consume_skb(skb); 647 xdp.rxq->mem = rq->xdp_mem; 648 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 649 trace_xdp_exception(rq->dev, xdp_prog, act); 650 goto err_xdp; 651 } 652 *xdp_xmit |= VETH_XDP_TX; 653 rcu_read_unlock(); 654 goto xdp_xmit; 655 case XDP_REDIRECT: 656 get_page(virt_to_page(xdp.data)); 657 consume_skb(skb); 658 xdp.rxq->mem = rq->xdp_mem; 659 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) 660 goto err_xdp; 661 *xdp_xmit |= VETH_XDP_REDIR; 662 rcu_read_unlock(); 663 goto xdp_xmit; 664 default: 665 bpf_warn_invalid_xdp_action(act); 666 /* fall through */ 667 case XDP_ABORTED: 668 trace_xdp_exception(rq->dev, xdp_prog, act); 669 /* fall through */ 670 case XDP_DROP: 671 goto drop; 672 } 673 rcu_read_unlock(); 674 675 delta = orig_data - xdp.data; 676 off = mac_len + delta; 677 if (off > 0) 678 __skb_push(skb, off); 679 else if (off < 0) 680 __skb_pull(skb, -off); 681 skb->mac_header -= delta; 682 off = xdp.data_end - orig_data_end; 683 if (off != 0) 684 __skb_put(skb, off); 685 skb->protocol = eth_type_trans(skb, rq->dev); 686 687 metalen = xdp.data - xdp.data_meta; 688 if (metalen) 689 skb_metadata_set(skb, metalen); 690 out: 691 return skb; 692 drop: 693 rcu_read_unlock(); 694 kfree_skb(skb); 695 return NULL; 696 err_xdp: 697 rcu_read_unlock(); 698 page_frag_free(xdp.data); 699 xdp_xmit: 700 return NULL; 701 } 702 703 static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) 704 { 705 int i, done = 0, drops = 0, bytes = 0; 706 707 for (i = 0; i < budget; i++) { 708 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 709 unsigned int xdp_xmit_one = 0; 710 struct sk_buff *skb; 711 712 if (!ptr) 713 break; 714 715 if (veth_is_xdp_frame(ptr)) { 716 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 717 718 bytes += frame->len; 719 skb = veth_xdp_rcv_one(rq, frame, &xdp_xmit_one); 720 } else { 721 skb = ptr; 722 bytes += skb->len; 723 skb = veth_xdp_rcv_skb(rq, skb, &xdp_xmit_one); 724 } 725 *xdp_xmit |= xdp_xmit_one; 726 727 if (skb) 728 napi_gro_receive(&rq->xdp_napi, skb); 729 else if (!xdp_xmit_one) 730 drops++; 731 732 done++; 733 } 734 735 u64_stats_update_begin(&rq->stats.syncp); 736 rq->stats.xdp_packets += done; 737 rq->stats.xdp_bytes += bytes; 738 rq->stats.xdp_drops += drops; 739 u64_stats_update_end(&rq->stats.syncp); 740 741 return done; 742 } 743 744 static int veth_poll(struct napi_struct *napi, int budget) 745 { 746 struct veth_rq *rq = 747 container_of(napi, struct veth_rq, xdp_napi); 748 unsigned int xdp_xmit = 0; 749 int done; 750 751 xdp_set_return_frame_no_direct(); 752 done = veth_xdp_rcv(rq, budget, &xdp_xmit); 753 754 if (done < budget && napi_complete_done(napi, done)) { 755 /* Write rx_notify_masked before reading ptr_ring */ 756 smp_store_mb(rq->rx_notify_masked, false); 757 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 758 rq->rx_notify_masked = true; 759 napi_schedule(&rq->xdp_napi); 760 } 761 } 762 763 if (xdp_xmit & VETH_XDP_TX) 764 veth_xdp_flush(rq->dev); 765 if (xdp_xmit & VETH_XDP_REDIR) 766 xdp_do_flush_map(); 767 xdp_clear_return_frame_no_direct(); 768 769 return done; 770 } 771 772 static int veth_napi_add(struct net_device *dev) 773 { 774 struct veth_priv *priv = netdev_priv(dev); 775 int err, i; 776 777 for (i = 0; i < dev->real_num_rx_queues; i++) { 778 struct veth_rq *rq = &priv->rq[i]; 779 780 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 781 if (err) 782 goto err_xdp_ring; 783 } 784 785 for (i = 0; i < dev->real_num_rx_queues; i++) { 786 struct veth_rq *rq = &priv->rq[i]; 787 788 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 789 napi_enable(&rq->xdp_napi); 790 } 791 792 return 0; 793 err_xdp_ring: 794 for (i--; i >= 0; i--) 795 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 796 797 return err; 798 } 799 800 static void veth_napi_del(struct net_device *dev) 801 { 802 struct veth_priv *priv = netdev_priv(dev); 803 int i; 804 805 for (i = 0; i < dev->real_num_rx_queues; i++) { 806 struct veth_rq *rq = &priv->rq[i]; 807 808 napi_disable(&rq->xdp_napi); 809 napi_hash_del(&rq->xdp_napi); 810 } 811 synchronize_net(); 812 813 for (i = 0; i < dev->real_num_rx_queues; i++) { 814 struct veth_rq *rq = &priv->rq[i]; 815 816 netif_napi_del(&rq->xdp_napi); 817 rq->rx_notify_masked = false; 818 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 819 } 820 } 821 822 static int veth_enable_xdp(struct net_device *dev) 823 { 824 struct veth_priv *priv = netdev_priv(dev); 825 int err, i; 826 827 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 828 for (i = 0; i < dev->real_num_rx_queues; i++) { 829 struct veth_rq *rq = &priv->rq[i]; 830 831 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 832 if (err < 0) 833 goto err_rxq_reg; 834 835 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 836 MEM_TYPE_PAGE_SHARED, 837 NULL); 838 if (err < 0) 839 goto err_reg_mem; 840 841 /* Save original mem info as it can be overwritten */ 842 rq->xdp_mem = rq->xdp_rxq.mem; 843 } 844 845 err = veth_napi_add(dev); 846 if (err) 847 goto err_rxq_reg; 848 } 849 850 for (i = 0; i < dev->real_num_rx_queues; i++) 851 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 852 853 return 0; 854 err_reg_mem: 855 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 856 err_rxq_reg: 857 for (i--; i >= 0; i--) 858 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 859 860 return err; 861 } 862 863 static void veth_disable_xdp(struct net_device *dev) 864 { 865 struct veth_priv *priv = netdev_priv(dev); 866 int i; 867 868 for (i = 0; i < dev->real_num_rx_queues; i++) 869 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 870 veth_napi_del(dev); 871 for (i = 0; i < dev->real_num_rx_queues; i++) { 872 struct veth_rq *rq = &priv->rq[i]; 873 874 rq->xdp_rxq.mem = rq->xdp_mem; 875 xdp_rxq_info_unreg(&rq->xdp_rxq); 876 } 877 } 878 879 static int veth_open(struct net_device *dev) 880 { 881 struct veth_priv *priv = netdev_priv(dev); 882 struct net_device *peer = rtnl_dereference(priv->peer); 883 int err; 884 885 if (!peer) 886 return -ENOTCONN; 887 888 if (priv->_xdp_prog) { 889 err = veth_enable_xdp(dev); 890 if (err) 891 return err; 892 } 893 894 if (peer->flags & IFF_UP) { 895 netif_carrier_on(dev); 896 netif_carrier_on(peer); 897 } 898 899 return 0; 900 } 901 902 static int veth_close(struct net_device *dev) 903 { 904 struct veth_priv *priv = netdev_priv(dev); 905 struct net_device *peer = rtnl_dereference(priv->peer); 906 907 netif_carrier_off(dev); 908 if (peer) 909 netif_carrier_off(peer); 910 911 if (priv->_xdp_prog) 912 veth_disable_xdp(dev); 913 914 return 0; 915 } 916 917 static int is_valid_veth_mtu(int mtu) 918 { 919 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 920 } 921 922 static int veth_alloc_queues(struct net_device *dev) 923 { 924 struct veth_priv *priv = netdev_priv(dev); 925 int i; 926 927 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 928 if (!priv->rq) 929 return -ENOMEM; 930 931 for (i = 0; i < dev->num_rx_queues; i++) { 932 priv->rq[i].dev = dev; 933 u64_stats_init(&priv->rq[i].stats.syncp); 934 } 935 936 return 0; 937 } 938 939 static void veth_free_queues(struct net_device *dev) 940 { 941 struct veth_priv *priv = netdev_priv(dev); 942 943 kfree(priv->rq); 944 } 945 946 static int veth_dev_init(struct net_device *dev) 947 { 948 int err; 949 950 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 951 if (!dev->lstats) 952 return -ENOMEM; 953 954 err = veth_alloc_queues(dev); 955 if (err) { 956 free_percpu(dev->lstats); 957 return err; 958 } 959 960 return 0; 961 } 962 963 static void veth_dev_free(struct net_device *dev) 964 { 965 veth_free_queues(dev); 966 free_percpu(dev->lstats); 967 } 968 969 #ifdef CONFIG_NET_POLL_CONTROLLER 970 static void veth_poll_controller(struct net_device *dev) 971 { 972 /* veth only receives frames when its peer sends one 973 * Since it has nothing to do with disabling irqs, we are guaranteed 974 * never to have pending data when we poll for it so 975 * there is nothing to do here. 976 * 977 * We need this though so netpoll recognizes us as an interface that 978 * supports polling, which enables bridge devices in virt setups to 979 * still use netconsole 980 */ 981 } 982 #endif /* CONFIG_NET_POLL_CONTROLLER */ 983 984 static int veth_get_iflink(const struct net_device *dev) 985 { 986 struct veth_priv *priv = netdev_priv(dev); 987 struct net_device *peer; 988 int iflink; 989 990 rcu_read_lock(); 991 peer = rcu_dereference(priv->peer); 992 iflink = peer ? peer->ifindex : 0; 993 rcu_read_unlock(); 994 995 return iflink; 996 } 997 998 static netdev_features_t veth_fix_features(struct net_device *dev, 999 netdev_features_t features) 1000 { 1001 struct veth_priv *priv = netdev_priv(dev); 1002 struct net_device *peer; 1003 1004 peer = rtnl_dereference(priv->peer); 1005 if (peer) { 1006 struct veth_priv *peer_priv = netdev_priv(peer); 1007 1008 if (peer_priv->_xdp_prog) 1009 features &= ~NETIF_F_GSO_SOFTWARE; 1010 } 1011 1012 return features; 1013 } 1014 1015 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1016 { 1017 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1018 struct net_device *peer; 1019 1020 if (new_hr < 0) 1021 new_hr = 0; 1022 1023 rcu_read_lock(); 1024 peer = rcu_dereference(priv->peer); 1025 if (unlikely(!peer)) 1026 goto out; 1027 1028 peer_priv = netdev_priv(peer); 1029 priv->requested_headroom = new_hr; 1030 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1031 dev->needed_headroom = new_hr; 1032 peer->needed_headroom = new_hr; 1033 1034 out: 1035 rcu_read_unlock(); 1036 } 1037 1038 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1039 struct netlink_ext_ack *extack) 1040 { 1041 struct veth_priv *priv = netdev_priv(dev); 1042 struct bpf_prog *old_prog; 1043 struct net_device *peer; 1044 unsigned int max_mtu; 1045 int err; 1046 1047 old_prog = priv->_xdp_prog; 1048 priv->_xdp_prog = prog; 1049 peer = rtnl_dereference(priv->peer); 1050 1051 if (prog) { 1052 if (!peer) { 1053 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1054 err = -ENOTCONN; 1055 goto err; 1056 } 1057 1058 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1059 peer->hard_header_len - 1060 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1061 if (peer->mtu > max_mtu) { 1062 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1063 err = -ERANGE; 1064 goto err; 1065 } 1066 1067 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1068 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1069 err = -ENOSPC; 1070 goto err; 1071 } 1072 1073 if (dev->flags & IFF_UP) { 1074 err = veth_enable_xdp(dev); 1075 if (err) { 1076 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1077 goto err; 1078 } 1079 } 1080 1081 if (!old_prog) { 1082 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1083 peer->max_mtu = max_mtu; 1084 } 1085 } 1086 1087 if (old_prog) { 1088 if (!prog) { 1089 if (dev->flags & IFF_UP) 1090 veth_disable_xdp(dev); 1091 1092 if (peer) { 1093 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1094 peer->max_mtu = ETH_MAX_MTU; 1095 } 1096 } 1097 bpf_prog_put(old_prog); 1098 } 1099 1100 if ((!!old_prog ^ !!prog) && peer) 1101 netdev_update_features(peer); 1102 1103 return 0; 1104 err: 1105 priv->_xdp_prog = old_prog; 1106 1107 return err; 1108 } 1109 1110 static u32 veth_xdp_query(struct net_device *dev) 1111 { 1112 struct veth_priv *priv = netdev_priv(dev); 1113 const struct bpf_prog *xdp_prog; 1114 1115 xdp_prog = priv->_xdp_prog; 1116 if (xdp_prog) 1117 return xdp_prog->aux->id; 1118 1119 return 0; 1120 } 1121 1122 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1123 { 1124 switch (xdp->command) { 1125 case XDP_SETUP_PROG: 1126 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1127 case XDP_QUERY_PROG: 1128 xdp->prog_id = veth_xdp_query(dev); 1129 return 0; 1130 default: 1131 return -EINVAL; 1132 } 1133 } 1134 1135 static const struct net_device_ops veth_netdev_ops = { 1136 .ndo_init = veth_dev_init, 1137 .ndo_open = veth_open, 1138 .ndo_stop = veth_close, 1139 .ndo_start_xmit = veth_xmit, 1140 .ndo_get_stats64 = veth_get_stats64, 1141 .ndo_set_rx_mode = veth_set_multicast_list, 1142 .ndo_set_mac_address = eth_mac_addr, 1143 #ifdef CONFIG_NET_POLL_CONTROLLER 1144 .ndo_poll_controller = veth_poll_controller, 1145 #endif 1146 .ndo_get_iflink = veth_get_iflink, 1147 .ndo_fix_features = veth_fix_features, 1148 .ndo_features_check = passthru_features_check, 1149 .ndo_set_rx_headroom = veth_set_rx_headroom, 1150 .ndo_bpf = veth_xdp, 1151 .ndo_xdp_xmit = veth_xdp_xmit, 1152 }; 1153 1154 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1155 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1156 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1157 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1158 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1159 1160 static void veth_setup(struct net_device *dev) 1161 { 1162 ether_setup(dev); 1163 1164 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1165 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1166 dev->priv_flags |= IFF_NO_QUEUE; 1167 dev->priv_flags |= IFF_PHONY_HEADROOM; 1168 1169 dev->netdev_ops = &veth_netdev_ops; 1170 dev->ethtool_ops = &veth_ethtool_ops; 1171 dev->features |= NETIF_F_LLTX; 1172 dev->features |= VETH_FEATURES; 1173 dev->vlan_features = dev->features & 1174 ~(NETIF_F_HW_VLAN_CTAG_TX | 1175 NETIF_F_HW_VLAN_STAG_TX | 1176 NETIF_F_HW_VLAN_CTAG_RX | 1177 NETIF_F_HW_VLAN_STAG_RX); 1178 dev->needs_free_netdev = true; 1179 dev->priv_destructor = veth_dev_free; 1180 dev->max_mtu = ETH_MAX_MTU; 1181 1182 dev->hw_features = VETH_FEATURES; 1183 dev->hw_enc_features = VETH_FEATURES; 1184 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1185 } 1186 1187 /* 1188 * netlink interface 1189 */ 1190 1191 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1192 struct netlink_ext_ack *extack) 1193 { 1194 if (tb[IFLA_ADDRESS]) { 1195 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1196 return -EINVAL; 1197 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1198 return -EADDRNOTAVAIL; 1199 } 1200 if (tb[IFLA_MTU]) { 1201 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1202 return -EINVAL; 1203 } 1204 return 0; 1205 } 1206 1207 static struct rtnl_link_ops veth_link_ops; 1208 1209 static int veth_newlink(struct net *src_net, struct net_device *dev, 1210 struct nlattr *tb[], struct nlattr *data[], 1211 struct netlink_ext_ack *extack) 1212 { 1213 int err; 1214 struct net_device *peer; 1215 struct veth_priv *priv; 1216 char ifname[IFNAMSIZ]; 1217 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1218 unsigned char name_assign_type; 1219 struct ifinfomsg *ifmp; 1220 struct net *net; 1221 1222 /* 1223 * create and register peer first 1224 */ 1225 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1226 struct nlattr *nla_peer; 1227 1228 nla_peer = data[VETH_INFO_PEER]; 1229 ifmp = nla_data(nla_peer); 1230 err = rtnl_nla_parse_ifla(peer_tb, 1231 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1232 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1233 NULL); 1234 if (err < 0) 1235 return err; 1236 1237 err = veth_validate(peer_tb, NULL, extack); 1238 if (err < 0) 1239 return err; 1240 1241 tbp = peer_tb; 1242 } else { 1243 ifmp = NULL; 1244 tbp = tb; 1245 } 1246 1247 if (ifmp && tbp[IFLA_IFNAME]) { 1248 nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1249 name_assign_type = NET_NAME_USER; 1250 } else { 1251 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1252 name_assign_type = NET_NAME_ENUM; 1253 } 1254 1255 net = rtnl_link_get_net(src_net, tbp); 1256 if (IS_ERR(net)) 1257 return PTR_ERR(net); 1258 1259 peer = rtnl_create_link(net, ifname, name_assign_type, 1260 &veth_link_ops, tbp, extack); 1261 if (IS_ERR(peer)) { 1262 put_net(net); 1263 return PTR_ERR(peer); 1264 } 1265 1266 if (!ifmp || !tbp[IFLA_ADDRESS]) 1267 eth_hw_addr_random(peer); 1268 1269 if (ifmp && (dev->ifindex != 0)) 1270 peer->ifindex = ifmp->ifi_index; 1271 1272 peer->gso_max_size = dev->gso_max_size; 1273 peer->gso_max_segs = dev->gso_max_segs; 1274 1275 err = register_netdevice(peer); 1276 put_net(net); 1277 net = NULL; 1278 if (err < 0) 1279 goto err_register_peer; 1280 1281 netif_carrier_off(peer); 1282 1283 err = rtnl_configure_link(peer, ifmp); 1284 if (err < 0) 1285 goto err_configure_peer; 1286 1287 /* 1288 * register dev last 1289 * 1290 * note, that since we've registered new device the dev's name 1291 * should be re-allocated 1292 */ 1293 1294 if (tb[IFLA_ADDRESS] == NULL) 1295 eth_hw_addr_random(dev); 1296 1297 if (tb[IFLA_IFNAME]) 1298 nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1299 else 1300 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1301 1302 err = register_netdevice(dev); 1303 if (err < 0) 1304 goto err_register_dev; 1305 1306 netif_carrier_off(dev); 1307 1308 /* 1309 * tie the deviced together 1310 */ 1311 1312 priv = netdev_priv(dev); 1313 rcu_assign_pointer(priv->peer, peer); 1314 1315 priv = netdev_priv(peer); 1316 rcu_assign_pointer(priv->peer, dev); 1317 1318 return 0; 1319 1320 err_register_dev: 1321 /* nothing to do */ 1322 err_configure_peer: 1323 unregister_netdevice(peer); 1324 return err; 1325 1326 err_register_peer: 1327 free_netdev(peer); 1328 return err; 1329 } 1330 1331 static void veth_dellink(struct net_device *dev, struct list_head *head) 1332 { 1333 struct veth_priv *priv; 1334 struct net_device *peer; 1335 1336 priv = netdev_priv(dev); 1337 peer = rtnl_dereference(priv->peer); 1338 1339 /* Note : dellink() is called from default_device_exit_batch(), 1340 * before a rcu_synchronize() point. The devices are guaranteed 1341 * not being freed before one RCU grace period. 1342 */ 1343 RCU_INIT_POINTER(priv->peer, NULL); 1344 unregister_netdevice_queue(dev, head); 1345 1346 if (peer) { 1347 priv = netdev_priv(peer); 1348 RCU_INIT_POINTER(priv->peer, NULL); 1349 unregister_netdevice_queue(peer, head); 1350 } 1351 } 1352 1353 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1354 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1355 }; 1356 1357 static struct net *veth_get_link_net(const struct net_device *dev) 1358 { 1359 struct veth_priv *priv = netdev_priv(dev); 1360 struct net_device *peer = rtnl_dereference(priv->peer); 1361 1362 return peer ? dev_net(peer) : dev_net(dev); 1363 } 1364 1365 static struct rtnl_link_ops veth_link_ops = { 1366 .kind = DRV_NAME, 1367 .priv_size = sizeof(struct veth_priv), 1368 .setup = veth_setup, 1369 .validate = veth_validate, 1370 .newlink = veth_newlink, 1371 .dellink = veth_dellink, 1372 .policy = veth_policy, 1373 .maxtype = VETH_INFO_MAX, 1374 .get_link_net = veth_get_link_net, 1375 }; 1376 1377 /* 1378 * init/fini 1379 */ 1380 1381 static __init int veth_init(void) 1382 { 1383 return rtnl_link_register(&veth_link_ops); 1384 } 1385 1386 static __exit void veth_exit(void) 1387 { 1388 rtnl_link_unregister(&veth_link_ops); 1389 } 1390 1391 module_init(veth_init); 1392 module_exit(veth_exit); 1393 1394 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1395 MODULE_LICENSE("GPL v2"); 1396 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1397