1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 /* Separating two types of XDP xmit */ 38 #define VETH_XDP_TX BIT(0) 39 #define VETH_XDP_REDIR BIT(1) 40 41 struct veth_rq_stats { 42 u64 xdp_packets; 43 u64 xdp_bytes; 44 u64 xdp_drops; 45 struct u64_stats_sync syncp; 46 }; 47 48 struct veth_rq { 49 struct napi_struct xdp_napi; 50 struct net_device *dev; 51 struct bpf_prog __rcu *xdp_prog; 52 struct xdp_mem_info xdp_mem; 53 struct veth_rq_stats stats; 54 bool rx_notify_masked; 55 struct ptr_ring xdp_ring; 56 struct xdp_rxq_info xdp_rxq; 57 }; 58 59 struct veth_priv { 60 struct net_device __rcu *peer; 61 atomic64_t dropped; 62 struct bpf_prog *_xdp_prog; 63 struct veth_rq *rq; 64 unsigned int requested_headroom; 65 }; 66 67 /* 68 * ethtool interface 69 */ 70 71 struct veth_q_stat_desc { 72 char desc[ETH_GSTRING_LEN]; 73 size_t offset; 74 }; 75 76 #define VETH_RQ_STAT(m) offsetof(struct veth_rq_stats, m) 77 78 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 79 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 80 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 81 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 82 }; 83 84 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 85 86 static struct { 87 const char string[ETH_GSTRING_LEN]; 88 } ethtool_stats_keys[] = { 89 { "peer_ifindex" }, 90 }; 91 92 static int veth_get_link_ksettings(struct net_device *dev, 93 struct ethtool_link_ksettings *cmd) 94 { 95 cmd->base.speed = SPEED_10000; 96 cmd->base.duplex = DUPLEX_FULL; 97 cmd->base.port = PORT_TP; 98 cmd->base.autoneg = AUTONEG_DISABLE; 99 return 0; 100 } 101 102 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 103 { 104 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 105 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 106 } 107 108 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 109 { 110 char *p = (char *)buf; 111 int i, j; 112 113 switch(stringset) { 114 case ETH_SS_STATS: 115 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 116 p += sizeof(ethtool_stats_keys); 117 for (i = 0; i < dev->real_num_rx_queues; i++) { 118 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 119 snprintf(p, ETH_GSTRING_LEN, 120 "rx_queue_%u_%.11s", 121 i, veth_rq_stats_desc[j].desc); 122 p += ETH_GSTRING_LEN; 123 } 124 } 125 break; 126 } 127 } 128 129 static int veth_get_sset_count(struct net_device *dev, int sset) 130 { 131 switch (sset) { 132 case ETH_SS_STATS: 133 return ARRAY_SIZE(ethtool_stats_keys) + 134 VETH_RQ_STATS_LEN * dev->real_num_rx_queues; 135 default: 136 return -EOPNOTSUPP; 137 } 138 } 139 140 static void veth_get_ethtool_stats(struct net_device *dev, 141 struct ethtool_stats *stats, u64 *data) 142 { 143 struct veth_priv *priv = netdev_priv(dev); 144 struct net_device *peer = rtnl_dereference(priv->peer); 145 int i, j, idx; 146 147 data[0] = peer ? peer->ifindex : 0; 148 idx = 1; 149 for (i = 0; i < dev->real_num_rx_queues; i++) { 150 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 151 const void *stats_base = (void *)rq_stats; 152 unsigned int start; 153 size_t offset; 154 155 do { 156 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 157 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 158 offset = veth_rq_stats_desc[j].offset; 159 data[idx + j] = *(u64 *)(stats_base + offset); 160 } 161 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 162 idx += VETH_RQ_STATS_LEN; 163 } 164 } 165 166 static const struct ethtool_ops veth_ethtool_ops = { 167 .get_drvinfo = veth_get_drvinfo, 168 .get_link = ethtool_op_get_link, 169 .get_strings = veth_get_strings, 170 .get_sset_count = veth_get_sset_count, 171 .get_ethtool_stats = veth_get_ethtool_stats, 172 .get_link_ksettings = veth_get_link_ksettings, 173 .get_ts_info = ethtool_op_get_ts_info, 174 }; 175 176 /* general routines */ 177 178 static bool veth_is_xdp_frame(void *ptr) 179 { 180 return (unsigned long)ptr & VETH_XDP_FLAG; 181 } 182 183 static void *veth_ptr_to_xdp(void *ptr) 184 { 185 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 186 } 187 188 static void *veth_xdp_to_ptr(void *ptr) 189 { 190 return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 191 } 192 193 static void veth_ptr_free(void *ptr) 194 { 195 if (veth_is_xdp_frame(ptr)) 196 xdp_return_frame(veth_ptr_to_xdp(ptr)); 197 else 198 kfree_skb(ptr); 199 } 200 201 static void __veth_xdp_flush(struct veth_rq *rq) 202 { 203 /* Write ptr_ring before reading rx_notify_masked */ 204 smp_mb(); 205 if (!rq->rx_notify_masked) { 206 rq->rx_notify_masked = true; 207 napi_schedule(&rq->xdp_napi); 208 } 209 } 210 211 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 212 { 213 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 214 dev_kfree_skb_any(skb); 215 return NET_RX_DROP; 216 } 217 218 return NET_RX_SUCCESS; 219 } 220 221 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 222 struct veth_rq *rq, bool xdp) 223 { 224 return __dev_forward_skb(dev, skb) ?: xdp ? 225 veth_xdp_rx(rq, skb) : 226 netif_rx(skb); 227 } 228 229 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 230 { 231 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 232 struct veth_rq *rq = NULL; 233 struct net_device *rcv; 234 int length = skb->len; 235 bool rcv_xdp = false; 236 int rxq; 237 238 rcu_read_lock(); 239 rcv = rcu_dereference(priv->peer); 240 if (unlikely(!rcv)) { 241 kfree_skb(skb); 242 goto drop; 243 } 244 245 rcv_priv = netdev_priv(rcv); 246 rxq = skb_get_queue_mapping(skb); 247 if (rxq < rcv->real_num_rx_queues) { 248 rq = &rcv_priv->rq[rxq]; 249 rcv_xdp = rcu_access_pointer(rq->xdp_prog); 250 if (rcv_xdp) 251 skb_record_rx_queue(skb, rxq); 252 } 253 254 skb_tx_timestamp(skb); 255 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 256 if (!rcv_xdp) { 257 struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats); 258 259 u64_stats_update_begin(&stats->syncp); 260 stats->bytes += length; 261 stats->packets++; 262 u64_stats_update_end(&stats->syncp); 263 } 264 } else { 265 drop: 266 atomic64_inc(&priv->dropped); 267 } 268 269 if (rcv_xdp) 270 __veth_xdp_flush(rq); 271 272 rcu_read_unlock(); 273 274 return NETDEV_TX_OK; 275 } 276 277 static u64 veth_stats_tx(struct pcpu_lstats *result, struct net_device *dev) 278 { 279 struct veth_priv *priv = netdev_priv(dev); 280 int cpu; 281 282 result->packets = 0; 283 result->bytes = 0; 284 for_each_possible_cpu(cpu) { 285 struct pcpu_lstats *stats = per_cpu_ptr(dev->lstats, cpu); 286 u64 packets, bytes; 287 unsigned int start; 288 289 do { 290 start = u64_stats_fetch_begin_irq(&stats->syncp); 291 packets = stats->packets; 292 bytes = stats->bytes; 293 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 294 result->packets += packets; 295 result->bytes += bytes; 296 } 297 return atomic64_read(&priv->dropped); 298 } 299 300 static void veth_stats_rx(struct veth_rq_stats *result, struct net_device *dev) 301 { 302 struct veth_priv *priv = netdev_priv(dev); 303 int i; 304 305 result->xdp_packets = 0; 306 result->xdp_bytes = 0; 307 result->xdp_drops = 0; 308 for (i = 0; i < dev->num_rx_queues; i++) { 309 struct veth_rq_stats *stats = &priv->rq[i].stats; 310 u64 packets, bytes, drops; 311 unsigned int start; 312 313 do { 314 start = u64_stats_fetch_begin_irq(&stats->syncp); 315 packets = stats->xdp_packets; 316 bytes = stats->xdp_bytes; 317 drops = stats->xdp_drops; 318 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 319 result->xdp_packets += packets; 320 result->xdp_bytes += bytes; 321 result->xdp_drops += drops; 322 } 323 } 324 325 static void veth_get_stats64(struct net_device *dev, 326 struct rtnl_link_stats64 *tot) 327 { 328 struct veth_priv *priv = netdev_priv(dev); 329 struct net_device *peer; 330 struct veth_rq_stats rx; 331 struct pcpu_lstats tx; 332 333 tot->tx_dropped = veth_stats_tx(&tx, dev); 334 tot->tx_bytes = tx.bytes; 335 tot->tx_packets = tx.packets; 336 337 veth_stats_rx(&rx, dev); 338 tot->rx_dropped = rx.xdp_drops; 339 tot->rx_bytes = rx.xdp_bytes; 340 tot->rx_packets = rx.xdp_packets; 341 342 rcu_read_lock(); 343 peer = rcu_dereference(priv->peer); 344 if (peer) { 345 tot->rx_dropped += veth_stats_tx(&tx, peer); 346 tot->rx_bytes += tx.bytes; 347 tot->rx_packets += tx.packets; 348 349 veth_stats_rx(&rx, peer); 350 tot->tx_bytes += rx.xdp_bytes; 351 tot->tx_packets += rx.xdp_packets; 352 } 353 rcu_read_unlock(); 354 } 355 356 /* fake multicast ability */ 357 static void veth_set_multicast_list(struct net_device *dev) 358 { 359 } 360 361 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 362 int buflen) 363 { 364 struct sk_buff *skb; 365 366 if (!buflen) { 367 buflen = SKB_DATA_ALIGN(headroom + len) + 368 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 369 } 370 skb = build_skb(head, buflen); 371 if (!skb) 372 return NULL; 373 374 skb_reserve(skb, headroom); 375 skb_put(skb, len); 376 377 return skb; 378 } 379 380 static int veth_select_rxq(struct net_device *dev) 381 { 382 return smp_processor_id() % dev->real_num_rx_queues; 383 } 384 385 static int veth_xdp_xmit(struct net_device *dev, int n, 386 struct xdp_frame **frames, u32 flags) 387 { 388 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 389 struct net_device *rcv; 390 int i, ret, drops = n; 391 unsigned int max_len; 392 struct veth_rq *rq; 393 394 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) { 395 ret = -EINVAL; 396 goto drop; 397 } 398 399 rcv = rcu_dereference(priv->peer); 400 if (unlikely(!rcv)) { 401 ret = -ENXIO; 402 goto drop; 403 } 404 405 rcv_priv = netdev_priv(rcv); 406 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 407 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 408 * side. This means an XDP program is loaded on the peer and the peer 409 * device is up. 410 */ 411 if (!rcu_access_pointer(rq->xdp_prog)) { 412 ret = -ENXIO; 413 goto drop; 414 } 415 416 drops = 0; 417 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 418 419 spin_lock(&rq->xdp_ring.producer_lock); 420 for (i = 0; i < n; i++) { 421 struct xdp_frame *frame = frames[i]; 422 void *ptr = veth_xdp_to_ptr(frame); 423 424 if (unlikely(frame->len > max_len || 425 __ptr_ring_produce(&rq->xdp_ring, ptr))) { 426 xdp_return_frame_rx_napi(frame); 427 drops++; 428 } 429 } 430 spin_unlock(&rq->xdp_ring.producer_lock); 431 432 if (flags & XDP_XMIT_FLUSH) 433 __veth_xdp_flush(rq); 434 435 if (likely(!drops)) 436 return n; 437 438 ret = n - drops; 439 drop: 440 atomic64_add(drops, &priv->dropped); 441 442 return ret; 443 } 444 445 static void veth_xdp_flush(struct net_device *dev) 446 { 447 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 448 struct net_device *rcv; 449 struct veth_rq *rq; 450 451 rcu_read_lock(); 452 rcv = rcu_dereference(priv->peer); 453 if (unlikely(!rcv)) 454 goto out; 455 456 rcv_priv = netdev_priv(rcv); 457 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 458 /* xdp_ring is initialized on receive side? */ 459 if (unlikely(!rcu_access_pointer(rq->xdp_prog))) 460 goto out; 461 462 __veth_xdp_flush(rq); 463 out: 464 rcu_read_unlock(); 465 } 466 467 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) 468 { 469 struct xdp_frame *frame = convert_to_xdp_frame(xdp); 470 471 if (unlikely(!frame)) 472 return -EOVERFLOW; 473 474 return veth_xdp_xmit(dev, 1, &frame, 0); 475 } 476 477 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 478 struct xdp_frame *frame, 479 unsigned int *xdp_xmit) 480 { 481 void *hard_start = frame->data - frame->headroom; 482 void *head = hard_start - sizeof(struct xdp_frame); 483 int len = frame->len, delta = 0; 484 struct xdp_frame orig_frame; 485 struct bpf_prog *xdp_prog; 486 unsigned int headroom; 487 struct sk_buff *skb; 488 489 rcu_read_lock(); 490 xdp_prog = rcu_dereference(rq->xdp_prog); 491 if (likely(xdp_prog)) { 492 struct xdp_buff xdp; 493 u32 act; 494 495 xdp.data_hard_start = hard_start; 496 xdp.data = frame->data; 497 xdp.data_end = frame->data + frame->len; 498 xdp.data_meta = frame->data - frame->metasize; 499 xdp.rxq = &rq->xdp_rxq; 500 501 act = bpf_prog_run_xdp(xdp_prog, &xdp); 502 503 switch (act) { 504 case XDP_PASS: 505 delta = frame->data - xdp.data; 506 len = xdp.data_end - xdp.data; 507 break; 508 case XDP_TX: 509 orig_frame = *frame; 510 xdp.data_hard_start = head; 511 xdp.rxq->mem = frame->mem; 512 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 513 trace_xdp_exception(rq->dev, xdp_prog, act); 514 frame = &orig_frame; 515 goto err_xdp; 516 } 517 *xdp_xmit |= VETH_XDP_TX; 518 rcu_read_unlock(); 519 goto xdp_xmit; 520 case XDP_REDIRECT: 521 orig_frame = *frame; 522 xdp.data_hard_start = head; 523 xdp.rxq->mem = frame->mem; 524 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 525 frame = &orig_frame; 526 goto err_xdp; 527 } 528 *xdp_xmit |= VETH_XDP_REDIR; 529 rcu_read_unlock(); 530 goto xdp_xmit; 531 default: 532 bpf_warn_invalid_xdp_action(act); 533 /* fall through */ 534 case XDP_ABORTED: 535 trace_xdp_exception(rq->dev, xdp_prog, act); 536 /* fall through */ 537 case XDP_DROP: 538 goto err_xdp; 539 } 540 } 541 rcu_read_unlock(); 542 543 headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 544 skb = veth_build_skb(head, headroom, len, 0); 545 if (!skb) { 546 xdp_return_frame(frame); 547 goto err; 548 } 549 550 xdp_release_frame(frame); 551 xdp_scrub_frame(frame); 552 skb->protocol = eth_type_trans(skb, rq->dev); 553 err: 554 return skb; 555 err_xdp: 556 rcu_read_unlock(); 557 xdp_return_frame(frame); 558 xdp_xmit: 559 return NULL; 560 } 561 562 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, 563 unsigned int *xdp_xmit) 564 { 565 u32 pktlen, headroom, act, metalen; 566 void *orig_data, *orig_data_end; 567 struct bpf_prog *xdp_prog; 568 int mac_len, delta, off; 569 struct xdp_buff xdp; 570 571 skb_orphan(skb); 572 573 rcu_read_lock(); 574 xdp_prog = rcu_dereference(rq->xdp_prog); 575 if (unlikely(!xdp_prog)) { 576 rcu_read_unlock(); 577 goto out; 578 } 579 580 mac_len = skb->data - skb_mac_header(skb); 581 pktlen = skb->len + mac_len; 582 headroom = skb_headroom(skb) - mac_len; 583 584 if (skb_shared(skb) || skb_head_is_locked(skb) || 585 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 586 struct sk_buff *nskb; 587 int size, head_off; 588 void *head, *start; 589 struct page *page; 590 591 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 592 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 593 if (size > PAGE_SIZE) 594 goto drop; 595 596 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 597 if (!page) 598 goto drop; 599 600 head = page_address(page); 601 start = head + VETH_XDP_HEADROOM; 602 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 603 page_frag_free(head); 604 goto drop; 605 } 606 607 nskb = veth_build_skb(head, 608 VETH_XDP_HEADROOM + mac_len, skb->len, 609 PAGE_SIZE); 610 if (!nskb) { 611 page_frag_free(head); 612 goto drop; 613 } 614 615 skb_copy_header(nskb, skb); 616 head_off = skb_headroom(nskb) - skb_headroom(skb); 617 skb_headers_offset_update(nskb, head_off); 618 consume_skb(skb); 619 skb = nskb; 620 } 621 622 xdp.data_hard_start = skb->head; 623 xdp.data = skb_mac_header(skb); 624 xdp.data_end = xdp.data + pktlen; 625 xdp.data_meta = xdp.data; 626 xdp.rxq = &rq->xdp_rxq; 627 orig_data = xdp.data; 628 orig_data_end = xdp.data_end; 629 630 act = bpf_prog_run_xdp(xdp_prog, &xdp); 631 632 switch (act) { 633 case XDP_PASS: 634 break; 635 case XDP_TX: 636 get_page(virt_to_page(xdp.data)); 637 consume_skb(skb); 638 xdp.rxq->mem = rq->xdp_mem; 639 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 640 trace_xdp_exception(rq->dev, xdp_prog, act); 641 goto err_xdp; 642 } 643 *xdp_xmit |= VETH_XDP_TX; 644 rcu_read_unlock(); 645 goto xdp_xmit; 646 case XDP_REDIRECT: 647 get_page(virt_to_page(xdp.data)); 648 consume_skb(skb); 649 xdp.rxq->mem = rq->xdp_mem; 650 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) 651 goto err_xdp; 652 *xdp_xmit |= VETH_XDP_REDIR; 653 rcu_read_unlock(); 654 goto xdp_xmit; 655 default: 656 bpf_warn_invalid_xdp_action(act); 657 /* fall through */ 658 case XDP_ABORTED: 659 trace_xdp_exception(rq->dev, xdp_prog, act); 660 /* fall through */ 661 case XDP_DROP: 662 goto drop; 663 } 664 rcu_read_unlock(); 665 666 delta = orig_data - xdp.data; 667 off = mac_len + delta; 668 if (off > 0) 669 __skb_push(skb, off); 670 else if (off < 0) 671 __skb_pull(skb, -off); 672 skb->mac_header -= delta; 673 off = xdp.data_end - orig_data_end; 674 if (off != 0) 675 __skb_put(skb, off); 676 skb->protocol = eth_type_trans(skb, rq->dev); 677 678 metalen = xdp.data - xdp.data_meta; 679 if (metalen) 680 skb_metadata_set(skb, metalen); 681 out: 682 return skb; 683 drop: 684 rcu_read_unlock(); 685 kfree_skb(skb); 686 return NULL; 687 err_xdp: 688 rcu_read_unlock(); 689 page_frag_free(xdp.data); 690 xdp_xmit: 691 return NULL; 692 } 693 694 static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) 695 { 696 int i, done = 0, drops = 0, bytes = 0; 697 698 for (i = 0; i < budget; i++) { 699 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 700 unsigned int xdp_xmit_one = 0; 701 struct sk_buff *skb; 702 703 if (!ptr) 704 break; 705 706 if (veth_is_xdp_frame(ptr)) { 707 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 708 709 bytes += frame->len; 710 skb = veth_xdp_rcv_one(rq, frame, &xdp_xmit_one); 711 } else { 712 skb = ptr; 713 bytes += skb->len; 714 skb = veth_xdp_rcv_skb(rq, skb, &xdp_xmit_one); 715 } 716 *xdp_xmit |= xdp_xmit_one; 717 718 if (skb) 719 napi_gro_receive(&rq->xdp_napi, skb); 720 else if (!xdp_xmit_one) 721 drops++; 722 723 done++; 724 } 725 726 u64_stats_update_begin(&rq->stats.syncp); 727 rq->stats.xdp_packets += done; 728 rq->stats.xdp_bytes += bytes; 729 rq->stats.xdp_drops += drops; 730 u64_stats_update_end(&rq->stats.syncp); 731 732 return done; 733 } 734 735 static int veth_poll(struct napi_struct *napi, int budget) 736 { 737 struct veth_rq *rq = 738 container_of(napi, struct veth_rq, xdp_napi); 739 unsigned int xdp_xmit = 0; 740 int done; 741 742 xdp_set_return_frame_no_direct(); 743 done = veth_xdp_rcv(rq, budget, &xdp_xmit); 744 745 if (done < budget && napi_complete_done(napi, done)) { 746 /* Write rx_notify_masked before reading ptr_ring */ 747 smp_store_mb(rq->rx_notify_masked, false); 748 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 749 rq->rx_notify_masked = true; 750 napi_schedule(&rq->xdp_napi); 751 } 752 } 753 754 if (xdp_xmit & VETH_XDP_TX) 755 veth_xdp_flush(rq->dev); 756 if (xdp_xmit & VETH_XDP_REDIR) 757 xdp_do_flush_map(); 758 xdp_clear_return_frame_no_direct(); 759 760 return done; 761 } 762 763 static int veth_napi_add(struct net_device *dev) 764 { 765 struct veth_priv *priv = netdev_priv(dev); 766 int err, i; 767 768 for (i = 0; i < dev->real_num_rx_queues; i++) { 769 struct veth_rq *rq = &priv->rq[i]; 770 771 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 772 if (err) 773 goto err_xdp_ring; 774 } 775 776 for (i = 0; i < dev->real_num_rx_queues; i++) { 777 struct veth_rq *rq = &priv->rq[i]; 778 779 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 780 napi_enable(&rq->xdp_napi); 781 } 782 783 return 0; 784 err_xdp_ring: 785 for (i--; i >= 0; i--) 786 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 787 788 return err; 789 } 790 791 static void veth_napi_del(struct net_device *dev) 792 { 793 struct veth_priv *priv = netdev_priv(dev); 794 int i; 795 796 for (i = 0; i < dev->real_num_rx_queues; i++) { 797 struct veth_rq *rq = &priv->rq[i]; 798 799 napi_disable(&rq->xdp_napi); 800 napi_hash_del(&rq->xdp_napi); 801 } 802 synchronize_net(); 803 804 for (i = 0; i < dev->real_num_rx_queues; i++) { 805 struct veth_rq *rq = &priv->rq[i]; 806 807 netif_napi_del(&rq->xdp_napi); 808 rq->rx_notify_masked = false; 809 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 810 } 811 } 812 813 static int veth_enable_xdp(struct net_device *dev) 814 { 815 struct veth_priv *priv = netdev_priv(dev); 816 int err, i; 817 818 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 819 for (i = 0; i < dev->real_num_rx_queues; i++) { 820 struct veth_rq *rq = &priv->rq[i]; 821 822 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 823 if (err < 0) 824 goto err_rxq_reg; 825 826 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 827 MEM_TYPE_PAGE_SHARED, 828 NULL); 829 if (err < 0) 830 goto err_reg_mem; 831 832 /* Save original mem info as it can be overwritten */ 833 rq->xdp_mem = rq->xdp_rxq.mem; 834 } 835 836 err = veth_napi_add(dev); 837 if (err) 838 goto err_rxq_reg; 839 } 840 841 for (i = 0; i < dev->real_num_rx_queues; i++) 842 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 843 844 return 0; 845 err_reg_mem: 846 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 847 err_rxq_reg: 848 for (i--; i >= 0; i--) 849 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 850 851 return err; 852 } 853 854 static void veth_disable_xdp(struct net_device *dev) 855 { 856 struct veth_priv *priv = netdev_priv(dev); 857 int i; 858 859 for (i = 0; i < dev->real_num_rx_queues; i++) 860 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 861 veth_napi_del(dev); 862 for (i = 0; i < dev->real_num_rx_queues; i++) { 863 struct veth_rq *rq = &priv->rq[i]; 864 865 rq->xdp_rxq.mem = rq->xdp_mem; 866 xdp_rxq_info_unreg(&rq->xdp_rxq); 867 } 868 } 869 870 static int veth_open(struct net_device *dev) 871 { 872 struct veth_priv *priv = netdev_priv(dev); 873 struct net_device *peer = rtnl_dereference(priv->peer); 874 int err; 875 876 if (!peer) 877 return -ENOTCONN; 878 879 if (priv->_xdp_prog) { 880 err = veth_enable_xdp(dev); 881 if (err) 882 return err; 883 } 884 885 if (peer->flags & IFF_UP) { 886 netif_carrier_on(dev); 887 netif_carrier_on(peer); 888 } 889 890 return 0; 891 } 892 893 static int veth_close(struct net_device *dev) 894 { 895 struct veth_priv *priv = netdev_priv(dev); 896 struct net_device *peer = rtnl_dereference(priv->peer); 897 898 netif_carrier_off(dev); 899 if (peer) 900 netif_carrier_off(peer); 901 902 if (priv->_xdp_prog) 903 veth_disable_xdp(dev); 904 905 return 0; 906 } 907 908 static int is_valid_veth_mtu(int mtu) 909 { 910 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 911 } 912 913 static int veth_alloc_queues(struct net_device *dev) 914 { 915 struct veth_priv *priv = netdev_priv(dev); 916 int i; 917 918 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 919 if (!priv->rq) 920 return -ENOMEM; 921 922 for (i = 0; i < dev->num_rx_queues; i++) { 923 priv->rq[i].dev = dev; 924 u64_stats_init(&priv->rq[i].stats.syncp); 925 } 926 927 return 0; 928 } 929 930 static void veth_free_queues(struct net_device *dev) 931 { 932 struct veth_priv *priv = netdev_priv(dev); 933 934 kfree(priv->rq); 935 } 936 937 static int veth_dev_init(struct net_device *dev) 938 { 939 int err; 940 941 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 942 if (!dev->lstats) 943 return -ENOMEM; 944 945 err = veth_alloc_queues(dev); 946 if (err) { 947 free_percpu(dev->lstats); 948 return err; 949 } 950 951 return 0; 952 } 953 954 static void veth_dev_free(struct net_device *dev) 955 { 956 veth_free_queues(dev); 957 free_percpu(dev->lstats); 958 } 959 960 #ifdef CONFIG_NET_POLL_CONTROLLER 961 static void veth_poll_controller(struct net_device *dev) 962 { 963 /* veth only receives frames when its peer sends one 964 * Since it has nothing to do with disabling irqs, we are guaranteed 965 * never to have pending data when we poll for it so 966 * there is nothing to do here. 967 * 968 * We need this though so netpoll recognizes us as an interface that 969 * supports polling, which enables bridge devices in virt setups to 970 * still use netconsole 971 */ 972 } 973 #endif /* CONFIG_NET_POLL_CONTROLLER */ 974 975 static int veth_get_iflink(const struct net_device *dev) 976 { 977 struct veth_priv *priv = netdev_priv(dev); 978 struct net_device *peer; 979 int iflink; 980 981 rcu_read_lock(); 982 peer = rcu_dereference(priv->peer); 983 iflink = peer ? peer->ifindex : 0; 984 rcu_read_unlock(); 985 986 return iflink; 987 } 988 989 static netdev_features_t veth_fix_features(struct net_device *dev, 990 netdev_features_t features) 991 { 992 struct veth_priv *priv = netdev_priv(dev); 993 struct net_device *peer; 994 995 peer = rtnl_dereference(priv->peer); 996 if (peer) { 997 struct veth_priv *peer_priv = netdev_priv(peer); 998 999 if (peer_priv->_xdp_prog) 1000 features &= ~NETIF_F_GSO_SOFTWARE; 1001 } 1002 1003 return features; 1004 } 1005 1006 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1007 { 1008 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1009 struct net_device *peer; 1010 1011 if (new_hr < 0) 1012 new_hr = 0; 1013 1014 rcu_read_lock(); 1015 peer = rcu_dereference(priv->peer); 1016 if (unlikely(!peer)) 1017 goto out; 1018 1019 peer_priv = netdev_priv(peer); 1020 priv->requested_headroom = new_hr; 1021 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1022 dev->needed_headroom = new_hr; 1023 peer->needed_headroom = new_hr; 1024 1025 out: 1026 rcu_read_unlock(); 1027 } 1028 1029 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1030 struct netlink_ext_ack *extack) 1031 { 1032 struct veth_priv *priv = netdev_priv(dev); 1033 struct bpf_prog *old_prog; 1034 struct net_device *peer; 1035 unsigned int max_mtu; 1036 int err; 1037 1038 old_prog = priv->_xdp_prog; 1039 priv->_xdp_prog = prog; 1040 peer = rtnl_dereference(priv->peer); 1041 1042 if (prog) { 1043 if (!peer) { 1044 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1045 err = -ENOTCONN; 1046 goto err; 1047 } 1048 1049 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1050 peer->hard_header_len - 1051 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1052 if (peer->mtu > max_mtu) { 1053 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1054 err = -ERANGE; 1055 goto err; 1056 } 1057 1058 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1059 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1060 err = -ENOSPC; 1061 goto err; 1062 } 1063 1064 if (dev->flags & IFF_UP) { 1065 err = veth_enable_xdp(dev); 1066 if (err) { 1067 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1068 goto err; 1069 } 1070 } 1071 1072 if (!old_prog) { 1073 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1074 peer->max_mtu = max_mtu; 1075 } 1076 } 1077 1078 if (old_prog) { 1079 if (!prog) { 1080 if (dev->flags & IFF_UP) 1081 veth_disable_xdp(dev); 1082 1083 if (peer) { 1084 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1085 peer->max_mtu = ETH_MAX_MTU; 1086 } 1087 } 1088 bpf_prog_put(old_prog); 1089 } 1090 1091 if ((!!old_prog ^ !!prog) && peer) 1092 netdev_update_features(peer); 1093 1094 return 0; 1095 err: 1096 priv->_xdp_prog = old_prog; 1097 1098 return err; 1099 } 1100 1101 static u32 veth_xdp_query(struct net_device *dev) 1102 { 1103 struct veth_priv *priv = netdev_priv(dev); 1104 const struct bpf_prog *xdp_prog; 1105 1106 xdp_prog = priv->_xdp_prog; 1107 if (xdp_prog) 1108 return xdp_prog->aux->id; 1109 1110 return 0; 1111 } 1112 1113 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1114 { 1115 switch (xdp->command) { 1116 case XDP_SETUP_PROG: 1117 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1118 case XDP_QUERY_PROG: 1119 xdp->prog_id = veth_xdp_query(dev); 1120 return 0; 1121 default: 1122 return -EINVAL; 1123 } 1124 } 1125 1126 static const struct net_device_ops veth_netdev_ops = { 1127 .ndo_init = veth_dev_init, 1128 .ndo_open = veth_open, 1129 .ndo_stop = veth_close, 1130 .ndo_start_xmit = veth_xmit, 1131 .ndo_get_stats64 = veth_get_stats64, 1132 .ndo_set_rx_mode = veth_set_multicast_list, 1133 .ndo_set_mac_address = eth_mac_addr, 1134 #ifdef CONFIG_NET_POLL_CONTROLLER 1135 .ndo_poll_controller = veth_poll_controller, 1136 #endif 1137 .ndo_get_iflink = veth_get_iflink, 1138 .ndo_fix_features = veth_fix_features, 1139 .ndo_features_check = passthru_features_check, 1140 .ndo_set_rx_headroom = veth_set_rx_headroom, 1141 .ndo_bpf = veth_xdp, 1142 .ndo_xdp_xmit = veth_xdp_xmit, 1143 }; 1144 1145 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1146 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1147 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1148 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1149 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1150 1151 static void veth_setup(struct net_device *dev) 1152 { 1153 ether_setup(dev); 1154 1155 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1156 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1157 dev->priv_flags |= IFF_NO_QUEUE; 1158 dev->priv_flags |= IFF_PHONY_HEADROOM; 1159 1160 dev->netdev_ops = &veth_netdev_ops; 1161 dev->ethtool_ops = &veth_ethtool_ops; 1162 dev->features |= NETIF_F_LLTX; 1163 dev->features |= VETH_FEATURES; 1164 dev->vlan_features = dev->features & 1165 ~(NETIF_F_HW_VLAN_CTAG_TX | 1166 NETIF_F_HW_VLAN_STAG_TX | 1167 NETIF_F_HW_VLAN_CTAG_RX | 1168 NETIF_F_HW_VLAN_STAG_RX); 1169 dev->needs_free_netdev = true; 1170 dev->priv_destructor = veth_dev_free; 1171 dev->max_mtu = ETH_MAX_MTU; 1172 1173 dev->hw_features = VETH_FEATURES; 1174 dev->hw_enc_features = VETH_FEATURES; 1175 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1176 } 1177 1178 /* 1179 * netlink interface 1180 */ 1181 1182 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1183 struct netlink_ext_ack *extack) 1184 { 1185 if (tb[IFLA_ADDRESS]) { 1186 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1187 return -EINVAL; 1188 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1189 return -EADDRNOTAVAIL; 1190 } 1191 if (tb[IFLA_MTU]) { 1192 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1193 return -EINVAL; 1194 } 1195 return 0; 1196 } 1197 1198 static struct rtnl_link_ops veth_link_ops; 1199 1200 static int veth_newlink(struct net *src_net, struct net_device *dev, 1201 struct nlattr *tb[], struct nlattr *data[], 1202 struct netlink_ext_ack *extack) 1203 { 1204 int err; 1205 struct net_device *peer; 1206 struct veth_priv *priv; 1207 char ifname[IFNAMSIZ]; 1208 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1209 unsigned char name_assign_type; 1210 struct ifinfomsg *ifmp; 1211 struct net *net; 1212 1213 /* 1214 * create and register peer first 1215 */ 1216 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1217 struct nlattr *nla_peer; 1218 1219 nla_peer = data[VETH_INFO_PEER]; 1220 ifmp = nla_data(nla_peer); 1221 err = rtnl_nla_parse_ifla(peer_tb, 1222 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1223 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1224 NULL); 1225 if (err < 0) 1226 return err; 1227 1228 err = veth_validate(peer_tb, NULL, extack); 1229 if (err < 0) 1230 return err; 1231 1232 tbp = peer_tb; 1233 } else { 1234 ifmp = NULL; 1235 tbp = tb; 1236 } 1237 1238 if (ifmp && tbp[IFLA_IFNAME]) { 1239 nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1240 name_assign_type = NET_NAME_USER; 1241 } else { 1242 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1243 name_assign_type = NET_NAME_ENUM; 1244 } 1245 1246 net = rtnl_link_get_net(src_net, tbp); 1247 if (IS_ERR(net)) 1248 return PTR_ERR(net); 1249 1250 peer = rtnl_create_link(net, ifname, name_assign_type, 1251 &veth_link_ops, tbp, extack); 1252 if (IS_ERR(peer)) { 1253 put_net(net); 1254 return PTR_ERR(peer); 1255 } 1256 1257 if (!ifmp || !tbp[IFLA_ADDRESS]) 1258 eth_hw_addr_random(peer); 1259 1260 if (ifmp && (dev->ifindex != 0)) 1261 peer->ifindex = ifmp->ifi_index; 1262 1263 peer->gso_max_size = dev->gso_max_size; 1264 peer->gso_max_segs = dev->gso_max_segs; 1265 1266 err = register_netdevice(peer); 1267 put_net(net); 1268 net = NULL; 1269 if (err < 0) 1270 goto err_register_peer; 1271 1272 netif_carrier_off(peer); 1273 1274 err = rtnl_configure_link(peer, ifmp); 1275 if (err < 0) 1276 goto err_configure_peer; 1277 1278 /* 1279 * register dev last 1280 * 1281 * note, that since we've registered new device the dev's name 1282 * should be re-allocated 1283 */ 1284 1285 if (tb[IFLA_ADDRESS] == NULL) 1286 eth_hw_addr_random(dev); 1287 1288 if (tb[IFLA_IFNAME]) 1289 nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1290 else 1291 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1292 1293 err = register_netdevice(dev); 1294 if (err < 0) 1295 goto err_register_dev; 1296 1297 netif_carrier_off(dev); 1298 1299 /* 1300 * tie the deviced together 1301 */ 1302 1303 priv = netdev_priv(dev); 1304 rcu_assign_pointer(priv->peer, peer); 1305 1306 priv = netdev_priv(peer); 1307 rcu_assign_pointer(priv->peer, dev); 1308 1309 return 0; 1310 1311 err_register_dev: 1312 /* nothing to do */ 1313 err_configure_peer: 1314 unregister_netdevice(peer); 1315 return err; 1316 1317 err_register_peer: 1318 free_netdev(peer); 1319 return err; 1320 } 1321 1322 static void veth_dellink(struct net_device *dev, struct list_head *head) 1323 { 1324 struct veth_priv *priv; 1325 struct net_device *peer; 1326 1327 priv = netdev_priv(dev); 1328 peer = rtnl_dereference(priv->peer); 1329 1330 /* Note : dellink() is called from default_device_exit_batch(), 1331 * before a rcu_synchronize() point. The devices are guaranteed 1332 * not being freed before one RCU grace period. 1333 */ 1334 RCU_INIT_POINTER(priv->peer, NULL); 1335 unregister_netdevice_queue(dev, head); 1336 1337 if (peer) { 1338 priv = netdev_priv(peer); 1339 RCU_INIT_POINTER(priv->peer, NULL); 1340 unregister_netdevice_queue(peer, head); 1341 } 1342 } 1343 1344 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1345 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1346 }; 1347 1348 static struct net *veth_get_link_net(const struct net_device *dev) 1349 { 1350 struct veth_priv *priv = netdev_priv(dev); 1351 struct net_device *peer = rtnl_dereference(priv->peer); 1352 1353 return peer ? dev_net(peer) : dev_net(dev); 1354 } 1355 1356 static struct rtnl_link_ops veth_link_ops = { 1357 .kind = DRV_NAME, 1358 .priv_size = sizeof(struct veth_priv), 1359 .setup = veth_setup, 1360 .validate = veth_validate, 1361 .newlink = veth_newlink, 1362 .dellink = veth_dellink, 1363 .policy = veth_policy, 1364 .maxtype = VETH_INFO_MAX, 1365 .get_link_net = veth_get_link_net, 1366 }; 1367 1368 /* 1369 * init/fini 1370 */ 1371 1372 static __init int veth_init(void) 1373 { 1374 return rtnl_link_register(&veth_link_ops); 1375 } 1376 1377 static __exit void veth_exit(void) 1378 { 1379 rtnl_link_unregister(&veth_link_ops); 1380 } 1381 1382 module_init(veth_init); 1383 module_exit(veth_exit); 1384 1385 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1386 MODULE_LICENSE("GPL v2"); 1387 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1388