1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 #define VETH_XDP_TX_BULK_SIZE 16 38 #define VETH_XDP_BATCH 16 39 40 struct veth_stats { 41 u64 rx_drops; 42 /* xdp */ 43 u64 xdp_packets; 44 u64 xdp_bytes; 45 u64 xdp_redirect; 46 u64 xdp_drops; 47 u64 xdp_tx; 48 u64 xdp_tx_err; 49 u64 peer_tq_xdp_xmit; 50 u64 peer_tq_xdp_xmit_err; 51 }; 52 53 struct veth_rq_stats { 54 struct veth_stats vs; 55 struct u64_stats_sync syncp; 56 }; 57 58 struct veth_rq { 59 struct napi_struct xdp_napi; 60 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 61 struct net_device *dev; 62 struct bpf_prog __rcu *xdp_prog; 63 struct xdp_mem_info xdp_mem; 64 struct veth_rq_stats stats; 65 bool rx_notify_masked; 66 struct ptr_ring xdp_ring; 67 struct xdp_rxq_info xdp_rxq; 68 }; 69 70 struct veth_priv { 71 struct net_device __rcu *peer; 72 atomic64_t dropped; 73 struct bpf_prog *_xdp_prog; 74 struct veth_rq *rq; 75 unsigned int requested_headroom; 76 }; 77 78 struct veth_xdp_tx_bq { 79 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 80 unsigned int count; 81 }; 82 83 /* 84 * ethtool interface 85 */ 86 87 struct veth_q_stat_desc { 88 char desc[ETH_GSTRING_LEN]; 89 size_t offset; 90 }; 91 92 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 93 94 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 95 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 96 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 97 { "drops", VETH_RQ_STAT(rx_drops) }, 98 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 99 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 100 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 101 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 102 }; 103 104 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 105 106 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 107 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 108 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 109 }; 110 111 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 112 113 static struct { 114 const char string[ETH_GSTRING_LEN]; 115 } ethtool_stats_keys[] = { 116 { "peer_ifindex" }, 117 }; 118 119 static int veth_get_link_ksettings(struct net_device *dev, 120 struct ethtool_link_ksettings *cmd) 121 { 122 cmd->base.speed = SPEED_10000; 123 cmd->base.duplex = DUPLEX_FULL; 124 cmd->base.port = PORT_TP; 125 cmd->base.autoneg = AUTONEG_DISABLE; 126 return 0; 127 } 128 129 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 130 { 131 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 132 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 133 } 134 135 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 136 { 137 char *p = (char *)buf; 138 int i, j; 139 140 switch(stringset) { 141 case ETH_SS_STATS: 142 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 143 p += sizeof(ethtool_stats_keys); 144 for (i = 0; i < dev->real_num_rx_queues; i++) { 145 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 146 snprintf(p, ETH_GSTRING_LEN, 147 "rx_queue_%u_%.18s", 148 i, veth_rq_stats_desc[j].desc); 149 p += ETH_GSTRING_LEN; 150 } 151 } 152 for (i = 0; i < dev->real_num_tx_queues; i++) { 153 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 154 snprintf(p, ETH_GSTRING_LEN, 155 "tx_queue_%u_%.18s", 156 i, veth_tq_stats_desc[j].desc); 157 p += ETH_GSTRING_LEN; 158 } 159 } 160 break; 161 } 162 } 163 164 static int veth_get_sset_count(struct net_device *dev, int sset) 165 { 166 switch (sset) { 167 case ETH_SS_STATS: 168 return ARRAY_SIZE(ethtool_stats_keys) + 169 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 170 VETH_TQ_STATS_LEN * dev->real_num_tx_queues; 171 default: 172 return -EOPNOTSUPP; 173 } 174 } 175 176 static void veth_get_ethtool_stats(struct net_device *dev, 177 struct ethtool_stats *stats, u64 *data) 178 { 179 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 180 struct net_device *peer = rtnl_dereference(priv->peer); 181 int i, j, idx; 182 183 data[0] = peer ? peer->ifindex : 0; 184 idx = 1; 185 for (i = 0; i < dev->real_num_rx_queues; i++) { 186 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 187 const void *stats_base = (void *)&rq_stats->vs; 188 unsigned int start; 189 size_t offset; 190 191 do { 192 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 193 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 194 offset = veth_rq_stats_desc[j].offset; 195 data[idx + j] = *(u64 *)(stats_base + offset); 196 } 197 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 198 idx += VETH_RQ_STATS_LEN; 199 } 200 201 if (!peer) 202 return; 203 204 rcv_priv = netdev_priv(peer); 205 for (i = 0; i < peer->real_num_rx_queues; i++) { 206 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 207 const void *base = (void *)&rq_stats->vs; 208 unsigned int start, tx_idx = idx; 209 size_t offset; 210 211 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 212 do { 213 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 214 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 215 offset = veth_tq_stats_desc[j].offset; 216 data[tx_idx + j] += *(u64 *)(base + offset); 217 } 218 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 219 } 220 } 221 222 static void veth_get_channels(struct net_device *dev, 223 struct ethtool_channels *channels) 224 { 225 channels->tx_count = dev->real_num_tx_queues; 226 channels->rx_count = dev->real_num_rx_queues; 227 channels->max_tx = dev->real_num_tx_queues; 228 channels->max_rx = dev->real_num_rx_queues; 229 channels->combined_count = min(dev->real_num_rx_queues, dev->real_num_tx_queues); 230 channels->max_combined = min(dev->real_num_rx_queues, dev->real_num_tx_queues); 231 } 232 233 static const struct ethtool_ops veth_ethtool_ops = { 234 .get_drvinfo = veth_get_drvinfo, 235 .get_link = ethtool_op_get_link, 236 .get_strings = veth_get_strings, 237 .get_sset_count = veth_get_sset_count, 238 .get_ethtool_stats = veth_get_ethtool_stats, 239 .get_link_ksettings = veth_get_link_ksettings, 240 .get_ts_info = ethtool_op_get_ts_info, 241 .get_channels = veth_get_channels, 242 }; 243 244 /* general routines */ 245 246 static bool veth_is_xdp_frame(void *ptr) 247 { 248 return (unsigned long)ptr & VETH_XDP_FLAG; 249 } 250 251 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 252 { 253 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 254 } 255 256 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 257 { 258 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 259 } 260 261 static void veth_ptr_free(void *ptr) 262 { 263 if (veth_is_xdp_frame(ptr)) 264 xdp_return_frame(veth_ptr_to_xdp(ptr)); 265 else 266 kfree_skb(ptr); 267 } 268 269 static void __veth_xdp_flush(struct veth_rq *rq) 270 { 271 /* Write ptr_ring before reading rx_notify_masked */ 272 smp_mb(); 273 if (!rq->rx_notify_masked) { 274 rq->rx_notify_masked = true; 275 napi_schedule(&rq->xdp_napi); 276 } 277 } 278 279 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 280 { 281 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 282 dev_kfree_skb_any(skb); 283 return NET_RX_DROP; 284 } 285 286 return NET_RX_SUCCESS; 287 } 288 289 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 290 struct veth_rq *rq, bool xdp) 291 { 292 return __dev_forward_skb(dev, skb) ?: xdp ? 293 veth_xdp_rx(rq, skb) : 294 netif_rx(skb); 295 } 296 297 /* return true if the specified skb has chances of GRO aggregation 298 * Don't strive for accuracy, but try to avoid GRO overhead in the most 299 * common scenarios. 300 * When XDP is enabled, all traffic is considered eligible, as the xmit 301 * device has TSO off. 302 * When TSO is enabled on the xmit device, we are likely interested only 303 * in UDP aggregation, explicitly check for that if the skb is suspected 304 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 305 * to belong to locally generated UDP traffic. 306 */ 307 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 308 const struct net_device *rcv, 309 const struct sk_buff *skb) 310 { 311 return !(dev->features & NETIF_F_ALL_TSO) || 312 (skb->destructor == sock_wfree && 313 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 314 } 315 316 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 317 { 318 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 319 struct veth_rq *rq = NULL; 320 struct net_device *rcv; 321 int length = skb->len; 322 bool use_napi = false; 323 int rxq; 324 325 rcu_read_lock(); 326 rcv = rcu_dereference(priv->peer); 327 if (unlikely(!rcv)) { 328 kfree_skb(skb); 329 goto drop; 330 } 331 332 rcv_priv = netdev_priv(rcv); 333 rxq = skb_get_queue_mapping(skb); 334 if (rxq < rcv->real_num_rx_queues) { 335 rq = &rcv_priv->rq[rxq]; 336 337 /* The napi pointer is available when an XDP program is 338 * attached or when GRO is enabled 339 * Don't bother with napi/GRO if the skb can't be aggregated 340 */ 341 use_napi = rcu_access_pointer(rq->napi) && 342 veth_skb_is_eligible_for_gro(dev, rcv, skb); 343 skb_record_rx_queue(skb, rxq); 344 } 345 346 skb_tx_timestamp(skb); 347 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 348 if (!use_napi) 349 dev_lstats_add(dev, length); 350 } else { 351 drop: 352 atomic64_inc(&priv->dropped); 353 } 354 355 if (use_napi) 356 __veth_xdp_flush(rq); 357 358 rcu_read_unlock(); 359 360 return NETDEV_TX_OK; 361 } 362 363 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 364 { 365 struct veth_priv *priv = netdev_priv(dev); 366 367 dev_lstats_read(dev, packets, bytes); 368 return atomic64_read(&priv->dropped); 369 } 370 371 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 372 { 373 struct veth_priv *priv = netdev_priv(dev); 374 int i; 375 376 result->peer_tq_xdp_xmit_err = 0; 377 result->xdp_packets = 0; 378 result->xdp_tx_err = 0; 379 result->xdp_bytes = 0; 380 result->rx_drops = 0; 381 for (i = 0; i < dev->num_rx_queues; i++) { 382 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 383 struct veth_rq_stats *stats = &priv->rq[i].stats; 384 unsigned int start; 385 386 do { 387 start = u64_stats_fetch_begin_irq(&stats->syncp); 388 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 389 xdp_tx_err = stats->vs.xdp_tx_err; 390 packets = stats->vs.xdp_packets; 391 bytes = stats->vs.xdp_bytes; 392 drops = stats->vs.rx_drops; 393 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 394 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 395 result->xdp_tx_err += xdp_tx_err; 396 result->xdp_packets += packets; 397 result->xdp_bytes += bytes; 398 result->rx_drops += drops; 399 } 400 } 401 402 static void veth_get_stats64(struct net_device *dev, 403 struct rtnl_link_stats64 *tot) 404 { 405 struct veth_priv *priv = netdev_priv(dev); 406 struct net_device *peer; 407 struct veth_stats rx; 408 u64 packets, bytes; 409 410 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 411 tot->tx_bytes = bytes; 412 tot->tx_packets = packets; 413 414 veth_stats_rx(&rx, dev); 415 tot->tx_dropped += rx.xdp_tx_err; 416 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 417 tot->rx_bytes = rx.xdp_bytes; 418 tot->rx_packets = rx.xdp_packets; 419 420 rcu_read_lock(); 421 peer = rcu_dereference(priv->peer); 422 if (peer) { 423 veth_stats_tx(peer, &packets, &bytes); 424 tot->rx_bytes += bytes; 425 tot->rx_packets += packets; 426 427 veth_stats_rx(&rx, peer); 428 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 429 tot->rx_dropped += rx.xdp_tx_err; 430 tot->tx_bytes += rx.xdp_bytes; 431 tot->tx_packets += rx.xdp_packets; 432 } 433 rcu_read_unlock(); 434 } 435 436 /* fake multicast ability */ 437 static void veth_set_multicast_list(struct net_device *dev) 438 { 439 } 440 441 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 442 int buflen) 443 { 444 struct sk_buff *skb; 445 446 skb = build_skb(head, buflen); 447 if (!skb) 448 return NULL; 449 450 skb_reserve(skb, headroom); 451 skb_put(skb, len); 452 453 return skb; 454 } 455 456 static int veth_select_rxq(struct net_device *dev) 457 { 458 return smp_processor_id() % dev->real_num_rx_queues; 459 } 460 461 static struct net_device *veth_peer_dev(struct net_device *dev) 462 { 463 struct veth_priv *priv = netdev_priv(dev); 464 465 /* Callers must be under RCU read side. */ 466 return rcu_dereference(priv->peer); 467 } 468 469 static int veth_xdp_xmit(struct net_device *dev, int n, 470 struct xdp_frame **frames, 471 u32 flags, bool ndo_xmit) 472 { 473 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 474 int i, ret = -ENXIO, nxmit = 0; 475 struct net_device *rcv; 476 unsigned int max_len; 477 struct veth_rq *rq; 478 479 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 480 return -EINVAL; 481 482 rcu_read_lock(); 483 rcv = rcu_dereference(priv->peer); 484 if (unlikely(!rcv)) 485 goto out; 486 487 rcv_priv = netdev_priv(rcv); 488 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 489 /* The napi pointer is set if NAPI is enabled, which ensures that 490 * xdp_ring is initialized on receive side and the peer device is up. 491 */ 492 if (!rcu_access_pointer(rq->napi)) 493 goto out; 494 495 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 496 497 spin_lock(&rq->xdp_ring.producer_lock); 498 for (i = 0; i < n; i++) { 499 struct xdp_frame *frame = frames[i]; 500 void *ptr = veth_xdp_to_ptr(frame); 501 502 if (unlikely(frame->len > max_len || 503 __ptr_ring_produce(&rq->xdp_ring, ptr))) 504 break; 505 nxmit++; 506 } 507 spin_unlock(&rq->xdp_ring.producer_lock); 508 509 if (flags & XDP_XMIT_FLUSH) 510 __veth_xdp_flush(rq); 511 512 ret = nxmit; 513 if (ndo_xmit) { 514 u64_stats_update_begin(&rq->stats.syncp); 515 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 516 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 517 u64_stats_update_end(&rq->stats.syncp); 518 } 519 520 out: 521 rcu_read_unlock(); 522 523 return ret; 524 } 525 526 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 527 struct xdp_frame **frames, u32 flags) 528 { 529 int err; 530 531 err = veth_xdp_xmit(dev, n, frames, flags, true); 532 if (err < 0) { 533 struct veth_priv *priv = netdev_priv(dev); 534 535 atomic64_add(n, &priv->dropped); 536 } 537 538 return err; 539 } 540 541 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 542 { 543 int sent, i, err = 0, drops; 544 545 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 546 if (sent < 0) { 547 err = sent; 548 sent = 0; 549 } 550 551 for (i = sent; unlikely(i < bq->count); i++) 552 xdp_return_frame(bq->q[i]); 553 554 drops = bq->count - sent; 555 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 556 557 u64_stats_update_begin(&rq->stats.syncp); 558 rq->stats.vs.xdp_tx += sent; 559 rq->stats.vs.xdp_tx_err += drops; 560 u64_stats_update_end(&rq->stats.syncp); 561 562 bq->count = 0; 563 } 564 565 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 566 { 567 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 568 struct net_device *rcv; 569 struct veth_rq *rcv_rq; 570 571 rcu_read_lock(); 572 veth_xdp_flush_bq(rq, bq); 573 rcv = rcu_dereference(priv->peer); 574 if (unlikely(!rcv)) 575 goto out; 576 577 rcv_priv = netdev_priv(rcv); 578 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 579 /* xdp_ring is initialized on receive side? */ 580 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 581 goto out; 582 583 __veth_xdp_flush(rcv_rq); 584 out: 585 rcu_read_unlock(); 586 } 587 588 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 589 struct veth_xdp_tx_bq *bq) 590 { 591 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 592 593 if (unlikely(!frame)) 594 return -EOVERFLOW; 595 596 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 597 veth_xdp_flush_bq(rq, bq); 598 599 bq->q[bq->count++] = frame; 600 601 return 0; 602 } 603 604 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 605 struct xdp_frame *frame, 606 struct veth_xdp_tx_bq *bq, 607 struct veth_stats *stats) 608 { 609 struct xdp_frame orig_frame; 610 struct bpf_prog *xdp_prog; 611 612 rcu_read_lock(); 613 xdp_prog = rcu_dereference(rq->xdp_prog); 614 if (likely(xdp_prog)) { 615 struct xdp_buff xdp; 616 u32 act; 617 618 xdp_convert_frame_to_buff(frame, &xdp); 619 xdp.rxq = &rq->xdp_rxq; 620 621 act = bpf_prog_run_xdp(xdp_prog, &xdp); 622 623 switch (act) { 624 case XDP_PASS: 625 if (xdp_update_frame_from_buff(&xdp, frame)) 626 goto err_xdp; 627 break; 628 case XDP_TX: 629 orig_frame = *frame; 630 xdp.rxq->mem = frame->mem; 631 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 632 trace_xdp_exception(rq->dev, xdp_prog, act); 633 frame = &orig_frame; 634 stats->rx_drops++; 635 goto err_xdp; 636 } 637 stats->xdp_tx++; 638 rcu_read_unlock(); 639 goto xdp_xmit; 640 case XDP_REDIRECT: 641 orig_frame = *frame; 642 xdp.rxq->mem = frame->mem; 643 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 644 frame = &orig_frame; 645 stats->rx_drops++; 646 goto err_xdp; 647 } 648 stats->xdp_redirect++; 649 rcu_read_unlock(); 650 goto xdp_xmit; 651 default: 652 bpf_warn_invalid_xdp_action(act); 653 fallthrough; 654 case XDP_ABORTED: 655 trace_xdp_exception(rq->dev, xdp_prog, act); 656 fallthrough; 657 case XDP_DROP: 658 stats->xdp_drops++; 659 goto err_xdp; 660 } 661 } 662 rcu_read_unlock(); 663 664 return frame; 665 err_xdp: 666 rcu_read_unlock(); 667 xdp_return_frame(frame); 668 xdp_xmit: 669 return NULL; 670 } 671 672 /* frames array contains VETH_XDP_BATCH at most */ 673 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 674 int n_xdpf, struct veth_xdp_tx_bq *bq, 675 struct veth_stats *stats) 676 { 677 void *skbs[VETH_XDP_BATCH]; 678 int i; 679 680 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 681 GFP_ATOMIC | __GFP_ZERO) < 0) { 682 for (i = 0; i < n_xdpf; i++) 683 xdp_return_frame(frames[i]); 684 stats->rx_drops += n_xdpf; 685 686 return; 687 } 688 689 for (i = 0; i < n_xdpf; i++) { 690 struct sk_buff *skb = skbs[i]; 691 692 skb = __xdp_build_skb_from_frame(frames[i], skb, 693 rq->dev); 694 if (!skb) { 695 xdp_return_frame(frames[i]); 696 stats->rx_drops++; 697 continue; 698 } 699 napi_gro_receive(&rq->xdp_napi, skb); 700 } 701 } 702 703 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 704 struct sk_buff *skb, 705 struct veth_xdp_tx_bq *bq, 706 struct veth_stats *stats) 707 { 708 u32 pktlen, headroom, act, metalen, frame_sz; 709 void *orig_data, *orig_data_end; 710 struct bpf_prog *xdp_prog; 711 int mac_len, delta, off; 712 struct xdp_buff xdp; 713 714 skb_orphan_partial(skb); 715 716 rcu_read_lock(); 717 xdp_prog = rcu_dereference(rq->xdp_prog); 718 if (unlikely(!xdp_prog)) { 719 rcu_read_unlock(); 720 goto out; 721 } 722 723 mac_len = skb->data - skb_mac_header(skb); 724 pktlen = skb->len + mac_len; 725 headroom = skb_headroom(skb) - mac_len; 726 727 if (skb_shared(skb) || skb_head_is_locked(skb) || 728 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 729 struct sk_buff *nskb; 730 int size, head_off; 731 void *head, *start; 732 struct page *page; 733 734 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 735 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 736 if (size > PAGE_SIZE) 737 goto drop; 738 739 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 740 if (!page) 741 goto drop; 742 743 head = page_address(page); 744 start = head + VETH_XDP_HEADROOM; 745 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 746 page_frag_free(head); 747 goto drop; 748 } 749 750 nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, 751 skb->len, PAGE_SIZE); 752 if (!nskb) { 753 page_frag_free(head); 754 goto drop; 755 } 756 757 skb_copy_header(nskb, skb); 758 head_off = skb_headroom(nskb) - skb_headroom(skb); 759 skb_headers_offset_update(nskb, head_off); 760 consume_skb(skb); 761 skb = nskb; 762 } 763 764 /* SKB "head" area always have tailroom for skb_shared_info */ 765 frame_sz = skb_end_pointer(skb) - skb->head; 766 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 767 xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); 768 xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true); 769 770 orig_data = xdp.data; 771 orig_data_end = xdp.data_end; 772 773 act = bpf_prog_run_xdp(xdp_prog, &xdp); 774 775 switch (act) { 776 case XDP_PASS: 777 break; 778 case XDP_TX: 779 get_page(virt_to_page(xdp.data)); 780 consume_skb(skb); 781 xdp.rxq->mem = rq->xdp_mem; 782 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 783 trace_xdp_exception(rq->dev, xdp_prog, act); 784 stats->rx_drops++; 785 goto err_xdp; 786 } 787 stats->xdp_tx++; 788 rcu_read_unlock(); 789 goto xdp_xmit; 790 case XDP_REDIRECT: 791 get_page(virt_to_page(xdp.data)); 792 consume_skb(skb); 793 xdp.rxq->mem = rq->xdp_mem; 794 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 795 stats->rx_drops++; 796 goto err_xdp; 797 } 798 stats->xdp_redirect++; 799 rcu_read_unlock(); 800 goto xdp_xmit; 801 default: 802 bpf_warn_invalid_xdp_action(act); 803 fallthrough; 804 case XDP_ABORTED: 805 trace_xdp_exception(rq->dev, xdp_prog, act); 806 fallthrough; 807 case XDP_DROP: 808 stats->xdp_drops++; 809 goto xdp_drop; 810 } 811 rcu_read_unlock(); 812 813 /* check if bpf_xdp_adjust_head was used */ 814 delta = orig_data - xdp.data; 815 off = mac_len + delta; 816 if (off > 0) 817 __skb_push(skb, off); 818 else if (off < 0) 819 __skb_pull(skb, -off); 820 skb->mac_header -= delta; 821 822 /* check if bpf_xdp_adjust_tail was used */ 823 off = xdp.data_end - orig_data_end; 824 if (off != 0) 825 __skb_put(skb, off); /* positive on grow, negative on shrink */ 826 skb->protocol = eth_type_trans(skb, rq->dev); 827 828 metalen = xdp.data - xdp.data_meta; 829 if (metalen) 830 skb_metadata_set(skb, metalen); 831 out: 832 return skb; 833 drop: 834 stats->rx_drops++; 835 xdp_drop: 836 rcu_read_unlock(); 837 kfree_skb(skb); 838 return NULL; 839 err_xdp: 840 rcu_read_unlock(); 841 page_frag_free(xdp.data); 842 xdp_xmit: 843 return NULL; 844 } 845 846 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 847 struct veth_xdp_tx_bq *bq, 848 struct veth_stats *stats) 849 { 850 int i, done = 0, n_xdpf = 0; 851 void *xdpf[VETH_XDP_BATCH]; 852 853 for (i = 0; i < budget; i++) { 854 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 855 856 if (!ptr) 857 break; 858 859 if (veth_is_xdp_frame(ptr)) { 860 /* ndo_xdp_xmit */ 861 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 862 863 stats->xdp_bytes += frame->len; 864 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 865 if (frame) { 866 /* XDP_PASS */ 867 xdpf[n_xdpf++] = frame; 868 if (n_xdpf == VETH_XDP_BATCH) { 869 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 870 bq, stats); 871 n_xdpf = 0; 872 } 873 } 874 } else { 875 /* ndo_start_xmit */ 876 struct sk_buff *skb = ptr; 877 878 stats->xdp_bytes += skb->len; 879 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 880 if (skb) 881 napi_gro_receive(&rq->xdp_napi, skb); 882 } 883 done++; 884 } 885 886 if (n_xdpf) 887 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 888 889 u64_stats_update_begin(&rq->stats.syncp); 890 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 891 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 892 rq->stats.vs.xdp_drops += stats->xdp_drops; 893 rq->stats.vs.rx_drops += stats->rx_drops; 894 rq->stats.vs.xdp_packets += done; 895 u64_stats_update_end(&rq->stats.syncp); 896 897 return done; 898 } 899 900 static int veth_poll(struct napi_struct *napi, int budget) 901 { 902 struct veth_rq *rq = 903 container_of(napi, struct veth_rq, xdp_napi); 904 struct veth_stats stats = {}; 905 struct veth_xdp_tx_bq bq; 906 int done; 907 908 bq.count = 0; 909 910 xdp_set_return_frame_no_direct(); 911 done = veth_xdp_rcv(rq, budget, &bq, &stats); 912 913 if (done < budget && napi_complete_done(napi, done)) { 914 /* Write rx_notify_masked before reading ptr_ring */ 915 smp_store_mb(rq->rx_notify_masked, false); 916 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 917 rq->rx_notify_masked = true; 918 napi_schedule(&rq->xdp_napi); 919 } 920 } 921 922 if (stats.xdp_tx > 0) 923 veth_xdp_flush(rq, &bq); 924 if (stats.xdp_redirect > 0) 925 xdp_do_flush(); 926 xdp_clear_return_frame_no_direct(); 927 928 return done; 929 } 930 931 static int __veth_napi_enable(struct net_device *dev) 932 { 933 struct veth_priv *priv = netdev_priv(dev); 934 int err, i; 935 936 for (i = 0; i < dev->real_num_rx_queues; i++) { 937 struct veth_rq *rq = &priv->rq[i]; 938 939 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 940 if (err) 941 goto err_xdp_ring; 942 } 943 944 for (i = 0; i < dev->real_num_rx_queues; i++) { 945 struct veth_rq *rq = &priv->rq[i]; 946 947 napi_enable(&rq->xdp_napi); 948 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 949 } 950 951 return 0; 952 err_xdp_ring: 953 for (i--; i >= 0; i--) 954 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 955 956 return err; 957 } 958 959 static void veth_napi_del(struct net_device *dev) 960 { 961 struct veth_priv *priv = netdev_priv(dev); 962 int i; 963 964 for (i = 0; i < dev->real_num_rx_queues; i++) { 965 struct veth_rq *rq = &priv->rq[i]; 966 967 rcu_assign_pointer(priv->rq[i].napi, NULL); 968 napi_disable(&rq->xdp_napi); 969 __netif_napi_del(&rq->xdp_napi); 970 } 971 synchronize_net(); 972 973 for (i = 0; i < dev->real_num_rx_queues; i++) { 974 struct veth_rq *rq = &priv->rq[i]; 975 976 rq->rx_notify_masked = false; 977 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 978 } 979 } 980 981 static bool veth_gro_requested(const struct net_device *dev) 982 { 983 return !!(dev->wanted_features & NETIF_F_GRO); 984 } 985 986 static int veth_enable_xdp(struct net_device *dev) 987 { 988 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 989 struct veth_priv *priv = netdev_priv(dev); 990 int err, i; 991 992 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 993 for (i = 0; i < dev->real_num_rx_queues; i++) { 994 struct veth_rq *rq = &priv->rq[i]; 995 996 if (!napi_already_on) 997 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 998 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 999 if (err < 0) 1000 goto err_rxq_reg; 1001 1002 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1003 MEM_TYPE_PAGE_SHARED, 1004 NULL); 1005 if (err < 0) 1006 goto err_reg_mem; 1007 1008 /* Save original mem info as it can be overwritten */ 1009 rq->xdp_mem = rq->xdp_rxq.mem; 1010 } 1011 1012 if (!napi_already_on) { 1013 err = __veth_napi_enable(dev); 1014 if (err) 1015 goto err_rxq_reg; 1016 1017 if (!veth_gro_requested(dev)) { 1018 /* user-space did not require GRO, but adding XDP 1019 * is supposed to get GRO working 1020 */ 1021 dev->features |= NETIF_F_GRO; 1022 netdev_features_change(dev); 1023 } 1024 } 1025 } 1026 1027 for (i = 0; i < dev->real_num_rx_queues; i++) { 1028 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1029 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1030 } 1031 1032 return 0; 1033 err_reg_mem: 1034 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1035 err_rxq_reg: 1036 for (i--; i >= 0; i--) { 1037 struct veth_rq *rq = &priv->rq[i]; 1038 1039 xdp_rxq_info_unreg(&rq->xdp_rxq); 1040 if (!napi_already_on) 1041 netif_napi_del(&rq->xdp_napi); 1042 } 1043 1044 return err; 1045 } 1046 1047 static void veth_disable_xdp(struct net_device *dev) 1048 { 1049 struct veth_priv *priv = netdev_priv(dev); 1050 int i; 1051 1052 for (i = 0; i < dev->real_num_rx_queues; i++) 1053 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1054 1055 if (!netif_running(dev) || !veth_gro_requested(dev)) { 1056 veth_napi_del(dev); 1057 1058 /* if user-space did not require GRO, since adding XDP 1059 * enabled it, clear it now 1060 */ 1061 if (!veth_gro_requested(dev) && netif_running(dev)) { 1062 dev->features &= ~NETIF_F_GRO; 1063 netdev_features_change(dev); 1064 } 1065 } 1066 1067 for (i = 0; i < dev->real_num_rx_queues; i++) { 1068 struct veth_rq *rq = &priv->rq[i]; 1069 1070 rq->xdp_rxq.mem = rq->xdp_mem; 1071 xdp_rxq_info_unreg(&rq->xdp_rxq); 1072 } 1073 } 1074 1075 static int veth_napi_enable(struct net_device *dev) 1076 { 1077 struct veth_priv *priv = netdev_priv(dev); 1078 int err, i; 1079 1080 for (i = 0; i < dev->real_num_rx_queues; i++) { 1081 struct veth_rq *rq = &priv->rq[i]; 1082 1083 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1084 } 1085 1086 err = __veth_napi_enable(dev); 1087 if (err) { 1088 for (i = 0; i < dev->real_num_rx_queues; i++) { 1089 struct veth_rq *rq = &priv->rq[i]; 1090 1091 netif_napi_del(&rq->xdp_napi); 1092 } 1093 return err; 1094 } 1095 return err; 1096 } 1097 1098 static int veth_open(struct net_device *dev) 1099 { 1100 struct veth_priv *priv = netdev_priv(dev); 1101 struct net_device *peer = rtnl_dereference(priv->peer); 1102 int err; 1103 1104 if (!peer) 1105 return -ENOTCONN; 1106 1107 if (priv->_xdp_prog) { 1108 err = veth_enable_xdp(dev); 1109 if (err) 1110 return err; 1111 } else if (veth_gro_requested(dev)) { 1112 err = veth_napi_enable(dev); 1113 if (err) 1114 return err; 1115 } 1116 1117 if (peer->flags & IFF_UP) { 1118 netif_carrier_on(dev); 1119 netif_carrier_on(peer); 1120 } 1121 1122 return 0; 1123 } 1124 1125 static int veth_close(struct net_device *dev) 1126 { 1127 struct veth_priv *priv = netdev_priv(dev); 1128 struct net_device *peer = rtnl_dereference(priv->peer); 1129 1130 netif_carrier_off(dev); 1131 if (peer) 1132 netif_carrier_off(peer); 1133 1134 if (priv->_xdp_prog) 1135 veth_disable_xdp(dev); 1136 else if (veth_gro_requested(dev)) 1137 veth_napi_del(dev); 1138 1139 return 0; 1140 } 1141 1142 static int is_valid_veth_mtu(int mtu) 1143 { 1144 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1145 } 1146 1147 static int veth_alloc_queues(struct net_device *dev) 1148 { 1149 struct veth_priv *priv = netdev_priv(dev); 1150 int i; 1151 1152 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 1153 if (!priv->rq) 1154 return -ENOMEM; 1155 1156 for (i = 0; i < dev->num_rx_queues; i++) { 1157 priv->rq[i].dev = dev; 1158 u64_stats_init(&priv->rq[i].stats.syncp); 1159 } 1160 1161 return 0; 1162 } 1163 1164 static void veth_free_queues(struct net_device *dev) 1165 { 1166 struct veth_priv *priv = netdev_priv(dev); 1167 1168 kfree(priv->rq); 1169 } 1170 1171 static int veth_dev_init(struct net_device *dev) 1172 { 1173 int err; 1174 1175 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1176 if (!dev->lstats) 1177 return -ENOMEM; 1178 1179 err = veth_alloc_queues(dev); 1180 if (err) { 1181 free_percpu(dev->lstats); 1182 return err; 1183 } 1184 1185 return 0; 1186 } 1187 1188 static void veth_dev_free(struct net_device *dev) 1189 { 1190 veth_free_queues(dev); 1191 free_percpu(dev->lstats); 1192 } 1193 1194 #ifdef CONFIG_NET_POLL_CONTROLLER 1195 static void veth_poll_controller(struct net_device *dev) 1196 { 1197 /* veth only receives frames when its peer sends one 1198 * Since it has nothing to do with disabling irqs, we are guaranteed 1199 * never to have pending data when we poll for it so 1200 * there is nothing to do here. 1201 * 1202 * We need this though so netpoll recognizes us as an interface that 1203 * supports polling, which enables bridge devices in virt setups to 1204 * still use netconsole 1205 */ 1206 } 1207 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1208 1209 static int veth_get_iflink(const struct net_device *dev) 1210 { 1211 struct veth_priv *priv = netdev_priv(dev); 1212 struct net_device *peer; 1213 int iflink; 1214 1215 rcu_read_lock(); 1216 peer = rcu_dereference(priv->peer); 1217 iflink = peer ? peer->ifindex : 0; 1218 rcu_read_unlock(); 1219 1220 return iflink; 1221 } 1222 1223 static netdev_features_t veth_fix_features(struct net_device *dev, 1224 netdev_features_t features) 1225 { 1226 struct veth_priv *priv = netdev_priv(dev); 1227 struct net_device *peer; 1228 1229 peer = rtnl_dereference(priv->peer); 1230 if (peer) { 1231 struct veth_priv *peer_priv = netdev_priv(peer); 1232 1233 if (peer_priv->_xdp_prog) 1234 features &= ~NETIF_F_GSO_SOFTWARE; 1235 } 1236 if (priv->_xdp_prog) 1237 features |= NETIF_F_GRO; 1238 1239 return features; 1240 } 1241 1242 static int veth_set_features(struct net_device *dev, 1243 netdev_features_t features) 1244 { 1245 netdev_features_t changed = features ^ dev->features; 1246 struct veth_priv *priv = netdev_priv(dev); 1247 int err; 1248 1249 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1250 return 0; 1251 1252 if (features & NETIF_F_GRO) { 1253 err = veth_napi_enable(dev); 1254 if (err) 1255 return err; 1256 } else { 1257 veth_napi_del(dev); 1258 } 1259 return 0; 1260 } 1261 1262 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1263 { 1264 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1265 struct net_device *peer; 1266 1267 if (new_hr < 0) 1268 new_hr = 0; 1269 1270 rcu_read_lock(); 1271 peer = rcu_dereference(priv->peer); 1272 if (unlikely(!peer)) 1273 goto out; 1274 1275 peer_priv = netdev_priv(peer); 1276 priv->requested_headroom = new_hr; 1277 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1278 dev->needed_headroom = new_hr; 1279 peer->needed_headroom = new_hr; 1280 1281 out: 1282 rcu_read_unlock(); 1283 } 1284 1285 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1286 struct netlink_ext_ack *extack) 1287 { 1288 struct veth_priv *priv = netdev_priv(dev); 1289 struct bpf_prog *old_prog; 1290 struct net_device *peer; 1291 unsigned int max_mtu; 1292 int err; 1293 1294 old_prog = priv->_xdp_prog; 1295 priv->_xdp_prog = prog; 1296 peer = rtnl_dereference(priv->peer); 1297 1298 if (prog) { 1299 if (!peer) { 1300 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1301 err = -ENOTCONN; 1302 goto err; 1303 } 1304 1305 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1306 peer->hard_header_len - 1307 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1308 if (peer->mtu > max_mtu) { 1309 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1310 err = -ERANGE; 1311 goto err; 1312 } 1313 1314 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1315 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1316 err = -ENOSPC; 1317 goto err; 1318 } 1319 1320 if (dev->flags & IFF_UP) { 1321 err = veth_enable_xdp(dev); 1322 if (err) { 1323 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1324 goto err; 1325 } 1326 } 1327 1328 if (!old_prog) { 1329 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1330 peer->max_mtu = max_mtu; 1331 } 1332 } 1333 1334 if (old_prog) { 1335 if (!prog) { 1336 if (dev->flags & IFF_UP) 1337 veth_disable_xdp(dev); 1338 1339 if (peer) { 1340 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1341 peer->max_mtu = ETH_MAX_MTU; 1342 } 1343 } 1344 bpf_prog_put(old_prog); 1345 } 1346 1347 if ((!!old_prog ^ !!prog) && peer) 1348 netdev_update_features(peer); 1349 1350 return 0; 1351 err: 1352 priv->_xdp_prog = old_prog; 1353 1354 return err; 1355 } 1356 1357 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1358 { 1359 switch (xdp->command) { 1360 case XDP_SETUP_PROG: 1361 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1362 default: 1363 return -EINVAL; 1364 } 1365 } 1366 1367 static const struct net_device_ops veth_netdev_ops = { 1368 .ndo_init = veth_dev_init, 1369 .ndo_open = veth_open, 1370 .ndo_stop = veth_close, 1371 .ndo_start_xmit = veth_xmit, 1372 .ndo_get_stats64 = veth_get_stats64, 1373 .ndo_set_rx_mode = veth_set_multicast_list, 1374 .ndo_set_mac_address = eth_mac_addr, 1375 #ifdef CONFIG_NET_POLL_CONTROLLER 1376 .ndo_poll_controller = veth_poll_controller, 1377 #endif 1378 .ndo_get_iflink = veth_get_iflink, 1379 .ndo_fix_features = veth_fix_features, 1380 .ndo_set_features = veth_set_features, 1381 .ndo_features_check = passthru_features_check, 1382 .ndo_set_rx_headroom = veth_set_rx_headroom, 1383 .ndo_bpf = veth_xdp, 1384 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1385 .ndo_get_peer_dev = veth_peer_dev, 1386 }; 1387 1388 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1389 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1390 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1391 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1392 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1393 1394 static void veth_setup(struct net_device *dev) 1395 { 1396 ether_setup(dev); 1397 1398 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1399 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1400 dev->priv_flags |= IFF_NO_QUEUE; 1401 dev->priv_flags |= IFF_PHONY_HEADROOM; 1402 1403 dev->netdev_ops = &veth_netdev_ops; 1404 dev->ethtool_ops = &veth_ethtool_ops; 1405 dev->features |= NETIF_F_LLTX; 1406 dev->features |= VETH_FEATURES; 1407 dev->vlan_features = dev->features & 1408 ~(NETIF_F_HW_VLAN_CTAG_TX | 1409 NETIF_F_HW_VLAN_STAG_TX | 1410 NETIF_F_HW_VLAN_CTAG_RX | 1411 NETIF_F_HW_VLAN_STAG_RX); 1412 dev->needs_free_netdev = true; 1413 dev->priv_destructor = veth_dev_free; 1414 dev->max_mtu = ETH_MAX_MTU; 1415 1416 dev->hw_features = VETH_FEATURES; 1417 dev->hw_enc_features = VETH_FEATURES; 1418 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1419 } 1420 1421 /* 1422 * netlink interface 1423 */ 1424 1425 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1426 struct netlink_ext_ack *extack) 1427 { 1428 if (tb[IFLA_ADDRESS]) { 1429 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1430 return -EINVAL; 1431 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1432 return -EADDRNOTAVAIL; 1433 } 1434 if (tb[IFLA_MTU]) { 1435 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1436 return -EINVAL; 1437 } 1438 return 0; 1439 } 1440 1441 static struct rtnl_link_ops veth_link_ops; 1442 1443 static void veth_disable_gro(struct net_device *dev) 1444 { 1445 dev->features &= ~NETIF_F_GRO; 1446 dev->wanted_features &= ~NETIF_F_GRO; 1447 netdev_update_features(dev); 1448 } 1449 1450 static int veth_newlink(struct net *src_net, struct net_device *dev, 1451 struct nlattr *tb[], struct nlattr *data[], 1452 struct netlink_ext_ack *extack) 1453 { 1454 int err; 1455 struct net_device *peer; 1456 struct veth_priv *priv; 1457 char ifname[IFNAMSIZ]; 1458 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1459 unsigned char name_assign_type; 1460 struct ifinfomsg *ifmp; 1461 struct net *net; 1462 1463 /* 1464 * create and register peer first 1465 */ 1466 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1467 struct nlattr *nla_peer; 1468 1469 nla_peer = data[VETH_INFO_PEER]; 1470 ifmp = nla_data(nla_peer); 1471 err = rtnl_nla_parse_ifla(peer_tb, 1472 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1473 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1474 NULL); 1475 if (err < 0) 1476 return err; 1477 1478 err = veth_validate(peer_tb, NULL, extack); 1479 if (err < 0) 1480 return err; 1481 1482 tbp = peer_tb; 1483 } else { 1484 ifmp = NULL; 1485 tbp = tb; 1486 } 1487 1488 if (ifmp && tbp[IFLA_IFNAME]) { 1489 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1490 name_assign_type = NET_NAME_USER; 1491 } else { 1492 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1493 name_assign_type = NET_NAME_ENUM; 1494 } 1495 1496 net = rtnl_link_get_net(src_net, tbp); 1497 if (IS_ERR(net)) 1498 return PTR_ERR(net); 1499 1500 peer = rtnl_create_link(net, ifname, name_assign_type, 1501 &veth_link_ops, tbp, extack); 1502 if (IS_ERR(peer)) { 1503 put_net(net); 1504 return PTR_ERR(peer); 1505 } 1506 1507 if (!ifmp || !tbp[IFLA_ADDRESS]) 1508 eth_hw_addr_random(peer); 1509 1510 if (ifmp && (dev->ifindex != 0)) 1511 peer->ifindex = ifmp->ifi_index; 1512 1513 peer->gso_max_size = dev->gso_max_size; 1514 peer->gso_max_segs = dev->gso_max_segs; 1515 1516 err = register_netdevice(peer); 1517 put_net(net); 1518 net = NULL; 1519 if (err < 0) 1520 goto err_register_peer; 1521 1522 /* keep GRO disabled by default to be consistent with the established 1523 * veth behavior 1524 */ 1525 veth_disable_gro(peer); 1526 netif_carrier_off(peer); 1527 1528 err = rtnl_configure_link(peer, ifmp); 1529 if (err < 0) 1530 goto err_configure_peer; 1531 1532 /* 1533 * register dev last 1534 * 1535 * note, that since we've registered new device the dev's name 1536 * should be re-allocated 1537 */ 1538 1539 if (tb[IFLA_ADDRESS] == NULL) 1540 eth_hw_addr_random(dev); 1541 1542 if (tb[IFLA_IFNAME]) 1543 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1544 else 1545 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1546 1547 err = register_netdevice(dev); 1548 if (err < 0) 1549 goto err_register_dev; 1550 1551 netif_carrier_off(dev); 1552 1553 /* 1554 * tie the deviced together 1555 */ 1556 1557 priv = netdev_priv(dev); 1558 rcu_assign_pointer(priv->peer, peer); 1559 1560 priv = netdev_priv(peer); 1561 rcu_assign_pointer(priv->peer, dev); 1562 1563 veth_disable_gro(dev); 1564 return 0; 1565 1566 err_register_dev: 1567 /* nothing to do */ 1568 err_configure_peer: 1569 unregister_netdevice(peer); 1570 return err; 1571 1572 err_register_peer: 1573 free_netdev(peer); 1574 return err; 1575 } 1576 1577 static void veth_dellink(struct net_device *dev, struct list_head *head) 1578 { 1579 struct veth_priv *priv; 1580 struct net_device *peer; 1581 1582 priv = netdev_priv(dev); 1583 peer = rtnl_dereference(priv->peer); 1584 1585 /* Note : dellink() is called from default_device_exit_batch(), 1586 * before a rcu_synchronize() point. The devices are guaranteed 1587 * not being freed before one RCU grace period. 1588 */ 1589 RCU_INIT_POINTER(priv->peer, NULL); 1590 unregister_netdevice_queue(dev, head); 1591 1592 if (peer) { 1593 priv = netdev_priv(peer); 1594 RCU_INIT_POINTER(priv->peer, NULL); 1595 unregister_netdevice_queue(peer, head); 1596 } 1597 } 1598 1599 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1600 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1601 }; 1602 1603 static struct net *veth_get_link_net(const struct net_device *dev) 1604 { 1605 struct veth_priv *priv = netdev_priv(dev); 1606 struct net_device *peer = rtnl_dereference(priv->peer); 1607 1608 return peer ? dev_net(peer) : dev_net(dev); 1609 } 1610 1611 static struct rtnl_link_ops veth_link_ops = { 1612 .kind = DRV_NAME, 1613 .priv_size = sizeof(struct veth_priv), 1614 .setup = veth_setup, 1615 .validate = veth_validate, 1616 .newlink = veth_newlink, 1617 .dellink = veth_dellink, 1618 .policy = veth_policy, 1619 .maxtype = VETH_INFO_MAX, 1620 .get_link_net = veth_get_link_net, 1621 }; 1622 1623 /* 1624 * init/fini 1625 */ 1626 1627 static __init int veth_init(void) 1628 { 1629 return rtnl_link_register(&veth_link_ops); 1630 } 1631 1632 static __exit void veth_exit(void) 1633 { 1634 rtnl_link_unregister(&veth_link_ops); 1635 } 1636 1637 module_init(veth_init); 1638 module_exit(veth_exit); 1639 1640 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1641 MODULE_LICENSE("GPL v2"); 1642 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1643