1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 #define VETH_XDP_TX_BULK_SIZE 16 38 #define VETH_XDP_BATCH 16 39 40 struct veth_stats { 41 u64 rx_drops; 42 /* xdp */ 43 u64 xdp_packets; 44 u64 xdp_bytes; 45 u64 xdp_redirect; 46 u64 xdp_drops; 47 u64 xdp_tx; 48 u64 xdp_tx_err; 49 u64 peer_tq_xdp_xmit; 50 u64 peer_tq_xdp_xmit_err; 51 }; 52 53 struct veth_rq_stats { 54 struct veth_stats vs; 55 struct u64_stats_sync syncp; 56 }; 57 58 struct veth_rq { 59 struct napi_struct xdp_napi; 60 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 61 struct net_device *dev; 62 struct bpf_prog __rcu *xdp_prog; 63 struct xdp_mem_info xdp_mem; 64 struct veth_rq_stats stats; 65 bool rx_notify_masked; 66 struct ptr_ring xdp_ring; 67 struct xdp_rxq_info xdp_rxq; 68 }; 69 70 struct veth_priv { 71 struct net_device __rcu *peer; 72 atomic64_t dropped; 73 struct bpf_prog *_xdp_prog; 74 struct veth_rq *rq; 75 unsigned int requested_headroom; 76 }; 77 78 struct veth_xdp_tx_bq { 79 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 80 unsigned int count; 81 }; 82 83 /* 84 * ethtool interface 85 */ 86 87 struct veth_q_stat_desc { 88 char desc[ETH_GSTRING_LEN]; 89 size_t offset; 90 }; 91 92 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 93 94 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 95 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 96 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 97 { "drops", VETH_RQ_STAT(rx_drops) }, 98 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 99 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 100 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 101 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 102 }; 103 104 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 105 106 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 107 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 108 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 109 }; 110 111 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 112 113 static struct { 114 const char string[ETH_GSTRING_LEN]; 115 } ethtool_stats_keys[] = { 116 { "peer_ifindex" }, 117 }; 118 119 static int veth_get_link_ksettings(struct net_device *dev, 120 struct ethtool_link_ksettings *cmd) 121 { 122 cmd->base.speed = SPEED_10000; 123 cmd->base.duplex = DUPLEX_FULL; 124 cmd->base.port = PORT_TP; 125 cmd->base.autoneg = AUTONEG_DISABLE; 126 return 0; 127 } 128 129 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 130 { 131 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 132 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 133 } 134 135 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 136 { 137 u8 *p = buf; 138 int i, j; 139 140 switch(stringset) { 141 case ETH_SS_STATS: 142 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 143 p += sizeof(ethtool_stats_keys); 144 for (i = 0; i < dev->real_num_rx_queues; i++) 145 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 146 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 147 i, veth_rq_stats_desc[j].desc); 148 149 for (i = 0; i < dev->real_num_tx_queues; i++) 150 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 151 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 152 i, veth_tq_stats_desc[j].desc); 153 break; 154 } 155 } 156 157 static int veth_get_sset_count(struct net_device *dev, int sset) 158 { 159 switch (sset) { 160 case ETH_SS_STATS: 161 return ARRAY_SIZE(ethtool_stats_keys) + 162 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 163 VETH_TQ_STATS_LEN * dev->real_num_tx_queues; 164 default: 165 return -EOPNOTSUPP; 166 } 167 } 168 169 static void veth_get_ethtool_stats(struct net_device *dev, 170 struct ethtool_stats *stats, u64 *data) 171 { 172 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 173 struct net_device *peer = rtnl_dereference(priv->peer); 174 int i, j, idx; 175 176 data[0] = peer ? peer->ifindex : 0; 177 idx = 1; 178 for (i = 0; i < dev->real_num_rx_queues; i++) { 179 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 180 const void *stats_base = (void *)&rq_stats->vs; 181 unsigned int start; 182 size_t offset; 183 184 do { 185 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 186 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 187 offset = veth_rq_stats_desc[j].offset; 188 data[idx + j] = *(u64 *)(stats_base + offset); 189 } 190 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 191 idx += VETH_RQ_STATS_LEN; 192 } 193 194 if (!peer) 195 return; 196 197 rcv_priv = netdev_priv(peer); 198 for (i = 0; i < peer->real_num_rx_queues; i++) { 199 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 200 const void *base = (void *)&rq_stats->vs; 201 unsigned int start, tx_idx = idx; 202 size_t offset; 203 204 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 205 do { 206 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 207 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 208 offset = veth_tq_stats_desc[j].offset; 209 data[tx_idx + j] += *(u64 *)(base + offset); 210 } 211 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 212 } 213 } 214 215 static void veth_get_channels(struct net_device *dev, 216 struct ethtool_channels *channels) 217 { 218 channels->tx_count = dev->real_num_tx_queues; 219 channels->rx_count = dev->real_num_rx_queues; 220 channels->max_tx = dev->num_tx_queues; 221 channels->max_rx = dev->num_rx_queues; 222 } 223 224 static int veth_set_channels(struct net_device *dev, 225 struct ethtool_channels *ch); 226 227 static const struct ethtool_ops veth_ethtool_ops = { 228 .get_drvinfo = veth_get_drvinfo, 229 .get_link = ethtool_op_get_link, 230 .get_strings = veth_get_strings, 231 .get_sset_count = veth_get_sset_count, 232 .get_ethtool_stats = veth_get_ethtool_stats, 233 .get_link_ksettings = veth_get_link_ksettings, 234 .get_ts_info = ethtool_op_get_ts_info, 235 .get_channels = veth_get_channels, 236 .set_channels = veth_set_channels, 237 }; 238 239 /* general routines */ 240 241 static bool veth_is_xdp_frame(void *ptr) 242 { 243 return (unsigned long)ptr & VETH_XDP_FLAG; 244 } 245 246 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 247 { 248 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 249 } 250 251 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 252 { 253 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 254 } 255 256 static void veth_ptr_free(void *ptr) 257 { 258 if (veth_is_xdp_frame(ptr)) 259 xdp_return_frame(veth_ptr_to_xdp(ptr)); 260 else 261 kfree_skb(ptr); 262 } 263 264 static void __veth_xdp_flush(struct veth_rq *rq) 265 { 266 /* Write ptr_ring before reading rx_notify_masked */ 267 smp_mb(); 268 if (!READ_ONCE(rq->rx_notify_masked) && 269 napi_schedule_prep(&rq->xdp_napi)) { 270 WRITE_ONCE(rq->rx_notify_masked, true); 271 __napi_schedule(&rq->xdp_napi); 272 } 273 } 274 275 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 276 { 277 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 278 dev_kfree_skb_any(skb); 279 return NET_RX_DROP; 280 } 281 282 return NET_RX_SUCCESS; 283 } 284 285 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 286 struct veth_rq *rq, bool xdp) 287 { 288 return __dev_forward_skb(dev, skb) ?: xdp ? 289 veth_xdp_rx(rq, skb) : 290 __netif_rx(skb); 291 } 292 293 /* return true if the specified skb has chances of GRO aggregation 294 * Don't strive for accuracy, but try to avoid GRO overhead in the most 295 * common scenarios. 296 * When XDP is enabled, all traffic is considered eligible, as the xmit 297 * device has TSO off. 298 * When TSO is enabled on the xmit device, we are likely interested only 299 * in UDP aggregation, explicitly check for that if the skb is suspected 300 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 301 * to belong to locally generated UDP traffic. 302 */ 303 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 304 const struct net_device *rcv, 305 const struct sk_buff *skb) 306 { 307 return !(dev->features & NETIF_F_ALL_TSO) || 308 (skb->destructor == sock_wfree && 309 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 310 } 311 312 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 313 { 314 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 315 struct veth_rq *rq = NULL; 316 struct net_device *rcv; 317 int length = skb->len; 318 bool use_napi = false; 319 int rxq; 320 321 rcu_read_lock(); 322 rcv = rcu_dereference(priv->peer); 323 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 324 kfree_skb(skb); 325 goto drop; 326 } 327 328 rcv_priv = netdev_priv(rcv); 329 rxq = skb_get_queue_mapping(skb); 330 if (rxq < rcv->real_num_rx_queues) { 331 rq = &rcv_priv->rq[rxq]; 332 333 /* The napi pointer is available when an XDP program is 334 * attached or when GRO is enabled 335 * Don't bother with napi/GRO if the skb can't be aggregated 336 */ 337 use_napi = rcu_access_pointer(rq->napi) && 338 veth_skb_is_eligible_for_gro(dev, rcv, skb); 339 } 340 341 skb_tx_timestamp(skb); 342 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 343 if (!use_napi) 344 dev_lstats_add(dev, length); 345 } else { 346 drop: 347 atomic64_inc(&priv->dropped); 348 } 349 350 if (use_napi) 351 __veth_xdp_flush(rq); 352 353 rcu_read_unlock(); 354 355 return NETDEV_TX_OK; 356 } 357 358 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 359 { 360 struct veth_priv *priv = netdev_priv(dev); 361 362 dev_lstats_read(dev, packets, bytes); 363 return atomic64_read(&priv->dropped); 364 } 365 366 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 367 { 368 struct veth_priv *priv = netdev_priv(dev); 369 int i; 370 371 result->peer_tq_xdp_xmit_err = 0; 372 result->xdp_packets = 0; 373 result->xdp_tx_err = 0; 374 result->xdp_bytes = 0; 375 result->rx_drops = 0; 376 for (i = 0; i < dev->num_rx_queues; i++) { 377 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 378 struct veth_rq_stats *stats = &priv->rq[i].stats; 379 unsigned int start; 380 381 do { 382 start = u64_stats_fetch_begin_irq(&stats->syncp); 383 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 384 xdp_tx_err = stats->vs.xdp_tx_err; 385 packets = stats->vs.xdp_packets; 386 bytes = stats->vs.xdp_bytes; 387 drops = stats->vs.rx_drops; 388 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 389 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 390 result->xdp_tx_err += xdp_tx_err; 391 result->xdp_packets += packets; 392 result->xdp_bytes += bytes; 393 result->rx_drops += drops; 394 } 395 } 396 397 static void veth_get_stats64(struct net_device *dev, 398 struct rtnl_link_stats64 *tot) 399 { 400 struct veth_priv *priv = netdev_priv(dev); 401 struct net_device *peer; 402 struct veth_stats rx; 403 u64 packets, bytes; 404 405 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 406 tot->tx_bytes = bytes; 407 tot->tx_packets = packets; 408 409 veth_stats_rx(&rx, dev); 410 tot->tx_dropped += rx.xdp_tx_err; 411 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 412 tot->rx_bytes = rx.xdp_bytes; 413 tot->rx_packets = rx.xdp_packets; 414 415 rcu_read_lock(); 416 peer = rcu_dereference(priv->peer); 417 if (peer) { 418 veth_stats_tx(peer, &packets, &bytes); 419 tot->rx_bytes += bytes; 420 tot->rx_packets += packets; 421 422 veth_stats_rx(&rx, peer); 423 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 424 tot->rx_dropped += rx.xdp_tx_err; 425 tot->tx_bytes += rx.xdp_bytes; 426 tot->tx_packets += rx.xdp_packets; 427 } 428 rcu_read_unlock(); 429 } 430 431 /* fake multicast ability */ 432 static void veth_set_multicast_list(struct net_device *dev) 433 { 434 } 435 436 static int veth_select_rxq(struct net_device *dev) 437 { 438 return smp_processor_id() % dev->real_num_rx_queues; 439 } 440 441 static struct net_device *veth_peer_dev(struct net_device *dev) 442 { 443 struct veth_priv *priv = netdev_priv(dev); 444 445 /* Callers must be under RCU read side. */ 446 return rcu_dereference(priv->peer); 447 } 448 449 static int veth_xdp_xmit(struct net_device *dev, int n, 450 struct xdp_frame **frames, 451 u32 flags, bool ndo_xmit) 452 { 453 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 454 int i, ret = -ENXIO, nxmit = 0; 455 struct net_device *rcv; 456 unsigned int max_len; 457 struct veth_rq *rq; 458 459 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 460 return -EINVAL; 461 462 rcu_read_lock(); 463 rcv = rcu_dereference(priv->peer); 464 if (unlikely(!rcv)) 465 goto out; 466 467 rcv_priv = netdev_priv(rcv); 468 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 469 /* The napi pointer is set if NAPI is enabled, which ensures that 470 * xdp_ring is initialized on receive side and the peer device is up. 471 */ 472 if (!rcu_access_pointer(rq->napi)) 473 goto out; 474 475 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 476 477 spin_lock(&rq->xdp_ring.producer_lock); 478 for (i = 0; i < n; i++) { 479 struct xdp_frame *frame = frames[i]; 480 void *ptr = veth_xdp_to_ptr(frame); 481 482 if (unlikely(xdp_get_frame_len(frame) > max_len || 483 __ptr_ring_produce(&rq->xdp_ring, ptr))) 484 break; 485 nxmit++; 486 } 487 spin_unlock(&rq->xdp_ring.producer_lock); 488 489 if (flags & XDP_XMIT_FLUSH) 490 __veth_xdp_flush(rq); 491 492 ret = nxmit; 493 if (ndo_xmit) { 494 u64_stats_update_begin(&rq->stats.syncp); 495 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 496 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 497 u64_stats_update_end(&rq->stats.syncp); 498 } 499 500 out: 501 rcu_read_unlock(); 502 503 return ret; 504 } 505 506 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 507 struct xdp_frame **frames, u32 flags) 508 { 509 int err; 510 511 err = veth_xdp_xmit(dev, n, frames, flags, true); 512 if (err < 0) { 513 struct veth_priv *priv = netdev_priv(dev); 514 515 atomic64_add(n, &priv->dropped); 516 } 517 518 return err; 519 } 520 521 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 522 { 523 int sent, i, err = 0, drops; 524 525 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 526 if (sent < 0) { 527 err = sent; 528 sent = 0; 529 } 530 531 for (i = sent; unlikely(i < bq->count); i++) 532 xdp_return_frame(bq->q[i]); 533 534 drops = bq->count - sent; 535 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 536 537 u64_stats_update_begin(&rq->stats.syncp); 538 rq->stats.vs.xdp_tx += sent; 539 rq->stats.vs.xdp_tx_err += drops; 540 u64_stats_update_end(&rq->stats.syncp); 541 542 bq->count = 0; 543 } 544 545 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 546 { 547 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 548 struct net_device *rcv; 549 struct veth_rq *rcv_rq; 550 551 rcu_read_lock(); 552 veth_xdp_flush_bq(rq, bq); 553 rcv = rcu_dereference(priv->peer); 554 if (unlikely(!rcv)) 555 goto out; 556 557 rcv_priv = netdev_priv(rcv); 558 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 559 /* xdp_ring is initialized on receive side? */ 560 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 561 goto out; 562 563 __veth_xdp_flush(rcv_rq); 564 out: 565 rcu_read_unlock(); 566 } 567 568 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 569 struct veth_xdp_tx_bq *bq) 570 { 571 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 572 573 if (unlikely(!frame)) 574 return -EOVERFLOW; 575 576 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 577 veth_xdp_flush_bq(rq, bq); 578 579 bq->q[bq->count++] = frame; 580 581 return 0; 582 } 583 584 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 585 struct xdp_frame *frame, 586 struct veth_xdp_tx_bq *bq, 587 struct veth_stats *stats) 588 { 589 struct xdp_frame orig_frame; 590 struct bpf_prog *xdp_prog; 591 592 rcu_read_lock(); 593 xdp_prog = rcu_dereference(rq->xdp_prog); 594 if (likely(xdp_prog)) { 595 struct xdp_buff xdp; 596 u32 act; 597 598 xdp_convert_frame_to_buff(frame, &xdp); 599 xdp.rxq = &rq->xdp_rxq; 600 601 act = bpf_prog_run_xdp(xdp_prog, &xdp); 602 603 switch (act) { 604 case XDP_PASS: 605 if (xdp_update_frame_from_buff(&xdp, frame)) 606 goto err_xdp; 607 break; 608 case XDP_TX: 609 orig_frame = *frame; 610 xdp.rxq->mem = frame->mem; 611 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 612 trace_xdp_exception(rq->dev, xdp_prog, act); 613 frame = &orig_frame; 614 stats->rx_drops++; 615 goto err_xdp; 616 } 617 stats->xdp_tx++; 618 rcu_read_unlock(); 619 goto xdp_xmit; 620 case XDP_REDIRECT: 621 orig_frame = *frame; 622 xdp.rxq->mem = frame->mem; 623 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 624 frame = &orig_frame; 625 stats->rx_drops++; 626 goto err_xdp; 627 } 628 stats->xdp_redirect++; 629 rcu_read_unlock(); 630 goto xdp_xmit; 631 default: 632 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 633 fallthrough; 634 case XDP_ABORTED: 635 trace_xdp_exception(rq->dev, xdp_prog, act); 636 fallthrough; 637 case XDP_DROP: 638 stats->xdp_drops++; 639 goto err_xdp; 640 } 641 } 642 rcu_read_unlock(); 643 644 return frame; 645 err_xdp: 646 rcu_read_unlock(); 647 xdp_return_frame(frame); 648 xdp_xmit: 649 return NULL; 650 } 651 652 /* frames array contains VETH_XDP_BATCH at most */ 653 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 654 int n_xdpf, struct veth_xdp_tx_bq *bq, 655 struct veth_stats *stats) 656 { 657 void *skbs[VETH_XDP_BATCH]; 658 int i; 659 660 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 661 GFP_ATOMIC | __GFP_ZERO) < 0) { 662 for (i = 0; i < n_xdpf; i++) 663 xdp_return_frame(frames[i]); 664 stats->rx_drops += n_xdpf; 665 666 return; 667 } 668 669 for (i = 0; i < n_xdpf; i++) { 670 struct sk_buff *skb = skbs[i]; 671 672 skb = __xdp_build_skb_from_frame(frames[i], skb, 673 rq->dev); 674 if (!skb) { 675 xdp_return_frame(frames[i]); 676 stats->rx_drops++; 677 continue; 678 } 679 napi_gro_receive(&rq->xdp_napi, skb); 680 } 681 } 682 683 static void veth_xdp_get(struct xdp_buff *xdp) 684 { 685 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 686 int i; 687 688 get_page(virt_to_page(xdp->data)); 689 if (likely(!xdp_buff_has_frags(xdp))) 690 return; 691 692 for (i = 0; i < sinfo->nr_frags; i++) 693 __skb_frag_ref(&sinfo->frags[i]); 694 } 695 696 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 697 struct xdp_buff *xdp, 698 struct sk_buff **pskb) 699 { 700 struct sk_buff *skb = *pskb; 701 u32 frame_sz; 702 703 if (skb_shared(skb) || skb_head_is_locked(skb) || 704 skb_shinfo(skb)->nr_frags) { 705 u32 size, len, max_head_size, off; 706 struct sk_buff *nskb; 707 struct page *page; 708 int i, head_off; 709 710 /* We need a private copy of the skb and data buffers since 711 * the ebpf program can modify it. We segment the original skb 712 * into order-0 pages without linearize it. 713 * 714 * Make sure we have enough space for linear and paged area 715 */ 716 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - 717 VETH_XDP_HEADROOM); 718 if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) 719 goto drop; 720 721 /* Allocate skb head */ 722 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 723 if (!page) 724 goto drop; 725 726 nskb = build_skb(page_address(page), PAGE_SIZE); 727 if (!nskb) { 728 put_page(page); 729 goto drop; 730 } 731 732 skb_reserve(nskb, VETH_XDP_HEADROOM); 733 size = min_t(u32, skb->len, max_head_size); 734 if (skb_copy_bits(skb, 0, nskb->data, size)) { 735 consume_skb(nskb); 736 goto drop; 737 } 738 skb_put(nskb, size); 739 740 skb_copy_header(nskb, skb); 741 head_off = skb_headroom(nskb) - skb_headroom(skb); 742 skb_headers_offset_update(nskb, head_off); 743 744 /* Allocate paged area of new skb */ 745 off = size; 746 len = skb->len - off; 747 748 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { 749 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 750 if (!page) { 751 consume_skb(nskb); 752 goto drop; 753 } 754 755 size = min_t(u32, len, PAGE_SIZE); 756 skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); 757 if (skb_copy_bits(skb, off, page_address(page), 758 size)) { 759 consume_skb(nskb); 760 goto drop; 761 } 762 763 len -= size; 764 off += size; 765 } 766 767 consume_skb(skb); 768 skb = nskb; 769 } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM && 770 pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) { 771 goto drop; 772 } 773 774 /* SKB "head" area always have tailroom for skb_shared_info */ 775 frame_sz = skb_end_pointer(skb) - skb->head; 776 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 777 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 778 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 779 skb_headlen(skb), true); 780 781 if (skb_is_nonlinear(skb)) { 782 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 783 xdp_buff_set_frags_flag(xdp); 784 } else { 785 xdp_buff_clear_frags_flag(xdp); 786 } 787 *pskb = skb; 788 789 return 0; 790 drop: 791 consume_skb(skb); 792 *pskb = NULL; 793 794 return -ENOMEM; 795 } 796 797 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 798 struct sk_buff *skb, 799 struct veth_xdp_tx_bq *bq, 800 struct veth_stats *stats) 801 { 802 void *orig_data, *orig_data_end; 803 struct bpf_prog *xdp_prog; 804 struct xdp_buff xdp; 805 u32 act, metalen; 806 int off; 807 808 skb_prepare_for_gro(skb); 809 810 rcu_read_lock(); 811 xdp_prog = rcu_dereference(rq->xdp_prog); 812 if (unlikely(!xdp_prog)) { 813 rcu_read_unlock(); 814 goto out; 815 } 816 817 __skb_push(skb, skb->data - skb_mac_header(skb)); 818 if (veth_convert_skb_to_xdp_buff(rq, &xdp, &skb)) 819 goto drop; 820 821 orig_data = xdp.data; 822 orig_data_end = xdp.data_end; 823 824 act = bpf_prog_run_xdp(xdp_prog, &xdp); 825 826 switch (act) { 827 case XDP_PASS: 828 break; 829 case XDP_TX: 830 veth_xdp_get(&xdp); 831 consume_skb(skb); 832 xdp.rxq->mem = rq->xdp_mem; 833 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 834 trace_xdp_exception(rq->dev, xdp_prog, act); 835 stats->rx_drops++; 836 goto err_xdp; 837 } 838 stats->xdp_tx++; 839 rcu_read_unlock(); 840 goto xdp_xmit; 841 case XDP_REDIRECT: 842 veth_xdp_get(&xdp); 843 consume_skb(skb); 844 xdp.rxq->mem = rq->xdp_mem; 845 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 846 stats->rx_drops++; 847 goto err_xdp; 848 } 849 stats->xdp_redirect++; 850 rcu_read_unlock(); 851 goto xdp_xmit; 852 default: 853 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 854 fallthrough; 855 case XDP_ABORTED: 856 trace_xdp_exception(rq->dev, xdp_prog, act); 857 fallthrough; 858 case XDP_DROP: 859 stats->xdp_drops++; 860 goto xdp_drop; 861 } 862 rcu_read_unlock(); 863 864 /* check if bpf_xdp_adjust_head was used */ 865 off = orig_data - xdp.data; 866 if (off > 0) 867 __skb_push(skb, off); 868 else if (off < 0) 869 __skb_pull(skb, -off); 870 871 skb_reset_mac_header(skb); 872 873 /* check if bpf_xdp_adjust_tail was used */ 874 off = xdp.data_end - orig_data_end; 875 if (off != 0) 876 __skb_put(skb, off); /* positive on grow, negative on shrink */ 877 878 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 879 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 880 */ 881 if (xdp_buff_has_frags(&xdp)) 882 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 883 else 884 skb->data_len = 0; 885 886 skb->protocol = eth_type_trans(skb, rq->dev); 887 888 metalen = xdp.data - xdp.data_meta; 889 if (metalen) 890 skb_metadata_set(skb, metalen); 891 out: 892 return skb; 893 drop: 894 stats->rx_drops++; 895 xdp_drop: 896 rcu_read_unlock(); 897 kfree_skb(skb); 898 return NULL; 899 err_xdp: 900 rcu_read_unlock(); 901 xdp_return_buff(&xdp); 902 xdp_xmit: 903 return NULL; 904 } 905 906 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 907 struct veth_xdp_tx_bq *bq, 908 struct veth_stats *stats) 909 { 910 int i, done = 0, n_xdpf = 0; 911 void *xdpf[VETH_XDP_BATCH]; 912 913 for (i = 0; i < budget; i++) { 914 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 915 916 if (!ptr) 917 break; 918 919 if (veth_is_xdp_frame(ptr)) { 920 /* ndo_xdp_xmit */ 921 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 922 923 stats->xdp_bytes += xdp_get_frame_len(frame); 924 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 925 if (frame) { 926 /* XDP_PASS */ 927 xdpf[n_xdpf++] = frame; 928 if (n_xdpf == VETH_XDP_BATCH) { 929 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 930 bq, stats); 931 n_xdpf = 0; 932 } 933 } 934 } else { 935 /* ndo_start_xmit */ 936 struct sk_buff *skb = ptr; 937 938 stats->xdp_bytes += skb->len; 939 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 940 if (skb) { 941 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 942 netif_receive_skb(skb); 943 else 944 napi_gro_receive(&rq->xdp_napi, skb); 945 } 946 } 947 done++; 948 } 949 950 if (n_xdpf) 951 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 952 953 u64_stats_update_begin(&rq->stats.syncp); 954 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 955 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 956 rq->stats.vs.xdp_drops += stats->xdp_drops; 957 rq->stats.vs.rx_drops += stats->rx_drops; 958 rq->stats.vs.xdp_packets += done; 959 u64_stats_update_end(&rq->stats.syncp); 960 961 return done; 962 } 963 964 static int veth_poll(struct napi_struct *napi, int budget) 965 { 966 struct veth_rq *rq = 967 container_of(napi, struct veth_rq, xdp_napi); 968 struct veth_stats stats = {}; 969 struct veth_xdp_tx_bq bq; 970 int done; 971 972 bq.count = 0; 973 974 xdp_set_return_frame_no_direct(); 975 done = veth_xdp_rcv(rq, budget, &bq, &stats); 976 977 if (done < budget && napi_complete_done(napi, done)) { 978 /* Write rx_notify_masked before reading ptr_ring */ 979 smp_store_mb(rq->rx_notify_masked, false); 980 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 981 if (napi_schedule_prep(&rq->xdp_napi)) { 982 WRITE_ONCE(rq->rx_notify_masked, true); 983 __napi_schedule(&rq->xdp_napi); 984 } 985 } 986 } 987 988 if (stats.xdp_tx > 0) 989 veth_xdp_flush(rq, &bq); 990 if (stats.xdp_redirect > 0) 991 xdp_do_flush(); 992 xdp_clear_return_frame_no_direct(); 993 994 return done; 995 } 996 997 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 998 { 999 struct veth_priv *priv = netdev_priv(dev); 1000 int err, i; 1001 1002 for (i = start; i < end; i++) { 1003 struct veth_rq *rq = &priv->rq[i]; 1004 1005 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 1006 if (err) 1007 goto err_xdp_ring; 1008 } 1009 1010 for (i = start; i < end; i++) { 1011 struct veth_rq *rq = &priv->rq[i]; 1012 1013 napi_enable(&rq->xdp_napi); 1014 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1015 } 1016 1017 return 0; 1018 1019 err_xdp_ring: 1020 for (i--; i >= start; i--) 1021 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1022 1023 return err; 1024 } 1025 1026 static int __veth_napi_enable(struct net_device *dev) 1027 { 1028 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1029 } 1030 1031 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1032 { 1033 struct veth_priv *priv = netdev_priv(dev); 1034 int i; 1035 1036 for (i = start; i < end; i++) { 1037 struct veth_rq *rq = &priv->rq[i]; 1038 1039 rcu_assign_pointer(priv->rq[i].napi, NULL); 1040 napi_disable(&rq->xdp_napi); 1041 __netif_napi_del(&rq->xdp_napi); 1042 } 1043 synchronize_net(); 1044 1045 for (i = start; i < end; i++) { 1046 struct veth_rq *rq = &priv->rq[i]; 1047 1048 rq->rx_notify_masked = false; 1049 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1050 } 1051 } 1052 1053 static void veth_napi_del(struct net_device *dev) 1054 { 1055 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1056 } 1057 1058 static bool veth_gro_requested(const struct net_device *dev) 1059 { 1060 return !!(dev->wanted_features & NETIF_F_GRO); 1061 } 1062 1063 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1064 bool napi_already_on) 1065 { 1066 struct veth_priv *priv = netdev_priv(dev); 1067 int err, i; 1068 1069 for (i = start; i < end; i++) { 1070 struct veth_rq *rq = &priv->rq[i]; 1071 1072 if (!napi_already_on) 1073 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1074 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1075 if (err < 0) 1076 goto err_rxq_reg; 1077 1078 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1079 MEM_TYPE_PAGE_SHARED, 1080 NULL); 1081 if (err < 0) 1082 goto err_reg_mem; 1083 1084 /* Save original mem info as it can be overwritten */ 1085 rq->xdp_mem = rq->xdp_rxq.mem; 1086 } 1087 return 0; 1088 1089 err_reg_mem: 1090 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1091 err_rxq_reg: 1092 for (i--; i >= start; i--) { 1093 struct veth_rq *rq = &priv->rq[i]; 1094 1095 xdp_rxq_info_unreg(&rq->xdp_rxq); 1096 if (!napi_already_on) 1097 netif_napi_del(&rq->xdp_napi); 1098 } 1099 1100 return err; 1101 } 1102 1103 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1104 bool delete_napi) 1105 { 1106 struct veth_priv *priv = netdev_priv(dev); 1107 int i; 1108 1109 for (i = start; i < end; i++) { 1110 struct veth_rq *rq = &priv->rq[i]; 1111 1112 rq->xdp_rxq.mem = rq->xdp_mem; 1113 xdp_rxq_info_unreg(&rq->xdp_rxq); 1114 1115 if (delete_napi) 1116 netif_napi_del(&rq->xdp_napi); 1117 } 1118 } 1119 1120 static int veth_enable_xdp(struct net_device *dev) 1121 { 1122 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1123 struct veth_priv *priv = netdev_priv(dev); 1124 int err, i; 1125 1126 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1127 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1128 if (err) 1129 return err; 1130 1131 if (!napi_already_on) { 1132 err = __veth_napi_enable(dev); 1133 if (err) { 1134 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1135 return err; 1136 } 1137 1138 if (!veth_gro_requested(dev)) { 1139 /* user-space did not require GRO, but adding XDP 1140 * is supposed to get GRO working 1141 */ 1142 dev->features |= NETIF_F_GRO; 1143 netdev_features_change(dev); 1144 } 1145 } 1146 } 1147 1148 for (i = 0; i < dev->real_num_rx_queues; i++) { 1149 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1150 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1151 } 1152 1153 return 0; 1154 } 1155 1156 static void veth_disable_xdp(struct net_device *dev) 1157 { 1158 struct veth_priv *priv = netdev_priv(dev); 1159 int i; 1160 1161 for (i = 0; i < dev->real_num_rx_queues; i++) 1162 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1163 1164 if (!netif_running(dev) || !veth_gro_requested(dev)) { 1165 veth_napi_del(dev); 1166 1167 /* if user-space did not require GRO, since adding XDP 1168 * enabled it, clear it now 1169 */ 1170 if (!veth_gro_requested(dev) && netif_running(dev)) { 1171 dev->features &= ~NETIF_F_GRO; 1172 netdev_features_change(dev); 1173 } 1174 } 1175 1176 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1177 } 1178 1179 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1180 { 1181 struct veth_priv *priv = netdev_priv(dev); 1182 int err, i; 1183 1184 for (i = start; i < end; i++) { 1185 struct veth_rq *rq = &priv->rq[i]; 1186 1187 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1188 } 1189 1190 err = __veth_napi_enable_range(dev, start, end); 1191 if (err) { 1192 for (i = start; i < end; i++) { 1193 struct veth_rq *rq = &priv->rq[i]; 1194 1195 netif_napi_del(&rq->xdp_napi); 1196 } 1197 return err; 1198 } 1199 return err; 1200 } 1201 1202 static int veth_napi_enable(struct net_device *dev) 1203 { 1204 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1205 } 1206 1207 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1208 { 1209 struct veth_priv *priv = netdev_priv(dev); 1210 1211 if (start >= end) 1212 return; 1213 1214 if (priv->_xdp_prog) { 1215 veth_napi_del_range(dev, start, end); 1216 veth_disable_xdp_range(dev, start, end, false); 1217 } else if (veth_gro_requested(dev)) { 1218 veth_napi_del_range(dev, start, end); 1219 } 1220 } 1221 1222 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1223 { 1224 struct veth_priv *priv = netdev_priv(dev); 1225 int err; 1226 1227 if (start >= end) 1228 return 0; 1229 1230 if (priv->_xdp_prog) { 1231 /* these channels are freshly initialized, napi is not on there even 1232 * when GRO is requeste 1233 */ 1234 err = veth_enable_xdp_range(dev, start, end, false); 1235 if (err) 1236 return err; 1237 1238 err = __veth_napi_enable_range(dev, start, end); 1239 if (err) { 1240 /* on error always delete the newly added napis */ 1241 veth_disable_xdp_range(dev, start, end, true); 1242 return err; 1243 } 1244 } else if (veth_gro_requested(dev)) { 1245 return veth_napi_enable_range(dev, start, end); 1246 } 1247 return 0; 1248 } 1249 1250 static int veth_set_channels(struct net_device *dev, 1251 struct ethtool_channels *ch) 1252 { 1253 struct veth_priv *priv = netdev_priv(dev); 1254 unsigned int old_rx_count, new_rx_count; 1255 struct veth_priv *peer_priv; 1256 struct net_device *peer; 1257 int err; 1258 1259 /* sanity check. Upper bounds are already enforced by the caller */ 1260 if (!ch->rx_count || !ch->tx_count) 1261 return -EINVAL; 1262 1263 /* avoid braking XDP, if that is enabled */ 1264 peer = rtnl_dereference(priv->peer); 1265 peer_priv = peer ? netdev_priv(peer) : NULL; 1266 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1267 return -EINVAL; 1268 1269 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1270 return -EINVAL; 1271 1272 old_rx_count = dev->real_num_rx_queues; 1273 new_rx_count = ch->rx_count; 1274 if (netif_running(dev)) { 1275 /* turn device off */ 1276 netif_carrier_off(dev); 1277 if (peer) 1278 netif_carrier_off(peer); 1279 1280 /* try to allocate new resurces, as needed*/ 1281 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1282 if (err) 1283 goto out; 1284 } 1285 1286 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1287 if (err) 1288 goto revert; 1289 1290 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1291 if (err) { 1292 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1293 1294 /* this error condition could happen only if rx and tx change 1295 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1296 * and we can't do anything to fully restore the original 1297 * status 1298 */ 1299 if (err2) 1300 pr_warn("Can't restore rx queues config %d -> %d %d", 1301 new_rx_count, old_rx_count, err2); 1302 else 1303 goto revert; 1304 } 1305 1306 out: 1307 if (netif_running(dev)) { 1308 /* note that we need to swap the arguments WRT the enable part 1309 * to identify the range we have to disable 1310 */ 1311 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1312 netif_carrier_on(dev); 1313 if (peer) 1314 netif_carrier_on(peer); 1315 } 1316 return err; 1317 1318 revert: 1319 new_rx_count = old_rx_count; 1320 old_rx_count = ch->rx_count; 1321 goto out; 1322 } 1323 1324 static int veth_open(struct net_device *dev) 1325 { 1326 struct veth_priv *priv = netdev_priv(dev); 1327 struct net_device *peer = rtnl_dereference(priv->peer); 1328 int err; 1329 1330 if (!peer) 1331 return -ENOTCONN; 1332 1333 if (priv->_xdp_prog) { 1334 err = veth_enable_xdp(dev); 1335 if (err) 1336 return err; 1337 } else if (veth_gro_requested(dev)) { 1338 err = veth_napi_enable(dev); 1339 if (err) 1340 return err; 1341 } 1342 1343 if (peer->flags & IFF_UP) { 1344 netif_carrier_on(dev); 1345 netif_carrier_on(peer); 1346 } 1347 1348 return 0; 1349 } 1350 1351 static int veth_close(struct net_device *dev) 1352 { 1353 struct veth_priv *priv = netdev_priv(dev); 1354 struct net_device *peer = rtnl_dereference(priv->peer); 1355 1356 netif_carrier_off(dev); 1357 if (peer) 1358 netif_carrier_off(peer); 1359 1360 if (priv->_xdp_prog) 1361 veth_disable_xdp(dev); 1362 else if (veth_gro_requested(dev)) 1363 veth_napi_del(dev); 1364 1365 return 0; 1366 } 1367 1368 static int is_valid_veth_mtu(int mtu) 1369 { 1370 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1371 } 1372 1373 static int veth_alloc_queues(struct net_device *dev) 1374 { 1375 struct veth_priv *priv = netdev_priv(dev); 1376 int i; 1377 1378 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT); 1379 if (!priv->rq) 1380 return -ENOMEM; 1381 1382 for (i = 0; i < dev->num_rx_queues; i++) { 1383 priv->rq[i].dev = dev; 1384 u64_stats_init(&priv->rq[i].stats.syncp); 1385 } 1386 1387 return 0; 1388 } 1389 1390 static void veth_free_queues(struct net_device *dev) 1391 { 1392 struct veth_priv *priv = netdev_priv(dev); 1393 1394 kfree(priv->rq); 1395 } 1396 1397 static int veth_dev_init(struct net_device *dev) 1398 { 1399 int err; 1400 1401 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1402 if (!dev->lstats) 1403 return -ENOMEM; 1404 1405 err = veth_alloc_queues(dev); 1406 if (err) { 1407 free_percpu(dev->lstats); 1408 return err; 1409 } 1410 1411 return 0; 1412 } 1413 1414 static void veth_dev_free(struct net_device *dev) 1415 { 1416 veth_free_queues(dev); 1417 free_percpu(dev->lstats); 1418 } 1419 1420 #ifdef CONFIG_NET_POLL_CONTROLLER 1421 static void veth_poll_controller(struct net_device *dev) 1422 { 1423 /* veth only receives frames when its peer sends one 1424 * Since it has nothing to do with disabling irqs, we are guaranteed 1425 * never to have pending data when we poll for it so 1426 * there is nothing to do here. 1427 * 1428 * We need this though so netpoll recognizes us as an interface that 1429 * supports polling, which enables bridge devices in virt setups to 1430 * still use netconsole 1431 */ 1432 } 1433 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1434 1435 static int veth_get_iflink(const struct net_device *dev) 1436 { 1437 struct veth_priv *priv = netdev_priv(dev); 1438 struct net_device *peer; 1439 int iflink; 1440 1441 rcu_read_lock(); 1442 peer = rcu_dereference(priv->peer); 1443 iflink = peer ? peer->ifindex : 0; 1444 rcu_read_unlock(); 1445 1446 return iflink; 1447 } 1448 1449 static netdev_features_t veth_fix_features(struct net_device *dev, 1450 netdev_features_t features) 1451 { 1452 struct veth_priv *priv = netdev_priv(dev); 1453 struct net_device *peer; 1454 1455 peer = rtnl_dereference(priv->peer); 1456 if (peer) { 1457 struct veth_priv *peer_priv = netdev_priv(peer); 1458 1459 if (peer_priv->_xdp_prog) 1460 features &= ~NETIF_F_GSO_SOFTWARE; 1461 } 1462 if (priv->_xdp_prog) 1463 features |= NETIF_F_GRO; 1464 1465 return features; 1466 } 1467 1468 static int veth_set_features(struct net_device *dev, 1469 netdev_features_t features) 1470 { 1471 netdev_features_t changed = features ^ dev->features; 1472 struct veth_priv *priv = netdev_priv(dev); 1473 int err; 1474 1475 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1476 return 0; 1477 1478 if (features & NETIF_F_GRO) { 1479 err = veth_napi_enable(dev); 1480 if (err) 1481 return err; 1482 } else { 1483 veth_napi_del(dev); 1484 } 1485 return 0; 1486 } 1487 1488 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1489 { 1490 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1491 struct net_device *peer; 1492 1493 if (new_hr < 0) 1494 new_hr = 0; 1495 1496 rcu_read_lock(); 1497 peer = rcu_dereference(priv->peer); 1498 if (unlikely(!peer)) 1499 goto out; 1500 1501 peer_priv = netdev_priv(peer); 1502 priv->requested_headroom = new_hr; 1503 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1504 dev->needed_headroom = new_hr; 1505 peer->needed_headroom = new_hr; 1506 1507 out: 1508 rcu_read_unlock(); 1509 } 1510 1511 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1512 struct netlink_ext_ack *extack) 1513 { 1514 struct veth_priv *priv = netdev_priv(dev); 1515 struct bpf_prog *old_prog; 1516 struct net_device *peer; 1517 unsigned int max_mtu; 1518 int err; 1519 1520 old_prog = priv->_xdp_prog; 1521 priv->_xdp_prog = prog; 1522 peer = rtnl_dereference(priv->peer); 1523 1524 if (prog) { 1525 if (!peer) { 1526 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1527 err = -ENOTCONN; 1528 goto err; 1529 } 1530 1531 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1532 peer->hard_header_len; 1533 /* Allow increasing the max_mtu if the program supports 1534 * XDP fragments. 1535 */ 1536 if (prog->aux->xdp_has_frags) 1537 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1538 1539 if (peer->mtu > max_mtu) { 1540 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1541 err = -ERANGE; 1542 goto err; 1543 } 1544 1545 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1546 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1547 err = -ENOSPC; 1548 goto err; 1549 } 1550 1551 if (dev->flags & IFF_UP) { 1552 err = veth_enable_xdp(dev); 1553 if (err) { 1554 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1555 goto err; 1556 } 1557 } 1558 1559 if (!old_prog) { 1560 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1561 peer->max_mtu = max_mtu; 1562 } 1563 } 1564 1565 if (old_prog) { 1566 if (!prog) { 1567 if (dev->flags & IFF_UP) 1568 veth_disable_xdp(dev); 1569 1570 if (peer) { 1571 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1572 peer->max_mtu = ETH_MAX_MTU; 1573 } 1574 } 1575 bpf_prog_put(old_prog); 1576 } 1577 1578 if ((!!old_prog ^ !!prog) && peer) 1579 netdev_update_features(peer); 1580 1581 return 0; 1582 err: 1583 priv->_xdp_prog = old_prog; 1584 1585 return err; 1586 } 1587 1588 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1589 { 1590 switch (xdp->command) { 1591 case XDP_SETUP_PROG: 1592 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1593 default: 1594 return -EINVAL; 1595 } 1596 } 1597 1598 static const struct net_device_ops veth_netdev_ops = { 1599 .ndo_init = veth_dev_init, 1600 .ndo_open = veth_open, 1601 .ndo_stop = veth_close, 1602 .ndo_start_xmit = veth_xmit, 1603 .ndo_get_stats64 = veth_get_stats64, 1604 .ndo_set_rx_mode = veth_set_multicast_list, 1605 .ndo_set_mac_address = eth_mac_addr, 1606 #ifdef CONFIG_NET_POLL_CONTROLLER 1607 .ndo_poll_controller = veth_poll_controller, 1608 #endif 1609 .ndo_get_iflink = veth_get_iflink, 1610 .ndo_fix_features = veth_fix_features, 1611 .ndo_set_features = veth_set_features, 1612 .ndo_features_check = passthru_features_check, 1613 .ndo_set_rx_headroom = veth_set_rx_headroom, 1614 .ndo_bpf = veth_xdp, 1615 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1616 .ndo_get_peer_dev = veth_peer_dev, 1617 }; 1618 1619 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1620 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1621 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1622 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1623 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1624 1625 static void veth_setup(struct net_device *dev) 1626 { 1627 ether_setup(dev); 1628 1629 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1630 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1631 dev->priv_flags |= IFF_NO_QUEUE; 1632 dev->priv_flags |= IFF_PHONY_HEADROOM; 1633 1634 dev->netdev_ops = &veth_netdev_ops; 1635 dev->ethtool_ops = &veth_ethtool_ops; 1636 dev->features |= NETIF_F_LLTX; 1637 dev->features |= VETH_FEATURES; 1638 dev->vlan_features = dev->features & 1639 ~(NETIF_F_HW_VLAN_CTAG_TX | 1640 NETIF_F_HW_VLAN_STAG_TX | 1641 NETIF_F_HW_VLAN_CTAG_RX | 1642 NETIF_F_HW_VLAN_STAG_RX); 1643 dev->needs_free_netdev = true; 1644 dev->priv_destructor = veth_dev_free; 1645 dev->max_mtu = ETH_MAX_MTU; 1646 1647 dev->hw_features = VETH_FEATURES; 1648 dev->hw_enc_features = VETH_FEATURES; 1649 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1650 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1651 } 1652 1653 /* 1654 * netlink interface 1655 */ 1656 1657 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1658 struct netlink_ext_ack *extack) 1659 { 1660 if (tb[IFLA_ADDRESS]) { 1661 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1662 return -EINVAL; 1663 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1664 return -EADDRNOTAVAIL; 1665 } 1666 if (tb[IFLA_MTU]) { 1667 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1668 return -EINVAL; 1669 } 1670 return 0; 1671 } 1672 1673 static struct rtnl_link_ops veth_link_ops; 1674 1675 static void veth_disable_gro(struct net_device *dev) 1676 { 1677 dev->features &= ~NETIF_F_GRO; 1678 dev->wanted_features &= ~NETIF_F_GRO; 1679 netdev_update_features(dev); 1680 } 1681 1682 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1683 { 1684 int err; 1685 1686 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1687 err = netif_set_real_num_tx_queues(dev, 1); 1688 if (err) 1689 return err; 1690 } 1691 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1692 err = netif_set_real_num_rx_queues(dev, 1); 1693 if (err) 1694 return err; 1695 } 1696 return 0; 1697 } 1698 1699 static int veth_newlink(struct net *src_net, struct net_device *dev, 1700 struct nlattr *tb[], struct nlattr *data[], 1701 struct netlink_ext_ack *extack) 1702 { 1703 int err; 1704 struct net_device *peer; 1705 struct veth_priv *priv; 1706 char ifname[IFNAMSIZ]; 1707 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1708 unsigned char name_assign_type; 1709 struct ifinfomsg *ifmp; 1710 struct net *net; 1711 1712 /* 1713 * create and register peer first 1714 */ 1715 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1716 struct nlattr *nla_peer; 1717 1718 nla_peer = data[VETH_INFO_PEER]; 1719 ifmp = nla_data(nla_peer); 1720 err = rtnl_nla_parse_ifla(peer_tb, 1721 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1722 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1723 NULL); 1724 if (err < 0) 1725 return err; 1726 1727 err = veth_validate(peer_tb, NULL, extack); 1728 if (err < 0) 1729 return err; 1730 1731 tbp = peer_tb; 1732 } else { 1733 ifmp = NULL; 1734 tbp = tb; 1735 } 1736 1737 if (ifmp && tbp[IFLA_IFNAME]) { 1738 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1739 name_assign_type = NET_NAME_USER; 1740 } else { 1741 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1742 name_assign_type = NET_NAME_ENUM; 1743 } 1744 1745 net = rtnl_link_get_net(src_net, tbp); 1746 if (IS_ERR(net)) 1747 return PTR_ERR(net); 1748 1749 peer = rtnl_create_link(net, ifname, name_assign_type, 1750 &veth_link_ops, tbp, extack); 1751 if (IS_ERR(peer)) { 1752 put_net(net); 1753 return PTR_ERR(peer); 1754 } 1755 1756 if (!ifmp || !tbp[IFLA_ADDRESS]) 1757 eth_hw_addr_random(peer); 1758 1759 if (ifmp && (dev->ifindex != 0)) 1760 peer->ifindex = ifmp->ifi_index; 1761 1762 netif_inherit_tso_max(peer, dev); 1763 1764 err = register_netdevice(peer); 1765 put_net(net); 1766 net = NULL; 1767 if (err < 0) 1768 goto err_register_peer; 1769 1770 /* keep GRO disabled by default to be consistent with the established 1771 * veth behavior 1772 */ 1773 veth_disable_gro(peer); 1774 netif_carrier_off(peer); 1775 1776 err = rtnl_configure_link(peer, ifmp); 1777 if (err < 0) 1778 goto err_configure_peer; 1779 1780 /* 1781 * register dev last 1782 * 1783 * note, that since we've registered new device the dev's name 1784 * should be re-allocated 1785 */ 1786 1787 if (tb[IFLA_ADDRESS] == NULL) 1788 eth_hw_addr_random(dev); 1789 1790 if (tb[IFLA_IFNAME]) 1791 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1792 else 1793 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1794 1795 err = register_netdevice(dev); 1796 if (err < 0) 1797 goto err_register_dev; 1798 1799 netif_carrier_off(dev); 1800 1801 /* 1802 * tie the deviced together 1803 */ 1804 1805 priv = netdev_priv(dev); 1806 rcu_assign_pointer(priv->peer, peer); 1807 err = veth_init_queues(dev, tb); 1808 if (err) 1809 goto err_queues; 1810 1811 priv = netdev_priv(peer); 1812 rcu_assign_pointer(priv->peer, dev); 1813 err = veth_init_queues(peer, tb); 1814 if (err) 1815 goto err_queues; 1816 1817 veth_disable_gro(dev); 1818 return 0; 1819 1820 err_queues: 1821 unregister_netdevice(dev); 1822 err_register_dev: 1823 /* nothing to do */ 1824 err_configure_peer: 1825 unregister_netdevice(peer); 1826 return err; 1827 1828 err_register_peer: 1829 free_netdev(peer); 1830 return err; 1831 } 1832 1833 static void veth_dellink(struct net_device *dev, struct list_head *head) 1834 { 1835 struct veth_priv *priv; 1836 struct net_device *peer; 1837 1838 priv = netdev_priv(dev); 1839 peer = rtnl_dereference(priv->peer); 1840 1841 /* Note : dellink() is called from default_device_exit_batch(), 1842 * before a rcu_synchronize() point. The devices are guaranteed 1843 * not being freed before one RCU grace period. 1844 */ 1845 RCU_INIT_POINTER(priv->peer, NULL); 1846 unregister_netdevice_queue(dev, head); 1847 1848 if (peer) { 1849 priv = netdev_priv(peer); 1850 RCU_INIT_POINTER(priv->peer, NULL); 1851 unregister_netdevice_queue(peer, head); 1852 } 1853 } 1854 1855 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1856 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1857 }; 1858 1859 static struct net *veth_get_link_net(const struct net_device *dev) 1860 { 1861 struct veth_priv *priv = netdev_priv(dev); 1862 struct net_device *peer = rtnl_dereference(priv->peer); 1863 1864 return peer ? dev_net(peer) : dev_net(dev); 1865 } 1866 1867 static unsigned int veth_get_num_queues(void) 1868 { 1869 /* enforce the same queue limit as rtnl_create_link */ 1870 int queues = num_possible_cpus(); 1871 1872 if (queues > 4096) 1873 queues = 4096; 1874 return queues; 1875 } 1876 1877 static struct rtnl_link_ops veth_link_ops = { 1878 .kind = DRV_NAME, 1879 .priv_size = sizeof(struct veth_priv), 1880 .setup = veth_setup, 1881 .validate = veth_validate, 1882 .newlink = veth_newlink, 1883 .dellink = veth_dellink, 1884 .policy = veth_policy, 1885 .maxtype = VETH_INFO_MAX, 1886 .get_link_net = veth_get_link_net, 1887 .get_num_tx_queues = veth_get_num_queues, 1888 .get_num_rx_queues = veth_get_num_queues, 1889 }; 1890 1891 /* 1892 * init/fini 1893 */ 1894 1895 static __init int veth_init(void) 1896 { 1897 return rtnl_link_register(&veth_link_ops); 1898 } 1899 1900 static __exit void veth_exit(void) 1901 { 1902 rtnl_link_unregister(&veth_link_ops); 1903 } 1904 1905 module_init(veth_init); 1906 module_exit(veth_exit); 1907 1908 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1909 MODULE_LICENSE("GPL v2"); 1910 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1911