1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 #define VETH_XDP_TX_BULK_SIZE 16 38 #define VETH_XDP_BATCH 16 39 40 struct veth_stats { 41 u64 rx_drops; 42 /* xdp */ 43 u64 xdp_packets; 44 u64 xdp_bytes; 45 u64 xdp_redirect; 46 u64 xdp_drops; 47 u64 xdp_tx; 48 u64 xdp_tx_err; 49 u64 peer_tq_xdp_xmit; 50 u64 peer_tq_xdp_xmit_err; 51 }; 52 53 struct veth_rq_stats { 54 struct veth_stats vs; 55 struct u64_stats_sync syncp; 56 }; 57 58 struct veth_rq { 59 struct napi_struct xdp_napi; 60 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 61 struct net_device *dev; 62 struct bpf_prog __rcu *xdp_prog; 63 struct xdp_mem_info xdp_mem; 64 struct veth_rq_stats stats; 65 bool rx_notify_masked; 66 struct ptr_ring xdp_ring; 67 struct xdp_rxq_info xdp_rxq; 68 }; 69 70 struct veth_priv { 71 struct net_device __rcu *peer; 72 atomic64_t dropped; 73 struct bpf_prog *_xdp_prog; 74 struct veth_rq *rq; 75 unsigned int requested_headroom; 76 }; 77 78 struct veth_xdp_tx_bq { 79 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 80 unsigned int count; 81 }; 82 83 /* 84 * ethtool interface 85 */ 86 87 struct veth_q_stat_desc { 88 char desc[ETH_GSTRING_LEN]; 89 size_t offset; 90 }; 91 92 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 93 94 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 95 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 96 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 97 { "drops", VETH_RQ_STAT(rx_drops) }, 98 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 99 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 100 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 101 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 102 }; 103 104 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 105 106 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 107 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 108 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 109 }; 110 111 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 112 113 static struct { 114 const char string[ETH_GSTRING_LEN]; 115 } ethtool_stats_keys[] = { 116 { "peer_ifindex" }, 117 }; 118 119 static int veth_get_link_ksettings(struct net_device *dev, 120 struct ethtool_link_ksettings *cmd) 121 { 122 cmd->base.speed = SPEED_10000; 123 cmd->base.duplex = DUPLEX_FULL; 124 cmd->base.port = PORT_TP; 125 cmd->base.autoneg = AUTONEG_DISABLE; 126 return 0; 127 } 128 129 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 130 { 131 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 132 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 133 } 134 135 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 136 { 137 u8 *p = buf; 138 int i, j; 139 140 switch(stringset) { 141 case ETH_SS_STATS: 142 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 143 p += sizeof(ethtool_stats_keys); 144 for (i = 0; i < dev->real_num_rx_queues; i++) 145 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 146 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 147 i, veth_rq_stats_desc[j].desc); 148 149 for (i = 0; i < dev->real_num_tx_queues; i++) 150 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 151 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 152 i, veth_tq_stats_desc[j].desc); 153 break; 154 } 155 } 156 157 static int veth_get_sset_count(struct net_device *dev, int sset) 158 { 159 switch (sset) { 160 case ETH_SS_STATS: 161 return ARRAY_SIZE(ethtool_stats_keys) + 162 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 163 VETH_TQ_STATS_LEN * dev->real_num_tx_queues; 164 default: 165 return -EOPNOTSUPP; 166 } 167 } 168 169 static void veth_get_ethtool_stats(struct net_device *dev, 170 struct ethtool_stats *stats, u64 *data) 171 { 172 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 173 struct net_device *peer = rtnl_dereference(priv->peer); 174 int i, j, idx; 175 176 data[0] = peer ? peer->ifindex : 0; 177 idx = 1; 178 for (i = 0; i < dev->real_num_rx_queues; i++) { 179 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 180 const void *stats_base = (void *)&rq_stats->vs; 181 unsigned int start; 182 size_t offset; 183 184 do { 185 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 186 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 187 offset = veth_rq_stats_desc[j].offset; 188 data[idx + j] = *(u64 *)(stats_base + offset); 189 } 190 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 191 idx += VETH_RQ_STATS_LEN; 192 } 193 194 if (!peer) 195 return; 196 197 rcv_priv = netdev_priv(peer); 198 for (i = 0; i < peer->real_num_rx_queues; i++) { 199 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 200 const void *base = (void *)&rq_stats->vs; 201 unsigned int start, tx_idx = idx; 202 size_t offset; 203 204 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 205 do { 206 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 207 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 208 offset = veth_tq_stats_desc[j].offset; 209 data[tx_idx + j] += *(u64 *)(base + offset); 210 } 211 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 212 } 213 } 214 215 static void veth_get_channels(struct net_device *dev, 216 struct ethtool_channels *channels) 217 { 218 channels->tx_count = dev->real_num_tx_queues; 219 channels->rx_count = dev->real_num_rx_queues; 220 channels->max_tx = dev->num_tx_queues; 221 channels->max_rx = dev->num_rx_queues; 222 } 223 224 static int veth_set_channels(struct net_device *dev, 225 struct ethtool_channels *ch); 226 227 static const struct ethtool_ops veth_ethtool_ops = { 228 .get_drvinfo = veth_get_drvinfo, 229 .get_link = ethtool_op_get_link, 230 .get_strings = veth_get_strings, 231 .get_sset_count = veth_get_sset_count, 232 .get_ethtool_stats = veth_get_ethtool_stats, 233 .get_link_ksettings = veth_get_link_ksettings, 234 .get_ts_info = ethtool_op_get_ts_info, 235 .get_channels = veth_get_channels, 236 .set_channels = veth_set_channels, 237 }; 238 239 /* general routines */ 240 241 static bool veth_is_xdp_frame(void *ptr) 242 { 243 return (unsigned long)ptr & VETH_XDP_FLAG; 244 } 245 246 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 247 { 248 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 249 } 250 251 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 252 { 253 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 254 } 255 256 static void veth_ptr_free(void *ptr) 257 { 258 if (veth_is_xdp_frame(ptr)) 259 xdp_return_frame(veth_ptr_to_xdp(ptr)); 260 else 261 kfree_skb(ptr); 262 } 263 264 static void __veth_xdp_flush(struct veth_rq *rq) 265 { 266 /* Write ptr_ring before reading rx_notify_masked */ 267 smp_mb(); 268 if (!READ_ONCE(rq->rx_notify_masked) && 269 napi_schedule_prep(&rq->xdp_napi)) { 270 WRITE_ONCE(rq->rx_notify_masked, true); 271 __napi_schedule(&rq->xdp_napi); 272 } 273 } 274 275 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 276 { 277 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 278 dev_kfree_skb_any(skb); 279 return NET_RX_DROP; 280 } 281 282 return NET_RX_SUCCESS; 283 } 284 285 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 286 struct veth_rq *rq, bool xdp) 287 { 288 return __dev_forward_skb(dev, skb) ?: xdp ? 289 veth_xdp_rx(rq, skb) : 290 __netif_rx(skb); 291 } 292 293 /* return true if the specified skb has chances of GRO aggregation 294 * Don't strive for accuracy, but try to avoid GRO overhead in the most 295 * common scenarios. 296 * When XDP is enabled, all traffic is considered eligible, as the xmit 297 * device has TSO off. 298 * When TSO is enabled on the xmit device, we are likely interested only 299 * in UDP aggregation, explicitly check for that if the skb is suspected 300 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 301 * to belong to locally generated UDP traffic. 302 */ 303 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 304 const struct net_device *rcv, 305 const struct sk_buff *skb) 306 { 307 return !(dev->features & NETIF_F_ALL_TSO) || 308 (skb->destructor == sock_wfree && 309 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 310 } 311 312 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 313 { 314 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 315 struct netdev_queue *queue = NULL; 316 struct veth_rq *rq = NULL; 317 struct net_device *rcv; 318 int length = skb->len; 319 bool use_napi = false; 320 int rxq; 321 322 rcu_read_lock(); 323 rcv = rcu_dereference(priv->peer); 324 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 325 kfree_skb(skb); 326 goto drop; 327 } 328 329 rcv_priv = netdev_priv(rcv); 330 rxq = skb_get_queue_mapping(skb); 331 if (rxq < rcv->real_num_rx_queues) { 332 rq = &rcv_priv->rq[rxq]; 333 queue = netdev_get_tx_queue(dev, rxq); 334 335 /* The napi pointer is available when an XDP program is 336 * attached or when GRO is enabled 337 * Don't bother with napi/GRO if the skb can't be aggregated 338 */ 339 use_napi = rcu_access_pointer(rq->napi) && 340 veth_skb_is_eligible_for_gro(dev, rcv, skb); 341 } 342 343 skb_tx_timestamp(skb); 344 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 345 if (queue) 346 txq_trans_cond_update(queue); 347 if (!use_napi) 348 dev_lstats_add(dev, length); 349 } else { 350 drop: 351 atomic64_inc(&priv->dropped); 352 } 353 354 if (use_napi) 355 __veth_xdp_flush(rq); 356 357 rcu_read_unlock(); 358 359 return NETDEV_TX_OK; 360 } 361 362 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 363 { 364 struct veth_priv *priv = netdev_priv(dev); 365 366 dev_lstats_read(dev, packets, bytes); 367 return atomic64_read(&priv->dropped); 368 } 369 370 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 371 { 372 struct veth_priv *priv = netdev_priv(dev); 373 int i; 374 375 result->peer_tq_xdp_xmit_err = 0; 376 result->xdp_packets = 0; 377 result->xdp_tx_err = 0; 378 result->xdp_bytes = 0; 379 result->rx_drops = 0; 380 for (i = 0; i < dev->num_rx_queues; i++) { 381 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 382 struct veth_rq_stats *stats = &priv->rq[i].stats; 383 unsigned int start; 384 385 do { 386 start = u64_stats_fetch_begin_irq(&stats->syncp); 387 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 388 xdp_tx_err = stats->vs.xdp_tx_err; 389 packets = stats->vs.xdp_packets; 390 bytes = stats->vs.xdp_bytes; 391 drops = stats->vs.rx_drops; 392 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 393 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 394 result->xdp_tx_err += xdp_tx_err; 395 result->xdp_packets += packets; 396 result->xdp_bytes += bytes; 397 result->rx_drops += drops; 398 } 399 } 400 401 static void veth_get_stats64(struct net_device *dev, 402 struct rtnl_link_stats64 *tot) 403 { 404 struct veth_priv *priv = netdev_priv(dev); 405 struct net_device *peer; 406 struct veth_stats rx; 407 u64 packets, bytes; 408 409 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 410 tot->tx_bytes = bytes; 411 tot->tx_packets = packets; 412 413 veth_stats_rx(&rx, dev); 414 tot->tx_dropped += rx.xdp_tx_err; 415 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 416 tot->rx_bytes = rx.xdp_bytes; 417 tot->rx_packets = rx.xdp_packets; 418 419 rcu_read_lock(); 420 peer = rcu_dereference(priv->peer); 421 if (peer) { 422 veth_stats_tx(peer, &packets, &bytes); 423 tot->rx_bytes += bytes; 424 tot->rx_packets += packets; 425 426 veth_stats_rx(&rx, peer); 427 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 428 tot->rx_dropped += rx.xdp_tx_err; 429 tot->tx_bytes += rx.xdp_bytes; 430 tot->tx_packets += rx.xdp_packets; 431 } 432 rcu_read_unlock(); 433 } 434 435 /* fake multicast ability */ 436 static void veth_set_multicast_list(struct net_device *dev) 437 { 438 } 439 440 static int veth_select_rxq(struct net_device *dev) 441 { 442 return smp_processor_id() % dev->real_num_rx_queues; 443 } 444 445 static struct net_device *veth_peer_dev(struct net_device *dev) 446 { 447 struct veth_priv *priv = netdev_priv(dev); 448 449 /* Callers must be under RCU read side. */ 450 return rcu_dereference(priv->peer); 451 } 452 453 static int veth_xdp_xmit(struct net_device *dev, int n, 454 struct xdp_frame **frames, 455 u32 flags, bool ndo_xmit) 456 { 457 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 458 int i, ret = -ENXIO, nxmit = 0; 459 struct net_device *rcv; 460 unsigned int max_len; 461 struct veth_rq *rq; 462 463 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 464 return -EINVAL; 465 466 rcu_read_lock(); 467 rcv = rcu_dereference(priv->peer); 468 if (unlikely(!rcv)) 469 goto out; 470 471 rcv_priv = netdev_priv(rcv); 472 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 473 /* The napi pointer is set if NAPI is enabled, which ensures that 474 * xdp_ring is initialized on receive side and the peer device is up. 475 */ 476 if (!rcu_access_pointer(rq->napi)) 477 goto out; 478 479 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 480 481 spin_lock(&rq->xdp_ring.producer_lock); 482 for (i = 0; i < n; i++) { 483 struct xdp_frame *frame = frames[i]; 484 void *ptr = veth_xdp_to_ptr(frame); 485 486 if (unlikely(xdp_get_frame_len(frame) > max_len || 487 __ptr_ring_produce(&rq->xdp_ring, ptr))) 488 break; 489 nxmit++; 490 } 491 spin_unlock(&rq->xdp_ring.producer_lock); 492 493 if (flags & XDP_XMIT_FLUSH) 494 __veth_xdp_flush(rq); 495 496 ret = nxmit; 497 if (ndo_xmit) { 498 u64_stats_update_begin(&rq->stats.syncp); 499 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 500 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 501 u64_stats_update_end(&rq->stats.syncp); 502 } 503 504 out: 505 rcu_read_unlock(); 506 507 return ret; 508 } 509 510 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 511 struct xdp_frame **frames, u32 flags) 512 { 513 int err; 514 515 err = veth_xdp_xmit(dev, n, frames, flags, true); 516 if (err < 0) { 517 struct veth_priv *priv = netdev_priv(dev); 518 519 atomic64_add(n, &priv->dropped); 520 } 521 522 return err; 523 } 524 525 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 526 { 527 int sent, i, err = 0, drops; 528 529 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 530 if (sent < 0) { 531 err = sent; 532 sent = 0; 533 } 534 535 for (i = sent; unlikely(i < bq->count); i++) 536 xdp_return_frame(bq->q[i]); 537 538 drops = bq->count - sent; 539 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 540 541 u64_stats_update_begin(&rq->stats.syncp); 542 rq->stats.vs.xdp_tx += sent; 543 rq->stats.vs.xdp_tx_err += drops; 544 u64_stats_update_end(&rq->stats.syncp); 545 546 bq->count = 0; 547 } 548 549 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 550 { 551 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 552 struct net_device *rcv; 553 struct veth_rq *rcv_rq; 554 555 rcu_read_lock(); 556 veth_xdp_flush_bq(rq, bq); 557 rcv = rcu_dereference(priv->peer); 558 if (unlikely(!rcv)) 559 goto out; 560 561 rcv_priv = netdev_priv(rcv); 562 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 563 /* xdp_ring is initialized on receive side? */ 564 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 565 goto out; 566 567 __veth_xdp_flush(rcv_rq); 568 out: 569 rcu_read_unlock(); 570 } 571 572 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 573 struct veth_xdp_tx_bq *bq) 574 { 575 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 576 577 if (unlikely(!frame)) 578 return -EOVERFLOW; 579 580 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 581 veth_xdp_flush_bq(rq, bq); 582 583 bq->q[bq->count++] = frame; 584 585 return 0; 586 } 587 588 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 589 struct xdp_frame *frame, 590 struct veth_xdp_tx_bq *bq, 591 struct veth_stats *stats) 592 { 593 struct xdp_frame orig_frame; 594 struct bpf_prog *xdp_prog; 595 596 rcu_read_lock(); 597 xdp_prog = rcu_dereference(rq->xdp_prog); 598 if (likely(xdp_prog)) { 599 struct xdp_buff xdp; 600 u32 act; 601 602 xdp_convert_frame_to_buff(frame, &xdp); 603 xdp.rxq = &rq->xdp_rxq; 604 605 act = bpf_prog_run_xdp(xdp_prog, &xdp); 606 607 switch (act) { 608 case XDP_PASS: 609 if (xdp_update_frame_from_buff(&xdp, frame)) 610 goto err_xdp; 611 break; 612 case XDP_TX: 613 orig_frame = *frame; 614 xdp.rxq->mem = frame->mem; 615 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 616 trace_xdp_exception(rq->dev, xdp_prog, act); 617 frame = &orig_frame; 618 stats->rx_drops++; 619 goto err_xdp; 620 } 621 stats->xdp_tx++; 622 rcu_read_unlock(); 623 goto xdp_xmit; 624 case XDP_REDIRECT: 625 orig_frame = *frame; 626 xdp.rxq->mem = frame->mem; 627 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 628 frame = &orig_frame; 629 stats->rx_drops++; 630 goto err_xdp; 631 } 632 stats->xdp_redirect++; 633 rcu_read_unlock(); 634 goto xdp_xmit; 635 default: 636 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 637 fallthrough; 638 case XDP_ABORTED: 639 trace_xdp_exception(rq->dev, xdp_prog, act); 640 fallthrough; 641 case XDP_DROP: 642 stats->xdp_drops++; 643 goto err_xdp; 644 } 645 } 646 rcu_read_unlock(); 647 648 return frame; 649 err_xdp: 650 rcu_read_unlock(); 651 xdp_return_frame(frame); 652 xdp_xmit: 653 return NULL; 654 } 655 656 /* frames array contains VETH_XDP_BATCH at most */ 657 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 658 int n_xdpf, struct veth_xdp_tx_bq *bq, 659 struct veth_stats *stats) 660 { 661 void *skbs[VETH_XDP_BATCH]; 662 int i; 663 664 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 665 GFP_ATOMIC | __GFP_ZERO) < 0) { 666 for (i = 0; i < n_xdpf; i++) 667 xdp_return_frame(frames[i]); 668 stats->rx_drops += n_xdpf; 669 670 return; 671 } 672 673 for (i = 0; i < n_xdpf; i++) { 674 struct sk_buff *skb = skbs[i]; 675 676 skb = __xdp_build_skb_from_frame(frames[i], skb, 677 rq->dev); 678 if (!skb) { 679 xdp_return_frame(frames[i]); 680 stats->rx_drops++; 681 continue; 682 } 683 napi_gro_receive(&rq->xdp_napi, skb); 684 } 685 } 686 687 static void veth_xdp_get(struct xdp_buff *xdp) 688 { 689 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 690 int i; 691 692 get_page(virt_to_page(xdp->data)); 693 if (likely(!xdp_buff_has_frags(xdp))) 694 return; 695 696 for (i = 0; i < sinfo->nr_frags; i++) 697 __skb_frag_ref(&sinfo->frags[i]); 698 } 699 700 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 701 struct xdp_buff *xdp, 702 struct sk_buff **pskb) 703 { 704 struct sk_buff *skb = *pskb; 705 u32 frame_sz; 706 707 if (skb_shared(skb) || skb_head_is_locked(skb) || 708 skb_shinfo(skb)->nr_frags) { 709 u32 size, len, max_head_size, off; 710 struct sk_buff *nskb; 711 struct page *page; 712 int i, head_off; 713 714 /* We need a private copy of the skb and data buffers since 715 * the ebpf program can modify it. We segment the original skb 716 * into order-0 pages without linearize it. 717 * 718 * Make sure we have enough space for linear and paged area 719 */ 720 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - 721 VETH_XDP_HEADROOM); 722 if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) 723 goto drop; 724 725 /* Allocate skb head */ 726 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 727 if (!page) 728 goto drop; 729 730 nskb = build_skb(page_address(page), PAGE_SIZE); 731 if (!nskb) { 732 put_page(page); 733 goto drop; 734 } 735 736 skb_reserve(nskb, VETH_XDP_HEADROOM); 737 size = min_t(u32, skb->len, max_head_size); 738 if (skb_copy_bits(skb, 0, nskb->data, size)) { 739 consume_skb(nskb); 740 goto drop; 741 } 742 skb_put(nskb, size); 743 744 skb_copy_header(nskb, skb); 745 head_off = skb_headroom(nskb) - skb_headroom(skb); 746 skb_headers_offset_update(nskb, head_off); 747 748 /* Allocate paged area of new skb */ 749 off = size; 750 len = skb->len - off; 751 752 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { 753 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 754 if (!page) { 755 consume_skb(nskb); 756 goto drop; 757 } 758 759 size = min_t(u32, len, PAGE_SIZE); 760 skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); 761 if (skb_copy_bits(skb, off, page_address(page), 762 size)) { 763 consume_skb(nskb); 764 goto drop; 765 } 766 767 len -= size; 768 off += size; 769 } 770 771 consume_skb(skb); 772 skb = nskb; 773 } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM && 774 pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) { 775 goto drop; 776 } 777 778 /* SKB "head" area always have tailroom for skb_shared_info */ 779 frame_sz = skb_end_pointer(skb) - skb->head; 780 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 781 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 782 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 783 skb_headlen(skb), true); 784 785 if (skb_is_nonlinear(skb)) { 786 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 787 xdp_buff_set_frags_flag(xdp); 788 } else { 789 xdp_buff_clear_frags_flag(xdp); 790 } 791 *pskb = skb; 792 793 return 0; 794 drop: 795 consume_skb(skb); 796 *pskb = NULL; 797 798 return -ENOMEM; 799 } 800 801 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 802 struct sk_buff *skb, 803 struct veth_xdp_tx_bq *bq, 804 struct veth_stats *stats) 805 { 806 void *orig_data, *orig_data_end; 807 struct bpf_prog *xdp_prog; 808 struct xdp_buff xdp; 809 u32 act, metalen; 810 int off; 811 812 skb_prepare_for_gro(skb); 813 814 rcu_read_lock(); 815 xdp_prog = rcu_dereference(rq->xdp_prog); 816 if (unlikely(!xdp_prog)) { 817 rcu_read_unlock(); 818 goto out; 819 } 820 821 __skb_push(skb, skb->data - skb_mac_header(skb)); 822 if (veth_convert_skb_to_xdp_buff(rq, &xdp, &skb)) 823 goto drop; 824 825 orig_data = xdp.data; 826 orig_data_end = xdp.data_end; 827 828 act = bpf_prog_run_xdp(xdp_prog, &xdp); 829 830 switch (act) { 831 case XDP_PASS: 832 break; 833 case XDP_TX: 834 veth_xdp_get(&xdp); 835 consume_skb(skb); 836 xdp.rxq->mem = rq->xdp_mem; 837 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 838 trace_xdp_exception(rq->dev, xdp_prog, act); 839 stats->rx_drops++; 840 goto err_xdp; 841 } 842 stats->xdp_tx++; 843 rcu_read_unlock(); 844 goto xdp_xmit; 845 case XDP_REDIRECT: 846 veth_xdp_get(&xdp); 847 consume_skb(skb); 848 xdp.rxq->mem = rq->xdp_mem; 849 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 850 stats->rx_drops++; 851 goto err_xdp; 852 } 853 stats->xdp_redirect++; 854 rcu_read_unlock(); 855 goto xdp_xmit; 856 default: 857 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 858 fallthrough; 859 case XDP_ABORTED: 860 trace_xdp_exception(rq->dev, xdp_prog, act); 861 fallthrough; 862 case XDP_DROP: 863 stats->xdp_drops++; 864 goto xdp_drop; 865 } 866 rcu_read_unlock(); 867 868 /* check if bpf_xdp_adjust_head was used */ 869 off = orig_data - xdp.data; 870 if (off > 0) 871 __skb_push(skb, off); 872 else if (off < 0) 873 __skb_pull(skb, -off); 874 875 skb_reset_mac_header(skb); 876 877 /* check if bpf_xdp_adjust_tail was used */ 878 off = xdp.data_end - orig_data_end; 879 if (off != 0) 880 __skb_put(skb, off); /* positive on grow, negative on shrink */ 881 882 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 883 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 884 */ 885 if (xdp_buff_has_frags(&xdp)) 886 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 887 else 888 skb->data_len = 0; 889 890 skb->protocol = eth_type_trans(skb, rq->dev); 891 892 metalen = xdp.data - xdp.data_meta; 893 if (metalen) 894 skb_metadata_set(skb, metalen); 895 out: 896 return skb; 897 drop: 898 stats->rx_drops++; 899 xdp_drop: 900 rcu_read_unlock(); 901 kfree_skb(skb); 902 return NULL; 903 err_xdp: 904 rcu_read_unlock(); 905 xdp_return_buff(&xdp); 906 xdp_xmit: 907 return NULL; 908 } 909 910 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 911 struct veth_xdp_tx_bq *bq, 912 struct veth_stats *stats) 913 { 914 int i, done = 0, n_xdpf = 0; 915 void *xdpf[VETH_XDP_BATCH]; 916 917 for (i = 0; i < budget; i++) { 918 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 919 920 if (!ptr) 921 break; 922 923 if (veth_is_xdp_frame(ptr)) { 924 /* ndo_xdp_xmit */ 925 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 926 927 stats->xdp_bytes += xdp_get_frame_len(frame); 928 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 929 if (frame) { 930 /* XDP_PASS */ 931 xdpf[n_xdpf++] = frame; 932 if (n_xdpf == VETH_XDP_BATCH) { 933 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 934 bq, stats); 935 n_xdpf = 0; 936 } 937 } 938 } else { 939 /* ndo_start_xmit */ 940 struct sk_buff *skb = ptr; 941 942 stats->xdp_bytes += skb->len; 943 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 944 if (skb) { 945 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 946 netif_receive_skb(skb); 947 else 948 napi_gro_receive(&rq->xdp_napi, skb); 949 } 950 } 951 done++; 952 } 953 954 if (n_xdpf) 955 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 956 957 u64_stats_update_begin(&rq->stats.syncp); 958 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 959 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 960 rq->stats.vs.xdp_drops += stats->xdp_drops; 961 rq->stats.vs.rx_drops += stats->rx_drops; 962 rq->stats.vs.xdp_packets += done; 963 u64_stats_update_end(&rq->stats.syncp); 964 965 return done; 966 } 967 968 static int veth_poll(struct napi_struct *napi, int budget) 969 { 970 struct veth_rq *rq = 971 container_of(napi, struct veth_rq, xdp_napi); 972 struct veth_stats stats = {}; 973 struct veth_xdp_tx_bq bq; 974 int done; 975 976 bq.count = 0; 977 978 xdp_set_return_frame_no_direct(); 979 done = veth_xdp_rcv(rq, budget, &bq, &stats); 980 981 if (done < budget && napi_complete_done(napi, done)) { 982 /* Write rx_notify_masked before reading ptr_ring */ 983 smp_store_mb(rq->rx_notify_masked, false); 984 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 985 if (napi_schedule_prep(&rq->xdp_napi)) { 986 WRITE_ONCE(rq->rx_notify_masked, true); 987 __napi_schedule(&rq->xdp_napi); 988 } 989 } 990 } 991 992 if (stats.xdp_tx > 0) 993 veth_xdp_flush(rq, &bq); 994 if (stats.xdp_redirect > 0) 995 xdp_do_flush(); 996 xdp_clear_return_frame_no_direct(); 997 998 return done; 999 } 1000 1001 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 1002 { 1003 struct veth_priv *priv = netdev_priv(dev); 1004 int err, i; 1005 1006 for (i = start; i < end; i++) { 1007 struct veth_rq *rq = &priv->rq[i]; 1008 1009 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 1010 if (err) 1011 goto err_xdp_ring; 1012 } 1013 1014 for (i = start; i < end; i++) { 1015 struct veth_rq *rq = &priv->rq[i]; 1016 1017 napi_enable(&rq->xdp_napi); 1018 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1019 } 1020 1021 return 0; 1022 1023 err_xdp_ring: 1024 for (i--; i >= start; i--) 1025 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1026 1027 return err; 1028 } 1029 1030 static int __veth_napi_enable(struct net_device *dev) 1031 { 1032 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1033 } 1034 1035 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1036 { 1037 struct veth_priv *priv = netdev_priv(dev); 1038 int i; 1039 1040 for (i = start; i < end; i++) { 1041 struct veth_rq *rq = &priv->rq[i]; 1042 1043 rcu_assign_pointer(priv->rq[i].napi, NULL); 1044 napi_disable(&rq->xdp_napi); 1045 __netif_napi_del(&rq->xdp_napi); 1046 } 1047 synchronize_net(); 1048 1049 for (i = start; i < end; i++) { 1050 struct veth_rq *rq = &priv->rq[i]; 1051 1052 rq->rx_notify_masked = false; 1053 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1054 } 1055 } 1056 1057 static void veth_napi_del(struct net_device *dev) 1058 { 1059 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1060 } 1061 1062 static bool veth_gro_requested(const struct net_device *dev) 1063 { 1064 return !!(dev->wanted_features & NETIF_F_GRO); 1065 } 1066 1067 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1068 bool napi_already_on) 1069 { 1070 struct veth_priv *priv = netdev_priv(dev); 1071 int err, i; 1072 1073 for (i = start; i < end; i++) { 1074 struct veth_rq *rq = &priv->rq[i]; 1075 1076 if (!napi_already_on) 1077 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1078 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1079 if (err < 0) 1080 goto err_rxq_reg; 1081 1082 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1083 MEM_TYPE_PAGE_SHARED, 1084 NULL); 1085 if (err < 0) 1086 goto err_reg_mem; 1087 1088 /* Save original mem info as it can be overwritten */ 1089 rq->xdp_mem = rq->xdp_rxq.mem; 1090 } 1091 return 0; 1092 1093 err_reg_mem: 1094 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1095 err_rxq_reg: 1096 for (i--; i >= start; i--) { 1097 struct veth_rq *rq = &priv->rq[i]; 1098 1099 xdp_rxq_info_unreg(&rq->xdp_rxq); 1100 if (!napi_already_on) 1101 netif_napi_del(&rq->xdp_napi); 1102 } 1103 1104 return err; 1105 } 1106 1107 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1108 bool delete_napi) 1109 { 1110 struct veth_priv *priv = netdev_priv(dev); 1111 int i; 1112 1113 for (i = start; i < end; i++) { 1114 struct veth_rq *rq = &priv->rq[i]; 1115 1116 rq->xdp_rxq.mem = rq->xdp_mem; 1117 xdp_rxq_info_unreg(&rq->xdp_rxq); 1118 1119 if (delete_napi) 1120 netif_napi_del(&rq->xdp_napi); 1121 } 1122 } 1123 1124 static int veth_enable_xdp(struct net_device *dev) 1125 { 1126 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1127 struct veth_priv *priv = netdev_priv(dev); 1128 int err, i; 1129 1130 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1131 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1132 if (err) 1133 return err; 1134 1135 if (!napi_already_on) { 1136 err = __veth_napi_enable(dev); 1137 if (err) { 1138 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1139 return err; 1140 } 1141 1142 if (!veth_gro_requested(dev)) { 1143 /* user-space did not require GRO, but adding XDP 1144 * is supposed to get GRO working 1145 */ 1146 dev->features |= NETIF_F_GRO; 1147 netdev_features_change(dev); 1148 } 1149 } 1150 } 1151 1152 for (i = 0; i < dev->real_num_rx_queues; i++) { 1153 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1154 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1155 } 1156 1157 return 0; 1158 } 1159 1160 static void veth_disable_xdp(struct net_device *dev) 1161 { 1162 struct veth_priv *priv = netdev_priv(dev); 1163 int i; 1164 1165 for (i = 0; i < dev->real_num_rx_queues; i++) 1166 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1167 1168 if (!netif_running(dev) || !veth_gro_requested(dev)) { 1169 veth_napi_del(dev); 1170 1171 /* if user-space did not require GRO, since adding XDP 1172 * enabled it, clear it now 1173 */ 1174 if (!veth_gro_requested(dev) && netif_running(dev)) { 1175 dev->features &= ~NETIF_F_GRO; 1176 netdev_features_change(dev); 1177 } 1178 } 1179 1180 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1181 } 1182 1183 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1184 { 1185 struct veth_priv *priv = netdev_priv(dev); 1186 int err, i; 1187 1188 for (i = start; i < end; i++) { 1189 struct veth_rq *rq = &priv->rq[i]; 1190 1191 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1192 } 1193 1194 err = __veth_napi_enable_range(dev, start, end); 1195 if (err) { 1196 for (i = start; i < end; i++) { 1197 struct veth_rq *rq = &priv->rq[i]; 1198 1199 netif_napi_del(&rq->xdp_napi); 1200 } 1201 return err; 1202 } 1203 return err; 1204 } 1205 1206 static int veth_napi_enable(struct net_device *dev) 1207 { 1208 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1209 } 1210 1211 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1212 { 1213 struct veth_priv *priv = netdev_priv(dev); 1214 1215 if (start >= end) 1216 return; 1217 1218 if (priv->_xdp_prog) { 1219 veth_napi_del_range(dev, start, end); 1220 veth_disable_xdp_range(dev, start, end, false); 1221 } else if (veth_gro_requested(dev)) { 1222 veth_napi_del_range(dev, start, end); 1223 } 1224 } 1225 1226 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1227 { 1228 struct veth_priv *priv = netdev_priv(dev); 1229 int err; 1230 1231 if (start >= end) 1232 return 0; 1233 1234 if (priv->_xdp_prog) { 1235 /* these channels are freshly initialized, napi is not on there even 1236 * when GRO is requeste 1237 */ 1238 err = veth_enable_xdp_range(dev, start, end, false); 1239 if (err) 1240 return err; 1241 1242 err = __veth_napi_enable_range(dev, start, end); 1243 if (err) { 1244 /* on error always delete the newly added napis */ 1245 veth_disable_xdp_range(dev, start, end, true); 1246 return err; 1247 } 1248 } else if (veth_gro_requested(dev)) { 1249 return veth_napi_enable_range(dev, start, end); 1250 } 1251 return 0; 1252 } 1253 1254 static int veth_set_channels(struct net_device *dev, 1255 struct ethtool_channels *ch) 1256 { 1257 struct veth_priv *priv = netdev_priv(dev); 1258 unsigned int old_rx_count, new_rx_count; 1259 struct veth_priv *peer_priv; 1260 struct net_device *peer; 1261 int err; 1262 1263 /* sanity check. Upper bounds are already enforced by the caller */ 1264 if (!ch->rx_count || !ch->tx_count) 1265 return -EINVAL; 1266 1267 /* avoid braking XDP, if that is enabled */ 1268 peer = rtnl_dereference(priv->peer); 1269 peer_priv = peer ? netdev_priv(peer) : NULL; 1270 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1271 return -EINVAL; 1272 1273 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1274 return -EINVAL; 1275 1276 old_rx_count = dev->real_num_rx_queues; 1277 new_rx_count = ch->rx_count; 1278 if (netif_running(dev)) { 1279 /* turn device off */ 1280 netif_carrier_off(dev); 1281 if (peer) 1282 netif_carrier_off(peer); 1283 1284 /* try to allocate new resurces, as needed*/ 1285 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1286 if (err) 1287 goto out; 1288 } 1289 1290 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1291 if (err) 1292 goto revert; 1293 1294 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1295 if (err) { 1296 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1297 1298 /* this error condition could happen only if rx and tx change 1299 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1300 * and we can't do anything to fully restore the original 1301 * status 1302 */ 1303 if (err2) 1304 pr_warn("Can't restore rx queues config %d -> %d %d", 1305 new_rx_count, old_rx_count, err2); 1306 else 1307 goto revert; 1308 } 1309 1310 out: 1311 if (netif_running(dev)) { 1312 /* note that we need to swap the arguments WRT the enable part 1313 * to identify the range we have to disable 1314 */ 1315 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1316 netif_carrier_on(dev); 1317 if (peer) 1318 netif_carrier_on(peer); 1319 } 1320 return err; 1321 1322 revert: 1323 new_rx_count = old_rx_count; 1324 old_rx_count = ch->rx_count; 1325 goto out; 1326 } 1327 1328 static int veth_open(struct net_device *dev) 1329 { 1330 struct veth_priv *priv = netdev_priv(dev); 1331 struct net_device *peer = rtnl_dereference(priv->peer); 1332 int err; 1333 1334 if (!peer) 1335 return -ENOTCONN; 1336 1337 if (priv->_xdp_prog) { 1338 err = veth_enable_xdp(dev); 1339 if (err) 1340 return err; 1341 } else if (veth_gro_requested(dev)) { 1342 err = veth_napi_enable(dev); 1343 if (err) 1344 return err; 1345 } 1346 1347 if (peer->flags & IFF_UP) { 1348 netif_carrier_on(dev); 1349 netif_carrier_on(peer); 1350 } 1351 1352 return 0; 1353 } 1354 1355 static int veth_close(struct net_device *dev) 1356 { 1357 struct veth_priv *priv = netdev_priv(dev); 1358 struct net_device *peer = rtnl_dereference(priv->peer); 1359 1360 netif_carrier_off(dev); 1361 if (peer) 1362 netif_carrier_off(peer); 1363 1364 if (priv->_xdp_prog) 1365 veth_disable_xdp(dev); 1366 else if (veth_gro_requested(dev)) 1367 veth_napi_del(dev); 1368 1369 return 0; 1370 } 1371 1372 static int is_valid_veth_mtu(int mtu) 1373 { 1374 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1375 } 1376 1377 static int veth_alloc_queues(struct net_device *dev) 1378 { 1379 struct veth_priv *priv = netdev_priv(dev); 1380 int i; 1381 1382 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT); 1383 if (!priv->rq) 1384 return -ENOMEM; 1385 1386 for (i = 0; i < dev->num_rx_queues; i++) { 1387 priv->rq[i].dev = dev; 1388 u64_stats_init(&priv->rq[i].stats.syncp); 1389 } 1390 1391 return 0; 1392 } 1393 1394 static void veth_free_queues(struct net_device *dev) 1395 { 1396 struct veth_priv *priv = netdev_priv(dev); 1397 1398 kfree(priv->rq); 1399 } 1400 1401 static int veth_dev_init(struct net_device *dev) 1402 { 1403 int err; 1404 1405 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1406 if (!dev->lstats) 1407 return -ENOMEM; 1408 1409 err = veth_alloc_queues(dev); 1410 if (err) { 1411 free_percpu(dev->lstats); 1412 return err; 1413 } 1414 1415 return 0; 1416 } 1417 1418 static void veth_dev_free(struct net_device *dev) 1419 { 1420 veth_free_queues(dev); 1421 free_percpu(dev->lstats); 1422 } 1423 1424 #ifdef CONFIG_NET_POLL_CONTROLLER 1425 static void veth_poll_controller(struct net_device *dev) 1426 { 1427 /* veth only receives frames when its peer sends one 1428 * Since it has nothing to do with disabling irqs, we are guaranteed 1429 * never to have pending data when we poll for it so 1430 * there is nothing to do here. 1431 * 1432 * We need this though so netpoll recognizes us as an interface that 1433 * supports polling, which enables bridge devices in virt setups to 1434 * still use netconsole 1435 */ 1436 } 1437 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1438 1439 static int veth_get_iflink(const struct net_device *dev) 1440 { 1441 struct veth_priv *priv = netdev_priv(dev); 1442 struct net_device *peer; 1443 int iflink; 1444 1445 rcu_read_lock(); 1446 peer = rcu_dereference(priv->peer); 1447 iflink = peer ? peer->ifindex : 0; 1448 rcu_read_unlock(); 1449 1450 return iflink; 1451 } 1452 1453 static netdev_features_t veth_fix_features(struct net_device *dev, 1454 netdev_features_t features) 1455 { 1456 struct veth_priv *priv = netdev_priv(dev); 1457 struct net_device *peer; 1458 1459 peer = rtnl_dereference(priv->peer); 1460 if (peer) { 1461 struct veth_priv *peer_priv = netdev_priv(peer); 1462 1463 if (peer_priv->_xdp_prog) 1464 features &= ~NETIF_F_GSO_SOFTWARE; 1465 } 1466 if (priv->_xdp_prog) 1467 features |= NETIF_F_GRO; 1468 1469 return features; 1470 } 1471 1472 static int veth_set_features(struct net_device *dev, 1473 netdev_features_t features) 1474 { 1475 netdev_features_t changed = features ^ dev->features; 1476 struct veth_priv *priv = netdev_priv(dev); 1477 int err; 1478 1479 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1480 return 0; 1481 1482 if (features & NETIF_F_GRO) { 1483 err = veth_napi_enable(dev); 1484 if (err) 1485 return err; 1486 } else { 1487 veth_napi_del(dev); 1488 } 1489 return 0; 1490 } 1491 1492 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1493 { 1494 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1495 struct net_device *peer; 1496 1497 if (new_hr < 0) 1498 new_hr = 0; 1499 1500 rcu_read_lock(); 1501 peer = rcu_dereference(priv->peer); 1502 if (unlikely(!peer)) 1503 goto out; 1504 1505 peer_priv = netdev_priv(peer); 1506 priv->requested_headroom = new_hr; 1507 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1508 dev->needed_headroom = new_hr; 1509 peer->needed_headroom = new_hr; 1510 1511 out: 1512 rcu_read_unlock(); 1513 } 1514 1515 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1516 struct netlink_ext_ack *extack) 1517 { 1518 struct veth_priv *priv = netdev_priv(dev); 1519 struct bpf_prog *old_prog; 1520 struct net_device *peer; 1521 unsigned int max_mtu; 1522 int err; 1523 1524 old_prog = priv->_xdp_prog; 1525 priv->_xdp_prog = prog; 1526 peer = rtnl_dereference(priv->peer); 1527 1528 if (prog) { 1529 if (!peer) { 1530 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1531 err = -ENOTCONN; 1532 goto err; 1533 } 1534 1535 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1536 peer->hard_header_len; 1537 /* Allow increasing the max_mtu if the program supports 1538 * XDP fragments. 1539 */ 1540 if (prog->aux->xdp_has_frags) 1541 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1542 1543 if (peer->mtu > max_mtu) { 1544 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1545 err = -ERANGE; 1546 goto err; 1547 } 1548 1549 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1550 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1551 err = -ENOSPC; 1552 goto err; 1553 } 1554 1555 if (dev->flags & IFF_UP) { 1556 err = veth_enable_xdp(dev); 1557 if (err) { 1558 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1559 goto err; 1560 } 1561 } 1562 1563 if (!old_prog) { 1564 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1565 peer->max_mtu = max_mtu; 1566 } 1567 } 1568 1569 if (old_prog) { 1570 if (!prog) { 1571 if (dev->flags & IFF_UP) 1572 veth_disable_xdp(dev); 1573 1574 if (peer) { 1575 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1576 peer->max_mtu = ETH_MAX_MTU; 1577 } 1578 } 1579 bpf_prog_put(old_prog); 1580 } 1581 1582 if ((!!old_prog ^ !!prog) && peer) 1583 netdev_update_features(peer); 1584 1585 return 0; 1586 err: 1587 priv->_xdp_prog = old_prog; 1588 1589 return err; 1590 } 1591 1592 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1593 { 1594 switch (xdp->command) { 1595 case XDP_SETUP_PROG: 1596 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1597 default: 1598 return -EINVAL; 1599 } 1600 } 1601 1602 static const struct net_device_ops veth_netdev_ops = { 1603 .ndo_init = veth_dev_init, 1604 .ndo_open = veth_open, 1605 .ndo_stop = veth_close, 1606 .ndo_start_xmit = veth_xmit, 1607 .ndo_get_stats64 = veth_get_stats64, 1608 .ndo_set_rx_mode = veth_set_multicast_list, 1609 .ndo_set_mac_address = eth_mac_addr, 1610 #ifdef CONFIG_NET_POLL_CONTROLLER 1611 .ndo_poll_controller = veth_poll_controller, 1612 #endif 1613 .ndo_get_iflink = veth_get_iflink, 1614 .ndo_fix_features = veth_fix_features, 1615 .ndo_set_features = veth_set_features, 1616 .ndo_features_check = passthru_features_check, 1617 .ndo_set_rx_headroom = veth_set_rx_headroom, 1618 .ndo_bpf = veth_xdp, 1619 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1620 .ndo_get_peer_dev = veth_peer_dev, 1621 }; 1622 1623 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1624 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1625 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1626 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1627 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1628 1629 static void veth_setup(struct net_device *dev) 1630 { 1631 ether_setup(dev); 1632 1633 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1634 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1635 dev->priv_flags |= IFF_NO_QUEUE; 1636 dev->priv_flags |= IFF_PHONY_HEADROOM; 1637 1638 dev->netdev_ops = &veth_netdev_ops; 1639 dev->ethtool_ops = &veth_ethtool_ops; 1640 dev->features |= NETIF_F_LLTX; 1641 dev->features |= VETH_FEATURES; 1642 dev->vlan_features = dev->features & 1643 ~(NETIF_F_HW_VLAN_CTAG_TX | 1644 NETIF_F_HW_VLAN_STAG_TX | 1645 NETIF_F_HW_VLAN_CTAG_RX | 1646 NETIF_F_HW_VLAN_STAG_RX); 1647 dev->needs_free_netdev = true; 1648 dev->priv_destructor = veth_dev_free; 1649 dev->max_mtu = ETH_MAX_MTU; 1650 1651 dev->hw_features = VETH_FEATURES; 1652 dev->hw_enc_features = VETH_FEATURES; 1653 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1654 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1655 } 1656 1657 /* 1658 * netlink interface 1659 */ 1660 1661 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1662 struct netlink_ext_ack *extack) 1663 { 1664 if (tb[IFLA_ADDRESS]) { 1665 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1666 return -EINVAL; 1667 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1668 return -EADDRNOTAVAIL; 1669 } 1670 if (tb[IFLA_MTU]) { 1671 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1672 return -EINVAL; 1673 } 1674 return 0; 1675 } 1676 1677 static struct rtnl_link_ops veth_link_ops; 1678 1679 static void veth_disable_gro(struct net_device *dev) 1680 { 1681 dev->features &= ~NETIF_F_GRO; 1682 dev->wanted_features &= ~NETIF_F_GRO; 1683 netdev_update_features(dev); 1684 } 1685 1686 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1687 { 1688 int err; 1689 1690 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1691 err = netif_set_real_num_tx_queues(dev, 1); 1692 if (err) 1693 return err; 1694 } 1695 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1696 err = netif_set_real_num_rx_queues(dev, 1); 1697 if (err) 1698 return err; 1699 } 1700 return 0; 1701 } 1702 1703 static int veth_newlink(struct net *src_net, struct net_device *dev, 1704 struct nlattr *tb[], struct nlattr *data[], 1705 struct netlink_ext_ack *extack) 1706 { 1707 int err; 1708 struct net_device *peer; 1709 struct veth_priv *priv; 1710 char ifname[IFNAMSIZ]; 1711 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1712 unsigned char name_assign_type; 1713 struct ifinfomsg *ifmp; 1714 struct net *net; 1715 1716 /* 1717 * create and register peer first 1718 */ 1719 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1720 struct nlattr *nla_peer; 1721 1722 nla_peer = data[VETH_INFO_PEER]; 1723 ifmp = nla_data(nla_peer); 1724 err = rtnl_nla_parse_ifla(peer_tb, 1725 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1726 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1727 NULL); 1728 if (err < 0) 1729 return err; 1730 1731 err = veth_validate(peer_tb, NULL, extack); 1732 if (err < 0) 1733 return err; 1734 1735 tbp = peer_tb; 1736 } else { 1737 ifmp = NULL; 1738 tbp = tb; 1739 } 1740 1741 if (ifmp && tbp[IFLA_IFNAME]) { 1742 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1743 name_assign_type = NET_NAME_USER; 1744 } else { 1745 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1746 name_assign_type = NET_NAME_ENUM; 1747 } 1748 1749 net = rtnl_link_get_net(src_net, tbp); 1750 if (IS_ERR(net)) 1751 return PTR_ERR(net); 1752 1753 peer = rtnl_create_link(net, ifname, name_assign_type, 1754 &veth_link_ops, tbp, extack); 1755 if (IS_ERR(peer)) { 1756 put_net(net); 1757 return PTR_ERR(peer); 1758 } 1759 1760 if (!ifmp || !tbp[IFLA_ADDRESS]) 1761 eth_hw_addr_random(peer); 1762 1763 if (ifmp && (dev->ifindex != 0)) 1764 peer->ifindex = ifmp->ifi_index; 1765 1766 netif_inherit_tso_max(peer, dev); 1767 1768 err = register_netdevice(peer); 1769 put_net(net); 1770 net = NULL; 1771 if (err < 0) 1772 goto err_register_peer; 1773 1774 /* keep GRO disabled by default to be consistent with the established 1775 * veth behavior 1776 */ 1777 veth_disable_gro(peer); 1778 netif_carrier_off(peer); 1779 1780 err = rtnl_configure_link(peer, ifmp); 1781 if (err < 0) 1782 goto err_configure_peer; 1783 1784 /* 1785 * register dev last 1786 * 1787 * note, that since we've registered new device the dev's name 1788 * should be re-allocated 1789 */ 1790 1791 if (tb[IFLA_ADDRESS] == NULL) 1792 eth_hw_addr_random(dev); 1793 1794 if (tb[IFLA_IFNAME]) 1795 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1796 else 1797 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1798 1799 err = register_netdevice(dev); 1800 if (err < 0) 1801 goto err_register_dev; 1802 1803 netif_carrier_off(dev); 1804 1805 /* 1806 * tie the deviced together 1807 */ 1808 1809 priv = netdev_priv(dev); 1810 rcu_assign_pointer(priv->peer, peer); 1811 err = veth_init_queues(dev, tb); 1812 if (err) 1813 goto err_queues; 1814 1815 priv = netdev_priv(peer); 1816 rcu_assign_pointer(priv->peer, dev); 1817 err = veth_init_queues(peer, tb); 1818 if (err) 1819 goto err_queues; 1820 1821 veth_disable_gro(dev); 1822 return 0; 1823 1824 err_queues: 1825 unregister_netdevice(dev); 1826 err_register_dev: 1827 /* nothing to do */ 1828 err_configure_peer: 1829 unregister_netdevice(peer); 1830 return err; 1831 1832 err_register_peer: 1833 free_netdev(peer); 1834 return err; 1835 } 1836 1837 static void veth_dellink(struct net_device *dev, struct list_head *head) 1838 { 1839 struct veth_priv *priv; 1840 struct net_device *peer; 1841 1842 priv = netdev_priv(dev); 1843 peer = rtnl_dereference(priv->peer); 1844 1845 /* Note : dellink() is called from default_device_exit_batch(), 1846 * before a rcu_synchronize() point. The devices are guaranteed 1847 * not being freed before one RCU grace period. 1848 */ 1849 RCU_INIT_POINTER(priv->peer, NULL); 1850 unregister_netdevice_queue(dev, head); 1851 1852 if (peer) { 1853 priv = netdev_priv(peer); 1854 RCU_INIT_POINTER(priv->peer, NULL); 1855 unregister_netdevice_queue(peer, head); 1856 } 1857 } 1858 1859 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1860 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1861 }; 1862 1863 static struct net *veth_get_link_net(const struct net_device *dev) 1864 { 1865 struct veth_priv *priv = netdev_priv(dev); 1866 struct net_device *peer = rtnl_dereference(priv->peer); 1867 1868 return peer ? dev_net(peer) : dev_net(dev); 1869 } 1870 1871 static unsigned int veth_get_num_queues(void) 1872 { 1873 /* enforce the same queue limit as rtnl_create_link */ 1874 int queues = num_possible_cpus(); 1875 1876 if (queues > 4096) 1877 queues = 4096; 1878 return queues; 1879 } 1880 1881 static struct rtnl_link_ops veth_link_ops = { 1882 .kind = DRV_NAME, 1883 .priv_size = sizeof(struct veth_priv), 1884 .setup = veth_setup, 1885 .validate = veth_validate, 1886 .newlink = veth_newlink, 1887 .dellink = veth_dellink, 1888 .policy = veth_policy, 1889 .maxtype = VETH_INFO_MAX, 1890 .get_link_net = veth_get_link_net, 1891 .get_num_tx_queues = veth_get_num_queues, 1892 .get_num_rx_queues = veth_get_num_queues, 1893 }; 1894 1895 /* 1896 * init/fini 1897 */ 1898 1899 static __init int veth_init(void) 1900 { 1901 return rtnl_link_register(&veth_link_ops); 1902 } 1903 1904 static __exit void veth_exit(void) 1905 { 1906 rtnl_link_unregister(&veth_link_ops); 1907 } 1908 1909 module_init(veth_init); 1910 module_exit(veth_exit); 1911 1912 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1913 MODULE_LICENSE("GPL v2"); 1914 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1915