1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 #include <linux/skbuff_ref.h> 30 #include <net/page_pool/helpers.h> 31 32 #define DRV_NAME "veth" 33 #define DRV_VERSION "1.0" 34 35 #define VETH_XDP_FLAG BIT(0) 36 #define VETH_RING_SIZE 256 37 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 38 39 #define VETH_XDP_TX_BULK_SIZE 16 40 #define VETH_XDP_BATCH 16 41 42 struct veth_stats { 43 u64 rx_drops; 44 /* xdp */ 45 u64 xdp_packets; 46 u64 xdp_bytes; 47 u64 xdp_redirect; 48 u64 xdp_drops; 49 u64 xdp_tx; 50 u64 xdp_tx_err; 51 u64 peer_tq_xdp_xmit; 52 u64 peer_tq_xdp_xmit_err; 53 }; 54 55 struct veth_rq_stats { 56 struct veth_stats vs; 57 struct u64_stats_sync syncp; 58 }; 59 60 struct veth_rq { 61 struct napi_struct xdp_napi; 62 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 63 struct net_device *dev; 64 struct bpf_prog __rcu *xdp_prog; 65 struct xdp_mem_info xdp_mem; 66 struct veth_rq_stats stats; 67 bool rx_notify_masked; 68 struct ptr_ring xdp_ring; 69 struct xdp_rxq_info xdp_rxq; 70 struct page_pool *page_pool; 71 }; 72 73 struct veth_priv { 74 struct net_device __rcu *peer; 75 atomic64_t dropped; 76 struct bpf_prog *_xdp_prog; 77 struct veth_rq *rq; 78 unsigned int requested_headroom; 79 }; 80 81 struct veth_xdp_tx_bq { 82 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 83 unsigned int count; 84 }; 85 86 /* 87 * ethtool interface 88 */ 89 90 struct veth_q_stat_desc { 91 char desc[ETH_GSTRING_LEN]; 92 size_t offset; 93 }; 94 95 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 96 97 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 98 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 99 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 100 { "drops", VETH_RQ_STAT(rx_drops) }, 101 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 102 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 103 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 104 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 105 }; 106 107 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 108 109 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 110 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 111 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 112 }; 113 114 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 115 116 static struct { 117 const char string[ETH_GSTRING_LEN]; 118 } ethtool_stats_keys[] = { 119 { "peer_ifindex" }, 120 }; 121 122 struct veth_xdp_buff { 123 struct xdp_buff xdp; 124 struct sk_buff *skb; 125 }; 126 127 static int veth_get_link_ksettings(struct net_device *dev, 128 struct ethtool_link_ksettings *cmd) 129 { 130 cmd->base.speed = SPEED_10000; 131 cmd->base.duplex = DUPLEX_FULL; 132 cmd->base.port = PORT_TP; 133 cmd->base.autoneg = AUTONEG_DISABLE; 134 return 0; 135 } 136 137 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 138 { 139 strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 140 strscpy(info->version, DRV_VERSION, sizeof(info->version)); 141 } 142 143 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 144 { 145 u8 *p = buf; 146 int i, j; 147 148 switch(stringset) { 149 case ETH_SS_STATS: 150 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 151 p += sizeof(ethtool_stats_keys); 152 for (i = 0; i < dev->real_num_rx_queues; i++) 153 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 154 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 155 i, veth_rq_stats_desc[j].desc); 156 157 for (i = 0; i < dev->real_num_tx_queues; i++) 158 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 159 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 160 i, veth_tq_stats_desc[j].desc); 161 162 page_pool_ethtool_stats_get_strings(p); 163 break; 164 } 165 } 166 167 static int veth_get_sset_count(struct net_device *dev, int sset) 168 { 169 switch (sset) { 170 case ETH_SS_STATS: 171 return ARRAY_SIZE(ethtool_stats_keys) + 172 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 173 VETH_TQ_STATS_LEN * dev->real_num_tx_queues + 174 page_pool_ethtool_stats_get_count(); 175 default: 176 return -EOPNOTSUPP; 177 } 178 } 179 180 static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) 181 { 182 #ifdef CONFIG_PAGE_POOL_STATS 183 struct veth_priv *priv = netdev_priv(dev); 184 struct page_pool_stats pp_stats = {}; 185 int i; 186 187 for (i = 0; i < dev->real_num_rx_queues; i++) { 188 if (!priv->rq[i].page_pool) 189 continue; 190 page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); 191 } 192 page_pool_ethtool_stats_get(data, &pp_stats); 193 #endif /* CONFIG_PAGE_POOL_STATS */ 194 } 195 196 static void veth_get_ethtool_stats(struct net_device *dev, 197 struct ethtool_stats *stats, u64 *data) 198 { 199 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 200 struct net_device *peer = rtnl_dereference(priv->peer); 201 int i, j, idx, pp_idx; 202 203 data[0] = peer ? peer->ifindex : 0; 204 idx = 1; 205 for (i = 0; i < dev->real_num_rx_queues; i++) { 206 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 207 const void *stats_base = (void *)&rq_stats->vs; 208 unsigned int start; 209 size_t offset; 210 211 do { 212 start = u64_stats_fetch_begin(&rq_stats->syncp); 213 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 214 offset = veth_rq_stats_desc[j].offset; 215 data[idx + j] = *(u64 *)(stats_base + offset); 216 } 217 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 218 idx += VETH_RQ_STATS_LEN; 219 } 220 pp_idx = idx; 221 222 if (!peer) 223 goto page_pool_stats; 224 225 rcv_priv = netdev_priv(peer); 226 for (i = 0; i < peer->real_num_rx_queues; i++) { 227 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 228 const void *base = (void *)&rq_stats->vs; 229 unsigned int start, tx_idx = idx; 230 size_t offset; 231 232 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 233 do { 234 start = u64_stats_fetch_begin(&rq_stats->syncp); 235 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 236 offset = veth_tq_stats_desc[j].offset; 237 data[tx_idx + j] += *(u64 *)(base + offset); 238 } 239 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 240 } 241 pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; 242 243 page_pool_stats: 244 veth_get_page_pool_stats(dev, &data[pp_idx]); 245 } 246 247 static void veth_get_channels(struct net_device *dev, 248 struct ethtool_channels *channels) 249 { 250 channels->tx_count = dev->real_num_tx_queues; 251 channels->rx_count = dev->real_num_rx_queues; 252 channels->max_tx = dev->num_tx_queues; 253 channels->max_rx = dev->num_rx_queues; 254 } 255 256 static int veth_set_channels(struct net_device *dev, 257 struct ethtool_channels *ch); 258 259 static const struct ethtool_ops veth_ethtool_ops = { 260 .get_drvinfo = veth_get_drvinfo, 261 .get_link = ethtool_op_get_link, 262 .get_strings = veth_get_strings, 263 .get_sset_count = veth_get_sset_count, 264 .get_ethtool_stats = veth_get_ethtool_stats, 265 .get_link_ksettings = veth_get_link_ksettings, 266 .get_ts_info = ethtool_op_get_ts_info, 267 .get_channels = veth_get_channels, 268 .set_channels = veth_set_channels, 269 }; 270 271 /* general routines */ 272 273 static bool veth_is_xdp_frame(void *ptr) 274 { 275 return (unsigned long)ptr & VETH_XDP_FLAG; 276 } 277 278 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 279 { 280 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 281 } 282 283 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 284 { 285 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 286 } 287 288 static void veth_ptr_free(void *ptr) 289 { 290 if (veth_is_xdp_frame(ptr)) 291 xdp_return_frame(veth_ptr_to_xdp(ptr)); 292 else 293 kfree_skb(ptr); 294 } 295 296 static void __veth_xdp_flush(struct veth_rq *rq) 297 { 298 /* Write ptr_ring before reading rx_notify_masked */ 299 smp_mb(); 300 if (!READ_ONCE(rq->rx_notify_masked) && 301 napi_schedule_prep(&rq->xdp_napi)) { 302 WRITE_ONCE(rq->rx_notify_masked, true); 303 __napi_schedule(&rq->xdp_napi); 304 } 305 } 306 307 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 308 { 309 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 310 dev_kfree_skb_any(skb); 311 return NET_RX_DROP; 312 } 313 314 return NET_RX_SUCCESS; 315 } 316 317 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 318 struct veth_rq *rq, bool xdp) 319 { 320 return __dev_forward_skb(dev, skb) ?: xdp ? 321 veth_xdp_rx(rq, skb) : 322 __netif_rx(skb); 323 } 324 325 /* return true if the specified skb has chances of GRO aggregation 326 * Don't strive for accuracy, but try to avoid GRO overhead in the most 327 * common scenarios. 328 * When XDP is enabled, all traffic is considered eligible, as the xmit 329 * device has TSO off. 330 * When TSO is enabled on the xmit device, we are likely interested only 331 * in UDP aggregation, explicitly check for that if the skb is suspected 332 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 333 * to belong to locally generated UDP traffic. 334 */ 335 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 336 const struct net_device *rcv, 337 const struct sk_buff *skb) 338 { 339 return !(dev->features & NETIF_F_ALL_TSO) || 340 (skb->destructor == sock_wfree && 341 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 342 } 343 344 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 345 { 346 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 347 struct veth_rq *rq = NULL; 348 int ret = NETDEV_TX_OK; 349 struct net_device *rcv; 350 int length = skb->len; 351 bool use_napi = false; 352 int rxq; 353 354 rcu_read_lock(); 355 rcv = rcu_dereference(priv->peer); 356 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 357 kfree_skb(skb); 358 goto drop; 359 } 360 361 rcv_priv = netdev_priv(rcv); 362 rxq = skb_get_queue_mapping(skb); 363 if (rxq < rcv->real_num_rx_queues) { 364 rq = &rcv_priv->rq[rxq]; 365 366 /* The napi pointer is available when an XDP program is 367 * attached or when GRO is enabled 368 * Don't bother with napi/GRO if the skb can't be aggregated 369 */ 370 use_napi = rcu_access_pointer(rq->napi) && 371 veth_skb_is_eligible_for_gro(dev, rcv, skb); 372 } 373 374 skb_tx_timestamp(skb); 375 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 376 if (!use_napi) 377 dev_sw_netstats_tx_add(dev, 1, length); 378 else 379 __veth_xdp_flush(rq); 380 } else { 381 drop: 382 atomic64_inc(&priv->dropped); 383 ret = NET_XMIT_DROP; 384 } 385 386 rcu_read_unlock(); 387 388 return ret; 389 } 390 391 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 392 { 393 struct veth_priv *priv = netdev_priv(dev); 394 int i; 395 396 result->peer_tq_xdp_xmit_err = 0; 397 result->xdp_packets = 0; 398 result->xdp_tx_err = 0; 399 result->xdp_bytes = 0; 400 result->rx_drops = 0; 401 for (i = 0; i < dev->num_rx_queues; i++) { 402 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 403 struct veth_rq_stats *stats = &priv->rq[i].stats; 404 unsigned int start; 405 406 do { 407 start = u64_stats_fetch_begin(&stats->syncp); 408 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 409 xdp_tx_err = stats->vs.xdp_tx_err; 410 packets = stats->vs.xdp_packets; 411 bytes = stats->vs.xdp_bytes; 412 drops = stats->vs.rx_drops; 413 } while (u64_stats_fetch_retry(&stats->syncp, start)); 414 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 415 result->xdp_tx_err += xdp_tx_err; 416 result->xdp_packets += packets; 417 result->xdp_bytes += bytes; 418 result->rx_drops += drops; 419 } 420 } 421 422 static void veth_get_stats64(struct net_device *dev, 423 struct rtnl_link_stats64 *tot) 424 { 425 struct veth_priv *priv = netdev_priv(dev); 426 struct net_device *peer; 427 struct veth_stats rx; 428 429 tot->tx_dropped = atomic64_read(&priv->dropped); 430 dev_fetch_sw_netstats(tot, dev->tstats); 431 432 veth_stats_rx(&rx, dev); 433 tot->tx_dropped += rx.xdp_tx_err; 434 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 435 tot->rx_bytes += rx.xdp_bytes; 436 tot->rx_packets += rx.xdp_packets; 437 438 rcu_read_lock(); 439 peer = rcu_dereference(priv->peer); 440 if (peer) { 441 struct rtnl_link_stats64 tot_peer = {}; 442 443 dev_fetch_sw_netstats(&tot_peer, peer->tstats); 444 tot->rx_bytes += tot_peer.tx_bytes; 445 tot->rx_packets += tot_peer.tx_packets; 446 447 veth_stats_rx(&rx, peer); 448 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 449 tot->rx_dropped += rx.xdp_tx_err; 450 tot->tx_bytes += rx.xdp_bytes; 451 tot->tx_packets += rx.xdp_packets; 452 } 453 rcu_read_unlock(); 454 } 455 456 /* fake multicast ability */ 457 static void veth_set_multicast_list(struct net_device *dev) 458 { 459 } 460 461 static int veth_select_rxq(struct net_device *dev) 462 { 463 return smp_processor_id() % dev->real_num_rx_queues; 464 } 465 466 static struct net_device *veth_peer_dev(struct net_device *dev) 467 { 468 struct veth_priv *priv = netdev_priv(dev); 469 470 /* Callers must be under RCU read side. */ 471 return rcu_dereference(priv->peer); 472 } 473 474 static int veth_xdp_xmit(struct net_device *dev, int n, 475 struct xdp_frame **frames, 476 u32 flags, bool ndo_xmit) 477 { 478 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 479 int i, ret = -ENXIO, nxmit = 0; 480 struct net_device *rcv; 481 unsigned int max_len; 482 struct veth_rq *rq; 483 484 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 485 return -EINVAL; 486 487 rcu_read_lock(); 488 rcv = rcu_dereference(priv->peer); 489 if (unlikely(!rcv)) 490 goto out; 491 492 rcv_priv = netdev_priv(rcv); 493 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 494 /* The napi pointer is set if NAPI is enabled, which ensures that 495 * xdp_ring is initialized on receive side and the peer device is up. 496 */ 497 if (!rcu_access_pointer(rq->napi)) 498 goto out; 499 500 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 501 502 spin_lock(&rq->xdp_ring.producer_lock); 503 for (i = 0; i < n; i++) { 504 struct xdp_frame *frame = frames[i]; 505 void *ptr = veth_xdp_to_ptr(frame); 506 507 if (unlikely(xdp_get_frame_len(frame) > max_len || 508 __ptr_ring_produce(&rq->xdp_ring, ptr))) 509 break; 510 nxmit++; 511 } 512 spin_unlock(&rq->xdp_ring.producer_lock); 513 514 if (flags & XDP_XMIT_FLUSH) 515 __veth_xdp_flush(rq); 516 517 ret = nxmit; 518 if (ndo_xmit) { 519 u64_stats_update_begin(&rq->stats.syncp); 520 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 521 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 522 u64_stats_update_end(&rq->stats.syncp); 523 } 524 525 out: 526 rcu_read_unlock(); 527 528 return ret; 529 } 530 531 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 532 struct xdp_frame **frames, u32 flags) 533 { 534 int err; 535 536 err = veth_xdp_xmit(dev, n, frames, flags, true); 537 if (err < 0) { 538 struct veth_priv *priv = netdev_priv(dev); 539 540 atomic64_add(n, &priv->dropped); 541 } 542 543 return err; 544 } 545 546 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 547 { 548 int sent, i, err = 0, drops; 549 550 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 551 if (sent < 0) { 552 err = sent; 553 sent = 0; 554 } 555 556 for (i = sent; unlikely(i < bq->count); i++) 557 xdp_return_frame(bq->q[i]); 558 559 drops = bq->count - sent; 560 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 561 562 u64_stats_update_begin(&rq->stats.syncp); 563 rq->stats.vs.xdp_tx += sent; 564 rq->stats.vs.xdp_tx_err += drops; 565 u64_stats_update_end(&rq->stats.syncp); 566 567 bq->count = 0; 568 } 569 570 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 571 { 572 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 573 struct net_device *rcv; 574 struct veth_rq *rcv_rq; 575 576 rcu_read_lock(); 577 veth_xdp_flush_bq(rq, bq); 578 rcv = rcu_dereference(priv->peer); 579 if (unlikely(!rcv)) 580 goto out; 581 582 rcv_priv = netdev_priv(rcv); 583 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 584 /* xdp_ring is initialized on receive side? */ 585 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 586 goto out; 587 588 __veth_xdp_flush(rcv_rq); 589 out: 590 rcu_read_unlock(); 591 } 592 593 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 594 struct veth_xdp_tx_bq *bq) 595 { 596 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 597 598 if (unlikely(!frame)) 599 return -EOVERFLOW; 600 601 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 602 veth_xdp_flush_bq(rq, bq); 603 604 bq->q[bq->count++] = frame; 605 606 return 0; 607 } 608 609 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 610 struct xdp_frame *frame, 611 struct veth_xdp_tx_bq *bq, 612 struct veth_stats *stats) 613 { 614 struct xdp_frame orig_frame; 615 struct bpf_prog *xdp_prog; 616 617 rcu_read_lock(); 618 xdp_prog = rcu_dereference(rq->xdp_prog); 619 if (likely(xdp_prog)) { 620 struct veth_xdp_buff vxbuf; 621 struct xdp_buff *xdp = &vxbuf.xdp; 622 u32 act; 623 624 xdp_convert_frame_to_buff(frame, xdp); 625 xdp->rxq = &rq->xdp_rxq; 626 vxbuf.skb = NULL; 627 628 act = bpf_prog_run_xdp(xdp_prog, xdp); 629 630 switch (act) { 631 case XDP_PASS: 632 if (xdp_update_frame_from_buff(xdp, frame)) 633 goto err_xdp; 634 break; 635 case XDP_TX: 636 orig_frame = *frame; 637 xdp->rxq->mem.type = frame->mem_type; 638 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 639 trace_xdp_exception(rq->dev, xdp_prog, act); 640 frame = &orig_frame; 641 stats->rx_drops++; 642 goto err_xdp; 643 } 644 stats->xdp_tx++; 645 rcu_read_unlock(); 646 goto xdp_xmit; 647 case XDP_REDIRECT: 648 orig_frame = *frame; 649 xdp->rxq->mem.type = frame->mem_type; 650 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 651 frame = &orig_frame; 652 stats->rx_drops++; 653 goto err_xdp; 654 } 655 stats->xdp_redirect++; 656 rcu_read_unlock(); 657 goto xdp_xmit; 658 default: 659 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 660 fallthrough; 661 case XDP_ABORTED: 662 trace_xdp_exception(rq->dev, xdp_prog, act); 663 fallthrough; 664 case XDP_DROP: 665 stats->xdp_drops++; 666 goto err_xdp; 667 } 668 } 669 rcu_read_unlock(); 670 671 return frame; 672 err_xdp: 673 rcu_read_unlock(); 674 xdp_return_frame(frame); 675 xdp_xmit: 676 return NULL; 677 } 678 679 /* frames array contains VETH_XDP_BATCH at most */ 680 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 681 int n_xdpf, struct veth_xdp_tx_bq *bq, 682 struct veth_stats *stats) 683 { 684 void *skbs[VETH_XDP_BATCH]; 685 int i; 686 687 if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { 688 for (i = 0; i < n_xdpf; i++) 689 xdp_return_frame(frames[i]); 690 stats->rx_drops += n_xdpf; 691 692 return; 693 } 694 695 for (i = 0; i < n_xdpf; i++) { 696 struct sk_buff *skb = skbs[i]; 697 698 skb = __xdp_build_skb_from_frame(frames[i], skb, 699 rq->dev); 700 if (!skb) { 701 xdp_return_frame(frames[i]); 702 stats->rx_drops++; 703 continue; 704 } 705 napi_gro_receive(&rq->xdp_napi, skb); 706 } 707 } 708 709 static void veth_xdp_get(struct xdp_buff *xdp) 710 { 711 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 712 int i; 713 714 get_page(virt_to_page(xdp->data)); 715 if (likely(!xdp_buff_has_frags(xdp))) 716 return; 717 718 for (i = 0; i < sinfo->nr_frags; i++) 719 __skb_frag_ref(&sinfo->frags[i]); 720 } 721 722 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 723 struct xdp_buff *xdp, 724 struct sk_buff **pskb) 725 { 726 struct sk_buff *skb = *pskb; 727 u32 frame_sz; 728 729 if (skb_shared(skb) || skb_head_is_locked(skb) || 730 skb_shinfo(skb)->nr_frags || 731 skb_headroom(skb) < XDP_PACKET_HEADROOM) { 732 if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM)) 733 goto drop; 734 735 skb = *pskb; 736 } 737 738 /* SKB "head" area always have tailroom for skb_shared_info */ 739 frame_sz = skb_end_pointer(skb) - skb->head; 740 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 741 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 742 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 743 skb_headlen(skb), true); 744 745 if (skb_is_nonlinear(skb)) { 746 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 747 xdp_buff_set_frags_flag(xdp); 748 } else { 749 xdp_buff_clear_frags_flag(xdp); 750 } 751 *pskb = skb; 752 753 return 0; 754 drop: 755 consume_skb(skb); 756 *pskb = NULL; 757 758 return -ENOMEM; 759 } 760 761 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 762 struct sk_buff *skb, 763 struct veth_xdp_tx_bq *bq, 764 struct veth_stats *stats) 765 { 766 void *orig_data, *orig_data_end; 767 struct bpf_prog *xdp_prog; 768 struct veth_xdp_buff vxbuf; 769 struct xdp_buff *xdp = &vxbuf.xdp; 770 u32 act, metalen; 771 int off; 772 773 skb_prepare_for_gro(skb); 774 775 rcu_read_lock(); 776 xdp_prog = rcu_dereference(rq->xdp_prog); 777 if (unlikely(!xdp_prog)) { 778 rcu_read_unlock(); 779 goto out; 780 } 781 782 __skb_push(skb, skb->data - skb_mac_header(skb)); 783 if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) 784 goto drop; 785 vxbuf.skb = skb; 786 787 orig_data = xdp->data; 788 orig_data_end = xdp->data_end; 789 790 act = bpf_prog_run_xdp(xdp_prog, xdp); 791 792 switch (act) { 793 case XDP_PASS: 794 break; 795 case XDP_TX: 796 veth_xdp_get(xdp); 797 consume_skb(skb); 798 xdp->rxq->mem = rq->xdp_mem; 799 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 800 trace_xdp_exception(rq->dev, xdp_prog, act); 801 stats->rx_drops++; 802 goto err_xdp; 803 } 804 stats->xdp_tx++; 805 rcu_read_unlock(); 806 goto xdp_xmit; 807 case XDP_REDIRECT: 808 veth_xdp_get(xdp); 809 consume_skb(skb); 810 xdp->rxq->mem = rq->xdp_mem; 811 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 812 stats->rx_drops++; 813 goto err_xdp; 814 } 815 stats->xdp_redirect++; 816 rcu_read_unlock(); 817 goto xdp_xmit; 818 default: 819 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 820 fallthrough; 821 case XDP_ABORTED: 822 trace_xdp_exception(rq->dev, xdp_prog, act); 823 fallthrough; 824 case XDP_DROP: 825 stats->xdp_drops++; 826 goto xdp_drop; 827 } 828 rcu_read_unlock(); 829 830 /* check if bpf_xdp_adjust_head was used */ 831 off = orig_data - xdp->data; 832 if (off > 0) 833 __skb_push(skb, off); 834 else if (off < 0) 835 __skb_pull(skb, -off); 836 837 skb_reset_mac_header(skb); 838 839 /* check if bpf_xdp_adjust_tail was used */ 840 off = xdp->data_end - orig_data_end; 841 if (off != 0) 842 __skb_put(skb, off); /* positive on grow, negative on shrink */ 843 844 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 845 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 846 */ 847 if (xdp_buff_has_frags(xdp)) 848 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 849 else 850 skb->data_len = 0; 851 852 skb->protocol = eth_type_trans(skb, rq->dev); 853 854 metalen = xdp->data - xdp->data_meta; 855 if (metalen) 856 skb_metadata_set(skb, metalen); 857 out: 858 return skb; 859 drop: 860 stats->rx_drops++; 861 xdp_drop: 862 rcu_read_unlock(); 863 kfree_skb(skb); 864 return NULL; 865 err_xdp: 866 rcu_read_unlock(); 867 xdp_return_buff(xdp); 868 xdp_xmit: 869 return NULL; 870 } 871 872 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 873 struct veth_xdp_tx_bq *bq, 874 struct veth_stats *stats) 875 { 876 int i, done = 0, n_xdpf = 0; 877 void *xdpf[VETH_XDP_BATCH]; 878 879 for (i = 0; i < budget; i++) { 880 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 881 882 if (!ptr) 883 break; 884 885 if (veth_is_xdp_frame(ptr)) { 886 /* ndo_xdp_xmit */ 887 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 888 889 stats->xdp_bytes += xdp_get_frame_len(frame); 890 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 891 if (frame) { 892 /* XDP_PASS */ 893 xdpf[n_xdpf++] = frame; 894 if (n_xdpf == VETH_XDP_BATCH) { 895 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 896 bq, stats); 897 n_xdpf = 0; 898 } 899 } 900 } else { 901 /* ndo_start_xmit */ 902 struct sk_buff *skb = ptr; 903 904 stats->xdp_bytes += skb->len; 905 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 906 if (skb) { 907 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 908 netif_receive_skb(skb); 909 else 910 napi_gro_receive(&rq->xdp_napi, skb); 911 } 912 } 913 done++; 914 } 915 916 if (n_xdpf) 917 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 918 919 u64_stats_update_begin(&rq->stats.syncp); 920 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 921 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 922 rq->stats.vs.xdp_drops += stats->xdp_drops; 923 rq->stats.vs.rx_drops += stats->rx_drops; 924 rq->stats.vs.xdp_packets += done; 925 u64_stats_update_end(&rq->stats.syncp); 926 927 return done; 928 } 929 930 static int veth_poll(struct napi_struct *napi, int budget) 931 { 932 struct veth_rq *rq = 933 container_of(napi, struct veth_rq, xdp_napi); 934 struct veth_stats stats = {}; 935 struct veth_xdp_tx_bq bq; 936 int done; 937 938 bq.count = 0; 939 940 xdp_set_return_frame_no_direct(); 941 done = veth_xdp_rcv(rq, budget, &bq, &stats); 942 943 if (stats.xdp_redirect > 0) 944 xdp_do_flush(); 945 946 if (done < budget && napi_complete_done(napi, done)) { 947 /* Write rx_notify_masked before reading ptr_ring */ 948 smp_store_mb(rq->rx_notify_masked, false); 949 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 950 if (napi_schedule_prep(&rq->xdp_napi)) { 951 WRITE_ONCE(rq->rx_notify_masked, true); 952 __napi_schedule(&rq->xdp_napi); 953 } 954 } 955 } 956 957 if (stats.xdp_tx > 0) 958 veth_xdp_flush(rq, &bq); 959 xdp_clear_return_frame_no_direct(); 960 961 return done; 962 } 963 964 static int veth_create_page_pool(struct veth_rq *rq) 965 { 966 struct page_pool_params pp_params = { 967 .order = 0, 968 .pool_size = VETH_RING_SIZE, 969 .nid = NUMA_NO_NODE, 970 .dev = &rq->dev->dev, 971 }; 972 973 rq->page_pool = page_pool_create(&pp_params); 974 if (IS_ERR(rq->page_pool)) { 975 int err = PTR_ERR(rq->page_pool); 976 977 rq->page_pool = NULL; 978 return err; 979 } 980 981 return 0; 982 } 983 984 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 985 { 986 struct veth_priv *priv = netdev_priv(dev); 987 int err, i; 988 989 for (i = start; i < end; i++) { 990 err = veth_create_page_pool(&priv->rq[i]); 991 if (err) 992 goto err_page_pool; 993 } 994 995 for (i = start; i < end; i++) { 996 struct veth_rq *rq = &priv->rq[i]; 997 998 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 999 if (err) 1000 goto err_xdp_ring; 1001 } 1002 1003 for (i = start; i < end; i++) { 1004 struct veth_rq *rq = &priv->rq[i]; 1005 1006 napi_enable(&rq->xdp_napi); 1007 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1008 } 1009 1010 return 0; 1011 1012 err_xdp_ring: 1013 for (i--; i >= start; i--) 1014 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1015 i = end; 1016 err_page_pool: 1017 for (i--; i >= start; i--) { 1018 page_pool_destroy(priv->rq[i].page_pool); 1019 priv->rq[i].page_pool = NULL; 1020 } 1021 1022 return err; 1023 } 1024 1025 static int __veth_napi_enable(struct net_device *dev) 1026 { 1027 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1028 } 1029 1030 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1031 { 1032 struct veth_priv *priv = netdev_priv(dev); 1033 int i; 1034 1035 for (i = start; i < end; i++) { 1036 struct veth_rq *rq = &priv->rq[i]; 1037 1038 rcu_assign_pointer(priv->rq[i].napi, NULL); 1039 napi_disable(&rq->xdp_napi); 1040 __netif_napi_del(&rq->xdp_napi); 1041 } 1042 synchronize_net(); 1043 1044 for (i = start; i < end; i++) { 1045 struct veth_rq *rq = &priv->rq[i]; 1046 1047 rq->rx_notify_masked = false; 1048 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1049 } 1050 1051 for (i = start; i < end; i++) { 1052 page_pool_destroy(priv->rq[i].page_pool); 1053 priv->rq[i].page_pool = NULL; 1054 } 1055 } 1056 1057 static void veth_napi_del(struct net_device *dev) 1058 { 1059 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1060 } 1061 1062 static bool veth_gro_requested(const struct net_device *dev) 1063 { 1064 return !!(dev->wanted_features & NETIF_F_GRO); 1065 } 1066 1067 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1068 bool napi_already_on) 1069 { 1070 struct veth_priv *priv = netdev_priv(dev); 1071 int err, i; 1072 1073 for (i = start; i < end; i++) { 1074 struct veth_rq *rq = &priv->rq[i]; 1075 1076 if (!napi_already_on) 1077 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1078 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1079 if (err < 0) 1080 goto err_rxq_reg; 1081 1082 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1083 MEM_TYPE_PAGE_SHARED, 1084 NULL); 1085 if (err < 0) 1086 goto err_reg_mem; 1087 1088 /* Save original mem info as it can be overwritten */ 1089 rq->xdp_mem = rq->xdp_rxq.mem; 1090 } 1091 return 0; 1092 1093 err_reg_mem: 1094 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1095 err_rxq_reg: 1096 for (i--; i >= start; i--) { 1097 struct veth_rq *rq = &priv->rq[i]; 1098 1099 xdp_rxq_info_unreg(&rq->xdp_rxq); 1100 if (!napi_already_on) 1101 netif_napi_del(&rq->xdp_napi); 1102 } 1103 1104 return err; 1105 } 1106 1107 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1108 bool delete_napi) 1109 { 1110 struct veth_priv *priv = netdev_priv(dev); 1111 int i; 1112 1113 for (i = start; i < end; i++) { 1114 struct veth_rq *rq = &priv->rq[i]; 1115 1116 rq->xdp_rxq.mem = rq->xdp_mem; 1117 xdp_rxq_info_unreg(&rq->xdp_rxq); 1118 1119 if (delete_napi) 1120 netif_napi_del(&rq->xdp_napi); 1121 } 1122 } 1123 1124 static int veth_enable_xdp(struct net_device *dev) 1125 { 1126 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1127 struct veth_priv *priv = netdev_priv(dev); 1128 int err, i; 1129 1130 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1131 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1132 if (err) 1133 return err; 1134 1135 if (!napi_already_on) { 1136 err = __veth_napi_enable(dev); 1137 if (err) { 1138 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1139 return err; 1140 } 1141 } 1142 } 1143 1144 for (i = 0; i < dev->real_num_rx_queues; i++) { 1145 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1146 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1147 } 1148 1149 return 0; 1150 } 1151 1152 static void veth_disable_xdp(struct net_device *dev) 1153 { 1154 struct veth_priv *priv = netdev_priv(dev); 1155 int i; 1156 1157 for (i = 0; i < dev->real_num_rx_queues; i++) 1158 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1159 1160 if (!netif_running(dev) || !veth_gro_requested(dev)) 1161 veth_napi_del(dev); 1162 1163 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1164 } 1165 1166 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1167 { 1168 struct veth_priv *priv = netdev_priv(dev); 1169 int err, i; 1170 1171 for (i = start; i < end; i++) { 1172 struct veth_rq *rq = &priv->rq[i]; 1173 1174 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1175 } 1176 1177 err = __veth_napi_enable_range(dev, start, end); 1178 if (err) { 1179 for (i = start; i < end; i++) { 1180 struct veth_rq *rq = &priv->rq[i]; 1181 1182 netif_napi_del(&rq->xdp_napi); 1183 } 1184 return err; 1185 } 1186 return err; 1187 } 1188 1189 static int veth_napi_enable(struct net_device *dev) 1190 { 1191 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1192 } 1193 1194 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1195 { 1196 struct veth_priv *priv = netdev_priv(dev); 1197 1198 if (start >= end) 1199 return; 1200 1201 if (priv->_xdp_prog) { 1202 veth_napi_del_range(dev, start, end); 1203 veth_disable_xdp_range(dev, start, end, false); 1204 } else if (veth_gro_requested(dev)) { 1205 veth_napi_del_range(dev, start, end); 1206 } 1207 } 1208 1209 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1210 { 1211 struct veth_priv *priv = netdev_priv(dev); 1212 int err; 1213 1214 if (start >= end) 1215 return 0; 1216 1217 if (priv->_xdp_prog) { 1218 /* these channels are freshly initialized, napi is not on there even 1219 * when GRO is requeste 1220 */ 1221 err = veth_enable_xdp_range(dev, start, end, false); 1222 if (err) 1223 return err; 1224 1225 err = __veth_napi_enable_range(dev, start, end); 1226 if (err) { 1227 /* on error always delete the newly added napis */ 1228 veth_disable_xdp_range(dev, start, end, true); 1229 return err; 1230 } 1231 } else if (veth_gro_requested(dev)) { 1232 return veth_napi_enable_range(dev, start, end); 1233 } 1234 return 0; 1235 } 1236 1237 static void veth_set_xdp_features(struct net_device *dev) 1238 { 1239 struct veth_priv *priv = netdev_priv(dev); 1240 struct net_device *peer; 1241 1242 peer = rtnl_dereference(priv->peer); 1243 if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { 1244 struct veth_priv *priv_peer = netdev_priv(peer); 1245 xdp_features_t val = NETDEV_XDP_ACT_BASIC | 1246 NETDEV_XDP_ACT_REDIRECT | 1247 NETDEV_XDP_ACT_RX_SG; 1248 1249 if (priv_peer->_xdp_prog || veth_gro_requested(peer)) 1250 val |= NETDEV_XDP_ACT_NDO_XMIT | 1251 NETDEV_XDP_ACT_NDO_XMIT_SG; 1252 xdp_set_features_flag(dev, val); 1253 } else { 1254 xdp_clear_features_flag(dev); 1255 } 1256 } 1257 1258 static int veth_set_channels(struct net_device *dev, 1259 struct ethtool_channels *ch) 1260 { 1261 struct veth_priv *priv = netdev_priv(dev); 1262 unsigned int old_rx_count, new_rx_count; 1263 struct veth_priv *peer_priv; 1264 struct net_device *peer; 1265 int err; 1266 1267 /* sanity check. Upper bounds are already enforced by the caller */ 1268 if (!ch->rx_count || !ch->tx_count) 1269 return -EINVAL; 1270 1271 /* avoid braking XDP, if that is enabled */ 1272 peer = rtnl_dereference(priv->peer); 1273 peer_priv = peer ? netdev_priv(peer) : NULL; 1274 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1275 return -EINVAL; 1276 1277 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1278 return -EINVAL; 1279 1280 old_rx_count = dev->real_num_rx_queues; 1281 new_rx_count = ch->rx_count; 1282 if (netif_running(dev)) { 1283 /* turn device off */ 1284 netif_carrier_off(dev); 1285 if (peer) 1286 netif_carrier_off(peer); 1287 1288 /* try to allocate new resurces, as needed*/ 1289 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1290 if (err) 1291 goto out; 1292 } 1293 1294 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1295 if (err) 1296 goto revert; 1297 1298 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1299 if (err) { 1300 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1301 1302 /* this error condition could happen only if rx and tx change 1303 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1304 * and we can't do anything to fully restore the original 1305 * status 1306 */ 1307 if (err2) 1308 pr_warn("Can't restore rx queues config %d -> %d %d", 1309 new_rx_count, old_rx_count, err2); 1310 else 1311 goto revert; 1312 } 1313 1314 out: 1315 if (netif_running(dev)) { 1316 /* note that we need to swap the arguments WRT the enable part 1317 * to identify the range we have to disable 1318 */ 1319 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1320 netif_carrier_on(dev); 1321 if (peer) 1322 netif_carrier_on(peer); 1323 } 1324 1325 /* update XDP supported features */ 1326 veth_set_xdp_features(dev); 1327 if (peer) 1328 veth_set_xdp_features(peer); 1329 1330 return err; 1331 1332 revert: 1333 new_rx_count = old_rx_count; 1334 old_rx_count = ch->rx_count; 1335 goto out; 1336 } 1337 1338 static int veth_open(struct net_device *dev) 1339 { 1340 struct veth_priv *priv = netdev_priv(dev); 1341 struct net_device *peer = rtnl_dereference(priv->peer); 1342 int err; 1343 1344 if (!peer) 1345 return -ENOTCONN; 1346 1347 if (priv->_xdp_prog) { 1348 err = veth_enable_xdp(dev); 1349 if (err) 1350 return err; 1351 } else if (veth_gro_requested(dev)) { 1352 err = veth_napi_enable(dev); 1353 if (err) 1354 return err; 1355 } 1356 1357 if (peer->flags & IFF_UP) { 1358 netif_carrier_on(dev); 1359 netif_carrier_on(peer); 1360 } 1361 1362 veth_set_xdp_features(dev); 1363 1364 return 0; 1365 } 1366 1367 static int veth_close(struct net_device *dev) 1368 { 1369 struct veth_priv *priv = netdev_priv(dev); 1370 struct net_device *peer = rtnl_dereference(priv->peer); 1371 1372 netif_carrier_off(dev); 1373 if (peer) 1374 netif_carrier_off(peer); 1375 1376 if (priv->_xdp_prog) 1377 veth_disable_xdp(dev); 1378 else if (veth_gro_requested(dev)) 1379 veth_napi_del(dev); 1380 1381 return 0; 1382 } 1383 1384 static int is_valid_veth_mtu(int mtu) 1385 { 1386 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1387 } 1388 1389 static int veth_alloc_queues(struct net_device *dev) 1390 { 1391 struct veth_priv *priv = netdev_priv(dev); 1392 int i; 1393 1394 priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq), 1395 GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); 1396 if (!priv->rq) 1397 return -ENOMEM; 1398 1399 for (i = 0; i < dev->num_rx_queues; i++) { 1400 priv->rq[i].dev = dev; 1401 u64_stats_init(&priv->rq[i].stats.syncp); 1402 } 1403 1404 return 0; 1405 } 1406 1407 static void veth_free_queues(struct net_device *dev) 1408 { 1409 struct veth_priv *priv = netdev_priv(dev); 1410 1411 kvfree(priv->rq); 1412 } 1413 1414 static int veth_dev_init(struct net_device *dev) 1415 { 1416 netdev_lockdep_set_classes(dev); 1417 return veth_alloc_queues(dev); 1418 } 1419 1420 static void veth_dev_free(struct net_device *dev) 1421 { 1422 veth_free_queues(dev); 1423 } 1424 1425 #ifdef CONFIG_NET_POLL_CONTROLLER 1426 static void veth_poll_controller(struct net_device *dev) 1427 { 1428 /* veth only receives frames when its peer sends one 1429 * Since it has nothing to do with disabling irqs, we are guaranteed 1430 * never to have pending data when we poll for it so 1431 * there is nothing to do here. 1432 * 1433 * We need this though so netpoll recognizes us as an interface that 1434 * supports polling, which enables bridge devices in virt setups to 1435 * still use netconsole 1436 */ 1437 } 1438 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1439 1440 static int veth_get_iflink(const struct net_device *dev) 1441 { 1442 struct veth_priv *priv = netdev_priv(dev); 1443 struct net_device *peer; 1444 int iflink; 1445 1446 rcu_read_lock(); 1447 peer = rcu_dereference(priv->peer); 1448 iflink = peer ? READ_ONCE(peer->ifindex) : 0; 1449 rcu_read_unlock(); 1450 1451 return iflink; 1452 } 1453 1454 static netdev_features_t veth_fix_features(struct net_device *dev, 1455 netdev_features_t features) 1456 { 1457 struct veth_priv *priv = netdev_priv(dev); 1458 struct net_device *peer; 1459 1460 peer = rtnl_dereference(priv->peer); 1461 if (peer) { 1462 struct veth_priv *peer_priv = netdev_priv(peer); 1463 1464 if (peer_priv->_xdp_prog) 1465 features &= ~NETIF_F_GSO_SOFTWARE; 1466 } 1467 1468 return features; 1469 } 1470 1471 static int veth_set_features(struct net_device *dev, 1472 netdev_features_t features) 1473 { 1474 netdev_features_t changed = features ^ dev->features; 1475 struct veth_priv *priv = netdev_priv(dev); 1476 struct net_device *peer; 1477 int err; 1478 1479 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1480 return 0; 1481 1482 peer = rtnl_dereference(priv->peer); 1483 if (features & NETIF_F_GRO) { 1484 err = veth_napi_enable(dev); 1485 if (err) 1486 return err; 1487 1488 if (peer) 1489 xdp_features_set_redirect_target(peer, true); 1490 } else { 1491 if (peer) 1492 xdp_features_clear_redirect_target(peer); 1493 veth_napi_del(dev); 1494 } 1495 return 0; 1496 } 1497 1498 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1499 { 1500 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1501 struct net_device *peer; 1502 1503 if (new_hr < 0) 1504 new_hr = 0; 1505 1506 rcu_read_lock(); 1507 peer = rcu_dereference(priv->peer); 1508 if (unlikely(!peer)) 1509 goto out; 1510 1511 peer_priv = netdev_priv(peer); 1512 priv->requested_headroom = new_hr; 1513 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1514 dev->needed_headroom = new_hr; 1515 peer->needed_headroom = new_hr; 1516 1517 out: 1518 rcu_read_unlock(); 1519 } 1520 1521 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1522 struct netlink_ext_ack *extack) 1523 { 1524 struct veth_priv *priv = netdev_priv(dev); 1525 struct bpf_prog *old_prog; 1526 struct net_device *peer; 1527 unsigned int max_mtu; 1528 int err; 1529 1530 old_prog = priv->_xdp_prog; 1531 priv->_xdp_prog = prog; 1532 peer = rtnl_dereference(priv->peer); 1533 1534 if (prog) { 1535 if (!peer) { 1536 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1537 err = -ENOTCONN; 1538 goto err; 1539 } 1540 1541 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1542 peer->hard_header_len; 1543 /* Allow increasing the max_mtu if the program supports 1544 * XDP fragments. 1545 */ 1546 if (prog->aux->xdp_has_frags) 1547 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1548 1549 if (peer->mtu > max_mtu) { 1550 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1551 err = -ERANGE; 1552 goto err; 1553 } 1554 1555 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1556 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1557 err = -ENOSPC; 1558 goto err; 1559 } 1560 1561 if (dev->flags & IFF_UP) { 1562 err = veth_enable_xdp(dev); 1563 if (err) { 1564 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1565 goto err; 1566 } 1567 } 1568 1569 if (!old_prog) { 1570 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1571 peer->max_mtu = max_mtu; 1572 } 1573 1574 xdp_features_set_redirect_target(peer, true); 1575 } 1576 1577 if (old_prog) { 1578 if (!prog) { 1579 if (peer && !veth_gro_requested(dev)) 1580 xdp_features_clear_redirect_target(peer); 1581 1582 if (dev->flags & IFF_UP) 1583 veth_disable_xdp(dev); 1584 1585 if (peer) { 1586 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1587 peer->max_mtu = ETH_MAX_MTU; 1588 } 1589 } 1590 bpf_prog_put(old_prog); 1591 } 1592 1593 if ((!!old_prog ^ !!prog) && peer) 1594 netdev_update_features(peer); 1595 1596 return 0; 1597 err: 1598 priv->_xdp_prog = old_prog; 1599 1600 return err; 1601 } 1602 1603 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1604 { 1605 switch (xdp->command) { 1606 case XDP_SETUP_PROG: 1607 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1608 default: 1609 return -EINVAL; 1610 } 1611 } 1612 1613 static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) 1614 { 1615 struct veth_xdp_buff *_ctx = (void *)ctx; 1616 1617 if (!_ctx->skb) 1618 return -ENODATA; 1619 1620 *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; 1621 return 0; 1622 } 1623 1624 static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, 1625 enum xdp_rss_hash_type *rss_type) 1626 { 1627 struct veth_xdp_buff *_ctx = (void *)ctx; 1628 struct sk_buff *skb = _ctx->skb; 1629 1630 if (!skb) 1631 return -ENODATA; 1632 1633 *hash = skb_get_hash(skb); 1634 *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; 1635 1636 return 0; 1637 } 1638 1639 static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, 1640 u16 *vlan_tci) 1641 { 1642 const struct veth_xdp_buff *_ctx = (void *)ctx; 1643 const struct sk_buff *skb = _ctx->skb; 1644 int err; 1645 1646 if (!skb) 1647 return -ENODATA; 1648 1649 err = __vlan_hwaccel_get_tag(skb, vlan_tci); 1650 if (err) 1651 return err; 1652 1653 *vlan_proto = skb->vlan_proto; 1654 return err; 1655 } 1656 1657 static const struct net_device_ops veth_netdev_ops = { 1658 .ndo_init = veth_dev_init, 1659 .ndo_open = veth_open, 1660 .ndo_stop = veth_close, 1661 .ndo_start_xmit = veth_xmit, 1662 .ndo_get_stats64 = veth_get_stats64, 1663 .ndo_set_rx_mode = veth_set_multicast_list, 1664 .ndo_set_mac_address = eth_mac_addr, 1665 #ifdef CONFIG_NET_POLL_CONTROLLER 1666 .ndo_poll_controller = veth_poll_controller, 1667 #endif 1668 .ndo_get_iflink = veth_get_iflink, 1669 .ndo_fix_features = veth_fix_features, 1670 .ndo_set_features = veth_set_features, 1671 .ndo_features_check = passthru_features_check, 1672 .ndo_set_rx_headroom = veth_set_rx_headroom, 1673 .ndo_bpf = veth_xdp, 1674 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1675 .ndo_get_peer_dev = veth_peer_dev, 1676 }; 1677 1678 static const struct xdp_metadata_ops veth_xdp_metadata_ops = { 1679 .xmo_rx_timestamp = veth_xdp_rx_timestamp, 1680 .xmo_rx_hash = veth_xdp_rx_hash, 1681 .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag, 1682 }; 1683 1684 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1685 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1686 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1687 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1688 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1689 1690 static void veth_setup(struct net_device *dev) 1691 { 1692 ether_setup(dev); 1693 1694 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1695 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1696 dev->priv_flags |= IFF_NO_QUEUE; 1697 dev->priv_flags |= IFF_PHONY_HEADROOM; 1698 dev->priv_flags |= IFF_DISABLE_NETPOLL; 1699 dev->lltx = true; 1700 1701 dev->netdev_ops = &veth_netdev_ops; 1702 dev->xdp_metadata_ops = &veth_xdp_metadata_ops; 1703 dev->ethtool_ops = &veth_ethtool_ops; 1704 dev->features |= VETH_FEATURES; 1705 dev->vlan_features = dev->features & 1706 ~(NETIF_F_HW_VLAN_CTAG_TX | 1707 NETIF_F_HW_VLAN_STAG_TX | 1708 NETIF_F_HW_VLAN_CTAG_RX | 1709 NETIF_F_HW_VLAN_STAG_RX); 1710 dev->needs_free_netdev = true; 1711 dev->priv_destructor = veth_dev_free; 1712 dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; 1713 dev->max_mtu = ETH_MAX_MTU; 1714 1715 dev->hw_features = VETH_FEATURES; 1716 dev->hw_enc_features = VETH_FEATURES; 1717 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1718 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1719 } 1720 1721 /* 1722 * netlink interface 1723 */ 1724 1725 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1726 struct netlink_ext_ack *extack) 1727 { 1728 if (tb[IFLA_ADDRESS]) { 1729 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1730 return -EINVAL; 1731 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1732 return -EADDRNOTAVAIL; 1733 } 1734 if (tb[IFLA_MTU]) { 1735 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1736 return -EINVAL; 1737 } 1738 return 0; 1739 } 1740 1741 static struct rtnl_link_ops veth_link_ops; 1742 1743 static void veth_disable_gro(struct net_device *dev) 1744 { 1745 dev->features &= ~NETIF_F_GRO; 1746 dev->wanted_features &= ~NETIF_F_GRO; 1747 netdev_update_features(dev); 1748 } 1749 1750 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1751 { 1752 int err; 1753 1754 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1755 err = netif_set_real_num_tx_queues(dev, 1); 1756 if (err) 1757 return err; 1758 } 1759 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1760 err = netif_set_real_num_rx_queues(dev, 1); 1761 if (err) 1762 return err; 1763 } 1764 return 0; 1765 } 1766 1767 static int veth_newlink(struct net_device *dev, 1768 struct rtnl_newlink_params *params, 1769 struct netlink_ext_ack *extack) 1770 { 1771 struct net *peer_net = rtnl_newlink_peer_net(params); 1772 struct nlattr **data = params->data; 1773 struct nlattr **tb = params->tb; 1774 int err; 1775 struct net_device *peer; 1776 struct veth_priv *priv; 1777 char ifname[IFNAMSIZ]; 1778 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1779 unsigned char name_assign_type; 1780 struct ifinfomsg *ifmp; 1781 1782 /* 1783 * create and register peer first 1784 */ 1785 if (data && data[VETH_INFO_PEER]) { 1786 struct nlattr *nla_peer = data[VETH_INFO_PEER]; 1787 1788 ifmp = nla_data(nla_peer); 1789 rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); 1790 tbp = peer_tb; 1791 } else { 1792 ifmp = NULL; 1793 tbp = tb; 1794 } 1795 1796 if (ifmp && tbp[IFLA_IFNAME]) { 1797 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1798 name_assign_type = NET_NAME_USER; 1799 } else { 1800 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1801 name_assign_type = NET_NAME_ENUM; 1802 } 1803 1804 peer = rtnl_create_link(peer_net, ifname, name_assign_type, 1805 &veth_link_ops, tbp, extack); 1806 if (IS_ERR(peer)) 1807 return PTR_ERR(peer); 1808 1809 if (!ifmp || !tbp[IFLA_ADDRESS]) 1810 eth_hw_addr_random(peer); 1811 1812 if (ifmp && (dev->ifindex != 0)) 1813 peer->ifindex = ifmp->ifi_index; 1814 1815 netif_inherit_tso_max(peer, dev); 1816 1817 err = register_netdevice(peer); 1818 if (err < 0) 1819 goto err_register_peer; 1820 1821 /* keep GRO disabled by default to be consistent with the established 1822 * veth behavior 1823 */ 1824 veth_disable_gro(peer); 1825 netif_carrier_off(peer); 1826 1827 err = rtnl_configure_link(peer, ifmp, 0, NULL); 1828 if (err < 0) 1829 goto err_configure_peer; 1830 1831 /* 1832 * register dev last 1833 * 1834 * note, that since we've registered new device the dev's name 1835 * should be re-allocated 1836 */ 1837 1838 if (tb[IFLA_ADDRESS] == NULL) 1839 eth_hw_addr_random(dev); 1840 1841 if (tb[IFLA_IFNAME]) 1842 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1843 else 1844 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1845 1846 err = register_netdevice(dev); 1847 if (err < 0) 1848 goto err_register_dev; 1849 1850 netif_carrier_off(dev); 1851 1852 /* 1853 * tie the deviced together 1854 */ 1855 1856 priv = netdev_priv(dev); 1857 rcu_assign_pointer(priv->peer, peer); 1858 err = veth_init_queues(dev, tb); 1859 if (err) 1860 goto err_queues; 1861 1862 priv = netdev_priv(peer); 1863 rcu_assign_pointer(priv->peer, dev); 1864 err = veth_init_queues(peer, tb); 1865 if (err) 1866 goto err_queues; 1867 1868 veth_disable_gro(dev); 1869 /* update XDP supported features */ 1870 veth_set_xdp_features(dev); 1871 veth_set_xdp_features(peer); 1872 1873 return 0; 1874 1875 err_queues: 1876 unregister_netdevice(dev); 1877 err_register_dev: 1878 /* nothing to do */ 1879 err_configure_peer: 1880 unregister_netdevice(peer); 1881 return err; 1882 1883 err_register_peer: 1884 free_netdev(peer); 1885 return err; 1886 } 1887 1888 static void veth_dellink(struct net_device *dev, struct list_head *head) 1889 { 1890 struct veth_priv *priv; 1891 struct net_device *peer; 1892 1893 priv = netdev_priv(dev); 1894 peer = rtnl_dereference(priv->peer); 1895 1896 /* Note : dellink() is called from default_device_exit_batch(), 1897 * before a rcu_synchronize() point. The devices are guaranteed 1898 * not being freed before one RCU grace period. 1899 */ 1900 RCU_INIT_POINTER(priv->peer, NULL); 1901 unregister_netdevice_queue(dev, head); 1902 1903 if (peer) { 1904 priv = netdev_priv(peer); 1905 RCU_INIT_POINTER(priv->peer, NULL); 1906 unregister_netdevice_queue(peer, head); 1907 } 1908 } 1909 1910 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1911 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1912 }; 1913 1914 static struct net *veth_get_link_net(const struct net_device *dev) 1915 { 1916 struct veth_priv *priv = netdev_priv(dev); 1917 struct net_device *peer = rtnl_dereference(priv->peer); 1918 1919 return peer ? dev_net(peer) : dev_net(dev); 1920 } 1921 1922 static unsigned int veth_get_num_queues(void) 1923 { 1924 /* enforce the same queue limit as rtnl_create_link */ 1925 int queues = num_possible_cpus(); 1926 1927 if (queues > 4096) 1928 queues = 4096; 1929 return queues; 1930 } 1931 1932 static struct rtnl_link_ops veth_link_ops = { 1933 .kind = DRV_NAME, 1934 .priv_size = sizeof(struct veth_priv), 1935 .setup = veth_setup, 1936 .validate = veth_validate, 1937 .newlink = veth_newlink, 1938 .dellink = veth_dellink, 1939 .policy = veth_policy, 1940 .peer_type = VETH_INFO_PEER, 1941 .maxtype = VETH_INFO_MAX, 1942 .get_link_net = veth_get_link_net, 1943 .get_num_tx_queues = veth_get_num_queues, 1944 .get_num_rx_queues = veth_get_num_queues, 1945 }; 1946 1947 /* 1948 * init/fini 1949 */ 1950 1951 static __init int veth_init(void) 1952 { 1953 return rtnl_link_register(&veth_link_ops); 1954 } 1955 1956 static __exit void veth_exit(void) 1957 { 1958 rtnl_link_unregister(&veth_link_ops); 1959 } 1960 1961 module_init(veth_init); 1962 module_exit(veth_exit); 1963 1964 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1965 MODULE_LICENSE("GPL v2"); 1966 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1967