1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/netdev_lock.h> 21 #include <net/xfrm.h> 22 #include <net/xdp.h> 23 #include <linux/veth.h> 24 #include <linux/module.h> 25 #include <linux/bpf.h> 26 #include <linux/filter.h> 27 #include <linux/ptr_ring.h> 28 #include <linux/bpf_trace.h> 29 #include <linux/net_tstamp.h> 30 #include <linux/skbuff_ref.h> 31 #include <net/page_pool/helpers.h> 32 33 #define DRV_NAME "veth" 34 #define DRV_VERSION "1.0" 35 36 #define VETH_XDP_FLAG BIT(0) 37 #define VETH_RING_SIZE 256 38 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 39 40 #define VETH_XDP_TX_BULK_SIZE 16 41 #define VETH_XDP_BATCH 16 42 43 struct veth_stats { 44 u64 rx_drops; 45 /* xdp */ 46 u64 xdp_packets; 47 u64 xdp_bytes; 48 u64 xdp_redirect; 49 u64 xdp_drops; 50 u64 xdp_tx; 51 u64 xdp_tx_err; 52 u64 peer_tq_xdp_xmit; 53 u64 peer_tq_xdp_xmit_err; 54 }; 55 56 struct veth_rq_stats { 57 struct veth_stats vs; 58 struct u64_stats_sync syncp; 59 }; 60 61 struct veth_rq { 62 struct napi_struct xdp_napi; 63 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 64 struct net_device *dev; 65 struct bpf_prog __rcu *xdp_prog; 66 struct xdp_mem_info xdp_mem; 67 struct veth_rq_stats stats; 68 bool rx_notify_masked; 69 struct ptr_ring xdp_ring; 70 struct xdp_rxq_info xdp_rxq; 71 struct page_pool *page_pool; 72 }; 73 74 struct veth_priv { 75 struct net_device __rcu *peer; 76 atomic64_t dropped; 77 struct bpf_prog *_xdp_prog; 78 struct veth_rq *rq; 79 unsigned int requested_headroom; 80 }; 81 82 struct veth_xdp_tx_bq { 83 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 84 unsigned int count; 85 }; 86 87 /* 88 * ethtool interface 89 */ 90 91 struct veth_q_stat_desc { 92 char desc[ETH_GSTRING_LEN]; 93 size_t offset; 94 }; 95 96 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 97 98 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 99 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 100 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 101 { "drops", VETH_RQ_STAT(rx_drops) }, 102 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 103 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 104 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 105 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 106 }; 107 108 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 109 110 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 111 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 112 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 113 }; 114 115 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 116 117 static struct { 118 const char string[ETH_GSTRING_LEN]; 119 } ethtool_stats_keys[] = { 120 { "peer_ifindex" }, 121 }; 122 123 struct veth_xdp_buff { 124 struct xdp_buff xdp; 125 struct sk_buff *skb; 126 }; 127 128 static int veth_get_link_ksettings(struct net_device *dev, 129 struct ethtool_link_ksettings *cmd) 130 { 131 cmd->base.speed = SPEED_10000; 132 cmd->base.duplex = DUPLEX_FULL; 133 cmd->base.port = PORT_TP; 134 cmd->base.autoneg = AUTONEG_DISABLE; 135 return 0; 136 } 137 138 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 139 { 140 strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 141 strscpy(info->version, DRV_VERSION, sizeof(info->version)); 142 } 143 144 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 145 { 146 u8 *p = buf; 147 int i, j; 148 149 switch(stringset) { 150 case ETH_SS_STATS: 151 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 152 p += sizeof(ethtool_stats_keys); 153 for (i = 0; i < dev->real_num_rx_queues; i++) 154 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 155 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 156 i, veth_rq_stats_desc[j].desc); 157 158 for (i = 0; i < dev->real_num_tx_queues; i++) 159 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 160 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 161 i, veth_tq_stats_desc[j].desc); 162 163 page_pool_ethtool_stats_get_strings(p); 164 break; 165 } 166 } 167 168 static int veth_get_sset_count(struct net_device *dev, int sset) 169 { 170 switch (sset) { 171 case ETH_SS_STATS: 172 return ARRAY_SIZE(ethtool_stats_keys) + 173 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 174 VETH_TQ_STATS_LEN * dev->real_num_tx_queues + 175 page_pool_ethtool_stats_get_count(); 176 default: 177 return -EOPNOTSUPP; 178 } 179 } 180 181 static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) 182 { 183 #ifdef CONFIG_PAGE_POOL_STATS 184 struct veth_priv *priv = netdev_priv(dev); 185 struct page_pool_stats pp_stats = {}; 186 int i; 187 188 for (i = 0; i < dev->real_num_rx_queues; i++) { 189 if (!priv->rq[i].page_pool) 190 continue; 191 page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); 192 } 193 page_pool_ethtool_stats_get(data, &pp_stats); 194 #endif /* CONFIG_PAGE_POOL_STATS */ 195 } 196 197 static void veth_get_ethtool_stats(struct net_device *dev, 198 struct ethtool_stats *stats, u64 *data) 199 { 200 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 201 struct net_device *peer = rtnl_dereference(priv->peer); 202 int i, j, idx, pp_idx; 203 204 data[0] = peer ? peer->ifindex : 0; 205 idx = 1; 206 for (i = 0; i < dev->real_num_rx_queues; i++) { 207 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 208 const void *stats_base = (void *)&rq_stats->vs; 209 unsigned int start; 210 size_t offset; 211 212 do { 213 start = u64_stats_fetch_begin(&rq_stats->syncp); 214 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 215 offset = veth_rq_stats_desc[j].offset; 216 data[idx + j] = *(u64 *)(stats_base + offset); 217 } 218 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 219 idx += VETH_RQ_STATS_LEN; 220 } 221 pp_idx = idx; 222 223 if (!peer) 224 goto page_pool_stats; 225 226 rcv_priv = netdev_priv(peer); 227 for (i = 0; i < peer->real_num_rx_queues; i++) { 228 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 229 const void *base = (void *)&rq_stats->vs; 230 unsigned int start, tx_idx = idx; 231 size_t offset; 232 233 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 234 do { 235 start = u64_stats_fetch_begin(&rq_stats->syncp); 236 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 237 offset = veth_tq_stats_desc[j].offset; 238 data[tx_idx + j] += *(u64 *)(base + offset); 239 } 240 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 241 } 242 pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; 243 244 page_pool_stats: 245 veth_get_page_pool_stats(dev, &data[pp_idx]); 246 } 247 248 static void veth_get_channels(struct net_device *dev, 249 struct ethtool_channels *channels) 250 { 251 channels->tx_count = dev->real_num_tx_queues; 252 channels->rx_count = dev->real_num_rx_queues; 253 channels->max_tx = dev->num_tx_queues; 254 channels->max_rx = dev->num_rx_queues; 255 } 256 257 static int veth_set_channels(struct net_device *dev, 258 struct ethtool_channels *ch); 259 260 static const struct ethtool_ops veth_ethtool_ops = { 261 .get_drvinfo = veth_get_drvinfo, 262 .get_link = ethtool_op_get_link, 263 .get_strings = veth_get_strings, 264 .get_sset_count = veth_get_sset_count, 265 .get_ethtool_stats = veth_get_ethtool_stats, 266 .get_link_ksettings = veth_get_link_ksettings, 267 .get_ts_info = ethtool_op_get_ts_info, 268 .get_channels = veth_get_channels, 269 .set_channels = veth_set_channels, 270 }; 271 272 /* general routines */ 273 274 static bool veth_is_xdp_frame(void *ptr) 275 { 276 return (unsigned long)ptr & VETH_XDP_FLAG; 277 } 278 279 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 280 { 281 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 282 } 283 284 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 285 { 286 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 287 } 288 289 static void veth_ptr_free(void *ptr) 290 { 291 if (veth_is_xdp_frame(ptr)) 292 xdp_return_frame(veth_ptr_to_xdp(ptr)); 293 else 294 kfree_skb(ptr); 295 } 296 297 static void __veth_xdp_flush(struct veth_rq *rq) 298 { 299 /* Write ptr_ring before reading rx_notify_masked */ 300 smp_mb(); 301 if (!READ_ONCE(rq->rx_notify_masked) && 302 napi_schedule_prep(&rq->xdp_napi)) { 303 WRITE_ONCE(rq->rx_notify_masked, true); 304 __napi_schedule(&rq->xdp_napi); 305 } 306 } 307 308 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 309 { 310 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 311 dev_kfree_skb_any(skb); 312 return NET_RX_DROP; 313 } 314 315 return NET_RX_SUCCESS; 316 } 317 318 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 319 struct veth_rq *rq, bool xdp) 320 { 321 return __dev_forward_skb(dev, skb) ?: xdp ? 322 veth_xdp_rx(rq, skb) : 323 __netif_rx(skb); 324 } 325 326 /* return true if the specified skb has chances of GRO aggregation 327 * Don't strive for accuracy, but try to avoid GRO overhead in the most 328 * common scenarios. 329 * When XDP is enabled, all traffic is considered eligible, as the xmit 330 * device has TSO off. 331 * When TSO is enabled on the xmit device, we are likely interested only 332 * in UDP aggregation, explicitly check for that if the skb is suspected 333 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 334 * to belong to locally generated UDP traffic. 335 */ 336 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 337 const struct net_device *rcv, 338 const struct sk_buff *skb) 339 { 340 return !(dev->features & NETIF_F_ALL_TSO) || 341 (skb->destructor == sock_wfree && 342 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 343 } 344 345 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 346 { 347 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 348 struct veth_rq *rq = NULL; 349 int ret = NETDEV_TX_OK; 350 struct net_device *rcv; 351 int length = skb->len; 352 bool use_napi = false; 353 int rxq; 354 355 rcu_read_lock(); 356 rcv = rcu_dereference(priv->peer); 357 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 358 kfree_skb(skb); 359 goto drop; 360 } 361 362 rcv_priv = netdev_priv(rcv); 363 rxq = skb_get_queue_mapping(skb); 364 if (rxq < rcv->real_num_rx_queues) { 365 rq = &rcv_priv->rq[rxq]; 366 367 /* The napi pointer is available when an XDP program is 368 * attached or when GRO is enabled 369 * Don't bother with napi/GRO if the skb can't be aggregated 370 */ 371 use_napi = rcu_access_pointer(rq->napi) && 372 veth_skb_is_eligible_for_gro(dev, rcv, skb); 373 } 374 375 skb_tx_timestamp(skb); 376 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 377 if (!use_napi) 378 dev_sw_netstats_tx_add(dev, 1, length); 379 else 380 __veth_xdp_flush(rq); 381 } else { 382 drop: 383 atomic64_inc(&priv->dropped); 384 ret = NET_XMIT_DROP; 385 } 386 387 rcu_read_unlock(); 388 389 return ret; 390 } 391 392 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 393 { 394 struct veth_priv *priv = netdev_priv(dev); 395 int i; 396 397 result->peer_tq_xdp_xmit_err = 0; 398 result->xdp_packets = 0; 399 result->xdp_tx_err = 0; 400 result->xdp_bytes = 0; 401 result->rx_drops = 0; 402 for (i = 0; i < dev->num_rx_queues; i++) { 403 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 404 struct veth_rq_stats *stats = &priv->rq[i].stats; 405 unsigned int start; 406 407 do { 408 start = u64_stats_fetch_begin(&stats->syncp); 409 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 410 xdp_tx_err = stats->vs.xdp_tx_err; 411 packets = stats->vs.xdp_packets; 412 bytes = stats->vs.xdp_bytes; 413 drops = stats->vs.rx_drops; 414 } while (u64_stats_fetch_retry(&stats->syncp, start)); 415 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 416 result->xdp_tx_err += xdp_tx_err; 417 result->xdp_packets += packets; 418 result->xdp_bytes += bytes; 419 result->rx_drops += drops; 420 } 421 } 422 423 static void veth_get_stats64(struct net_device *dev, 424 struct rtnl_link_stats64 *tot) 425 { 426 struct veth_priv *priv = netdev_priv(dev); 427 struct net_device *peer; 428 struct veth_stats rx; 429 430 tot->tx_dropped = atomic64_read(&priv->dropped); 431 dev_fetch_sw_netstats(tot, dev->tstats); 432 433 veth_stats_rx(&rx, dev); 434 tot->tx_dropped += rx.xdp_tx_err; 435 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 436 tot->rx_bytes += rx.xdp_bytes; 437 tot->rx_packets += rx.xdp_packets; 438 439 rcu_read_lock(); 440 peer = rcu_dereference(priv->peer); 441 if (peer) { 442 struct rtnl_link_stats64 tot_peer = {}; 443 444 dev_fetch_sw_netstats(&tot_peer, peer->tstats); 445 tot->rx_bytes += tot_peer.tx_bytes; 446 tot->rx_packets += tot_peer.tx_packets; 447 448 veth_stats_rx(&rx, peer); 449 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 450 tot->rx_dropped += rx.xdp_tx_err; 451 tot->tx_bytes += rx.xdp_bytes; 452 tot->tx_packets += rx.xdp_packets; 453 } 454 rcu_read_unlock(); 455 } 456 457 /* fake multicast ability */ 458 static void veth_set_multicast_list(struct net_device *dev) 459 { 460 } 461 462 static int veth_select_rxq(struct net_device *dev) 463 { 464 return smp_processor_id() % dev->real_num_rx_queues; 465 } 466 467 static struct net_device *veth_peer_dev(struct net_device *dev) 468 { 469 struct veth_priv *priv = netdev_priv(dev); 470 471 /* Callers must be under RCU read side. */ 472 return rcu_dereference(priv->peer); 473 } 474 475 static int veth_xdp_xmit(struct net_device *dev, int n, 476 struct xdp_frame **frames, 477 u32 flags, bool ndo_xmit) 478 { 479 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 480 int i, ret = -ENXIO, nxmit = 0; 481 struct net_device *rcv; 482 unsigned int max_len; 483 struct veth_rq *rq; 484 485 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 486 return -EINVAL; 487 488 rcu_read_lock(); 489 rcv = rcu_dereference(priv->peer); 490 if (unlikely(!rcv)) 491 goto out; 492 493 rcv_priv = netdev_priv(rcv); 494 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 495 /* The napi pointer is set if NAPI is enabled, which ensures that 496 * xdp_ring is initialized on receive side and the peer device is up. 497 */ 498 if (!rcu_access_pointer(rq->napi)) 499 goto out; 500 501 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 502 503 spin_lock(&rq->xdp_ring.producer_lock); 504 for (i = 0; i < n; i++) { 505 struct xdp_frame *frame = frames[i]; 506 void *ptr = veth_xdp_to_ptr(frame); 507 508 if (unlikely(xdp_get_frame_len(frame) > max_len || 509 __ptr_ring_produce(&rq->xdp_ring, ptr))) 510 break; 511 nxmit++; 512 } 513 spin_unlock(&rq->xdp_ring.producer_lock); 514 515 if (flags & XDP_XMIT_FLUSH) 516 __veth_xdp_flush(rq); 517 518 ret = nxmit; 519 if (ndo_xmit) { 520 u64_stats_update_begin(&rq->stats.syncp); 521 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 522 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 523 u64_stats_update_end(&rq->stats.syncp); 524 } 525 526 out: 527 rcu_read_unlock(); 528 529 return ret; 530 } 531 532 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 533 struct xdp_frame **frames, u32 flags) 534 { 535 int err; 536 537 err = veth_xdp_xmit(dev, n, frames, flags, true); 538 if (err < 0) { 539 struct veth_priv *priv = netdev_priv(dev); 540 541 atomic64_add(n, &priv->dropped); 542 } 543 544 return err; 545 } 546 547 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 548 { 549 int sent, i, err = 0, drops; 550 551 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 552 if (sent < 0) { 553 err = sent; 554 sent = 0; 555 } 556 557 for (i = sent; unlikely(i < bq->count); i++) 558 xdp_return_frame(bq->q[i]); 559 560 drops = bq->count - sent; 561 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 562 563 u64_stats_update_begin(&rq->stats.syncp); 564 rq->stats.vs.xdp_tx += sent; 565 rq->stats.vs.xdp_tx_err += drops; 566 u64_stats_update_end(&rq->stats.syncp); 567 568 bq->count = 0; 569 } 570 571 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 572 { 573 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 574 struct net_device *rcv; 575 struct veth_rq *rcv_rq; 576 577 rcu_read_lock(); 578 veth_xdp_flush_bq(rq, bq); 579 rcv = rcu_dereference(priv->peer); 580 if (unlikely(!rcv)) 581 goto out; 582 583 rcv_priv = netdev_priv(rcv); 584 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 585 /* xdp_ring is initialized on receive side? */ 586 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 587 goto out; 588 589 __veth_xdp_flush(rcv_rq); 590 out: 591 rcu_read_unlock(); 592 } 593 594 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 595 struct veth_xdp_tx_bq *bq) 596 { 597 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 598 599 if (unlikely(!frame)) 600 return -EOVERFLOW; 601 602 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 603 veth_xdp_flush_bq(rq, bq); 604 605 bq->q[bq->count++] = frame; 606 607 return 0; 608 } 609 610 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 611 struct xdp_frame *frame, 612 struct veth_xdp_tx_bq *bq, 613 struct veth_stats *stats) 614 { 615 struct xdp_frame orig_frame; 616 struct bpf_prog *xdp_prog; 617 618 rcu_read_lock(); 619 xdp_prog = rcu_dereference(rq->xdp_prog); 620 if (likely(xdp_prog)) { 621 struct veth_xdp_buff vxbuf; 622 struct xdp_buff *xdp = &vxbuf.xdp; 623 u32 act; 624 625 xdp_convert_frame_to_buff(frame, xdp); 626 xdp->rxq = &rq->xdp_rxq; 627 vxbuf.skb = NULL; 628 629 act = bpf_prog_run_xdp(xdp_prog, xdp); 630 631 switch (act) { 632 case XDP_PASS: 633 if (xdp_update_frame_from_buff(xdp, frame)) 634 goto err_xdp; 635 break; 636 case XDP_TX: 637 orig_frame = *frame; 638 xdp->rxq->mem.type = frame->mem_type; 639 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 640 trace_xdp_exception(rq->dev, xdp_prog, act); 641 frame = &orig_frame; 642 stats->rx_drops++; 643 goto err_xdp; 644 } 645 stats->xdp_tx++; 646 rcu_read_unlock(); 647 goto xdp_xmit; 648 case XDP_REDIRECT: 649 orig_frame = *frame; 650 xdp->rxq->mem.type = frame->mem_type; 651 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 652 frame = &orig_frame; 653 stats->rx_drops++; 654 goto err_xdp; 655 } 656 stats->xdp_redirect++; 657 rcu_read_unlock(); 658 goto xdp_xmit; 659 default: 660 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 661 fallthrough; 662 case XDP_ABORTED: 663 trace_xdp_exception(rq->dev, xdp_prog, act); 664 fallthrough; 665 case XDP_DROP: 666 stats->xdp_drops++; 667 goto err_xdp; 668 } 669 } 670 rcu_read_unlock(); 671 672 return frame; 673 err_xdp: 674 rcu_read_unlock(); 675 xdp_return_frame(frame); 676 xdp_xmit: 677 return NULL; 678 } 679 680 /* frames array contains VETH_XDP_BATCH at most */ 681 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 682 int n_xdpf, struct veth_xdp_tx_bq *bq, 683 struct veth_stats *stats) 684 { 685 void *skbs[VETH_XDP_BATCH]; 686 int i; 687 688 if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { 689 for (i = 0; i < n_xdpf; i++) 690 xdp_return_frame(frames[i]); 691 stats->rx_drops += n_xdpf; 692 693 return; 694 } 695 696 for (i = 0; i < n_xdpf; i++) { 697 struct sk_buff *skb = skbs[i]; 698 699 skb = __xdp_build_skb_from_frame(frames[i], skb, 700 rq->dev); 701 if (!skb) { 702 xdp_return_frame(frames[i]); 703 stats->rx_drops++; 704 continue; 705 } 706 napi_gro_receive(&rq->xdp_napi, skb); 707 } 708 } 709 710 static void veth_xdp_get(struct xdp_buff *xdp) 711 { 712 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 713 int i; 714 715 get_page(virt_to_page(xdp->data)); 716 if (likely(!xdp_buff_has_frags(xdp))) 717 return; 718 719 for (i = 0; i < sinfo->nr_frags; i++) 720 __skb_frag_ref(&sinfo->frags[i]); 721 } 722 723 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 724 struct xdp_buff *xdp, 725 struct sk_buff **pskb) 726 { 727 struct sk_buff *skb = *pskb; 728 u32 frame_sz; 729 730 if (skb_shared(skb) || skb_head_is_locked(skb) || 731 skb_shinfo(skb)->nr_frags || 732 skb_headroom(skb) < XDP_PACKET_HEADROOM) { 733 if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM)) 734 goto drop; 735 736 skb = *pskb; 737 } 738 739 /* SKB "head" area always have tailroom for skb_shared_info */ 740 frame_sz = skb_end_pointer(skb) - skb->head; 741 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 742 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 743 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 744 skb_headlen(skb), true); 745 746 if (skb_is_nonlinear(skb)) { 747 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 748 xdp_buff_set_frags_flag(xdp); 749 } else { 750 xdp_buff_clear_frags_flag(xdp); 751 } 752 *pskb = skb; 753 754 return 0; 755 drop: 756 consume_skb(skb); 757 *pskb = NULL; 758 759 return -ENOMEM; 760 } 761 762 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 763 struct sk_buff *skb, 764 struct veth_xdp_tx_bq *bq, 765 struct veth_stats *stats) 766 { 767 void *orig_data, *orig_data_end; 768 struct bpf_prog *xdp_prog; 769 struct veth_xdp_buff vxbuf; 770 struct xdp_buff *xdp = &vxbuf.xdp; 771 u32 act, metalen; 772 int off; 773 774 skb_prepare_for_gro(skb); 775 776 rcu_read_lock(); 777 xdp_prog = rcu_dereference(rq->xdp_prog); 778 if (unlikely(!xdp_prog)) { 779 rcu_read_unlock(); 780 goto out; 781 } 782 783 __skb_push(skb, skb->data - skb_mac_header(skb)); 784 if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) 785 goto drop; 786 vxbuf.skb = skb; 787 788 orig_data = xdp->data; 789 orig_data_end = xdp->data_end; 790 791 act = bpf_prog_run_xdp(xdp_prog, xdp); 792 793 switch (act) { 794 case XDP_PASS: 795 break; 796 case XDP_TX: 797 veth_xdp_get(xdp); 798 consume_skb(skb); 799 xdp->rxq->mem = rq->xdp_mem; 800 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 801 trace_xdp_exception(rq->dev, xdp_prog, act); 802 stats->rx_drops++; 803 goto err_xdp; 804 } 805 stats->xdp_tx++; 806 rcu_read_unlock(); 807 goto xdp_xmit; 808 case XDP_REDIRECT: 809 veth_xdp_get(xdp); 810 consume_skb(skb); 811 xdp->rxq->mem = rq->xdp_mem; 812 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 813 stats->rx_drops++; 814 goto err_xdp; 815 } 816 stats->xdp_redirect++; 817 rcu_read_unlock(); 818 goto xdp_xmit; 819 default: 820 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 821 fallthrough; 822 case XDP_ABORTED: 823 trace_xdp_exception(rq->dev, xdp_prog, act); 824 fallthrough; 825 case XDP_DROP: 826 stats->xdp_drops++; 827 goto xdp_drop; 828 } 829 rcu_read_unlock(); 830 831 /* check if bpf_xdp_adjust_head was used */ 832 off = orig_data - xdp->data; 833 if (off > 0) 834 __skb_push(skb, off); 835 else if (off < 0) 836 __skb_pull(skb, -off); 837 838 skb_reset_mac_header(skb); 839 840 /* check if bpf_xdp_adjust_tail was used */ 841 off = xdp->data_end - orig_data_end; 842 if (off != 0) 843 __skb_put(skb, off); /* positive on grow, negative on shrink */ 844 845 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 846 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 847 */ 848 if (xdp_buff_has_frags(xdp)) 849 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 850 else 851 skb->data_len = 0; 852 853 skb->protocol = eth_type_trans(skb, rq->dev); 854 855 metalen = xdp->data - xdp->data_meta; 856 if (metalen) 857 skb_metadata_set(skb, metalen); 858 out: 859 return skb; 860 drop: 861 stats->rx_drops++; 862 xdp_drop: 863 rcu_read_unlock(); 864 kfree_skb(skb); 865 return NULL; 866 err_xdp: 867 rcu_read_unlock(); 868 xdp_return_buff(xdp); 869 xdp_xmit: 870 return NULL; 871 } 872 873 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 874 struct veth_xdp_tx_bq *bq, 875 struct veth_stats *stats) 876 { 877 int i, done = 0, n_xdpf = 0; 878 void *xdpf[VETH_XDP_BATCH]; 879 880 for (i = 0; i < budget; i++) { 881 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 882 883 if (!ptr) 884 break; 885 886 if (veth_is_xdp_frame(ptr)) { 887 /* ndo_xdp_xmit */ 888 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 889 890 stats->xdp_bytes += xdp_get_frame_len(frame); 891 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 892 if (frame) { 893 /* XDP_PASS */ 894 xdpf[n_xdpf++] = frame; 895 if (n_xdpf == VETH_XDP_BATCH) { 896 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 897 bq, stats); 898 n_xdpf = 0; 899 } 900 } 901 } else { 902 /* ndo_start_xmit */ 903 struct sk_buff *skb = ptr; 904 905 stats->xdp_bytes += skb->len; 906 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 907 if (skb) { 908 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 909 netif_receive_skb(skb); 910 else 911 napi_gro_receive(&rq->xdp_napi, skb); 912 } 913 } 914 done++; 915 } 916 917 if (n_xdpf) 918 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 919 920 u64_stats_update_begin(&rq->stats.syncp); 921 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 922 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 923 rq->stats.vs.xdp_drops += stats->xdp_drops; 924 rq->stats.vs.rx_drops += stats->rx_drops; 925 rq->stats.vs.xdp_packets += done; 926 u64_stats_update_end(&rq->stats.syncp); 927 928 return done; 929 } 930 931 static int veth_poll(struct napi_struct *napi, int budget) 932 { 933 struct veth_rq *rq = 934 container_of(napi, struct veth_rq, xdp_napi); 935 struct veth_stats stats = {}; 936 struct veth_xdp_tx_bq bq; 937 int done; 938 939 bq.count = 0; 940 941 xdp_set_return_frame_no_direct(); 942 done = veth_xdp_rcv(rq, budget, &bq, &stats); 943 944 if (stats.xdp_redirect > 0) 945 xdp_do_flush(); 946 947 if (done < budget && napi_complete_done(napi, done)) { 948 /* Write rx_notify_masked before reading ptr_ring */ 949 smp_store_mb(rq->rx_notify_masked, false); 950 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 951 if (napi_schedule_prep(&rq->xdp_napi)) { 952 WRITE_ONCE(rq->rx_notify_masked, true); 953 __napi_schedule(&rq->xdp_napi); 954 } 955 } 956 } 957 958 if (stats.xdp_tx > 0) 959 veth_xdp_flush(rq, &bq); 960 xdp_clear_return_frame_no_direct(); 961 962 return done; 963 } 964 965 static int veth_create_page_pool(struct veth_rq *rq) 966 { 967 struct page_pool_params pp_params = { 968 .order = 0, 969 .pool_size = VETH_RING_SIZE, 970 .nid = NUMA_NO_NODE, 971 .dev = &rq->dev->dev, 972 }; 973 974 rq->page_pool = page_pool_create(&pp_params); 975 if (IS_ERR(rq->page_pool)) { 976 int err = PTR_ERR(rq->page_pool); 977 978 rq->page_pool = NULL; 979 return err; 980 } 981 982 return 0; 983 } 984 985 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 986 { 987 struct veth_priv *priv = netdev_priv(dev); 988 int err, i; 989 990 for (i = start; i < end; i++) { 991 err = veth_create_page_pool(&priv->rq[i]); 992 if (err) 993 goto err_page_pool; 994 } 995 996 for (i = start; i < end; i++) { 997 struct veth_rq *rq = &priv->rq[i]; 998 999 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 1000 if (err) 1001 goto err_xdp_ring; 1002 } 1003 1004 for (i = start; i < end; i++) { 1005 struct veth_rq *rq = &priv->rq[i]; 1006 1007 napi_enable(&rq->xdp_napi); 1008 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1009 } 1010 1011 return 0; 1012 1013 err_xdp_ring: 1014 for (i--; i >= start; i--) 1015 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1016 i = end; 1017 err_page_pool: 1018 for (i--; i >= start; i--) { 1019 page_pool_destroy(priv->rq[i].page_pool); 1020 priv->rq[i].page_pool = NULL; 1021 } 1022 1023 return err; 1024 } 1025 1026 static int __veth_napi_enable(struct net_device *dev) 1027 { 1028 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1029 } 1030 1031 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1032 { 1033 struct veth_priv *priv = netdev_priv(dev); 1034 int i; 1035 1036 for (i = start; i < end; i++) { 1037 struct veth_rq *rq = &priv->rq[i]; 1038 1039 rcu_assign_pointer(priv->rq[i].napi, NULL); 1040 napi_disable(&rq->xdp_napi); 1041 __netif_napi_del(&rq->xdp_napi); 1042 } 1043 synchronize_net(); 1044 1045 for (i = start; i < end; i++) { 1046 struct veth_rq *rq = &priv->rq[i]; 1047 1048 rq->rx_notify_masked = false; 1049 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1050 } 1051 1052 for (i = start; i < end; i++) { 1053 page_pool_destroy(priv->rq[i].page_pool); 1054 priv->rq[i].page_pool = NULL; 1055 } 1056 } 1057 1058 static void veth_napi_del(struct net_device *dev) 1059 { 1060 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1061 } 1062 1063 static bool veth_gro_requested(const struct net_device *dev) 1064 { 1065 return !!(dev->wanted_features & NETIF_F_GRO); 1066 } 1067 1068 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1069 bool napi_already_on) 1070 { 1071 struct veth_priv *priv = netdev_priv(dev); 1072 int err, i; 1073 1074 for (i = start; i < end; i++) { 1075 struct veth_rq *rq = &priv->rq[i]; 1076 1077 if (!napi_already_on) 1078 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1079 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1080 if (err < 0) 1081 goto err_rxq_reg; 1082 1083 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1084 MEM_TYPE_PAGE_SHARED, 1085 NULL); 1086 if (err < 0) 1087 goto err_reg_mem; 1088 1089 /* Save original mem info as it can be overwritten */ 1090 rq->xdp_mem = rq->xdp_rxq.mem; 1091 } 1092 return 0; 1093 1094 err_reg_mem: 1095 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1096 err_rxq_reg: 1097 for (i--; i >= start; i--) { 1098 struct veth_rq *rq = &priv->rq[i]; 1099 1100 xdp_rxq_info_unreg(&rq->xdp_rxq); 1101 if (!napi_already_on) 1102 netif_napi_del(&rq->xdp_napi); 1103 } 1104 1105 return err; 1106 } 1107 1108 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1109 bool delete_napi) 1110 { 1111 struct veth_priv *priv = netdev_priv(dev); 1112 int i; 1113 1114 for (i = start; i < end; i++) { 1115 struct veth_rq *rq = &priv->rq[i]; 1116 1117 rq->xdp_rxq.mem = rq->xdp_mem; 1118 xdp_rxq_info_unreg(&rq->xdp_rxq); 1119 1120 if (delete_napi) 1121 netif_napi_del(&rq->xdp_napi); 1122 } 1123 } 1124 1125 static int veth_enable_xdp(struct net_device *dev) 1126 { 1127 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1128 struct veth_priv *priv = netdev_priv(dev); 1129 int err, i; 1130 1131 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1132 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1133 if (err) 1134 return err; 1135 1136 if (!napi_already_on) { 1137 err = __veth_napi_enable(dev); 1138 if (err) { 1139 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1140 return err; 1141 } 1142 } 1143 } 1144 1145 for (i = 0; i < dev->real_num_rx_queues; i++) { 1146 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1147 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1148 } 1149 1150 return 0; 1151 } 1152 1153 static void veth_disable_xdp(struct net_device *dev) 1154 { 1155 struct veth_priv *priv = netdev_priv(dev); 1156 int i; 1157 1158 for (i = 0; i < dev->real_num_rx_queues; i++) 1159 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1160 1161 if (!netif_running(dev) || !veth_gro_requested(dev)) 1162 veth_napi_del(dev); 1163 1164 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1165 } 1166 1167 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1168 { 1169 struct veth_priv *priv = netdev_priv(dev); 1170 int err, i; 1171 1172 for (i = start; i < end; i++) { 1173 struct veth_rq *rq = &priv->rq[i]; 1174 1175 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1176 } 1177 1178 err = __veth_napi_enable_range(dev, start, end); 1179 if (err) { 1180 for (i = start; i < end; i++) { 1181 struct veth_rq *rq = &priv->rq[i]; 1182 1183 netif_napi_del(&rq->xdp_napi); 1184 } 1185 return err; 1186 } 1187 return err; 1188 } 1189 1190 static int veth_napi_enable(struct net_device *dev) 1191 { 1192 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1193 } 1194 1195 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1196 { 1197 struct veth_priv *priv = netdev_priv(dev); 1198 1199 if (start >= end) 1200 return; 1201 1202 if (priv->_xdp_prog) { 1203 veth_napi_del_range(dev, start, end); 1204 veth_disable_xdp_range(dev, start, end, false); 1205 } else if (veth_gro_requested(dev)) { 1206 veth_napi_del_range(dev, start, end); 1207 } 1208 } 1209 1210 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1211 { 1212 struct veth_priv *priv = netdev_priv(dev); 1213 int err; 1214 1215 if (start >= end) 1216 return 0; 1217 1218 if (priv->_xdp_prog) { 1219 /* these channels are freshly initialized, napi is not on there even 1220 * when GRO is requeste 1221 */ 1222 err = veth_enable_xdp_range(dev, start, end, false); 1223 if (err) 1224 return err; 1225 1226 err = __veth_napi_enable_range(dev, start, end); 1227 if (err) { 1228 /* on error always delete the newly added napis */ 1229 veth_disable_xdp_range(dev, start, end, true); 1230 return err; 1231 } 1232 } else if (veth_gro_requested(dev)) { 1233 return veth_napi_enable_range(dev, start, end); 1234 } 1235 return 0; 1236 } 1237 1238 static void veth_set_xdp_features(struct net_device *dev) 1239 { 1240 struct veth_priv *priv = netdev_priv(dev); 1241 struct net_device *peer; 1242 1243 peer = rtnl_dereference(priv->peer); 1244 if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { 1245 struct veth_priv *priv_peer = netdev_priv(peer); 1246 xdp_features_t val = NETDEV_XDP_ACT_BASIC | 1247 NETDEV_XDP_ACT_REDIRECT | 1248 NETDEV_XDP_ACT_RX_SG; 1249 1250 if (priv_peer->_xdp_prog || veth_gro_requested(peer)) 1251 val |= NETDEV_XDP_ACT_NDO_XMIT | 1252 NETDEV_XDP_ACT_NDO_XMIT_SG; 1253 xdp_set_features_flag(dev, val); 1254 } else { 1255 xdp_clear_features_flag(dev); 1256 } 1257 } 1258 1259 static int veth_set_channels(struct net_device *dev, 1260 struct ethtool_channels *ch) 1261 { 1262 struct veth_priv *priv = netdev_priv(dev); 1263 unsigned int old_rx_count, new_rx_count; 1264 struct veth_priv *peer_priv; 1265 struct net_device *peer; 1266 int err; 1267 1268 /* sanity check. Upper bounds are already enforced by the caller */ 1269 if (!ch->rx_count || !ch->tx_count) 1270 return -EINVAL; 1271 1272 /* avoid braking XDP, if that is enabled */ 1273 peer = rtnl_dereference(priv->peer); 1274 peer_priv = peer ? netdev_priv(peer) : NULL; 1275 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1276 return -EINVAL; 1277 1278 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1279 return -EINVAL; 1280 1281 old_rx_count = dev->real_num_rx_queues; 1282 new_rx_count = ch->rx_count; 1283 if (netif_running(dev)) { 1284 /* turn device off */ 1285 netif_carrier_off(dev); 1286 if (peer) 1287 netif_carrier_off(peer); 1288 1289 /* try to allocate new resurces, as needed*/ 1290 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1291 if (err) 1292 goto out; 1293 } 1294 1295 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1296 if (err) 1297 goto revert; 1298 1299 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1300 if (err) { 1301 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1302 1303 /* this error condition could happen only if rx and tx change 1304 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1305 * and we can't do anything to fully restore the original 1306 * status 1307 */ 1308 if (err2) 1309 pr_warn("Can't restore rx queues config %d -> %d %d", 1310 new_rx_count, old_rx_count, err2); 1311 else 1312 goto revert; 1313 } 1314 1315 out: 1316 if (netif_running(dev)) { 1317 /* note that we need to swap the arguments WRT the enable part 1318 * to identify the range we have to disable 1319 */ 1320 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1321 netif_carrier_on(dev); 1322 if (peer) 1323 netif_carrier_on(peer); 1324 } 1325 1326 /* update XDP supported features */ 1327 veth_set_xdp_features(dev); 1328 if (peer) 1329 veth_set_xdp_features(peer); 1330 1331 return err; 1332 1333 revert: 1334 new_rx_count = old_rx_count; 1335 old_rx_count = ch->rx_count; 1336 goto out; 1337 } 1338 1339 static int veth_open(struct net_device *dev) 1340 { 1341 struct veth_priv *priv = netdev_priv(dev); 1342 struct net_device *peer = rtnl_dereference(priv->peer); 1343 int err; 1344 1345 if (!peer) 1346 return -ENOTCONN; 1347 1348 if (priv->_xdp_prog) { 1349 err = veth_enable_xdp(dev); 1350 if (err) 1351 return err; 1352 } else if (veth_gro_requested(dev)) { 1353 err = veth_napi_enable(dev); 1354 if (err) 1355 return err; 1356 } 1357 1358 if (peer->flags & IFF_UP) { 1359 netif_carrier_on(dev); 1360 netif_carrier_on(peer); 1361 } 1362 1363 veth_set_xdp_features(dev); 1364 1365 return 0; 1366 } 1367 1368 static int veth_close(struct net_device *dev) 1369 { 1370 struct veth_priv *priv = netdev_priv(dev); 1371 struct net_device *peer = rtnl_dereference(priv->peer); 1372 1373 netif_carrier_off(dev); 1374 if (peer) 1375 netif_carrier_off(peer); 1376 1377 if (priv->_xdp_prog) 1378 veth_disable_xdp(dev); 1379 else if (veth_gro_requested(dev)) 1380 veth_napi_del(dev); 1381 1382 return 0; 1383 } 1384 1385 static int is_valid_veth_mtu(int mtu) 1386 { 1387 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1388 } 1389 1390 static int veth_alloc_queues(struct net_device *dev) 1391 { 1392 struct veth_priv *priv = netdev_priv(dev); 1393 int i; 1394 1395 priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq), 1396 GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); 1397 if (!priv->rq) 1398 return -ENOMEM; 1399 1400 for (i = 0; i < dev->num_rx_queues; i++) { 1401 priv->rq[i].dev = dev; 1402 u64_stats_init(&priv->rq[i].stats.syncp); 1403 } 1404 1405 return 0; 1406 } 1407 1408 static void veth_free_queues(struct net_device *dev) 1409 { 1410 struct veth_priv *priv = netdev_priv(dev); 1411 1412 kvfree(priv->rq); 1413 } 1414 1415 static int veth_dev_init(struct net_device *dev) 1416 { 1417 netdev_lockdep_set_classes(dev); 1418 return veth_alloc_queues(dev); 1419 } 1420 1421 static void veth_dev_free(struct net_device *dev) 1422 { 1423 veth_free_queues(dev); 1424 } 1425 1426 #ifdef CONFIG_NET_POLL_CONTROLLER 1427 static void veth_poll_controller(struct net_device *dev) 1428 { 1429 /* veth only receives frames when its peer sends one 1430 * Since it has nothing to do with disabling irqs, we are guaranteed 1431 * never to have pending data when we poll for it so 1432 * there is nothing to do here. 1433 * 1434 * We need this though so netpoll recognizes us as an interface that 1435 * supports polling, which enables bridge devices in virt setups to 1436 * still use netconsole 1437 */ 1438 } 1439 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1440 1441 static int veth_get_iflink(const struct net_device *dev) 1442 { 1443 struct veth_priv *priv = netdev_priv(dev); 1444 struct net_device *peer; 1445 int iflink; 1446 1447 rcu_read_lock(); 1448 peer = rcu_dereference(priv->peer); 1449 iflink = peer ? READ_ONCE(peer->ifindex) : 0; 1450 rcu_read_unlock(); 1451 1452 return iflink; 1453 } 1454 1455 static netdev_features_t veth_fix_features(struct net_device *dev, 1456 netdev_features_t features) 1457 { 1458 struct veth_priv *priv = netdev_priv(dev); 1459 struct net_device *peer; 1460 1461 peer = rtnl_dereference(priv->peer); 1462 if (peer) { 1463 struct veth_priv *peer_priv = netdev_priv(peer); 1464 1465 if (peer_priv->_xdp_prog) 1466 features &= ~NETIF_F_GSO_SOFTWARE; 1467 } 1468 1469 return features; 1470 } 1471 1472 static int veth_set_features(struct net_device *dev, 1473 netdev_features_t features) 1474 { 1475 netdev_features_t changed = features ^ dev->features; 1476 struct veth_priv *priv = netdev_priv(dev); 1477 struct net_device *peer; 1478 int err; 1479 1480 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1481 return 0; 1482 1483 peer = rtnl_dereference(priv->peer); 1484 if (features & NETIF_F_GRO) { 1485 err = veth_napi_enable(dev); 1486 if (err) 1487 return err; 1488 1489 if (peer) 1490 xdp_features_set_redirect_target(peer, true); 1491 } else { 1492 if (peer) 1493 xdp_features_clear_redirect_target(peer); 1494 veth_napi_del(dev); 1495 } 1496 return 0; 1497 } 1498 1499 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1500 { 1501 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1502 struct net_device *peer; 1503 1504 if (new_hr < 0) 1505 new_hr = 0; 1506 1507 rcu_read_lock(); 1508 peer = rcu_dereference(priv->peer); 1509 if (unlikely(!peer)) 1510 goto out; 1511 1512 peer_priv = netdev_priv(peer); 1513 priv->requested_headroom = new_hr; 1514 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1515 dev->needed_headroom = new_hr; 1516 peer->needed_headroom = new_hr; 1517 1518 out: 1519 rcu_read_unlock(); 1520 } 1521 1522 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1523 struct netlink_ext_ack *extack) 1524 { 1525 struct veth_priv *priv = netdev_priv(dev); 1526 struct bpf_prog *old_prog; 1527 struct net_device *peer; 1528 unsigned int max_mtu; 1529 int err; 1530 1531 old_prog = priv->_xdp_prog; 1532 priv->_xdp_prog = prog; 1533 peer = rtnl_dereference(priv->peer); 1534 1535 if (prog) { 1536 if (!peer) { 1537 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1538 err = -ENOTCONN; 1539 goto err; 1540 } 1541 1542 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1543 peer->hard_header_len; 1544 /* Allow increasing the max_mtu if the program supports 1545 * XDP fragments. 1546 */ 1547 if (prog->aux->xdp_has_frags) 1548 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1549 1550 if (peer->mtu > max_mtu) { 1551 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1552 err = -ERANGE; 1553 goto err; 1554 } 1555 1556 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1557 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1558 err = -ENOSPC; 1559 goto err; 1560 } 1561 1562 if (dev->flags & IFF_UP) { 1563 err = veth_enable_xdp(dev); 1564 if (err) { 1565 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1566 goto err; 1567 } 1568 } 1569 1570 if (!old_prog) { 1571 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1572 peer->max_mtu = max_mtu; 1573 } 1574 1575 xdp_features_set_redirect_target(peer, true); 1576 } 1577 1578 if (old_prog) { 1579 if (!prog) { 1580 if (peer && !veth_gro_requested(dev)) 1581 xdp_features_clear_redirect_target(peer); 1582 1583 if (dev->flags & IFF_UP) 1584 veth_disable_xdp(dev); 1585 1586 if (peer) { 1587 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1588 peer->max_mtu = ETH_MAX_MTU; 1589 } 1590 } 1591 bpf_prog_put(old_prog); 1592 } 1593 1594 if ((!!old_prog ^ !!prog) && peer) 1595 netdev_update_features(peer); 1596 1597 return 0; 1598 err: 1599 priv->_xdp_prog = old_prog; 1600 1601 return err; 1602 } 1603 1604 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1605 { 1606 switch (xdp->command) { 1607 case XDP_SETUP_PROG: 1608 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1609 default: 1610 return -EINVAL; 1611 } 1612 } 1613 1614 static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) 1615 { 1616 struct veth_xdp_buff *_ctx = (void *)ctx; 1617 1618 if (!_ctx->skb) 1619 return -ENODATA; 1620 1621 *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; 1622 return 0; 1623 } 1624 1625 static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, 1626 enum xdp_rss_hash_type *rss_type) 1627 { 1628 struct veth_xdp_buff *_ctx = (void *)ctx; 1629 struct sk_buff *skb = _ctx->skb; 1630 1631 if (!skb) 1632 return -ENODATA; 1633 1634 *hash = skb_get_hash(skb); 1635 *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; 1636 1637 return 0; 1638 } 1639 1640 static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, 1641 u16 *vlan_tci) 1642 { 1643 const struct veth_xdp_buff *_ctx = (void *)ctx; 1644 const struct sk_buff *skb = _ctx->skb; 1645 int err; 1646 1647 if (!skb) 1648 return -ENODATA; 1649 1650 err = __vlan_hwaccel_get_tag(skb, vlan_tci); 1651 if (err) 1652 return err; 1653 1654 *vlan_proto = skb->vlan_proto; 1655 return err; 1656 } 1657 1658 static const struct net_device_ops veth_netdev_ops = { 1659 .ndo_init = veth_dev_init, 1660 .ndo_open = veth_open, 1661 .ndo_stop = veth_close, 1662 .ndo_start_xmit = veth_xmit, 1663 .ndo_get_stats64 = veth_get_stats64, 1664 .ndo_set_rx_mode = veth_set_multicast_list, 1665 .ndo_set_mac_address = eth_mac_addr, 1666 #ifdef CONFIG_NET_POLL_CONTROLLER 1667 .ndo_poll_controller = veth_poll_controller, 1668 #endif 1669 .ndo_get_iflink = veth_get_iflink, 1670 .ndo_fix_features = veth_fix_features, 1671 .ndo_set_features = veth_set_features, 1672 .ndo_features_check = passthru_features_check, 1673 .ndo_set_rx_headroom = veth_set_rx_headroom, 1674 .ndo_bpf = veth_xdp, 1675 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1676 .ndo_get_peer_dev = veth_peer_dev, 1677 }; 1678 1679 static const struct xdp_metadata_ops veth_xdp_metadata_ops = { 1680 .xmo_rx_timestamp = veth_xdp_rx_timestamp, 1681 .xmo_rx_hash = veth_xdp_rx_hash, 1682 .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag, 1683 }; 1684 1685 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1686 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1687 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1688 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1689 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1690 1691 static void veth_setup(struct net_device *dev) 1692 { 1693 ether_setup(dev); 1694 1695 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1696 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1697 dev->priv_flags |= IFF_NO_QUEUE; 1698 dev->priv_flags |= IFF_PHONY_HEADROOM; 1699 dev->priv_flags |= IFF_DISABLE_NETPOLL; 1700 dev->lltx = true; 1701 1702 dev->netdev_ops = &veth_netdev_ops; 1703 dev->xdp_metadata_ops = &veth_xdp_metadata_ops; 1704 dev->ethtool_ops = &veth_ethtool_ops; 1705 dev->features |= VETH_FEATURES; 1706 dev->vlan_features = dev->features & 1707 ~(NETIF_F_HW_VLAN_CTAG_TX | 1708 NETIF_F_HW_VLAN_STAG_TX | 1709 NETIF_F_HW_VLAN_CTAG_RX | 1710 NETIF_F_HW_VLAN_STAG_RX); 1711 dev->needs_free_netdev = true; 1712 dev->priv_destructor = veth_dev_free; 1713 dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; 1714 dev->max_mtu = ETH_MAX_MTU; 1715 1716 dev->hw_features = VETH_FEATURES; 1717 dev->hw_enc_features = VETH_FEATURES; 1718 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1719 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1720 } 1721 1722 /* 1723 * netlink interface 1724 */ 1725 1726 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1727 struct netlink_ext_ack *extack) 1728 { 1729 if (tb[IFLA_ADDRESS]) { 1730 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1731 return -EINVAL; 1732 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1733 return -EADDRNOTAVAIL; 1734 } 1735 if (tb[IFLA_MTU]) { 1736 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1737 return -EINVAL; 1738 } 1739 return 0; 1740 } 1741 1742 static struct rtnl_link_ops veth_link_ops; 1743 1744 static void veth_disable_gro(struct net_device *dev) 1745 { 1746 dev->features &= ~NETIF_F_GRO; 1747 dev->wanted_features &= ~NETIF_F_GRO; 1748 netdev_update_features(dev); 1749 } 1750 1751 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1752 { 1753 int err; 1754 1755 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1756 err = netif_set_real_num_tx_queues(dev, 1); 1757 if (err) 1758 return err; 1759 } 1760 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1761 err = netif_set_real_num_rx_queues(dev, 1); 1762 if (err) 1763 return err; 1764 } 1765 return 0; 1766 } 1767 1768 static int veth_newlink(struct net_device *dev, 1769 struct rtnl_newlink_params *params, 1770 struct netlink_ext_ack *extack) 1771 { 1772 struct net *peer_net = rtnl_newlink_peer_net(params); 1773 struct nlattr **data = params->data; 1774 struct nlattr **tb = params->tb; 1775 int err; 1776 struct net_device *peer; 1777 struct veth_priv *priv; 1778 char ifname[IFNAMSIZ]; 1779 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1780 unsigned char name_assign_type; 1781 struct ifinfomsg *ifmp; 1782 1783 /* 1784 * create and register peer first 1785 */ 1786 if (data && data[VETH_INFO_PEER]) { 1787 struct nlattr *nla_peer = data[VETH_INFO_PEER]; 1788 1789 ifmp = nla_data(nla_peer); 1790 rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); 1791 tbp = peer_tb; 1792 } else { 1793 ifmp = NULL; 1794 tbp = tb; 1795 } 1796 1797 if (ifmp && tbp[IFLA_IFNAME]) { 1798 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1799 name_assign_type = NET_NAME_USER; 1800 } else { 1801 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1802 name_assign_type = NET_NAME_ENUM; 1803 } 1804 1805 peer = rtnl_create_link(peer_net, ifname, name_assign_type, 1806 &veth_link_ops, tbp, extack); 1807 if (IS_ERR(peer)) 1808 return PTR_ERR(peer); 1809 1810 if (!ifmp || !tbp[IFLA_ADDRESS]) 1811 eth_hw_addr_random(peer); 1812 1813 if (ifmp && (dev->ifindex != 0)) 1814 peer->ifindex = ifmp->ifi_index; 1815 1816 netif_inherit_tso_max(peer, dev); 1817 1818 err = register_netdevice(peer); 1819 if (err < 0) 1820 goto err_register_peer; 1821 1822 /* keep GRO disabled by default to be consistent with the established 1823 * veth behavior 1824 */ 1825 veth_disable_gro(peer); 1826 netif_carrier_off(peer); 1827 1828 err = rtnl_configure_link(peer, ifmp, 0, NULL); 1829 if (err < 0) 1830 goto err_configure_peer; 1831 1832 /* 1833 * register dev last 1834 * 1835 * note, that since we've registered new device the dev's name 1836 * should be re-allocated 1837 */ 1838 1839 if (tb[IFLA_ADDRESS] == NULL) 1840 eth_hw_addr_random(dev); 1841 1842 if (tb[IFLA_IFNAME]) 1843 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1844 else 1845 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1846 1847 err = register_netdevice(dev); 1848 if (err < 0) 1849 goto err_register_dev; 1850 1851 netif_carrier_off(dev); 1852 1853 /* 1854 * tie the deviced together 1855 */ 1856 1857 priv = netdev_priv(dev); 1858 rcu_assign_pointer(priv->peer, peer); 1859 err = veth_init_queues(dev, tb); 1860 if (err) 1861 goto err_queues; 1862 1863 priv = netdev_priv(peer); 1864 rcu_assign_pointer(priv->peer, dev); 1865 err = veth_init_queues(peer, tb); 1866 if (err) 1867 goto err_queues; 1868 1869 veth_disable_gro(dev); 1870 /* update XDP supported features */ 1871 veth_set_xdp_features(dev); 1872 veth_set_xdp_features(peer); 1873 1874 return 0; 1875 1876 err_queues: 1877 unregister_netdevice(dev); 1878 err_register_dev: 1879 /* nothing to do */ 1880 err_configure_peer: 1881 unregister_netdevice(peer); 1882 return err; 1883 1884 err_register_peer: 1885 free_netdev(peer); 1886 return err; 1887 } 1888 1889 static void veth_dellink(struct net_device *dev, struct list_head *head) 1890 { 1891 struct veth_priv *priv; 1892 struct net_device *peer; 1893 1894 priv = netdev_priv(dev); 1895 peer = rtnl_dereference(priv->peer); 1896 1897 /* Note : dellink() is called from default_device_exit_batch(), 1898 * before a rcu_synchronize() point. The devices are guaranteed 1899 * not being freed before one RCU grace period. 1900 */ 1901 RCU_INIT_POINTER(priv->peer, NULL); 1902 unregister_netdevice_queue(dev, head); 1903 1904 if (peer) { 1905 priv = netdev_priv(peer); 1906 RCU_INIT_POINTER(priv->peer, NULL); 1907 unregister_netdevice_queue(peer, head); 1908 } 1909 } 1910 1911 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1912 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1913 }; 1914 1915 static struct net *veth_get_link_net(const struct net_device *dev) 1916 { 1917 struct veth_priv *priv = netdev_priv(dev); 1918 struct net_device *peer = rtnl_dereference(priv->peer); 1919 1920 return peer ? dev_net(peer) : dev_net(dev); 1921 } 1922 1923 static unsigned int veth_get_num_queues(void) 1924 { 1925 /* enforce the same queue limit as rtnl_create_link */ 1926 int queues = num_possible_cpus(); 1927 1928 if (queues > 4096) 1929 queues = 4096; 1930 return queues; 1931 } 1932 1933 static struct rtnl_link_ops veth_link_ops = { 1934 .kind = DRV_NAME, 1935 .priv_size = sizeof(struct veth_priv), 1936 .setup = veth_setup, 1937 .validate = veth_validate, 1938 .newlink = veth_newlink, 1939 .dellink = veth_dellink, 1940 .policy = veth_policy, 1941 .peer_type = VETH_INFO_PEER, 1942 .maxtype = VETH_INFO_MAX, 1943 .get_link_net = veth_get_link_net, 1944 .get_num_tx_queues = veth_get_num_queues, 1945 .get_num_rx_queues = veth_get_num_queues, 1946 }; 1947 1948 /* 1949 * init/fini 1950 */ 1951 1952 static __init int veth_init(void) 1953 { 1954 return rtnl_link_register(&veth_link_ops); 1955 } 1956 1957 static __exit void veth_exit(void) 1958 { 1959 rtnl_link_unregister(&veth_link_ops); 1960 } 1961 1962 module_init(veth_init); 1963 module_exit(veth_exit); 1964 1965 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1966 MODULE_LICENSE("GPL v2"); 1967 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1968