1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/netdev_lock.h> 21 #include <net/xfrm.h> 22 #include <net/xdp.h> 23 #include <linux/veth.h> 24 #include <linux/module.h> 25 #include <linux/bpf.h> 26 #include <linux/filter.h> 27 #include <linux/ptr_ring.h> 28 #include <linux/bpf_trace.h> 29 #include <linux/net_tstamp.h> 30 #include <linux/skbuff_ref.h> 31 #include <net/page_pool/helpers.h> 32 33 #define DRV_NAME "veth" 34 #define DRV_VERSION "1.0" 35 36 #define VETH_XDP_FLAG BIT(0) 37 #define VETH_RING_SIZE 256 38 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 39 40 #define VETH_XDP_TX_BULK_SIZE 16 41 #define VETH_XDP_BATCH 16 42 43 struct veth_stats { 44 u64 rx_drops; 45 /* xdp */ 46 u64 xdp_packets; 47 u64 xdp_bytes; 48 u64 xdp_redirect; 49 u64 xdp_drops; 50 u64 xdp_tx; 51 u64 xdp_tx_err; 52 u64 peer_tq_xdp_xmit; 53 u64 peer_tq_xdp_xmit_err; 54 }; 55 56 struct veth_rq_stats { 57 struct veth_stats vs; 58 struct u64_stats_sync syncp; 59 }; 60 61 struct veth_rq { 62 struct napi_struct xdp_napi; 63 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 64 struct net_device *dev; 65 struct bpf_prog __rcu *xdp_prog; 66 struct xdp_mem_info xdp_mem; 67 struct veth_rq_stats stats; 68 bool rx_notify_masked; 69 struct ptr_ring xdp_ring; 70 struct xdp_rxq_info xdp_rxq; 71 struct page_pool *page_pool; 72 }; 73 74 struct veth_priv { 75 struct net_device __rcu *peer; 76 atomic64_t dropped; 77 struct bpf_prog *_xdp_prog; 78 struct veth_rq *rq; 79 unsigned int requested_headroom; 80 }; 81 82 struct veth_xdp_tx_bq { 83 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 84 unsigned int count; 85 }; 86 87 /* 88 * ethtool interface 89 */ 90 91 struct veth_q_stat_desc { 92 char desc[ETH_GSTRING_LEN]; 93 size_t offset; 94 }; 95 96 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 97 98 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 99 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 100 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 101 { "drops", VETH_RQ_STAT(rx_drops) }, 102 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 103 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 104 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 105 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 106 }; 107 108 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 109 110 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 111 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 112 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 113 }; 114 115 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 116 117 static struct { 118 const char string[ETH_GSTRING_LEN]; 119 } ethtool_stats_keys[] = { 120 { "peer_ifindex" }, 121 }; 122 123 struct veth_xdp_buff { 124 struct xdp_buff xdp; 125 struct sk_buff *skb; 126 }; 127 128 static int veth_get_link_ksettings(struct net_device *dev, 129 struct ethtool_link_ksettings *cmd) 130 { 131 cmd->base.speed = SPEED_10000; 132 cmd->base.duplex = DUPLEX_FULL; 133 cmd->base.port = PORT_TP; 134 cmd->base.autoneg = AUTONEG_DISABLE; 135 return 0; 136 } 137 138 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 139 { 140 strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 141 strscpy(info->version, DRV_VERSION, sizeof(info->version)); 142 } 143 144 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 145 { 146 u8 *p = buf; 147 int i, j; 148 149 switch(stringset) { 150 case ETH_SS_STATS: 151 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 152 p += sizeof(ethtool_stats_keys); 153 for (i = 0; i < dev->real_num_rx_queues; i++) 154 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 155 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 156 i, veth_rq_stats_desc[j].desc); 157 158 for (i = 0; i < dev->real_num_tx_queues; i++) 159 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 160 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 161 i, veth_tq_stats_desc[j].desc); 162 163 page_pool_ethtool_stats_get_strings(p); 164 break; 165 } 166 } 167 168 static int veth_get_sset_count(struct net_device *dev, int sset) 169 { 170 switch (sset) { 171 case ETH_SS_STATS: 172 return ARRAY_SIZE(ethtool_stats_keys) + 173 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 174 VETH_TQ_STATS_LEN * dev->real_num_tx_queues + 175 page_pool_ethtool_stats_get_count(); 176 default: 177 return -EOPNOTSUPP; 178 } 179 } 180 181 static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) 182 { 183 #ifdef CONFIG_PAGE_POOL_STATS 184 struct veth_priv *priv = netdev_priv(dev); 185 struct page_pool_stats pp_stats = {}; 186 int i; 187 188 for (i = 0; i < dev->real_num_rx_queues; i++) { 189 if (!priv->rq[i].page_pool) 190 continue; 191 page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); 192 } 193 page_pool_ethtool_stats_get(data, &pp_stats); 194 #endif /* CONFIG_PAGE_POOL_STATS */ 195 } 196 197 static void veth_get_ethtool_stats(struct net_device *dev, 198 struct ethtool_stats *stats, u64 *data) 199 { 200 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 201 struct net_device *peer = rtnl_dereference(priv->peer); 202 int i, j, idx, pp_idx; 203 204 data[0] = peer ? peer->ifindex : 0; 205 idx = 1; 206 for (i = 0; i < dev->real_num_rx_queues; i++) { 207 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 208 const void *stats_base = (void *)&rq_stats->vs; 209 unsigned int start; 210 size_t offset; 211 212 do { 213 start = u64_stats_fetch_begin(&rq_stats->syncp); 214 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 215 offset = veth_rq_stats_desc[j].offset; 216 data[idx + j] = *(u64 *)(stats_base + offset); 217 } 218 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 219 idx += VETH_RQ_STATS_LEN; 220 } 221 pp_idx = idx; 222 223 if (!peer) 224 goto page_pool_stats; 225 226 rcv_priv = netdev_priv(peer); 227 for (i = 0; i < peer->real_num_rx_queues; i++) { 228 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 229 const void *base = (void *)&rq_stats->vs; 230 unsigned int start, tx_idx = idx; 231 u64 buf[VETH_TQ_STATS_LEN]; 232 size_t offset; 233 234 do { 235 start = u64_stats_fetch_begin(&rq_stats->syncp); 236 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 237 offset = veth_tq_stats_desc[j].offset; 238 buf[j] = *(u64 *)(base + offset); 239 } 240 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 241 242 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 243 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 244 data[tx_idx + j] += buf[j]; 245 } 246 pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; 247 248 page_pool_stats: 249 veth_get_page_pool_stats(dev, &data[pp_idx]); 250 } 251 252 static void veth_get_channels(struct net_device *dev, 253 struct ethtool_channels *channels) 254 { 255 channels->tx_count = dev->real_num_tx_queues; 256 channels->rx_count = dev->real_num_rx_queues; 257 channels->max_tx = dev->num_tx_queues; 258 channels->max_rx = dev->num_rx_queues; 259 } 260 261 static int veth_set_channels(struct net_device *dev, 262 struct ethtool_channels *ch); 263 264 static const struct ethtool_ops veth_ethtool_ops = { 265 .get_drvinfo = veth_get_drvinfo, 266 .get_link = ethtool_op_get_link, 267 .get_strings = veth_get_strings, 268 .get_sset_count = veth_get_sset_count, 269 .get_ethtool_stats = veth_get_ethtool_stats, 270 .get_link_ksettings = veth_get_link_ksettings, 271 .get_ts_info = ethtool_op_get_ts_info, 272 .get_channels = veth_get_channels, 273 .set_channels = veth_set_channels, 274 }; 275 276 /* general routines */ 277 278 static bool veth_is_xdp_frame(void *ptr) 279 { 280 return (unsigned long)ptr & VETH_XDP_FLAG; 281 } 282 283 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 284 { 285 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 286 } 287 288 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 289 { 290 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 291 } 292 293 static void veth_ptr_free(void *ptr) 294 { 295 if (veth_is_xdp_frame(ptr)) 296 xdp_return_frame(veth_ptr_to_xdp(ptr)); 297 else 298 kfree_skb(ptr); 299 } 300 301 static void __veth_xdp_flush(struct veth_rq *rq) 302 { 303 /* Write ptr_ring before reading rx_notify_masked */ 304 smp_mb(); 305 if (!READ_ONCE(rq->rx_notify_masked) && 306 napi_schedule_prep(&rq->xdp_napi)) { 307 WRITE_ONCE(rq->rx_notify_masked, true); 308 __napi_schedule(&rq->xdp_napi); 309 } 310 } 311 312 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 313 { 314 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) 315 return NETDEV_TX_BUSY; /* signal qdisc layer */ 316 317 return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */ 318 } 319 320 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 321 struct veth_rq *rq, bool xdp) 322 { 323 return __dev_forward_skb(dev, skb) ?: xdp ? 324 veth_xdp_rx(rq, skb) : 325 __netif_rx(skb); 326 } 327 328 /* return true if the specified skb has chances of GRO aggregation 329 * Don't strive for accuracy, but try to avoid GRO overhead in the most 330 * common scenarios. 331 * When XDP is enabled, all traffic is considered eligible, as the xmit 332 * device has TSO off. 333 * When TSO is enabled on the xmit device, we are likely interested only 334 * in UDP aggregation, explicitly check for that if the skb is suspected 335 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 336 * to belong to locally generated UDP traffic. 337 */ 338 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 339 const struct net_device *rcv, 340 const struct sk_buff *skb) 341 { 342 return !(dev->features & NETIF_F_ALL_TSO) || 343 (skb->destructor == sock_wfree && 344 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 345 } 346 347 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 348 { 349 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 350 struct veth_rq *rq = NULL; 351 struct netdev_queue *txq; 352 struct net_device *rcv; 353 int length = skb->len; 354 bool use_napi = false; 355 int ret, rxq; 356 357 rcu_read_lock(); 358 rcv = rcu_dereference(priv->peer); 359 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 360 kfree_skb(skb); 361 goto drop; 362 } 363 364 rcv_priv = netdev_priv(rcv); 365 rxq = skb_get_queue_mapping(skb); 366 if (rxq < rcv->real_num_rx_queues) { 367 rq = &rcv_priv->rq[rxq]; 368 369 /* The napi pointer is available when an XDP program is 370 * attached or when GRO is enabled 371 * Don't bother with napi/GRO if the skb can't be aggregated 372 */ 373 use_napi = rcu_access_pointer(rq->napi) && 374 veth_skb_is_eligible_for_gro(dev, rcv, skb); 375 } 376 377 skb_tx_timestamp(skb); 378 379 ret = veth_forward_skb(rcv, skb, rq, use_napi); 380 switch (ret) { 381 case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */ 382 if (!use_napi) 383 dev_sw_netstats_tx_add(dev, 1, length); 384 else 385 __veth_xdp_flush(rq); 386 break; 387 case NETDEV_TX_BUSY: 388 /* If a qdisc is attached to our virtual device, returning 389 * NETDEV_TX_BUSY is allowed. 390 */ 391 txq = netdev_get_tx_queue(dev, rxq); 392 393 if (qdisc_txq_has_no_queue(txq)) { 394 dev_kfree_skb_any(skb); 395 goto drop; 396 } 397 /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ 398 __skb_push(skb, ETH_HLEN); 399 netif_tx_stop_queue(txq); 400 /* Makes sure NAPI peer consumer runs. Consumer is responsible 401 * for starting txq again, until then ndo_start_xmit (this 402 * function) will not be invoked by the netstack again. 403 */ 404 __veth_xdp_flush(rq); 405 break; 406 case NET_RX_DROP: /* same as NET_XMIT_DROP */ 407 drop: 408 atomic64_inc(&priv->dropped); 409 ret = NET_XMIT_DROP; 410 break; 411 default: 412 net_crit_ratelimited("%s(%s): Invalid return code(%d)", 413 __func__, dev->name, ret); 414 } 415 rcu_read_unlock(); 416 417 return ret; 418 } 419 420 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 421 { 422 struct veth_priv *priv = netdev_priv(dev); 423 int i; 424 425 result->peer_tq_xdp_xmit_err = 0; 426 result->xdp_packets = 0; 427 result->xdp_tx_err = 0; 428 result->xdp_bytes = 0; 429 result->rx_drops = 0; 430 for (i = 0; i < dev->num_rx_queues; i++) { 431 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 432 struct veth_rq_stats *stats = &priv->rq[i].stats; 433 unsigned int start; 434 435 do { 436 start = u64_stats_fetch_begin(&stats->syncp); 437 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 438 xdp_tx_err = stats->vs.xdp_tx_err; 439 packets = stats->vs.xdp_packets; 440 bytes = stats->vs.xdp_bytes; 441 drops = stats->vs.rx_drops; 442 } while (u64_stats_fetch_retry(&stats->syncp, start)); 443 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 444 result->xdp_tx_err += xdp_tx_err; 445 result->xdp_packets += packets; 446 result->xdp_bytes += bytes; 447 result->rx_drops += drops; 448 } 449 } 450 451 static void veth_get_stats64(struct net_device *dev, 452 struct rtnl_link_stats64 *tot) 453 { 454 struct veth_priv *priv = netdev_priv(dev); 455 struct net_device *peer; 456 struct veth_stats rx; 457 458 tot->tx_dropped = atomic64_read(&priv->dropped); 459 dev_fetch_sw_netstats(tot, dev->tstats); 460 461 veth_stats_rx(&rx, dev); 462 tot->tx_dropped += rx.xdp_tx_err; 463 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 464 tot->rx_bytes += rx.xdp_bytes; 465 tot->rx_packets += rx.xdp_packets; 466 467 rcu_read_lock(); 468 peer = rcu_dereference(priv->peer); 469 if (peer) { 470 struct rtnl_link_stats64 tot_peer = {}; 471 472 dev_fetch_sw_netstats(&tot_peer, peer->tstats); 473 tot->rx_bytes += tot_peer.tx_bytes; 474 tot->rx_packets += tot_peer.tx_packets; 475 476 veth_stats_rx(&rx, peer); 477 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 478 tot->rx_dropped += rx.xdp_tx_err; 479 tot->tx_bytes += rx.xdp_bytes; 480 tot->tx_packets += rx.xdp_packets; 481 } 482 rcu_read_unlock(); 483 } 484 485 /* fake multicast ability */ 486 static void veth_set_multicast_list(struct net_device *dev) 487 { 488 } 489 490 static int veth_select_rxq(struct net_device *dev) 491 { 492 return smp_processor_id() % dev->real_num_rx_queues; 493 } 494 495 static struct net_device *veth_peer_dev(struct net_device *dev) 496 { 497 struct veth_priv *priv = netdev_priv(dev); 498 499 /* Callers must be under RCU read side. */ 500 return rcu_dereference(priv->peer); 501 } 502 503 static int veth_xdp_xmit(struct net_device *dev, int n, 504 struct xdp_frame **frames, 505 u32 flags, bool ndo_xmit) 506 { 507 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 508 int i, ret = -ENXIO, nxmit = 0; 509 struct net_device *rcv; 510 unsigned int max_len; 511 struct veth_rq *rq; 512 513 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 514 return -EINVAL; 515 516 rcu_read_lock(); 517 rcv = rcu_dereference(priv->peer); 518 if (unlikely(!rcv)) 519 goto out; 520 521 rcv_priv = netdev_priv(rcv); 522 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 523 /* The napi pointer is set if NAPI is enabled, which ensures that 524 * xdp_ring is initialized on receive side and the peer device is up. 525 */ 526 if (!rcu_access_pointer(rq->napi)) 527 goto out; 528 529 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 530 531 spin_lock(&rq->xdp_ring.producer_lock); 532 for (i = 0; i < n; i++) { 533 struct xdp_frame *frame = frames[i]; 534 void *ptr = veth_xdp_to_ptr(frame); 535 536 if (unlikely(xdp_get_frame_len(frame) > max_len || 537 __ptr_ring_produce(&rq->xdp_ring, ptr))) 538 break; 539 nxmit++; 540 } 541 spin_unlock(&rq->xdp_ring.producer_lock); 542 543 if (flags & XDP_XMIT_FLUSH) 544 __veth_xdp_flush(rq); 545 546 ret = nxmit; 547 if (ndo_xmit) { 548 u64_stats_update_begin(&rq->stats.syncp); 549 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 550 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 551 u64_stats_update_end(&rq->stats.syncp); 552 } 553 554 out: 555 rcu_read_unlock(); 556 557 return ret; 558 } 559 560 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 561 struct xdp_frame **frames, u32 flags) 562 { 563 int err; 564 565 err = veth_xdp_xmit(dev, n, frames, flags, true); 566 if (err < 0) { 567 struct veth_priv *priv = netdev_priv(dev); 568 569 atomic64_add(n, &priv->dropped); 570 } 571 572 return err; 573 } 574 575 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 576 { 577 int sent, i, err = 0, drops; 578 579 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 580 if (sent < 0) { 581 err = sent; 582 sent = 0; 583 } 584 585 for (i = sent; unlikely(i < bq->count); i++) 586 xdp_return_frame(bq->q[i]); 587 588 drops = bq->count - sent; 589 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 590 591 u64_stats_update_begin(&rq->stats.syncp); 592 rq->stats.vs.xdp_tx += sent; 593 rq->stats.vs.xdp_tx_err += drops; 594 u64_stats_update_end(&rq->stats.syncp); 595 596 bq->count = 0; 597 } 598 599 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 600 { 601 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 602 struct net_device *rcv; 603 struct veth_rq *rcv_rq; 604 605 rcu_read_lock(); 606 veth_xdp_flush_bq(rq, bq); 607 rcv = rcu_dereference(priv->peer); 608 if (unlikely(!rcv)) 609 goto out; 610 611 rcv_priv = netdev_priv(rcv); 612 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 613 /* xdp_ring is initialized on receive side? */ 614 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 615 goto out; 616 617 __veth_xdp_flush(rcv_rq); 618 out: 619 rcu_read_unlock(); 620 } 621 622 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 623 struct veth_xdp_tx_bq *bq) 624 { 625 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 626 627 if (unlikely(!frame)) 628 return -EOVERFLOW; 629 630 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 631 veth_xdp_flush_bq(rq, bq); 632 633 bq->q[bq->count++] = frame; 634 635 return 0; 636 } 637 638 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 639 struct xdp_frame *frame, 640 struct veth_xdp_tx_bq *bq, 641 struct veth_stats *stats) 642 { 643 struct xdp_frame orig_frame; 644 struct bpf_prog *xdp_prog; 645 646 rcu_read_lock(); 647 xdp_prog = rcu_dereference(rq->xdp_prog); 648 if (likely(xdp_prog)) { 649 struct veth_xdp_buff vxbuf; 650 struct xdp_buff *xdp = &vxbuf.xdp; 651 u32 act; 652 653 xdp_convert_frame_to_buff(frame, xdp); 654 xdp->rxq = &rq->xdp_rxq; 655 vxbuf.skb = NULL; 656 657 act = bpf_prog_run_xdp(xdp_prog, xdp); 658 659 switch (act) { 660 case XDP_PASS: 661 if (xdp_update_frame_from_buff(xdp, frame)) 662 goto err_xdp; 663 break; 664 case XDP_TX: 665 orig_frame = *frame; 666 xdp->rxq->mem.type = frame->mem_type; 667 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 668 trace_xdp_exception(rq->dev, xdp_prog, act); 669 frame = &orig_frame; 670 stats->rx_drops++; 671 goto err_xdp; 672 } 673 stats->xdp_tx++; 674 rcu_read_unlock(); 675 goto xdp_xmit; 676 case XDP_REDIRECT: 677 orig_frame = *frame; 678 xdp->rxq->mem.type = frame->mem_type; 679 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 680 frame = &orig_frame; 681 stats->rx_drops++; 682 goto err_xdp; 683 } 684 stats->xdp_redirect++; 685 rcu_read_unlock(); 686 goto xdp_xmit; 687 default: 688 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 689 fallthrough; 690 case XDP_ABORTED: 691 trace_xdp_exception(rq->dev, xdp_prog, act); 692 fallthrough; 693 case XDP_DROP: 694 stats->xdp_drops++; 695 goto err_xdp; 696 } 697 } 698 rcu_read_unlock(); 699 700 return frame; 701 err_xdp: 702 rcu_read_unlock(); 703 xdp_return_frame(frame); 704 xdp_xmit: 705 return NULL; 706 } 707 708 /* frames array contains VETH_XDP_BATCH at most */ 709 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 710 int n_xdpf, struct veth_xdp_tx_bq *bq, 711 struct veth_stats *stats) 712 { 713 void *skbs[VETH_XDP_BATCH]; 714 int i; 715 716 if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { 717 for (i = 0; i < n_xdpf; i++) 718 xdp_return_frame(frames[i]); 719 stats->rx_drops += n_xdpf; 720 721 return; 722 } 723 724 for (i = 0; i < n_xdpf; i++) { 725 struct sk_buff *skb = skbs[i]; 726 727 skb = __xdp_build_skb_from_frame(frames[i], skb, 728 rq->dev); 729 if (!skb) { 730 xdp_return_frame(frames[i]); 731 stats->rx_drops++; 732 continue; 733 } 734 napi_gro_receive(&rq->xdp_napi, skb); 735 } 736 } 737 738 static void veth_xdp_get(struct xdp_buff *xdp) 739 { 740 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 741 int i; 742 743 get_page(virt_to_page(xdp->data)); 744 if (likely(!xdp_buff_has_frags(xdp))) 745 return; 746 747 for (i = 0; i < sinfo->nr_frags; i++) 748 __skb_frag_ref(&sinfo->frags[i]); 749 } 750 751 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 752 struct xdp_buff *xdp, 753 struct sk_buff **pskb) 754 { 755 struct sk_buff *skb = *pskb; 756 u32 frame_sz; 757 758 if (skb_shared(skb) || skb_head_is_locked(skb) || 759 skb_shinfo(skb)->nr_frags || 760 skb_headroom(skb) < XDP_PACKET_HEADROOM) { 761 if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM)) 762 goto drop; 763 764 skb = *pskb; 765 } 766 767 /* SKB "head" area always have tailroom for skb_shared_info */ 768 frame_sz = skb_end_pointer(skb) - skb->head; 769 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 770 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 771 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 772 skb_headlen(skb), true); 773 774 if (skb_is_nonlinear(skb)) { 775 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 776 xdp_buff_set_frags_flag(xdp); 777 } else { 778 xdp_buff_clear_frags_flag(xdp); 779 } 780 *pskb = skb; 781 782 return 0; 783 drop: 784 consume_skb(skb); 785 *pskb = NULL; 786 787 return -ENOMEM; 788 } 789 790 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 791 struct sk_buff *skb, 792 struct veth_xdp_tx_bq *bq, 793 struct veth_stats *stats) 794 { 795 void *orig_data, *orig_data_end; 796 struct bpf_prog *xdp_prog; 797 struct veth_xdp_buff vxbuf; 798 struct xdp_buff *xdp = &vxbuf.xdp; 799 u32 act, metalen; 800 int off; 801 802 skb_prepare_for_gro(skb); 803 804 rcu_read_lock(); 805 xdp_prog = rcu_dereference(rq->xdp_prog); 806 if (unlikely(!xdp_prog)) { 807 rcu_read_unlock(); 808 goto out; 809 } 810 811 __skb_push(skb, skb->data - skb_mac_header(skb)); 812 if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) 813 goto drop; 814 vxbuf.skb = skb; 815 816 orig_data = xdp->data; 817 orig_data_end = xdp->data_end; 818 819 act = bpf_prog_run_xdp(xdp_prog, xdp); 820 821 switch (act) { 822 case XDP_PASS: 823 break; 824 case XDP_TX: 825 veth_xdp_get(xdp); 826 consume_skb(skb); 827 xdp->rxq->mem = rq->xdp_mem; 828 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 829 trace_xdp_exception(rq->dev, xdp_prog, act); 830 stats->rx_drops++; 831 goto err_xdp; 832 } 833 stats->xdp_tx++; 834 rcu_read_unlock(); 835 goto xdp_xmit; 836 case XDP_REDIRECT: 837 veth_xdp_get(xdp); 838 consume_skb(skb); 839 xdp->rxq->mem = rq->xdp_mem; 840 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 841 stats->rx_drops++; 842 goto err_xdp; 843 } 844 stats->xdp_redirect++; 845 rcu_read_unlock(); 846 goto xdp_xmit; 847 default: 848 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 849 fallthrough; 850 case XDP_ABORTED: 851 trace_xdp_exception(rq->dev, xdp_prog, act); 852 fallthrough; 853 case XDP_DROP: 854 stats->xdp_drops++; 855 goto xdp_drop; 856 } 857 rcu_read_unlock(); 858 859 /* check if bpf_xdp_adjust_head was used */ 860 off = orig_data - xdp->data; 861 if (off > 0) 862 __skb_push(skb, off); 863 else if (off < 0) 864 __skb_pull(skb, -off); 865 866 skb_reset_mac_header(skb); 867 868 /* check if bpf_xdp_adjust_tail was used */ 869 off = xdp->data_end - orig_data_end; 870 if (off != 0) 871 __skb_put(skb, off); /* positive on grow, negative on shrink */ 872 873 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 874 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 875 */ 876 if (xdp_buff_has_frags(xdp)) 877 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 878 else 879 skb->data_len = 0; 880 881 skb->protocol = eth_type_trans(skb, rq->dev); 882 883 metalen = xdp->data - xdp->data_meta; 884 if (metalen) 885 skb_metadata_set(skb, metalen); 886 out: 887 return skb; 888 drop: 889 stats->rx_drops++; 890 xdp_drop: 891 rcu_read_unlock(); 892 kfree_skb(skb); 893 return NULL; 894 err_xdp: 895 rcu_read_unlock(); 896 xdp_return_buff(xdp); 897 xdp_xmit: 898 return NULL; 899 } 900 901 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 902 struct veth_xdp_tx_bq *bq, 903 struct veth_stats *stats) 904 { 905 int i, done = 0, n_xdpf = 0; 906 void *xdpf[VETH_XDP_BATCH]; 907 908 for (i = 0; i < budget; i++) { 909 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 910 911 if (!ptr) 912 break; 913 914 if (veth_is_xdp_frame(ptr)) { 915 /* ndo_xdp_xmit */ 916 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 917 918 stats->xdp_bytes += xdp_get_frame_len(frame); 919 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 920 if (frame) { 921 /* XDP_PASS */ 922 xdpf[n_xdpf++] = frame; 923 if (n_xdpf == VETH_XDP_BATCH) { 924 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 925 bq, stats); 926 n_xdpf = 0; 927 } 928 } 929 } else { 930 /* ndo_start_xmit */ 931 struct sk_buff *skb = ptr; 932 933 stats->xdp_bytes += skb->len; 934 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 935 if (skb) { 936 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 937 netif_receive_skb(skb); 938 else 939 napi_gro_receive(&rq->xdp_napi, skb); 940 } 941 } 942 done++; 943 } 944 945 if (n_xdpf) 946 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 947 948 u64_stats_update_begin(&rq->stats.syncp); 949 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 950 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 951 rq->stats.vs.xdp_drops += stats->xdp_drops; 952 rq->stats.vs.rx_drops += stats->rx_drops; 953 rq->stats.vs.xdp_packets += done; 954 u64_stats_update_end(&rq->stats.syncp); 955 956 return done; 957 } 958 959 static int veth_poll(struct napi_struct *napi, int budget) 960 { 961 struct veth_rq *rq = 962 container_of(napi, struct veth_rq, xdp_napi); 963 struct veth_priv *priv = netdev_priv(rq->dev); 964 int queue_idx = rq->xdp_rxq.queue_index; 965 struct netdev_queue *peer_txq; 966 struct veth_stats stats = {}; 967 struct net_device *peer_dev; 968 struct veth_xdp_tx_bq bq; 969 int done; 970 971 bq.count = 0; 972 973 /* NAPI functions as RCU section */ 974 peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); 975 peer_txq = (peer_dev && queue_idx < peer_dev->real_num_tx_queues) ? 976 netdev_get_tx_queue(peer_dev, queue_idx) : NULL; 977 978 xdp_set_return_frame_no_direct(); 979 done = veth_xdp_rcv(rq, budget, &bq, &stats); 980 981 if (stats.xdp_redirect > 0) 982 xdp_do_flush(); 983 if (stats.xdp_tx > 0) 984 veth_xdp_flush(rq, &bq); 985 xdp_clear_return_frame_no_direct(); 986 987 if (done < budget && napi_complete_done(napi, done)) { 988 /* Write rx_notify_masked before reading ptr_ring */ 989 smp_store_mb(rq->rx_notify_masked, false); 990 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 991 if (napi_schedule_prep(&rq->xdp_napi)) { 992 WRITE_ONCE(rq->rx_notify_masked, true); 993 __napi_schedule(&rq->xdp_napi); 994 } 995 } 996 } 997 998 /* Release backpressure per NAPI poll */ 999 smp_rmb(); /* Paired with netif_tx_stop_queue set_bit */ 1000 if (peer_txq && netif_tx_queue_stopped(peer_txq)) { 1001 txq_trans_cond_update(peer_txq); 1002 netif_tx_wake_queue(peer_txq); 1003 } 1004 1005 return done; 1006 } 1007 1008 static int veth_create_page_pool(struct veth_rq *rq) 1009 { 1010 struct page_pool_params pp_params = { 1011 .order = 0, 1012 .pool_size = VETH_RING_SIZE, 1013 .nid = NUMA_NO_NODE, 1014 .dev = &rq->dev->dev, 1015 }; 1016 1017 rq->page_pool = page_pool_create(&pp_params); 1018 if (IS_ERR(rq->page_pool)) { 1019 int err = PTR_ERR(rq->page_pool); 1020 1021 rq->page_pool = NULL; 1022 return err; 1023 } 1024 1025 return 0; 1026 } 1027 1028 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 1029 { 1030 struct veth_priv *priv = netdev_priv(dev); 1031 int err, i; 1032 1033 for (i = start; i < end; i++) { 1034 err = veth_create_page_pool(&priv->rq[i]); 1035 if (err) 1036 goto err_page_pool; 1037 } 1038 1039 for (i = start; i < end; i++) { 1040 struct veth_rq *rq = &priv->rq[i]; 1041 1042 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 1043 if (err) 1044 goto err_xdp_ring; 1045 } 1046 1047 for (i = start; i < end; i++) { 1048 struct veth_rq *rq = &priv->rq[i]; 1049 1050 napi_enable(&rq->xdp_napi); 1051 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1052 } 1053 1054 return 0; 1055 1056 err_xdp_ring: 1057 for (i--; i >= start; i--) 1058 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1059 i = end; 1060 err_page_pool: 1061 for (i--; i >= start; i--) { 1062 page_pool_destroy(priv->rq[i].page_pool); 1063 priv->rq[i].page_pool = NULL; 1064 } 1065 1066 return err; 1067 } 1068 1069 static int __veth_napi_enable(struct net_device *dev) 1070 { 1071 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1072 } 1073 1074 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1075 { 1076 struct veth_priv *priv = netdev_priv(dev); 1077 int i; 1078 1079 for (i = start; i < end; i++) { 1080 struct veth_rq *rq = &priv->rq[i]; 1081 1082 rcu_assign_pointer(priv->rq[i].napi, NULL); 1083 napi_disable(&rq->xdp_napi); 1084 __netif_napi_del(&rq->xdp_napi); 1085 } 1086 synchronize_net(); 1087 1088 for (i = start; i < end; i++) { 1089 struct veth_rq *rq = &priv->rq[i]; 1090 1091 rq->rx_notify_masked = false; 1092 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1093 } 1094 1095 for (i = start; i < end; i++) { 1096 page_pool_destroy(priv->rq[i].page_pool); 1097 priv->rq[i].page_pool = NULL; 1098 } 1099 } 1100 1101 static void veth_napi_del(struct net_device *dev) 1102 { 1103 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1104 } 1105 1106 static bool veth_gro_requested(const struct net_device *dev) 1107 { 1108 return !!(dev->wanted_features & NETIF_F_GRO); 1109 } 1110 1111 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1112 bool napi_already_on) 1113 { 1114 struct veth_priv *priv = netdev_priv(dev); 1115 int err, i; 1116 1117 for (i = start; i < end; i++) { 1118 struct veth_rq *rq = &priv->rq[i]; 1119 1120 if (!napi_already_on) 1121 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1122 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1123 if (err < 0) 1124 goto err_rxq_reg; 1125 1126 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1127 MEM_TYPE_PAGE_SHARED, 1128 NULL); 1129 if (err < 0) 1130 goto err_reg_mem; 1131 1132 /* Save original mem info as it can be overwritten */ 1133 rq->xdp_mem = rq->xdp_rxq.mem; 1134 } 1135 return 0; 1136 1137 err_reg_mem: 1138 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1139 err_rxq_reg: 1140 if (!napi_already_on) 1141 netif_napi_del(&priv->rq[i].xdp_napi); 1142 for (i--; i >= start; i--) { 1143 struct veth_rq *rq = &priv->rq[i]; 1144 1145 xdp_rxq_info_unreg(&rq->xdp_rxq); 1146 if (!napi_already_on) 1147 netif_napi_del(&rq->xdp_napi); 1148 } 1149 1150 return err; 1151 } 1152 1153 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1154 bool delete_napi) 1155 { 1156 struct veth_priv *priv = netdev_priv(dev); 1157 int i; 1158 1159 for (i = start; i < end; i++) { 1160 struct veth_rq *rq = &priv->rq[i]; 1161 1162 rq->xdp_rxq.mem = rq->xdp_mem; 1163 xdp_rxq_info_unreg(&rq->xdp_rxq); 1164 1165 if (delete_napi) 1166 netif_napi_del(&rq->xdp_napi); 1167 } 1168 } 1169 1170 static int veth_enable_xdp(struct net_device *dev) 1171 { 1172 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1173 struct veth_priv *priv = netdev_priv(dev); 1174 int err, i; 1175 1176 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1177 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1178 if (err) 1179 return err; 1180 1181 if (!napi_already_on) { 1182 err = __veth_napi_enable(dev); 1183 if (err) { 1184 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1185 return err; 1186 } 1187 } 1188 } 1189 1190 for (i = 0; i < dev->real_num_rx_queues; i++) { 1191 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1192 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1193 } 1194 1195 return 0; 1196 } 1197 1198 static void veth_disable_xdp(struct net_device *dev) 1199 { 1200 struct veth_priv *priv = netdev_priv(dev); 1201 int i; 1202 1203 for (i = 0; i < dev->real_num_rx_queues; i++) 1204 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1205 1206 if (!netif_running(dev) || !veth_gro_requested(dev)) 1207 veth_napi_del(dev); 1208 1209 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1210 } 1211 1212 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1213 { 1214 struct veth_priv *priv = netdev_priv(dev); 1215 int err, i; 1216 1217 for (i = start; i < end; i++) { 1218 struct veth_rq *rq = &priv->rq[i]; 1219 1220 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1221 } 1222 1223 err = __veth_napi_enable_range(dev, start, end); 1224 if (err) { 1225 for (i = start; i < end; i++) { 1226 struct veth_rq *rq = &priv->rq[i]; 1227 1228 netif_napi_del(&rq->xdp_napi); 1229 } 1230 return err; 1231 } 1232 return err; 1233 } 1234 1235 static int veth_napi_enable(struct net_device *dev) 1236 { 1237 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1238 } 1239 1240 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1241 { 1242 struct veth_priv *priv = netdev_priv(dev); 1243 1244 if (start >= end) 1245 return; 1246 1247 if (priv->_xdp_prog) { 1248 veth_napi_del_range(dev, start, end); 1249 veth_disable_xdp_range(dev, start, end, false); 1250 } else if (veth_gro_requested(dev)) { 1251 veth_napi_del_range(dev, start, end); 1252 } 1253 } 1254 1255 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1256 { 1257 struct veth_priv *priv = netdev_priv(dev); 1258 int err; 1259 1260 if (start >= end) 1261 return 0; 1262 1263 if (priv->_xdp_prog) { 1264 /* these channels are freshly initialized, napi is not on there even 1265 * when GRO is requeste 1266 */ 1267 err = veth_enable_xdp_range(dev, start, end, false); 1268 if (err) 1269 return err; 1270 1271 err = __veth_napi_enable_range(dev, start, end); 1272 if (err) { 1273 /* on error always delete the newly added napis */ 1274 veth_disable_xdp_range(dev, start, end, true); 1275 return err; 1276 } 1277 } else if (veth_gro_requested(dev)) { 1278 return veth_napi_enable_range(dev, start, end); 1279 } 1280 return 0; 1281 } 1282 1283 static void veth_set_xdp_features(struct net_device *dev) 1284 { 1285 struct veth_priv *priv = netdev_priv(dev); 1286 struct net_device *peer; 1287 1288 peer = rtnl_dereference(priv->peer); 1289 if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { 1290 struct veth_priv *priv_peer = netdev_priv(peer); 1291 xdp_features_t val = NETDEV_XDP_ACT_BASIC | 1292 NETDEV_XDP_ACT_REDIRECT | 1293 NETDEV_XDP_ACT_RX_SG; 1294 1295 if (priv_peer->_xdp_prog || veth_gro_requested(peer)) 1296 val |= NETDEV_XDP_ACT_NDO_XMIT | 1297 NETDEV_XDP_ACT_NDO_XMIT_SG; 1298 xdp_set_features_flag(dev, val); 1299 } else { 1300 xdp_clear_features_flag(dev); 1301 } 1302 } 1303 1304 static int veth_set_channels(struct net_device *dev, 1305 struct ethtool_channels *ch) 1306 { 1307 struct veth_priv *priv = netdev_priv(dev); 1308 unsigned int old_rx_count, new_rx_count; 1309 struct veth_priv *peer_priv; 1310 struct net_device *peer; 1311 int err; 1312 1313 /* sanity check. Upper bounds are already enforced by the caller */ 1314 if (!ch->rx_count || !ch->tx_count) 1315 return -EINVAL; 1316 1317 /* avoid braking XDP, if that is enabled */ 1318 peer = rtnl_dereference(priv->peer); 1319 peer_priv = peer ? netdev_priv(peer) : NULL; 1320 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1321 return -EINVAL; 1322 1323 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1324 return -EINVAL; 1325 1326 old_rx_count = dev->real_num_rx_queues; 1327 new_rx_count = ch->rx_count; 1328 if (netif_running(dev)) { 1329 /* turn device off */ 1330 netif_carrier_off(dev); 1331 if (peer) 1332 netif_carrier_off(peer); 1333 1334 /* try to allocate new resources, as needed*/ 1335 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1336 if (err) 1337 goto out; 1338 } 1339 1340 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1341 if (err) 1342 goto revert; 1343 1344 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1345 if (err) { 1346 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1347 1348 /* this error condition could happen only if rx and tx change 1349 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1350 * and we can't do anything to fully restore the original 1351 * status 1352 */ 1353 if (err2) 1354 pr_warn("Can't restore rx queues config %d -> %d %d", 1355 new_rx_count, old_rx_count, err2); 1356 else 1357 goto revert; 1358 } 1359 1360 out: 1361 if (netif_running(dev)) { 1362 /* note that we need to swap the arguments WRT the enable part 1363 * to identify the range we have to disable 1364 */ 1365 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1366 netif_carrier_on(dev); 1367 if (peer) 1368 netif_carrier_on(peer); 1369 } 1370 1371 /* update XDP supported features */ 1372 veth_set_xdp_features(dev); 1373 if (peer) 1374 veth_set_xdp_features(peer); 1375 1376 return err; 1377 1378 revert: 1379 new_rx_count = old_rx_count; 1380 old_rx_count = ch->rx_count; 1381 goto out; 1382 } 1383 1384 static int veth_open(struct net_device *dev) 1385 { 1386 struct veth_priv *priv = netdev_priv(dev); 1387 struct net_device *peer = rtnl_dereference(priv->peer); 1388 int err; 1389 1390 if (!peer) 1391 return -ENOTCONN; 1392 1393 if (priv->_xdp_prog) { 1394 err = veth_enable_xdp(dev); 1395 if (err) 1396 return err; 1397 } else if (veth_gro_requested(dev)) { 1398 err = veth_napi_enable(dev); 1399 if (err) 1400 return err; 1401 } 1402 1403 if (peer->flags & IFF_UP) { 1404 netif_carrier_on(dev); 1405 netif_carrier_on(peer); 1406 } 1407 1408 veth_set_xdp_features(dev); 1409 1410 return 0; 1411 } 1412 1413 static int veth_close(struct net_device *dev) 1414 { 1415 struct veth_priv *priv = netdev_priv(dev); 1416 struct net_device *peer = rtnl_dereference(priv->peer); 1417 1418 netif_carrier_off(dev); 1419 if (peer) 1420 netif_carrier_off(peer); 1421 1422 if (priv->_xdp_prog) 1423 veth_disable_xdp(dev); 1424 else if (veth_gro_requested(dev)) 1425 veth_napi_del(dev); 1426 1427 return 0; 1428 } 1429 1430 static int is_valid_veth_mtu(int mtu) 1431 { 1432 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1433 } 1434 1435 static int veth_alloc_queues(struct net_device *dev) 1436 { 1437 struct veth_priv *priv = netdev_priv(dev); 1438 int i; 1439 1440 priv->rq = kvzalloc_objs(*priv->rq, dev->num_rx_queues, 1441 GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); 1442 if (!priv->rq) 1443 return -ENOMEM; 1444 1445 for (i = 0; i < dev->num_rx_queues; i++) { 1446 priv->rq[i].dev = dev; 1447 u64_stats_init(&priv->rq[i].stats.syncp); 1448 } 1449 1450 return 0; 1451 } 1452 1453 static void veth_free_queues(struct net_device *dev) 1454 { 1455 struct veth_priv *priv = netdev_priv(dev); 1456 1457 kvfree(priv->rq); 1458 } 1459 1460 static int veth_dev_init(struct net_device *dev) 1461 { 1462 netdev_lockdep_set_classes(dev); 1463 return veth_alloc_queues(dev); 1464 } 1465 1466 static void veth_dev_free(struct net_device *dev) 1467 { 1468 veth_free_queues(dev); 1469 } 1470 1471 #ifdef CONFIG_NET_POLL_CONTROLLER 1472 static void veth_poll_controller(struct net_device *dev) 1473 { 1474 /* veth only receives frames when its peer sends one 1475 * Since it has nothing to do with disabling irqs, we are guaranteed 1476 * never to have pending data when we poll for it so 1477 * there is nothing to do here. 1478 * 1479 * We need this though so netpoll recognizes us as an interface that 1480 * supports polling, which enables bridge devices in virt setups to 1481 * still use netconsole 1482 */ 1483 } 1484 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1485 1486 static int veth_get_iflink(const struct net_device *dev) 1487 { 1488 struct veth_priv *priv = netdev_priv(dev); 1489 struct net_device *peer; 1490 int iflink; 1491 1492 rcu_read_lock(); 1493 peer = rcu_dereference(priv->peer); 1494 iflink = peer ? READ_ONCE(peer->ifindex) : 0; 1495 rcu_read_unlock(); 1496 1497 return iflink; 1498 } 1499 1500 static netdev_features_t veth_fix_features(struct net_device *dev, 1501 netdev_features_t features) 1502 { 1503 struct veth_priv *priv = netdev_priv(dev); 1504 struct net_device *peer; 1505 1506 peer = rtnl_dereference(priv->peer); 1507 if (peer) { 1508 struct veth_priv *peer_priv = netdev_priv(peer); 1509 1510 if (peer_priv->_xdp_prog) 1511 features &= ~NETIF_F_GSO_SOFTWARE; 1512 } 1513 1514 return features; 1515 } 1516 1517 static int veth_set_features(struct net_device *dev, 1518 netdev_features_t features) 1519 { 1520 netdev_features_t changed = features ^ dev->features; 1521 struct veth_priv *priv = netdev_priv(dev); 1522 struct net_device *peer; 1523 int err; 1524 1525 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1526 return 0; 1527 1528 peer = rtnl_dereference(priv->peer); 1529 if (features & NETIF_F_GRO) { 1530 err = veth_napi_enable(dev); 1531 if (err) 1532 return err; 1533 1534 if (peer) 1535 xdp_features_set_redirect_target(peer, true); 1536 } else { 1537 if (peer) 1538 xdp_features_clear_redirect_target(peer); 1539 veth_napi_del(dev); 1540 } 1541 return 0; 1542 } 1543 1544 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1545 { 1546 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1547 struct net_device *peer; 1548 1549 if (new_hr < 0) 1550 new_hr = 0; 1551 1552 rcu_read_lock(); 1553 peer = rcu_dereference(priv->peer); 1554 if (unlikely(!peer)) 1555 goto out; 1556 1557 peer_priv = netdev_priv(peer); 1558 priv->requested_headroom = new_hr; 1559 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1560 dev->needed_headroom = new_hr; 1561 peer->needed_headroom = new_hr; 1562 1563 out: 1564 rcu_read_unlock(); 1565 } 1566 1567 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1568 struct netlink_ext_ack *extack) 1569 { 1570 struct veth_priv *priv = netdev_priv(dev); 1571 struct bpf_prog *old_prog; 1572 struct net_device *peer; 1573 unsigned int max_mtu; 1574 int err; 1575 1576 old_prog = priv->_xdp_prog; 1577 priv->_xdp_prog = prog; 1578 peer = rtnl_dereference(priv->peer); 1579 1580 if (prog) { 1581 if (!peer) { 1582 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1583 err = -ENOTCONN; 1584 goto err; 1585 } 1586 1587 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1588 peer->hard_header_len; 1589 /* Allow increasing the max_mtu if the program supports 1590 * XDP fragments. 1591 */ 1592 if (prog->aux->xdp_has_frags) 1593 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1594 1595 if (peer->mtu > max_mtu) { 1596 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1597 err = -ERANGE; 1598 goto err; 1599 } 1600 1601 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1602 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1603 err = -ENOSPC; 1604 goto err; 1605 } 1606 1607 if (dev->flags & IFF_UP) { 1608 err = veth_enable_xdp(dev); 1609 if (err) { 1610 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1611 goto err; 1612 } 1613 } 1614 1615 if (!old_prog) { 1616 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1617 peer->max_mtu = max_mtu; 1618 } 1619 1620 xdp_features_set_redirect_target(peer, true); 1621 } 1622 1623 if (old_prog) { 1624 if (!prog) { 1625 if (peer && !veth_gro_requested(dev)) 1626 xdp_features_clear_redirect_target(peer); 1627 1628 if (dev->flags & IFF_UP) 1629 veth_disable_xdp(dev); 1630 1631 if (peer) { 1632 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1633 peer->max_mtu = ETH_MAX_MTU; 1634 } 1635 } 1636 bpf_prog_put(old_prog); 1637 } 1638 1639 if ((!!old_prog ^ !!prog) && peer) 1640 netdev_update_features(peer); 1641 1642 return 0; 1643 err: 1644 priv->_xdp_prog = old_prog; 1645 1646 return err; 1647 } 1648 1649 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1650 { 1651 switch (xdp->command) { 1652 case XDP_SETUP_PROG: 1653 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1654 default: 1655 return -EINVAL; 1656 } 1657 } 1658 1659 static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) 1660 { 1661 struct veth_xdp_buff *_ctx = (void *)ctx; 1662 1663 if (!_ctx->skb) 1664 return -ENODATA; 1665 1666 *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; 1667 return 0; 1668 } 1669 1670 static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, 1671 enum xdp_rss_hash_type *rss_type) 1672 { 1673 struct veth_xdp_buff *_ctx = (void *)ctx; 1674 struct sk_buff *skb = _ctx->skb; 1675 1676 if (!skb) 1677 return -ENODATA; 1678 1679 *hash = skb_get_hash(skb); 1680 *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; 1681 1682 return 0; 1683 } 1684 1685 static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, 1686 u16 *vlan_tci) 1687 { 1688 const struct veth_xdp_buff *_ctx = (void *)ctx; 1689 const struct sk_buff *skb = _ctx->skb; 1690 int err; 1691 1692 if (!skb) 1693 return -ENODATA; 1694 1695 err = __vlan_hwaccel_get_tag(skb, vlan_tci); 1696 if (err) 1697 return err; 1698 1699 *vlan_proto = skb->vlan_proto; 1700 return err; 1701 } 1702 1703 static const struct net_device_ops veth_netdev_ops = { 1704 .ndo_init = veth_dev_init, 1705 .ndo_open = veth_open, 1706 .ndo_stop = veth_close, 1707 .ndo_start_xmit = veth_xmit, 1708 .ndo_get_stats64 = veth_get_stats64, 1709 .ndo_set_rx_mode = veth_set_multicast_list, 1710 .ndo_set_mac_address = eth_mac_addr, 1711 #ifdef CONFIG_NET_POLL_CONTROLLER 1712 .ndo_poll_controller = veth_poll_controller, 1713 #endif 1714 .ndo_get_iflink = veth_get_iflink, 1715 .ndo_fix_features = veth_fix_features, 1716 .ndo_set_features = veth_set_features, 1717 .ndo_features_check = passthru_features_check, 1718 .ndo_set_rx_headroom = veth_set_rx_headroom, 1719 .ndo_bpf = veth_xdp, 1720 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1721 .ndo_get_peer_dev = veth_peer_dev, 1722 }; 1723 1724 static const struct xdp_metadata_ops veth_xdp_metadata_ops = { 1725 .xmo_rx_timestamp = veth_xdp_rx_timestamp, 1726 .xmo_rx_hash = veth_xdp_rx_hash, 1727 .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag, 1728 }; 1729 1730 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1731 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1732 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1733 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1734 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1735 1736 static void veth_setup(struct net_device *dev) 1737 { 1738 ether_setup(dev); 1739 1740 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1741 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1742 dev->priv_flags |= IFF_NO_QUEUE; 1743 dev->priv_flags |= IFF_PHONY_HEADROOM; 1744 dev->priv_flags |= IFF_DISABLE_NETPOLL; 1745 dev->lltx = true; 1746 1747 dev->netdev_ops = &veth_netdev_ops; 1748 dev->xdp_metadata_ops = &veth_xdp_metadata_ops; 1749 dev->ethtool_ops = &veth_ethtool_ops; 1750 dev->features |= VETH_FEATURES; 1751 dev->vlan_features = dev->features & 1752 ~(NETIF_F_HW_VLAN_CTAG_TX | 1753 NETIF_F_HW_VLAN_STAG_TX | 1754 NETIF_F_HW_VLAN_CTAG_RX | 1755 NETIF_F_HW_VLAN_STAG_RX); 1756 dev->needs_free_netdev = true; 1757 dev->priv_destructor = veth_dev_free; 1758 dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; 1759 dev->max_mtu = ETH_MAX_MTU; 1760 1761 dev->hw_features = VETH_FEATURES; 1762 dev->hw_enc_features = VETH_FEATURES; 1763 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1764 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1765 } 1766 1767 /* 1768 * netlink interface 1769 */ 1770 1771 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1772 struct netlink_ext_ack *extack) 1773 { 1774 if (tb[IFLA_ADDRESS]) { 1775 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1776 return -EINVAL; 1777 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1778 return -EADDRNOTAVAIL; 1779 } 1780 if (tb[IFLA_MTU]) { 1781 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1782 return -EINVAL; 1783 } 1784 return 0; 1785 } 1786 1787 static struct rtnl_link_ops veth_link_ops; 1788 1789 static void veth_disable_gro(struct net_device *dev) 1790 { 1791 dev->features &= ~NETIF_F_GRO; 1792 dev->wanted_features &= ~NETIF_F_GRO; 1793 netdev_update_features(dev); 1794 } 1795 1796 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1797 { 1798 int err; 1799 1800 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1801 err = netif_set_real_num_tx_queues(dev, 1); 1802 if (err) 1803 return err; 1804 } 1805 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1806 err = netif_set_real_num_rx_queues(dev, 1); 1807 if (err) 1808 return err; 1809 } 1810 return 0; 1811 } 1812 1813 static int veth_newlink(struct net_device *dev, 1814 struct rtnl_newlink_params *params, 1815 struct netlink_ext_ack *extack) 1816 { 1817 struct net *peer_net = rtnl_newlink_peer_net(params); 1818 struct nlattr **data = params->data; 1819 struct nlattr **tb = params->tb; 1820 int err; 1821 struct net_device *peer; 1822 struct veth_priv *priv; 1823 char ifname[IFNAMSIZ]; 1824 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1825 unsigned char name_assign_type; 1826 struct ifinfomsg *ifmp; 1827 1828 /* 1829 * create and register peer first 1830 */ 1831 if (data && data[VETH_INFO_PEER]) { 1832 struct nlattr *nla_peer = data[VETH_INFO_PEER]; 1833 1834 ifmp = nla_data(nla_peer); 1835 rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); 1836 tbp = peer_tb; 1837 } else { 1838 ifmp = NULL; 1839 tbp = tb; 1840 } 1841 1842 if (ifmp && tbp[IFLA_IFNAME]) { 1843 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1844 name_assign_type = NET_NAME_USER; 1845 } else { 1846 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1847 name_assign_type = NET_NAME_ENUM; 1848 } 1849 1850 peer = rtnl_create_link(peer_net, ifname, name_assign_type, 1851 &veth_link_ops, tbp, extack); 1852 if (IS_ERR(peer)) 1853 return PTR_ERR(peer); 1854 1855 if (!ifmp || !tbp[IFLA_ADDRESS]) 1856 eth_hw_addr_random(peer); 1857 1858 if (ifmp && (dev->ifindex != 0)) 1859 peer->ifindex = ifmp->ifi_index; 1860 1861 netif_inherit_tso_max(peer, dev); 1862 1863 err = register_netdevice(peer); 1864 if (err < 0) 1865 goto err_register_peer; 1866 1867 /* keep GRO disabled by default to be consistent with the established 1868 * veth behavior 1869 */ 1870 veth_disable_gro(peer); 1871 netif_carrier_off(peer); 1872 1873 err = rtnl_configure_link(peer, ifmp, 0, NULL); 1874 if (err < 0) 1875 goto err_configure_peer; 1876 1877 /* 1878 * register dev last 1879 * 1880 * note, that since we've registered new device the dev's name 1881 * should be re-allocated 1882 */ 1883 1884 if (tb[IFLA_ADDRESS] == NULL) 1885 eth_hw_addr_random(dev); 1886 1887 if (tb[IFLA_IFNAME]) 1888 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1889 else 1890 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1891 1892 err = register_netdevice(dev); 1893 if (err < 0) 1894 goto err_register_dev; 1895 1896 netif_carrier_off(dev); 1897 1898 /* 1899 * tie the deviced together 1900 */ 1901 1902 priv = netdev_priv(dev); 1903 rcu_assign_pointer(priv->peer, peer); 1904 err = veth_init_queues(dev, tb); 1905 if (err) 1906 goto err_queues; 1907 1908 priv = netdev_priv(peer); 1909 rcu_assign_pointer(priv->peer, dev); 1910 err = veth_init_queues(peer, tb); 1911 if (err) 1912 goto err_queues; 1913 1914 veth_disable_gro(dev); 1915 /* update XDP supported features */ 1916 veth_set_xdp_features(dev); 1917 veth_set_xdp_features(peer); 1918 1919 return 0; 1920 1921 err_queues: 1922 unregister_netdevice(dev); 1923 err_register_dev: 1924 /* nothing to do */ 1925 err_configure_peer: 1926 unregister_netdevice(peer); 1927 return err; 1928 1929 err_register_peer: 1930 free_netdev(peer); 1931 return err; 1932 } 1933 1934 static void veth_dellink(struct net_device *dev, struct list_head *head) 1935 { 1936 struct veth_priv *priv; 1937 struct net_device *peer; 1938 1939 priv = netdev_priv(dev); 1940 peer = rtnl_dereference(priv->peer); 1941 1942 /* Note : dellink() is called from default_device_exit_batch(), 1943 * before a rcu_synchronize() point. The devices are guaranteed 1944 * not being freed before one RCU grace period. 1945 */ 1946 RCU_INIT_POINTER(priv->peer, NULL); 1947 unregister_netdevice_queue(dev, head); 1948 1949 if (peer) { 1950 priv = netdev_priv(peer); 1951 RCU_INIT_POINTER(priv->peer, NULL); 1952 unregister_netdevice_queue(peer, head); 1953 } 1954 } 1955 1956 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1957 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1958 }; 1959 1960 static struct net *veth_get_link_net(const struct net_device *dev) 1961 { 1962 struct veth_priv *priv = netdev_priv(dev); 1963 struct net_device *peer = rtnl_dereference(priv->peer); 1964 1965 return peer ? dev_net(peer) : dev_net(dev); 1966 } 1967 1968 static unsigned int veth_get_num_queues(void) 1969 { 1970 /* enforce the same queue limit as rtnl_create_link */ 1971 int queues = num_possible_cpus(); 1972 1973 if (queues > 4096) 1974 queues = 4096; 1975 return queues; 1976 } 1977 1978 static struct rtnl_link_ops veth_link_ops = { 1979 .kind = DRV_NAME, 1980 .priv_size = sizeof(struct veth_priv), 1981 .setup = veth_setup, 1982 .validate = veth_validate, 1983 .newlink = veth_newlink, 1984 .dellink = veth_dellink, 1985 .policy = veth_policy, 1986 .peer_type = VETH_INFO_PEER, 1987 .maxtype = VETH_INFO_MAX, 1988 .get_link_net = veth_get_link_net, 1989 .get_num_tx_queues = veth_get_num_queues, 1990 .get_num_rx_queues = veth_get_num_queues, 1991 }; 1992 1993 /* 1994 * init/fini 1995 */ 1996 1997 static __init int veth_init(void) 1998 { 1999 return rtnl_link_register(&veth_link_ops); 2000 } 2001 2002 static __exit void veth_exit(void) 2003 { 2004 rtnl_link_unregister(&veth_link_ops); 2005 } 2006 2007 module_init(veth_init); 2008 module_exit(veth_exit); 2009 2010 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 2011 MODULE_LICENSE("GPL v2"); 2012 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 2013