1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/netdev_lock.h> 21 #include <net/xfrm.h> 22 #include <net/xdp.h> 23 #include <linux/veth.h> 24 #include <linux/module.h> 25 #include <linux/bpf.h> 26 #include <linux/filter.h> 27 #include <linux/ptr_ring.h> 28 #include <linux/bpf_trace.h> 29 #include <linux/net_tstamp.h> 30 #include <linux/skbuff_ref.h> 31 #include <net/page_pool/helpers.h> 32 33 #define DRV_NAME "veth" 34 #define DRV_VERSION "1.0" 35 36 #define VETH_XDP_FLAG BIT(0) 37 #define VETH_RING_SIZE 256 38 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 39 40 #define VETH_XDP_TX_BULK_SIZE 16 41 #define VETH_XDP_BATCH 16 42 43 struct veth_stats { 44 u64 rx_drops; 45 /* xdp */ 46 u64 xdp_packets; 47 u64 xdp_bytes; 48 u64 xdp_redirect; 49 u64 xdp_drops; 50 u64 xdp_tx; 51 u64 xdp_tx_err; 52 u64 peer_tq_xdp_xmit; 53 u64 peer_tq_xdp_xmit_err; 54 }; 55 56 struct veth_rq_stats { 57 struct veth_stats vs; 58 struct u64_stats_sync syncp; 59 }; 60 61 struct veth_rq { 62 struct napi_struct xdp_napi; 63 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 64 struct net_device *dev; 65 struct bpf_prog __rcu *xdp_prog; 66 struct xdp_mem_info xdp_mem; 67 struct veth_rq_stats stats; 68 bool rx_notify_masked; 69 struct ptr_ring xdp_ring; 70 struct xdp_rxq_info xdp_rxq; 71 struct page_pool *page_pool; 72 }; 73 74 struct veth_priv { 75 struct net_device __rcu *peer; 76 atomic64_t dropped; 77 struct bpf_prog *_xdp_prog; 78 struct veth_rq *rq; 79 unsigned int requested_headroom; 80 }; 81 82 struct veth_xdp_tx_bq { 83 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 84 unsigned int count; 85 }; 86 87 /* 88 * ethtool interface 89 */ 90 91 struct veth_q_stat_desc { 92 char desc[ETH_GSTRING_LEN]; 93 size_t offset; 94 }; 95 96 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 97 98 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 99 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 100 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 101 { "drops", VETH_RQ_STAT(rx_drops) }, 102 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 103 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 104 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 105 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 106 }; 107 108 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 109 110 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 111 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 112 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 113 }; 114 115 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 116 117 static struct { 118 const char string[ETH_GSTRING_LEN]; 119 } ethtool_stats_keys[] = { 120 { "peer_ifindex" }, 121 }; 122 123 struct veth_xdp_buff { 124 struct xdp_buff xdp; 125 struct sk_buff *skb; 126 }; 127 128 static int veth_get_link_ksettings(struct net_device *dev, 129 struct ethtool_link_ksettings *cmd) 130 { 131 cmd->base.speed = SPEED_10000; 132 cmd->base.duplex = DUPLEX_FULL; 133 cmd->base.port = PORT_TP; 134 cmd->base.autoneg = AUTONEG_DISABLE; 135 return 0; 136 } 137 138 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 139 { 140 strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 141 strscpy(info->version, DRV_VERSION, sizeof(info->version)); 142 } 143 144 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 145 { 146 u8 *p = buf; 147 int i, j; 148 149 switch(stringset) { 150 case ETH_SS_STATS: 151 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 152 p += sizeof(ethtool_stats_keys); 153 for (i = 0; i < dev->real_num_rx_queues; i++) 154 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 155 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 156 i, veth_rq_stats_desc[j].desc); 157 158 for (i = 0; i < dev->real_num_tx_queues; i++) 159 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 160 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 161 i, veth_tq_stats_desc[j].desc); 162 163 page_pool_ethtool_stats_get_strings(p); 164 break; 165 } 166 } 167 168 static int veth_get_sset_count(struct net_device *dev, int sset) 169 { 170 switch (sset) { 171 case ETH_SS_STATS: 172 return ARRAY_SIZE(ethtool_stats_keys) + 173 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 174 VETH_TQ_STATS_LEN * dev->real_num_tx_queues + 175 page_pool_ethtool_stats_get_count(); 176 default: 177 return -EOPNOTSUPP; 178 } 179 } 180 181 static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) 182 { 183 #ifdef CONFIG_PAGE_POOL_STATS 184 struct veth_priv *priv = netdev_priv(dev); 185 struct page_pool_stats pp_stats = {}; 186 int i; 187 188 for (i = 0; i < dev->real_num_rx_queues; i++) { 189 if (!priv->rq[i].page_pool) 190 continue; 191 page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); 192 } 193 page_pool_ethtool_stats_get(data, &pp_stats); 194 #endif /* CONFIG_PAGE_POOL_STATS */ 195 } 196 197 static void veth_get_ethtool_stats(struct net_device *dev, 198 struct ethtool_stats *stats, u64 *data) 199 { 200 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 201 struct net_device *peer = rtnl_dereference(priv->peer); 202 int i, j, idx, pp_idx; 203 204 data[0] = peer ? peer->ifindex : 0; 205 idx = 1; 206 for (i = 0; i < dev->real_num_rx_queues; i++) { 207 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 208 const void *stats_base = (void *)&rq_stats->vs; 209 unsigned int start; 210 size_t offset; 211 212 do { 213 start = u64_stats_fetch_begin(&rq_stats->syncp); 214 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 215 offset = veth_rq_stats_desc[j].offset; 216 data[idx + j] = *(u64 *)(stats_base + offset); 217 } 218 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 219 idx += VETH_RQ_STATS_LEN; 220 } 221 pp_idx = idx; 222 223 if (!peer) 224 goto page_pool_stats; 225 226 rcv_priv = netdev_priv(peer); 227 for (i = 0; i < peer->real_num_rx_queues; i++) { 228 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 229 const void *base = (void *)&rq_stats->vs; 230 unsigned int start, tx_idx = idx; 231 size_t offset; 232 233 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 234 do { 235 start = u64_stats_fetch_begin(&rq_stats->syncp); 236 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 237 offset = veth_tq_stats_desc[j].offset; 238 data[tx_idx + j] += *(u64 *)(base + offset); 239 } 240 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 241 } 242 pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; 243 244 page_pool_stats: 245 veth_get_page_pool_stats(dev, &data[pp_idx]); 246 } 247 248 static void veth_get_channels(struct net_device *dev, 249 struct ethtool_channels *channels) 250 { 251 channels->tx_count = dev->real_num_tx_queues; 252 channels->rx_count = dev->real_num_rx_queues; 253 channels->max_tx = dev->num_tx_queues; 254 channels->max_rx = dev->num_rx_queues; 255 } 256 257 static int veth_set_channels(struct net_device *dev, 258 struct ethtool_channels *ch); 259 260 static const struct ethtool_ops veth_ethtool_ops = { 261 .get_drvinfo = veth_get_drvinfo, 262 .get_link = ethtool_op_get_link, 263 .get_strings = veth_get_strings, 264 .get_sset_count = veth_get_sset_count, 265 .get_ethtool_stats = veth_get_ethtool_stats, 266 .get_link_ksettings = veth_get_link_ksettings, 267 .get_ts_info = ethtool_op_get_ts_info, 268 .get_channels = veth_get_channels, 269 .set_channels = veth_set_channels, 270 }; 271 272 /* general routines */ 273 274 static bool veth_is_xdp_frame(void *ptr) 275 { 276 return (unsigned long)ptr & VETH_XDP_FLAG; 277 } 278 279 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 280 { 281 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 282 } 283 284 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 285 { 286 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 287 } 288 289 static void veth_ptr_free(void *ptr) 290 { 291 if (veth_is_xdp_frame(ptr)) 292 xdp_return_frame(veth_ptr_to_xdp(ptr)); 293 else 294 kfree_skb(ptr); 295 } 296 297 static void __veth_xdp_flush(struct veth_rq *rq) 298 { 299 /* Write ptr_ring before reading rx_notify_masked */ 300 smp_mb(); 301 if (!READ_ONCE(rq->rx_notify_masked) && 302 napi_schedule_prep(&rq->xdp_napi)) { 303 WRITE_ONCE(rq->rx_notify_masked, true); 304 __napi_schedule(&rq->xdp_napi); 305 } 306 } 307 308 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 309 { 310 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) 311 return NETDEV_TX_BUSY; /* signal qdisc layer */ 312 313 return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */ 314 } 315 316 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 317 struct veth_rq *rq, bool xdp) 318 { 319 return __dev_forward_skb(dev, skb) ?: xdp ? 320 veth_xdp_rx(rq, skb) : 321 __netif_rx(skb); 322 } 323 324 /* return true if the specified skb has chances of GRO aggregation 325 * Don't strive for accuracy, but try to avoid GRO overhead in the most 326 * common scenarios. 327 * When XDP is enabled, all traffic is considered eligible, as the xmit 328 * device has TSO off. 329 * When TSO is enabled on the xmit device, we are likely interested only 330 * in UDP aggregation, explicitly check for that if the skb is suspected 331 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 332 * to belong to locally generated UDP traffic. 333 */ 334 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 335 const struct net_device *rcv, 336 const struct sk_buff *skb) 337 { 338 return !(dev->features & NETIF_F_ALL_TSO) || 339 (skb->destructor == sock_wfree && 340 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 341 } 342 343 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 344 { 345 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 346 struct veth_rq *rq = NULL; 347 struct netdev_queue *txq; 348 struct net_device *rcv; 349 int length = skb->len; 350 bool use_napi = false; 351 int ret, rxq; 352 353 rcu_read_lock(); 354 rcv = rcu_dereference(priv->peer); 355 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 356 kfree_skb(skb); 357 goto drop; 358 } 359 360 rcv_priv = netdev_priv(rcv); 361 rxq = skb_get_queue_mapping(skb); 362 if (rxq < rcv->real_num_rx_queues) { 363 rq = &rcv_priv->rq[rxq]; 364 365 /* The napi pointer is available when an XDP program is 366 * attached or when GRO is enabled 367 * Don't bother with napi/GRO if the skb can't be aggregated 368 */ 369 use_napi = rcu_access_pointer(rq->napi) && 370 veth_skb_is_eligible_for_gro(dev, rcv, skb); 371 } 372 373 skb_tx_timestamp(skb); 374 375 ret = veth_forward_skb(rcv, skb, rq, use_napi); 376 switch (ret) { 377 case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */ 378 if (!use_napi) 379 dev_sw_netstats_tx_add(dev, 1, length); 380 else 381 __veth_xdp_flush(rq); 382 break; 383 case NETDEV_TX_BUSY: 384 /* If a qdisc is attached to our virtual device, returning 385 * NETDEV_TX_BUSY is allowed. 386 */ 387 txq = netdev_get_tx_queue(dev, rxq); 388 389 if (qdisc_txq_has_no_queue(txq)) { 390 dev_kfree_skb_any(skb); 391 goto drop; 392 } 393 /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ 394 __skb_push(skb, ETH_HLEN); 395 netif_tx_stop_queue(txq); 396 /* Makes sure NAPI peer consumer runs. Consumer is responsible 397 * for starting txq again, until then ndo_start_xmit (this 398 * function) will not be invoked by the netstack again. 399 */ 400 __veth_xdp_flush(rq); 401 break; 402 case NET_RX_DROP: /* same as NET_XMIT_DROP */ 403 drop: 404 atomic64_inc(&priv->dropped); 405 ret = NET_XMIT_DROP; 406 break; 407 default: 408 net_crit_ratelimited("%s(%s): Invalid return code(%d)", 409 __func__, dev->name, ret); 410 } 411 rcu_read_unlock(); 412 413 return ret; 414 } 415 416 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 417 { 418 struct veth_priv *priv = netdev_priv(dev); 419 int i; 420 421 result->peer_tq_xdp_xmit_err = 0; 422 result->xdp_packets = 0; 423 result->xdp_tx_err = 0; 424 result->xdp_bytes = 0; 425 result->rx_drops = 0; 426 for (i = 0; i < dev->num_rx_queues; i++) { 427 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 428 struct veth_rq_stats *stats = &priv->rq[i].stats; 429 unsigned int start; 430 431 do { 432 start = u64_stats_fetch_begin(&stats->syncp); 433 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 434 xdp_tx_err = stats->vs.xdp_tx_err; 435 packets = stats->vs.xdp_packets; 436 bytes = stats->vs.xdp_bytes; 437 drops = stats->vs.rx_drops; 438 } while (u64_stats_fetch_retry(&stats->syncp, start)); 439 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 440 result->xdp_tx_err += xdp_tx_err; 441 result->xdp_packets += packets; 442 result->xdp_bytes += bytes; 443 result->rx_drops += drops; 444 } 445 } 446 447 static void veth_get_stats64(struct net_device *dev, 448 struct rtnl_link_stats64 *tot) 449 { 450 struct veth_priv *priv = netdev_priv(dev); 451 struct net_device *peer; 452 struct veth_stats rx; 453 454 tot->tx_dropped = atomic64_read(&priv->dropped); 455 dev_fetch_sw_netstats(tot, dev->tstats); 456 457 veth_stats_rx(&rx, dev); 458 tot->tx_dropped += rx.xdp_tx_err; 459 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 460 tot->rx_bytes += rx.xdp_bytes; 461 tot->rx_packets += rx.xdp_packets; 462 463 rcu_read_lock(); 464 peer = rcu_dereference(priv->peer); 465 if (peer) { 466 struct rtnl_link_stats64 tot_peer = {}; 467 468 dev_fetch_sw_netstats(&tot_peer, peer->tstats); 469 tot->rx_bytes += tot_peer.tx_bytes; 470 tot->rx_packets += tot_peer.tx_packets; 471 472 veth_stats_rx(&rx, peer); 473 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 474 tot->rx_dropped += rx.xdp_tx_err; 475 tot->tx_bytes += rx.xdp_bytes; 476 tot->tx_packets += rx.xdp_packets; 477 } 478 rcu_read_unlock(); 479 } 480 481 /* fake multicast ability */ 482 static void veth_set_multicast_list(struct net_device *dev) 483 { 484 } 485 486 static int veth_select_rxq(struct net_device *dev) 487 { 488 return smp_processor_id() % dev->real_num_rx_queues; 489 } 490 491 static struct net_device *veth_peer_dev(struct net_device *dev) 492 { 493 struct veth_priv *priv = netdev_priv(dev); 494 495 /* Callers must be under RCU read side. */ 496 return rcu_dereference(priv->peer); 497 } 498 499 static int veth_xdp_xmit(struct net_device *dev, int n, 500 struct xdp_frame **frames, 501 u32 flags, bool ndo_xmit) 502 { 503 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 504 int i, ret = -ENXIO, nxmit = 0; 505 struct net_device *rcv; 506 unsigned int max_len; 507 struct veth_rq *rq; 508 509 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 510 return -EINVAL; 511 512 rcu_read_lock(); 513 rcv = rcu_dereference(priv->peer); 514 if (unlikely(!rcv)) 515 goto out; 516 517 rcv_priv = netdev_priv(rcv); 518 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 519 /* The napi pointer is set if NAPI is enabled, which ensures that 520 * xdp_ring is initialized on receive side and the peer device is up. 521 */ 522 if (!rcu_access_pointer(rq->napi)) 523 goto out; 524 525 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 526 527 spin_lock(&rq->xdp_ring.producer_lock); 528 for (i = 0; i < n; i++) { 529 struct xdp_frame *frame = frames[i]; 530 void *ptr = veth_xdp_to_ptr(frame); 531 532 if (unlikely(xdp_get_frame_len(frame) > max_len || 533 __ptr_ring_produce(&rq->xdp_ring, ptr))) 534 break; 535 nxmit++; 536 } 537 spin_unlock(&rq->xdp_ring.producer_lock); 538 539 if (flags & XDP_XMIT_FLUSH) 540 __veth_xdp_flush(rq); 541 542 ret = nxmit; 543 if (ndo_xmit) { 544 u64_stats_update_begin(&rq->stats.syncp); 545 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 546 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 547 u64_stats_update_end(&rq->stats.syncp); 548 } 549 550 out: 551 rcu_read_unlock(); 552 553 return ret; 554 } 555 556 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 557 struct xdp_frame **frames, u32 flags) 558 { 559 int err; 560 561 err = veth_xdp_xmit(dev, n, frames, flags, true); 562 if (err < 0) { 563 struct veth_priv *priv = netdev_priv(dev); 564 565 atomic64_add(n, &priv->dropped); 566 } 567 568 return err; 569 } 570 571 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 572 { 573 int sent, i, err = 0, drops; 574 575 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 576 if (sent < 0) { 577 err = sent; 578 sent = 0; 579 } 580 581 for (i = sent; unlikely(i < bq->count); i++) 582 xdp_return_frame(bq->q[i]); 583 584 drops = bq->count - sent; 585 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 586 587 u64_stats_update_begin(&rq->stats.syncp); 588 rq->stats.vs.xdp_tx += sent; 589 rq->stats.vs.xdp_tx_err += drops; 590 u64_stats_update_end(&rq->stats.syncp); 591 592 bq->count = 0; 593 } 594 595 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 596 { 597 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 598 struct net_device *rcv; 599 struct veth_rq *rcv_rq; 600 601 rcu_read_lock(); 602 veth_xdp_flush_bq(rq, bq); 603 rcv = rcu_dereference(priv->peer); 604 if (unlikely(!rcv)) 605 goto out; 606 607 rcv_priv = netdev_priv(rcv); 608 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 609 /* xdp_ring is initialized on receive side? */ 610 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 611 goto out; 612 613 __veth_xdp_flush(rcv_rq); 614 out: 615 rcu_read_unlock(); 616 } 617 618 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 619 struct veth_xdp_tx_bq *bq) 620 { 621 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 622 623 if (unlikely(!frame)) 624 return -EOVERFLOW; 625 626 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 627 veth_xdp_flush_bq(rq, bq); 628 629 bq->q[bq->count++] = frame; 630 631 return 0; 632 } 633 634 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 635 struct xdp_frame *frame, 636 struct veth_xdp_tx_bq *bq, 637 struct veth_stats *stats) 638 { 639 struct xdp_frame orig_frame; 640 struct bpf_prog *xdp_prog; 641 642 rcu_read_lock(); 643 xdp_prog = rcu_dereference(rq->xdp_prog); 644 if (likely(xdp_prog)) { 645 struct veth_xdp_buff vxbuf; 646 struct xdp_buff *xdp = &vxbuf.xdp; 647 u32 act; 648 649 xdp_convert_frame_to_buff(frame, xdp); 650 xdp->rxq = &rq->xdp_rxq; 651 vxbuf.skb = NULL; 652 653 act = bpf_prog_run_xdp(xdp_prog, xdp); 654 655 switch (act) { 656 case XDP_PASS: 657 if (xdp_update_frame_from_buff(xdp, frame)) 658 goto err_xdp; 659 break; 660 case XDP_TX: 661 orig_frame = *frame; 662 xdp->rxq->mem.type = frame->mem_type; 663 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 664 trace_xdp_exception(rq->dev, xdp_prog, act); 665 frame = &orig_frame; 666 stats->rx_drops++; 667 goto err_xdp; 668 } 669 stats->xdp_tx++; 670 rcu_read_unlock(); 671 goto xdp_xmit; 672 case XDP_REDIRECT: 673 orig_frame = *frame; 674 xdp->rxq->mem.type = frame->mem_type; 675 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 676 frame = &orig_frame; 677 stats->rx_drops++; 678 goto err_xdp; 679 } 680 stats->xdp_redirect++; 681 rcu_read_unlock(); 682 goto xdp_xmit; 683 default: 684 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 685 fallthrough; 686 case XDP_ABORTED: 687 trace_xdp_exception(rq->dev, xdp_prog, act); 688 fallthrough; 689 case XDP_DROP: 690 stats->xdp_drops++; 691 goto err_xdp; 692 } 693 } 694 rcu_read_unlock(); 695 696 return frame; 697 err_xdp: 698 rcu_read_unlock(); 699 xdp_return_frame(frame); 700 xdp_xmit: 701 return NULL; 702 } 703 704 /* frames array contains VETH_XDP_BATCH at most */ 705 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 706 int n_xdpf, struct veth_xdp_tx_bq *bq, 707 struct veth_stats *stats) 708 { 709 void *skbs[VETH_XDP_BATCH]; 710 int i; 711 712 if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { 713 for (i = 0; i < n_xdpf; i++) 714 xdp_return_frame(frames[i]); 715 stats->rx_drops += n_xdpf; 716 717 return; 718 } 719 720 for (i = 0; i < n_xdpf; i++) { 721 struct sk_buff *skb = skbs[i]; 722 723 skb = __xdp_build_skb_from_frame(frames[i], skb, 724 rq->dev); 725 if (!skb) { 726 xdp_return_frame(frames[i]); 727 stats->rx_drops++; 728 continue; 729 } 730 napi_gro_receive(&rq->xdp_napi, skb); 731 } 732 } 733 734 static void veth_xdp_get(struct xdp_buff *xdp) 735 { 736 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 737 int i; 738 739 get_page(virt_to_page(xdp->data)); 740 if (likely(!xdp_buff_has_frags(xdp))) 741 return; 742 743 for (i = 0; i < sinfo->nr_frags; i++) 744 __skb_frag_ref(&sinfo->frags[i]); 745 } 746 747 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 748 struct xdp_buff *xdp, 749 struct sk_buff **pskb) 750 { 751 struct sk_buff *skb = *pskb; 752 u32 frame_sz; 753 754 if (skb_shared(skb) || skb_head_is_locked(skb) || 755 skb_shinfo(skb)->nr_frags || 756 skb_headroom(skb) < XDP_PACKET_HEADROOM) { 757 if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM)) 758 goto drop; 759 760 skb = *pskb; 761 } 762 763 /* SKB "head" area always have tailroom for skb_shared_info */ 764 frame_sz = skb_end_pointer(skb) - skb->head; 765 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 766 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 767 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 768 skb_headlen(skb), true); 769 770 if (skb_is_nonlinear(skb)) { 771 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 772 xdp_buff_set_frags_flag(xdp); 773 } else { 774 xdp_buff_clear_frags_flag(xdp); 775 } 776 *pskb = skb; 777 778 return 0; 779 drop: 780 consume_skb(skb); 781 *pskb = NULL; 782 783 return -ENOMEM; 784 } 785 786 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 787 struct sk_buff *skb, 788 struct veth_xdp_tx_bq *bq, 789 struct veth_stats *stats) 790 { 791 void *orig_data, *orig_data_end; 792 struct bpf_prog *xdp_prog; 793 struct veth_xdp_buff vxbuf; 794 struct xdp_buff *xdp = &vxbuf.xdp; 795 u32 act, metalen; 796 int off; 797 798 skb_prepare_for_gro(skb); 799 800 rcu_read_lock(); 801 xdp_prog = rcu_dereference(rq->xdp_prog); 802 if (unlikely(!xdp_prog)) { 803 rcu_read_unlock(); 804 goto out; 805 } 806 807 __skb_push(skb, skb->data - skb_mac_header(skb)); 808 if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) 809 goto drop; 810 vxbuf.skb = skb; 811 812 orig_data = xdp->data; 813 orig_data_end = xdp->data_end; 814 815 act = bpf_prog_run_xdp(xdp_prog, xdp); 816 817 switch (act) { 818 case XDP_PASS: 819 break; 820 case XDP_TX: 821 veth_xdp_get(xdp); 822 consume_skb(skb); 823 xdp->rxq->mem = rq->xdp_mem; 824 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 825 trace_xdp_exception(rq->dev, xdp_prog, act); 826 stats->rx_drops++; 827 goto err_xdp; 828 } 829 stats->xdp_tx++; 830 rcu_read_unlock(); 831 goto xdp_xmit; 832 case XDP_REDIRECT: 833 veth_xdp_get(xdp); 834 consume_skb(skb); 835 xdp->rxq->mem = rq->xdp_mem; 836 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 837 stats->rx_drops++; 838 goto err_xdp; 839 } 840 stats->xdp_redirect++; 841 rcu_read_unlock(); 842 goto xdp_xmit; 843 default: 844 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 845 fallthrough; 846 case XDP_ABORTED: 847 trace_xdp_exception(rq->dev, xdp_prog, act); 848 fallthrough; 849 case XDP_DROP: 850 stats->xdp_drops++; 851 goto xdp_drop; 852 } 853 rcu_read_unlock(); 854 855 /* check if bpf_xdp_adjust_head was used */ 856 off = orig_data - xdp->data; 857 if (off > 0) 858 __skb_push(skb, off); 859 else if (off < 0) 860 __skb_pull(skb, -off); 861 862 skb_reset_mac_header(skb); 863 864 /* check if bpf_xdp_adjust_tail was used */ 865 off = xdp->data_end - orig_data_end; 866 if (off != 0) 867 __skb_put(skb, off); /* positive on grow, negative on shrink */ 868 869 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 870 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 871 */ 872 if (xdp_buff_has_frags(xdp)) 873 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 874 else 875 skb->data_len = 0; 876 877 skb->protocol = eth_type_trans(skb, rq->dev); 878 879 metalen = xdp->data - xdp->data_meta; 880 if (metalen) 881 skb_metadata_set(skb, metalen); 882 out: 883 return skb; 884 drop: 885 stats->rx_drops++; 886 xdp_drop: 887 rcu_read_unlock(); 888 kfree_skb(skb); 889 return NULL; 890 err_xdp: 891 rcu_read_unlock(); 892 xdp_return_buff(xdp); 893 xdp_xmit: 894 return NULL; 895 } 896 897 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 898 struct veth_xdp_tx_bq *bq, 899 struct veth_stats *stats) 900 { 901 int i, done = 0, n_xdpf = 0; 902 void *xdpf[VETH_XDP_BATCH]; 903 904 for (i = 0; i < budget; i++) { 905 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 906 907 if (!ptr) 908 break; 909 910 if (veth_is_xdp_frame(ptr)) { 911 /* ndo_xdp_xmit */ 912 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 913 914 stats->xdp_bytes += xdp_get_frame_len(frame); 915 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 916 if (frame) { 917 /* XDP_PASS */ 918 xdpf[n_xdpf++] = frame; 919 if (n_xdpf == VETH_XDP_BATCH) { 920 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 921 bq, stats); 922 n_xdpf = 0; 923 } 924 } 925 } else { 926 /* ndo_start_xmit */ 927 struct sk_buff *skb = ptr; 928 929 stats->xdp_bytes += skb->len; 930 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 931 if (skb) { 932 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 933 netif_receive_skb(skb); 934 else 935 napi_gro_receive(&rq->xdp_napi, skb); 936 } 937 } 938 done++; 939 } 940 941 if (n_xdpf) 942 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 943 944 u64_stats_update_begin(&rq->stats.syncp); 945 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 946 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 947 rq->stats.vs.xdp_drops += stats->xdp_drops; 948 rq->stats.vs.rx_drops += stats->rx_drops; 949 rq->stats.vs.xdp_packets += done; 950 u64_stats_update_end(&rq->stats.syncp); 951 952 return done; 953 } 954 955 static int veth_poll(struct napi_struct *napi, int budget) 956 { 957 struct veth_rq *rq = 958 container_of(napi, struct veth_rq, xdp_napi); 959 struct veth_priv *priv = netdev_priv(rq->dev); 960 int queue_idx = rq->xdp_rxq.queue_index; 961 struct netdev_queue *peer_txq; 962 struct veth_stats stats = {}; 963 struct net_device *peer_dev; 964 struct veth_xdp_tx_bq bq; 965 int done; 966 967 bq.count = 0; 968 969 /* NAPI functions as RCU section */ 970 peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); 971 peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; 972 973 xdp_set_return_frame_no_direct(); 974 done = veth_xdp_rcv(rq, budget, &bq, &stats); 975 976 if (stats.xdp_redirect > 0) 977 xdp_do_flush(); 978 979 if (done < budget && napi_complete_done(napi, done)) { 980 /* Write rx_notify_masked before reading ptr_ring */ 981 smp_store_mb(rq->rx_notify_masked, false); 982 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 983 if (napi_schedule_prep(&rq->xdp_napi)) { 984 WRITE_ONCE(rq->rx_notify_masked, true); 985 __napi_schedule(&rq->xdp_napi); 986 } 987 } 988 } 989 990 if (stats.xdp_tx > 0) 991 veth_xdp_flush(rq, &bq); 992 xdp_clear_return_frame_no_direct(); 993 994 /* Release backpressure per NAPI poll */ 995 smp_rmb(); /* Paired with netif_tx_stop_queue set_bit */ 996 if (peer_txq && netif_tx_queue_stopped(peer_txq)) { 997 txq_trans_cond_update(peer_txq); 998 netif_tx_wake_queue(peer_txq); 999 } 1000 1001 return done; 1002 } 1003 1004 static int veth_create_page_pool(struct veth_rq *rq) 1005 { 1006 struct page_pool_params pp_params = { 1007 .order = 0, 1008 .pool_size = VETH_RING_SIZE, 1009 .nid = NUMA_NO_NODE, 1010 .dev = &rq->dev->dev, 1011 }; 1012 1013 rq->page_pool = page_pool_create(&pp_params); 1014 if (IS_ERR(rq->page_pool)) { 1015 int err = PTR_ERR(rq->page_pool); 1016 1017 rq->page_pool = NULL; 1018 return err; 1019 } 1020 1021 return 0; 1022 } 1023 1024 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 1025 { 1026 struct veth_priv *priv = netdev_priv(dev); 1027 int err, i; 1028 1029 for (i = start; i < end; i++) { 1030 err = veth_create_page_pool(&priv->rq[i]); 1031 if (err) 1032 goto err_page_pool; 1033 } 1034 1035 for (i = start; i < end; i++) { 1036 struct veth_rq *rq = &priv->rq[i]; 1037 1038 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 1039 if (err) 1040 goto err_xdp_ring; 1041 } 1042 1043 for (i = start; i < end; i++) { 1044 struct veth_rq *rq = &priv->rq[i]; 1045 1046 napi_enable(&rq->xdp_napi); 1047 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1048 } 1049 1050 return 0; 1051 1052 err_xdp_ring: 1053 for (i--; i >= start; i--) 1054 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1055 i = end; 1056 err_page_pool: 1057 for (i--; i >= start; i--) { 1058 page_pool_destroy(priv->rq[i].page_pool); 1059 priv->rq[i].page_pool = NULL; 1060 } 1061 1062 return err; 1063 } 1064 1065 static int __veth_napi_enable(struct net_device *dev) 1066 { 1067 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1068 } 1069 1070 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1071 { 1072 struct veth_priv *priv = netdev_priv(dev); 1073 int i; 1074 1075 for (i = start; i < end; i++) { 1076 struct veth_rq *rq = &priv->rq[i]; 1077 1078 rcu_assign_pointer(priv->rq[i].napi, NULL); 1079 napi_disable(&rq->xdp_napi); 1080 __netif_napi_del(&rq->xdp_napi); 1081 } 1082 synchronize_net(); 1083 1084 for (i = start; i < end; i++) { 1085 struct veth_rq *rq = &priv->rq[i]; 1086 1087 rq->rx_notify_masked = false; 1088 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1089 } 1090 1091 for (i = start; i < end; i++) { 1092 page_pool_destroy(priv->rq[i].page_pool); 1093 priv->rq[i].page_pool = NULL; 1094 } 1095 } 1096 1097 static void veth_napi_del(struct net_device *dev) 1098 { 1099 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1100 } 1101 1102 static bool veth_gro_requested(const struct net_device *dev) 1103 { 1104 return !!(dev->wanted_features & NETIF_F_GRO); 1105 } 1106 1107 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1108 bool napi_already_on) 1109 { 1110 struct veth_priv *priv = netdev_priv(dev); 1111 int err, i; 1112 1113 for (i = start; i < end; i++) { 1114 struct veth_rq *rq = &priv->rq[i]; 1115 1116 if (!napi_already_on) 1117 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1118 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1119 if (err < 0) 1120 goto err_rxq_reg; 1121 1122 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1123 MEM_TYPE_PAGE_SHARED, 1124 NULL); 1125 if (err < 0) 1126 goto err_reg_mem; 1127 1128 /* Save original mem info as it can be overwritten */ 1129 rq->xdp_mem = rq->xdp_rxq.mem; 1130 } 1131 return 0; 1132 1133 err_reg_mem: 1134 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1135 err_rxq_reg: 1136 for (i--; i >= start; i--) { 1137 struct veth_rq *rq = &priv->rq[i]; 1138 1139 xdp_rxq_info_unreg(&rq->xdp_rxq); 1140 if (!napi_already_on) 1141 netif_napi_del(&rq->xdp_napi); 1142 } 1143 1144 return err; 1145 } 1146 1147 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1148 bool delete_napi) 1149 { 1150 struct veth_priv *priv = netdev_priv(dev); 1151 int i; 1152 1153 for (i = start; i < end; i++) { 1154 struct veth_rq *rq = &priv->rq[i]; 1155 1156 rq->xdp_rxq.mem = rq->xdp_mem; 1157 xdp_rxq_info_unreg(&rq->xdp_rxq); 1158 1159 if (delete_napi) 1160 netif_napi_del(&rq->xdp_napi); 1161 } 1162 } 1163 1164 static int veth_enable_xdp(struct net_device *dev) 1165 { 1166 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1167 struct veth_priv *priv = netdev_priv(dev); 1168 int err, i; 1169 1170 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1171 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1172 if (err) 1173 return err; 1174 1175 if (!napi_already_on) { 1176 err = __veth_napi_enable(dev); 1177 if (err) { 1178 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1179 return err; 1180 } 1181 } 1182 } 1183 1184 for (i = 0; i < dev->real_num_rx_queues; i++) { 1185 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1186 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1187 } 1188 1189 return 0; 1190 } 1191 1192 static void veth_disable_xdp(struct net_device *dev) 1193 { 1194 struct veth_priv *priv = netdev_priv(dev); 1195 int i; 1196 1197 for (i = 0; i < dev->real_num_rx_queues; i++) 1198 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1199 1200 if (!netif_running(dev) || !veth_gro_requested(dev)) 1201 veth_napi_del(dev); 1202 1203 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1204 } 1205 1206 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1207 { 1208 struct veth_priv *priv = netdev_priv(dev); 1209 int err, i; 1210 1211 for (i = start; i < end; i++) { 1212 struct veth_rq *rq = &priv->rq[i]; 1213 1214 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1215 } 1216 1217 err = __veth_napi_enable_range(dev, start, end); 1218 if (err) { 1219 for (i = start; i < end; i++) { 1220 struct veth_rq *rq = &priv->rq[i]; 1221 1222 netif_napi_del(&rq->xdp_napi); 1223 } 1224 return err; 1225 } 1226 return err; 1227 } 1228 1229 static int veth_napi_enable(struct net_device *dev) 1230 { 1231 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1232 } 1233 1234 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1235 { 1236 struct veth_priv *priv = netdev_priv(dev); 1237 1238 if (start >= end) 1239 return; 1240 1241 if (priv->_xdp_prog) { 1242 veth_napi_del_range(dev, start, end); 1243 veth_disable_xdp_range(dev, start, end, false); 1244 } else if (veth_gro_requested(dev)) { 1245 veth_napi_del_range(dev, start, end); 1246 } 1247 } 1248 1249 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1250 { 1251 struct veth_priv *priv = netdev_priv(dev); 1252 int err; 1253 1254 if (start >= end) 1255 return 0; 1256 1257 if (priv->_xdp_prog) { 1258 /* these channels are freshly initialized, napi is not on there even 1259 * when GRO is requeste 1260 */ 1261 err = veth_enable_xdp_range(dev, start, end, false); 1262 if (err) 1263 return err; 1264 1265 err = __veth_napi_enable_range(dev, start, end); 1266 if (err) { 1267 /* on error always delete the newly added napis */ 1268 veth_disable_xdp_range(dev, start, end, true); 1269 return err; 1270 } 1271 } else if (veth_gro_requested(dev)) { 1272 return veth_napi_enable_range(dev, start, end); 1273 } 1274 return 0; 1275 } 1276 1277 static void veth_set_xdp_features(struct net_device *dev) 1278 { 1279 struct veth_priv *priv = netdev_priv(dev); 1280 struct net_device *peer; 1281 1282 peer = rtnl_dereference(priv->peer); 1283 if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { 1284 struct veth_priv *priv_peer = netdev_priv(peer); 1285 xdp_features_t val = NETDEV_XDP_ACT_BASIC | 1286 NETDEV_XDP_ACT_REDIRECT | 1287 NETDEV_XDP_ACT_RX_SG; 1288 1289 if (priv_peer->_xdp_prog || veth_gro_requested(peer)) 1290 val |= NETDEV_XDP_ACT_NDO_XMIT | 1291 NETDEV_XDP_ACT_NDO_XMIT_SG; 1292 xdp_set_features_flag(dev, val); 1293 } else { 1294 xdp_clear_features_flag(dev); 1295 } 1296 } 1297 1298 static int veth_set_channels(struct net_device *dev, 1299 struct ethtool_channels *ch) 1300 { 1301 struct veth_priv *priv = netdev_priv(dev); 1302 unsigned int old_rx_count, new_rx_count; 1303 struct veth_priv *peer_priv; 1304 struct net_device *peer; 1305 int err; 1306 1307 /* sanity check. Upper bounds are already enforced by the caller */ 1308 if (!ch->rx_count || !ch->tx_count) 1309 return -EINVAL; 1310 1311 /* avoid braking XDP, if that is enabled */ 1312 peer = rtnl_dereference(priv->peer); 1313 peer_priv = peer ? netdev_priv(peer) : NULL; 1314 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1315 return -EINVAL; 1316 1317 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1318 return -EINVAL; 1319 1320 old_rx_count = dev->real_num_rx_queues; 1321 new_rx_count = ch->rx_count; 1322 if (netif_running(dev)) { 1323 /* turn device off */ 1324 netif_carrier_off(dev); 1325 if (peer) 1326 netif_carrier_off(peer); 1327 1328 /* try to allocate new resurces, as needed*/ 1329 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1330 if (err) 1331 goto out; 1332 } 1333 1334 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1335 if (err) 1336 goto revert; 1337 1338 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1339 if (err) { 1340 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1341 1342 /* this error condition could happen only if rx and tx change 1343 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1344 * and we can't do anything to fully restore the original 1345 * status 1346 */ 1347 if (err2) 1348 pr_warn("Can't restore rx queues config %d -> %d %d", 1349 new_rx_count, old_rx_count, err2); 1350 else 1351 goto revert; 1352 } 1353 1354 out: 1355 if (netif_running(dev)) { 1356 /* note that we need to swap the arguments WRT the enable part 1357 * to identify the range we have to disable 1358 */ 1359 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1360 netif_carrier_on(dev); 1361 if (peer) 1362 netif_carrier_on(peer); 1363 } 1364 1365 /* update XDP supported features */ 1366 veth_set_xdp_features(dev); 1367 if (peer) 1368 veth_set_xdp_features(peer); 1369 1370 return err; 1371 1372 revert: 1373 new_rx_count = old_rx_count; 1374 old_rx_count = ch->rx_count; 1375 goto out; 1376 } 1377 1378 static int veth_open(struct net_device *dev) 1379 { 1380 struct veth_priv *priv = netdev_priv(dev); 1381 struct net_device *peer = rtnl_dereference(priv->peer); 1382 int err; 1383 1384 if (!peer) 1385 return -ENOTCONN; 1386 1387 if (priv->_xdp_prog) { 1388 err = veth_enable_xdp(dev); 1389 if (err) 1390 return err; 1391 } else if (veth_gro_requested(dev)) { 1392 err = veth_napi_enable(dev); 1393 if (err) 1394 return err; 1395 } 1396 1397 if (peer->flags & IFF_UP) { 1398 netif_carrier_on(dev); 1399 netif_carrier_on(peer); 1400 } 1401 1402 veth_set_xdp_features(dev); 1403 1404 return 0; 1405 } 1406 1407 static int veth_close(struct net_device *dev) 1408 { 1409 struct veth_priv *priv = netdev_priv(dev); 1410 struct net_device *peer = rtnl_dereference(priv->peer); 1411 1412 netif_carrier_off(dev); 1413 if (peer) 1414 netif_carrier_off(peer); 1415 1416 if (priv->_xdp_prog) 1417 veth_disable_xdp(dev); 1418 else if (veth_gro_requested(dev)) 1419 veth_napi_del(dev); 1420 1421 return 0; 1422 } 1423 1424 static int is_valid_veth_mtu(int mtu) 1425 { 1426 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1427 } 1428 1429 static int veth_alloc_queues(struct net_device *dev) 1430 { 1431 struct veth_priv *priv = netdev_priv(dev); 1432 int i; 1433 1434 priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq), 1435 GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); 1436 if (!priv->rq) 1437 return -ENOMEM; 1438 1439 for (i = 0; i < dev->num_rx_queues; i++) { 1440 priv->rq[i].dev = dev; 1441 u64_stats_init(&priv->rq[i].stats.syncp); 1442 } 1443 1444 return 0; 1445 } 1446 1447 static void veth_free_queues(struct net_device *dev) 1448 { 1449 struct veth_priv *priv = netdev_priv(dev); 1450 1451 kvfree(priv->rq); 1452 } 1453 1454 static int veth_dev_init(struct net_device *dev) 1455 { 1456 netdev_lockdep_set_classes(dev); 1457 return veth_alloc_queues(dev); 1458 } 1459 1460 static void veth_dev_free(struct net_device *dev) 1461 { 1462 veth_free_queues(dev); 1463 } 1464 1465 #ifdef CONFIG_NET_POLL_CONTROLLER 1466 static void veth_poll_controller(struct net_device *dev) 1467 { 1468 /* veth only receives frames when its peer sends one 1469 * Since it has nothing to do with disabling irqs, we are guaranteed 1470 * never to have pending data when we poll for it so 1471 * there is nothing to do here. 1472 * 1473 * We need this though so netpoll recognizes us as an interface that 1474 * supports polling, which enables bridge devices in virt setups to 1475 * still use netconsole 1476 */ 1477 } 1478 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1479 1480 static int veth_get_iflink(const struct net_device *dev) 1481 { 1482 struct veth_priv *priv = netdev_priv(dev); 1483 struct net_device *peer; 1484 int iflink; 1485 1486 rcu_read_lock(); 1487 peer = rcu_dereference(priv->peer); 1488 iflink = peer ? READ_ONCE(peer->ifindex) : 0; 1489 rcu_read_unlock(); 1490 1491 return iflink; 1492 } 1493 1494 static netdev_features_t veth_fix_features(struct net_device *dev, 1495 netdev_features_t features) 1496 { 1497 struct veth_priv *priv = netdev_priv(dev); 1498 struct net_device *peer; 1499 1500 peer = rtnl_dereference(priv->peer); 1501 if (peer) { 1502 struct veth_priv *peer_priv = netdev_priv(peer); 1503 1504 if (peer_priv->_xdp_prog) 1505 features &= ~NETIF_F_GSO_SOFTWARE; 1506 } 1507 1508 return features; 1509 } 1510 1511 static int veth_set_features(struct net_device *dev, 1512 netdev_features_t features) 1513 { 1514 netdev_features_t changed = features ^ dev->features; 1515 struct veth_priv *priv = netdev_priv(dev); 1516 struct net_device *peer; 1517 int err; 1518 1519 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1520 return 0; 1521 1522 peer = rtnl_dereference(priv->peer); 1523 if (features & NETIF_F_GRO) { 1524 err = veth_napi_enable(dev); 1525 if (err) 1526 return err; 1527 1528 if (peer) 1529 xdp_features_set_redirect_target(peer, true); 1530 } else { 1531 if (peer) 1532 xdp_features_clear_redirect_target(peer); 1533 veth_napi_del(dev); 1534 } 1535 return 0; 1536 } 1537 1538 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1539 { 1540 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1541 struct net_device *peer; 1542 1543 if (new_hr < 0) 1544 new_hr = 0; 1545 1546 rcu_read_lock(); 1547 peer = rcu_dereference(priv->peer); 1548 if (unlikely(!peer)) 1549 goto out; 1550 1551 peer_priv = netdev_priv(peer); 1552 priv->requested_headroom = new_hr; 1553 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1554 dev->needed_headroom = new_hr; 1555 peer->needed_headroom = new_hr; 1556 1557 out: 1558 rcu_read_unlock(); 1559 } 1560 1561 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1562 struct netlink_ext_ack *extack) 1563 { 1564 struct veth_priv *priv = netdev_priv(dev); 1565 struct bpf_prog *old_prog; 1566 struct net_device *peer; 1567 unsigned int max_mtu; 1568 int err; 1569 1570 old_prog = priv->_xdp_prog; 1571 priv->_xdp_prog = prog; 1572 peer = rtnl_dereference(priv->peer); 1573 1574 if (prog) { 1575 if (!peer) { 1576 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1577 err = -ENOTCONN; 1578 goto err; 1579 } 1580 1581 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1582 peer->hard_header_len; 1583 /* Allow increasing the max_mtu if the program supports 1584 * XDP fragments. 1585 */ 1586 if (prog->aux->xdp_has_frags) 1587 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1588 1589 if (peer->mtu > max_mtu) { 1590 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1591 err = -ERANGE; 1592 goto err; 1593 } 1594 1595 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1596 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1597 err = -ENOSPC; 1598 goto err; 1599 } 1600 1601 if (dev->flags & IFF_UP) { 1602 err = veth_enable_xdp(dev); 1603 if (err) { 1604 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1605 goto err; 1606 } 1607 } 1608 1609 if (!old_prog) { 1610 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1611 peer->max_mtu = max_mtu; 1612 } 1613 1614 xdp_features_set_redirect_target(peer, true); 1615 } 1616 1617 if (old_prog) { 1618 if (!prog) { 1619 if (peer && !veth_gro_requested(dev)) 1620 xdp_features_clear_redirect_target(peer); 1621 1622 if (dev->flags & IFF_UP) 1623 veth_disable_xdp(dev); 1624 1625 if (peer) { 1626 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1627 peer->max_mtu = ETH_MAX_MTU; 1628 } 1629 } 1630 bpf_prog_put(old_prog); 1631 } 1632 1633 if ((!!old_prog ^ !!prog) && peer) 1634 netdev_update_features(peer); 1635 1636 return 0; 1637 err: 1638 priv->_xdp_prog = old_prog; 1639 1640 return err; 1641 } 1642 1643 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1644 { 1645 switch (xdp->command) { 1646 case XDP_SETUP_PROG: 1647 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1648 default: 1649 return -EINVAL; 1650 } 1651 } 1652 1653 static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) 1654 { 1655 struct veth_xdp_buff *_ctx = (void *)ctx; 1656 1657 if (!_ctx->skb) 1658 return -ENODATA; 1659 1660 *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; 1661 return 0; 1662 } 1663 1664 static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, 1665 enum xdp_rss_hash_type *rss_type) 1666 { 1667 struct veth_xdp_buff *_ctx = (void *)ctx; 1668 struct sk_buff *skb = _ctx->skb; 1669 1670 if (!skb) 1671 return -ENODATA; 1672 1673 *hash = skb_get_hash(skb); 1674 *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; 1675 1676 return 0; 1677 } 1678 1679 static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, 1680 u16 *vlan_tci) 1681 { 1682 const struct veth_xdp_buff *_ctx = (void *)ctx; 1683 const struct sk_buff *skb = _ctx->skb; 1684 int err; 1685 1686 if (!skb) 1687 return -ENODATA; 1688 1689 err = __vlan_hwaccel_get_tag(skb, vlan_tci); 1690 if (err) 1691 return err; 1692 1693 *vlan_proto = skb->vlan_proto; 1694 return err; 1695 } 1696 1697 static const struct net_device_ops veth_netdev_ops = { 1698 .ndo_init = veth_dev_init, 1699 .ndo_open = veth_open, 1700 .ndo_stop = veth_close, 1701 .ndo_start_xmit = veth_xmit, 1702 .ndo_get_stats64 = veth_get_stats64, 1703 .ndo_set_rx_mode = veth_set_multicast_list, 1704 .ndo_set_mac_address = eth_mac_addr, 1705 #ifdef CONFIG_NET_POLL_CONTROLLER 1706 .ndo_poll_controller = veth_poll_controller, 1707 #endif 1708 .ndo_get_iflink = veth_get_iflink, 1709 .ndo_fix_features = veth_fix_features, 1710 .ndo_set_features = veth_set_features, 1711 .ndo_features_check = passthru_features_check, 1712 .ndo_set_rx_headroom = veth_set_rx_headroom, 1713 .ndo_bpf = veth_xdp, 1714 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1715 .ndo_get_peer_dev = veth_peer_dev, 1716 }; 1717 1718 static const struct xdp_metadata_ops veth_xdp_metadata_ops = { 1719 .xmo_rx_timestamp = veth_xdp_rx_timestamp, 1720 .xmo_rx_hash = veth_xdp_rx_hash, 1721 .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag, 1722 }; 1723 1724 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1725 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1726 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1727 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1728 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1729 1730 static void veth_setup(struct net_device *dev) 1731 { 1732 ether_setup(dev); 1733 1734 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1735 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1736 dev->priv_flags |= IFF_NO_QUEUE; 1737 dev->priv_flags |= IFF_PHONY_HEADROOM; 1738 dev->priv_flags |= IFF_DISABLE_NETPOLL; 1739 dev->lltx = true; 1740 1741 dev->netdev_ops = &veth_netdev_ops; 1742 dev->xdp_metadata_ops = &veth_xdp_metadata_ops; 1743 dev->ethtool_ops = &veth_ethtool_ops; 1744 dev->features |= VETH_FEATURES; 1745 dev->vlan_features = dev->features & 1746 ~(NETIF_F_HW_VLAN_CTAG_TX | 1747 NETIF_F_HW_VLAN_STAG_TX | 1748 NETIF_F_HW_VLAN_CTAG_RX | 1749 NETIF_F_HW_VLAN_STAG_RX); 1750 dev->needs_free_netdev = true; 1751 dev->priv_destructor = veth_dev_free; 1752 dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; 1753 dev->max_mtu = ETH_MAX_MTU; 1754 1755 dev->hw_features = VETH_FEATURES; 1756 dev->hw_enc_features = VETH_FEATURES; 1757 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1758 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1759 } 1760 1761 /* 1762 * netlink interface 1763 */ 1764 1765 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1766 struct netlink_ext_ack *extack) 1767 { 1768 if (tb[IFLA_ADDRESS]) { 1769 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1770 return -EINVAL; 1771 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1772 return -EADDRNOTAVAIL; 1773 } 1774 if (tb[IFLA_MTU]) { 1775 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1776 return -EINVAL; 1777 } 1778 return 0; 1779 } 1780 1781 static struct rtnl_link_ops veth_link_ops; 1782 1783 static void veth_disable_gro(struct net_device *dev) 1784 { 1785 dev->features &= ~NETIF_F_GRO; 1786 dev->wanted_features &= ~NETIF_F_GRO; 1787 netdev_update_features(dev); 1788 } 1789 1790 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1791 { 1792 int err; 1793 1794 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1795 err = netif_set_real_num_tx_queues(dev, 1); 1796 if (err) 1797 return err; 1798 } 1799 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1800 err = netif_set_real_num_rx_queues(dev, 1); 1801 if (err) 1802 return err; 1803 } 1804 return 0; 1805 } 1806 1807 static int veth_newlink(struct net_device *dev, 1808 struct rtnl_newlink_params *params, 1809 struct netlink_ext_ack *extack) 1810 { 1811 struct net *peer_net = rtnl_newlink_peer_net(params); 1812 struct nlattr **data = params->data; 1813 struct nlattr **tb = params->tb; 1814 int err; 1815 struct net_device *peer; 1816 struct veth_priv *priv; 1817 char ifname[IFNAMSIZ]; 1818 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1819 unsigned char name_assign_type; 1820 struct ifinfomsg *ifmp; 1821 1822 /* 1823 * create and register peer first 1824 */ 1825 if (data && data[VETH_INFO_PEER]) { 1826 struct nlattr *nla_peer = data[VETH_INFO_PEER]; 1827 1828 ifmp = nla_data(nla_peer); 1829 rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); 1830 tbp = peer_tb; 1831 } else { 1832 ifmp = NULL; 1833 tbp = tb; 1834 } 1835 1836 if (ifmp && tbp[IFLA_IFNAME]) { 1837 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1838 name_assign_type = NET_NAME_USER; 1839 } else { 1840 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1841 name_assign_type = NET_NAME_ENUM; 1842 } 1843 1844 peer = rtnl_create_link(peer_net, ifname, name_assign_type, 1845 &veth_link_ops, tbp, extack); 1846 if (IS_ERR(peer)) 1847 return PTR_ERR(peer); 1848 1849 if (!ifmp || !tbp[IFLA_ADDRESS]) 1850 eth_hw_addr_random(peer); 1851 1852 if (ifmp && (dev->ifindex != 0)) 1853 peer->ifindex = ifmp->ifi_index; 1854 1855 netif_inherit_tso_max(peer, dev); 1856 1857 err = register_netdevice(peer); 1858 if (err < 0) 1859 goto err_register_peer; 1860 1861 /* keep GRO disabled by default to be consistent with the established 1862 * veth behavior 1863 */ 1864 veth_disable_gro(peer); 1865 netif_carrier_off(peer); 1866 1867 err = rtnl_configure_link(peer, ifmp, 0, NULL); 1868 if (err < 0) 1869 goto err_configure_peer; 1870 1871 /* 1872 * register dev last 1873 * 1874 * note, that since we've registered new device the dev's name 1875 * should be re-allocated 1876 */ 1877 1878 if (tb[IFLA_ADDRESS] == NULL) 1879 eth_hw_addr_random(dev); 1880 1881 if (tb[IFLA_IFNAME]) 1882 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1883 else 1884 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1885 1886 err = register_netdevice(dev); 1887 if (err < 0) 1888 goto err_register_dev; 1889 1890 netif_carrier_off(dev); 1891 1892 /* 1893 * tie the deviced together 1894 */ 1895 1896 priv = netdev_priv(dev); 1897 rcu_assign_pointer(priv->peer, peer); 1898 err = veth_init_queues(dev, tb); 1899 if (err) 1900 goto err_queues; 1901 1902 priv = netdev_priv(peer); 1903 rcu_assign_pointer(priv->peer, dev); 1904 err = veth_init_queues(peer, tb); 1905 if (err) 1906 goto err_queues; 1907 1908 veth_disable_gro(dev); 1909 /* update XDP supported features */ 1910 veth_set_xdp_features(dev); 1911 veth_set_xdp_features(peer); 1912 1913 return 0; 1914 1915 err_queues: 1916 unregister_netdevice(dev); 1917 err_register_dev: 1918 /* nothing to do */ 1919 err_configure_peer: 1920 unregister_netdevice(peer); 1921 return err; 1922 1923 err_register_peer: 1924 free_netdev(peer); 1925 return err; 1926 } 1927 1928 static void veth_dellink(struct net_device *dev, struct list_head *head) 1929 { 1930 struct veth_priv *priv; 1931 struct net_device *peer; 1932 1933 priv = netdev_priv(dev); 1934 peer = rtnl_dereference(priv->peer); 1935 1936 /* Note : dellink() is called from default_device_exit_batch(), 1937 * before a rcu_synchronize() point. The devices are guaranteed 1938 * not being freed before one RCU grace period. 1939 */ 1940 RCU_INIT_POINTER(priv->peer, NULL); 1941 unregister_netdevice_queue(dev, head); 1942 1943 if (peer) { 1944 priv = netdev_priv(peer); 1945 RCU_INIT_POINTER(priv->peer, NULL); 1946 unregister_netdevice_queue(peer, head); 1947 } 1948 } 1949 1950 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1951 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1952 }; 1953 1954 static struct net *veth_get_link_net(const struct net_device *dev) 1955 { 1956 struct veth_priv *priv = netdev_priv(dev); 1957 struct net_device *peer = rtnl_dereference(priv->peer); 1958 1959 return peer ? dev_net(peer) : dev_net(dev); 1960 } 1961 1962 static unsigned int veth_get_num_queues(void) 1963 { 1964 /* enforce the same queue limit as rtnl_create_link */ 1965 int queues = num_possible_cpus(); 1966 1967 if (queues > 4096) 1968 queues = 4096; 1969 return queues; 1970 } 1971 1972 static struct rtnl_link_ops veth_link_ops = { 1973 .kind = DRV_NAME, 1974 .priv_size = sizeof(struct veth_priv), 1975 .setup = veth_setup, 1976 .validate = veth_validate, 1977 .newlink = veth_newlink, 1978 .dellink = veth_dellink, 1979 .policy = veth_policy, 1980 .peer_type = VETH_INFO_PEER, 1981 .maxtype = VETH_INFO_MAX, 1982 .get_link_net = veth_get_link_net, 1983 .get_num_tx_queues = veth_get_num_queues, 1984 .get_num_rx_queues = veth_get_num_queues, 1985 }; 1986 1987 /* 1988 * init/fini 1989 */ 1990 1991 static __init int veth_init(void) 1992 { 1993 return rtnl_link_register(&veth_link_ops); 1994 } 1995 1996 static __exit void veth_exit(void) 1997 { 1998 rtnl_link_unregister(&veth_link_ops); 1999 } 2000 2001 module_init(veth_init); 2002 module_exit(veth_exit); 2003 2004 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 2005 MODULE_LICENSE("GPL v2"); 2006 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 2007