1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 #include <net/page_pool.h> 30 31 #define DRV_NAME "veth" 32 #define DRV_VERSION "1.0" 33 34 #define VETH_XDP_FLAG BIT(0) 35 #define VETH_RING_SIZE 256 36 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 37 38 #define VETH_XDP_TX_BULK_SIZE 16 39 #define VETH_XDP_BATCH 16 40 41 struct veth_stats { 42 u64 rx_drops; 43 /* xdp */ 44 u64 xdp_packets; 45 u64 xdp_bytes; 46 u64 xdp_redirect; 47 u64 xdp_drops; 48 u64 xdp_tx; 49 u64 xdp_tx_err; 50 u64 peer_tq_xdp_xmit; 51 u64 peer_tq_xdp_xmit_err; 52 }; 53 54 struct veth_rq_stats { 55 struct veth_stats vs; 56 struct u64_stats_sync syncp; 57 }; 58 59 struct veth_rq { 60 struct napi_struct xdp_napi; 61 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 62 struct net_device *dev; 63 struct bpf_prog __rcu *xdp_prog; 64 struct xdp_mem_info xdp_mem; 65 struct veth_rq_stats stats; 66 bool rx_notify_masked; 67 struct ptr_ring xdp_ring; 68 struct xdp_rxq_info xdp_rxq; 69 struct page_pool *page_pool; 70 }; 71 72 struct veth_priv { 73 struct net_device __rcu *peer; 74 atomic64_t dropped; 75 struct bpf_prog *_xdp_prog; 76 struct veth_rq *rq; 77 unsigned int requested_headroom; 78 }; 79 80 struct veth_xdp_tx_bq { 81 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 82 unsigned int count; 83 }; 84 85 /* 86 * ethtool interface 87 */ 88 89 struct veth_q_stat_desc { 90 char desc[ETH_GSTRING_LEN]; 91 size_t offset; 92 }; 93 94 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 95 96 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 97 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 98 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 99 { "drops", VETH_RQ_STAT(rx_drops) }, 100 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 101 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 102 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 103 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 104 }; 105 106 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 107 108 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 109 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 110 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 111 }; 112 113 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 114 115 static struct { 116 const char string[ETH_GSTRING_LEN]; 117 } ethtool_stats_keys[] = { 118 { "peer_ifindex" }, 119 }; 120 121 struct veth_xdp_buff { 122 struct xdp_buff xdp; 123 struct sk_buff *skb; 124 }; 125 126 static int veth_get_link_ksettings(struct net_device *dev, 127 struct ethtool_link_ksettings *cmd) 128 { 129 cmd->base.speed = SPEED_10000; 130 cmd->base.duplex = DUPLEX_FULL; 131 cmd->base.port = PORT_TP; 132 cmd->base.autoneg = AUTONEG_DISABLE; 133 return 0; 134 } 135 136 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 137 { 138 strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 139 strscpy(info->version, DRV_VERSION, sizeof(info->version)); 140 } 141 142 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 143 { 144 u8 *p = buf; 145 int i, j; 146 147 switch(stringset) { 148 case ETH_SS_STATS: 149 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 150 p += sizeof(ethtool_stats_keys); 151 for (i = 0; i < dev->real_num_rx_queues; i++) 152 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 153 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 154 i, veth_rq_stats_desc[j].desc); 155 156 for (i = 0; i < dev->real_num_tx_queues; i++) 157 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 158 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 159 i, veth_tq_stats_desc[j].desc); 160 161 page_pool_ethtool_stats_get_strings(p); 162 break; 163 } 164 } 165 166 static int veth_get_sset_count(struct net_device *dev, int sset) 167 { 168 switch (sset) { 169 case ETH_SS_STATS: 170 return ARRAY_SIZE(ethtool_stats_keys) + 171 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 172 VETH_TQ_STATS_LEN * dev->real_num_tx_queues + 173 page_pool_ethtool_stats_get_count(); 174 default: 175 return -EOPNOTSUPP; 176 } 177 } 178 179 static void veth_get_ethtool_stats(struct net_device *dev, 180 struct ethtool_stats *stats, u64 *data) 181 { 182 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 183 struct net_device *peer = rtnl_dereference(priv->peer); 184 struct page_pool_stats pp_stats = {}; 185 int i, j, idx, pp_idx; 186 187 data[0] = peer ? peer->ifindex : 0; 188 idx = 1; 189 for (i = 0; i < dev->real_num_rx_queues; i++) { 190 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 191 const void *stats_base = (void *)&rq_stats->vs; 192 unsigned int start; 193 size_t offset; 194 195 do { 196 start = u64_stats_fetch_begin(&rq_stats->syncp); 197 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 198 offset = veth_rq_stats_desc[j].offset; 199 data[idx + j] = *(u64 *)(stats_base + offset); 200 } 201 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 202 idx += VETH_RQ_STATS_LEN; 203 } 204 pp_idx = idx; 205 206 if (!peer) 207 goto page_pool_stats; 208 209 rcv_priv = netdev_priv(peer); 210 for (i = 0; i < peer->real_num_rx_queues; i++) { 211 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 212 const void *base = (void *)&rq_stats->vs; 213 unsigned int start, tx_idx = idx; 214 size_t offset; 215 216 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 217 do { 218 start = u64_stats_fetch_begin(&rq_stats->syncp); 219 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 220 offset = veth_tq_stats_desc[j].offset; 221 data[tx_idx + j] += *(u64 *)(base + offset); 222 } 223 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 224 pp_idx = tx_idx + VETH_TQ_STATS_LEN; 225 } 226 227 page_pool_stats: 228 for (i = 0; i < dev->real_num_rx_queues; i++) { 229 if (!priv->rq[i].page_pool) 230 continue; 231 page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); 232 } 233 page_pool_ethtool_stats_get(&data[pp_idx], &pp_stats); 234 } 235 236 static void veth_get_channels(struct net_device *dev, 237 struct ethtool_channels *channels) 238 { 239 channels->tx_count = dev->real_num_tx_queues; 240 channels->rx_count = dev->real_num_rx_queues; 241 channels->max_tx = dev->num_tx_queues; 242 channels->max_rx = dev->num_rx_queues; 243 } 244 245 static int veth_set_channels(struct net_device *dev, 246 struct ethtool_channels *ch); 247 248 static const struct ethtool_ops veth_ethtool_ops = { 249 .get_drvinfo = veth_get_drvinfo, 250 .get_link = ethtool_op_get_link, 251 .get_strings = veth_get_strings, 252 .get_sset_count = veth_get_sset_count, 253 .get_ethtool_stats = veth_get_ethtool_stats, 254 .get_link_ksettings = veth_get_link_ksettings, 255 .get_ts_info = ethtool_op_get_ts_info, 256 .get_channels = veth_get_channels, 257 .set_channels = veth_set_channels, 258 }; 259 260 /* general routines */ 261 262 static bool veth_is_xdp_frame(void *ptr) 263 { 264 return (unsigned long)ptr & VETH_XDP_FLAG; 265 } 266 267 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 268 { 269 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 270 } 271 272 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 273 { 274 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 275 } 276 277 static void veth_ptr_free(void *ptr) 278 { 279 if (veth_is_xdp_frame(ptr)) 280 xdp_return_frame(veth_ptr_to_xdp(ptr)); 281 else 282 kfree_skb(ptr); 283 } 284 285 static void __veth_xdp_flush(struct veth_rq *rq) 286 { 287 /* Write ptr_ring before reading rx_notify_masked */ 288 smp_mb(); 289 if (!READ_ONCE(rq->rx_notify_masked) && 290 napi_schedule_prep(&rq->xdp_napi)) { 291 WRITE_ONCE(rq->rx_notify_masked, true); 292 __napi_schedule(&rq->xdp_napi); 293 } 294 } 295 296 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 297 { 298 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 299 dev_kfree_skb_any(skb); 300 return NET_RX_DROP; 301 } 302 303 return NET_RX_SUCCESS; 304 } 305 306 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 307 struct veth_rq *rq, bool xdp) 308 { 309 return __dev_forward_skb(dev, skb) ?: xdp ? 310 veth_xdp_rx(rq, skb) : 311 __netif_rx(skb); 312 } 313 314 /* return true if the specified skb has chances of GRO aggregation 315 * Don't strive for accuracy, but try to avoid GRO overhead in the most 316 * common scenarios. 317 * When XDP is enabled, all traffic is considered eligible, as the xmit 318 * device has TSO off. 319 * When TSO is enabled on the xmit device, we are likely interested only 320 * in UDP aggregation, explicitly check for that if the skb is suspected 321 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 322 * to belong to locally generated UDP traffic. 323 */ 324 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 325 const struct net_device *rcv, 326 const struct sk_buff *skb) 327 { 328 return !(dev->features & NETIF_F_ALL_TSO) || 329 (skb->destructor == sock_wfree && 330 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 331 } 332 333 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 334 { 335 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 336 struct veth_rq *rq = NULL; 337 struct net_device *rcv; 338 int length = skb->len; 339 bool use_napi = false; 340 int rxq; 341 342 rcu_read_lock(); 343 rcv = rcu_dereference(priv->peer); 344 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 345 kfree_skb(skb); 346 goto drop; 347 } 348 349 rcv_priv = netdev_priv(rcv); 350 rxq = skb_get_queue_mapping(skb); 351 if (rxq < rcv->real_num_rx_queues) { 352 rq = &rcv_priv->rq[rxq]; 353 354 /* The napi pointer is available when an XDP program is 355 * attached or when GRO is enabled 356 * Don't bother with napi/GRO if the skb can't be aggregated 357 */ 358 use_napi = rcu_access_pointer(rq->napi) && 359 veth_skb_is_eligible_for_gro(dev, rcv, skb); 360 } 361 362 skb_tx_timestamp(skb); 363 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 364 if (!use_napi) 365 dev_lstats_add(dev, length); 366 } else { 367 drop: 368 atomic64_inc(&priv->dropped); 369 } 370 371 if (use_napi) 372 __veth_xdp_flush(rq); 373 374 rcu_read_unlock(); 375 376 return NETDEV_TX_OK; 377 } 378 379 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 380 { 381 struct veth_priv *priv = netdev_priv(dev); 382 383 dev_lstats_read(dev, packets, bytes); 384 return atomic64_read(&priv->dropped); 385 } 386 387 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 388 { 389 struct veth_priv *priv = netdev_priv(dev); 390 int i; 391 392 result->peer_tq_xdp_xmit_err = 0; 393 result->xdp_packets = 0; 394 result->xdp_tx_err = 0; 395 result->xdp_bytes = 0; 396 result->rx_drops = 0; 397 for (i = 0; i < dev->num_rx_queues; i++) { 398 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 399 struct veth_rq_stats *stats = &priv->rq[i].stats; 400 unsigned int start; 401 402 do { 403 start = u64_stats_fetch_begin(&stats->syncp); 404 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 405 xdp_tx_err = stats->vs.xdp_tx_err; 406 packets = stats->vs.xdp_packets; 407 bytes = stats->vs.xdp_bytes; 408 drops = stats->vs.rx_drops; 409 } while (u64_stats_fetch_retry(&stats->syncp, start)); 410 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 411 result->xdp_tx_err += xdp_tx_err; 412 result->xdp_packets += packets; 413 result->xdp_bytes += bytes; 414 result->rx_drops += drops; 415 } 416 } 417 418 static void veth_get_stats64(struct net_device *dev, 419 struct rtnl_link_stats64 *tot) 420 { 421 struct veth_priv *priv = netdev_priv(dev); 422 struct net_device *peer; 423 struct veth_stats rx; 424 u64 packets, bytes; 425 426 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 427 tot->tx_bytes = bytes; 428 tot->tx_packets = packets; 429 430 veth_stats_rx(&rx, dev); 431 tot->tx_dropped += rx.xdp_tx_err; 432 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 433 tot->rx_bytes = rx.xdp_bytes; 434 tot->rx_packets = rx.xdp_packets; 435 436 rcu_read_lock(); 437 peer = rcu_dereference(priv->peer); 438 if (peer) { 439 veth_stats_tx(peer, &packets, &bytes); 440 tot->rx_bytes += bytes; 441 tot->rx_packets += packets; 442 443 veth_stats_rx(&rx, peer); 444 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 445 tot->rx_dropped += rx.xdp_tx_err; 446 tot->tx_bytes += rx.xdp_bytes; 447 tot->tx_packets += rx.xdp_packets; 448 } 449 rcu_read_unlock(); 450 } 451 452 /* fake multicast ability */ 453 static void veth_set_multicast_list(struct net_device *dev) 454 { 455 } 456 457 static int veth_select_rxq(struct net_device *dev) 458 { 459 return smp_processor_id() % dev->real_num_rx_queues; 460 } 461 462 static struct net_device *veth_peer_dev(struct net_device *dev) 463 { 464 struct veth_priv *priv = netdev_priv(dev); 465 466 /* Callers must be under RCU read side. */ 467 return rcu_dereference(priv->peer); 468 } 469 470 static int veth_xdp_xmit(struct net_device *dev, int n, 471 struct xdp_frame **frames, 472 u32 flags, bool ndo_xmit) 473 { 474 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 475 int i, ret = -ENXIO, nxmit = 0; 476 struct net_device *rcv; 477 unsigned int max_len; 478 struct veth_rq *rq; 479 480 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 481 return -EINVAL; 482 483 rcu_read_lock(); 484 rcv = rcu_dereference(priv->peer); 485 if (unlikely(!rcv)) 486 goto out; 487 488 rcv_priv = netdev_priv(rcv); 489 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 490 /* The napi pointer is set if NAPI is enabled, which ensures that 491 * xdp_ring is initialized on receive side and the peer device is up. 492 */ 493 if (!rcu_access_pointer(rq->napi)) 494 goto out; 495 496 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 497 498 spin_lock(&rq->xdp_ring.producer_lock); 499 for (i = 0; i < n; i++) { 500 struct xdp_frame *frame = frames[i]; 501 void *ptr = veth_xdp_to_ptr(frame); 502 503 if (unlikely(xdp_get_frame_len(frame) > max_len || 504 __ptr_ring_produce(&rq->xdp_ring, ptr))) 505 break; 506 nxmit++; 507 } 508 spin_unlock(&rq->xdp_ring.producer_lock); 509 510 if (flags & XDP_XMIT_FLUSH) 511 __veth_xdp_flush(rq); 512 513 ret = nxmit; 514 if (ndo_xmit) { 515 u64_stats_update_begin(&rq->stats.syncp); 516 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 517 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 518 u64_stats_update_end(&rq->stats.syncp); 519 } 520 521 out: 522 rcu_read_unlock(); 523 524 return ret; 525 } 526 527 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 528 struct xdp_frame **frames, u32 flags) 529 { 530 int err; 531 532 err = veth_xdp_xmit(dev, n, frames, flags, true); 533 if (err < 0) { 534 struct veth_priv *priv = netdev_priv(dev); 535 536 atomic64_add(n, &priv->dropped); 537 } 538 539 return err; 540 } 541 542 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 543 { 544 int sent, i, err = 0, drops; 545 546 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 547 if (sent < 0) { 548 err = sent; 549 sent = 0; 550 } 551 552 for (i = sent; unlikely(i < bq->count); i++) 553 xdp_return_frame(bq->q[i]); 554 555 drops = bq->count - sent; 556 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 557 558 u64_stats_update_begin(&rq->stats.syncp); 559 rq->stats.vs.xdp_tx += sent; 560 rq->stats.vs.xdp_tx_err += drops; 561 u64_stats_update_end(&rq->stats.syncp); 562 563 bq->count = 0; 564 } 565 566 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 567 { 568 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 569 struct net_device *rcv; 570 struct veth_rq *rcv_rq; 571 572 rcu_read_lock(); 573 veth_xdp_flush_bq(rq, bq); 574 rcv = rcu_dereference(priv->peer); 575 if (unlikely(!rcv)) 576 goto out; 577 578 rcv_priv = netdev_priv(rcv); 579 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 580 /* xdp_ring is initialized on receive side? */ 581 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 582 goto out; 583 584 __veth_xdp_flush(rcv_rq); 585 out: 586 rcu_read_unlock(); 587 } 588 589 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 590 struct veth_xdp_tx_bq *bq) 591 { 592 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 593 594 if (unlikely(!frame)) 595 return -EOVERFLOW; 596 597 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 598 veth_xdp_flush_bq(rq, bq); 599 600 bq->q[bq->count++] = frame; 601 602 return 0; 603 } 604 605 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 606 struct xdp_frame *frame, 607 struct veth_xdp_tx_bq *bq, 608 struct veth_stats *stats) 609 { 610 struct xdp_frame orig_frame; 611 struct bpf_prog *xdp_prog; 612 613 rcu_read_lock(); 614 xdp_prog = rcu_dereference(rq->xdp_prog); 615 if (likely(xdp_prog)) { 616 struct veth_xdp_buff vxbuf; 617 struct xdp_buff *xdp = &vxbuf.xdp; 618 u32 act; 619 620 xdp_convert_frame_to_buff(frame, xdp); 621 xdp->rxq = &rq->xdp_rxq; 622 vxbuf.skb = NULL; 623 624 act = bpf_prog_run_xdp(xdp_prog, xdp); 625 626 switch (act) { 627 case XDP_PASS: 628 if (xdp_update_frame_from_buff(xdp, frame)) 629 goto err_xdp; 630 break; 631 case XDP_TX: 632 orig_frame = *frame; 633 xdp->rxq->mem = frame->mem; 634 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 635 trace_xdp_exception(rq->dev, xdp_prog, act); 636 frame = &orig_frame; 637 stats->rx_drops++; 638 goto err_xdp; 639 } 640 stats->xdp_tx++; 641 rcu_read_unlock(); 642 goto xdp_xmit; 643 case XDP_REDIRECT: 644 orig_frame = *frame; 645 xdp->rxq->mem = frame->mem; 646 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 647 frame = &orig_frame; 648 stats->rx_drops++; 649 goto err_xdp; 650 } 651 stats->xdp_redirect++; 652 rcu_read_unlock(); 653 goto xdp_xmit; 654 default: 655 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 656 fallthrough; 657 case XDP_ABORTED: 658 trace_xdp_exception(rq->dev, xdp_prog, act); 659 fallthrough; 660 case XDP_DROP: 661 stats->xdp_drops++; 662 goto err_xdp; 663 } 664 } 665 rcu_read_unlock(); 666 667 return frame; 668 err_xdp: 669 rcu_read_unlock(); 670 xdp_return_frame(frame); 671 xdp_xmit: 672 return NULL; 673 } 674 675 /* frames array contains VETH_XDP_BATCH at most */ 676 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 677 int n_xdpf, struct veth_xdp_tx_bq *bq, 678 struct veth_stats *stats) 679 { 680 void *skbs[VETH_XDP_BATCH]; 681 int i; 682 683 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 684 GFP_ATOMIC | __GFP_ZERO) < 0) { 685 for (i = 0; i < n_xdpf; i++) 686 xdp_return_frame(frames[i]); 687 stats->rx_drops += n_xdpf; 688 689 return; 690 } 691 692 for (i = 0; i < n_xdpf; i++) { 693 struct sk_buff *skb = skbs[i]; 694 695 skb = __xdp_build_skb_from_frame(frames[i], skb, 696 rq->dev); 697 if (!skb) { 698 xdp_return_frame(frames[i]); 699 stats->rx_drops++; 700 continue; 701 } 702 napi_gro_receive(&rq->xdp_napi, skb); 703 } 704 } 705 706 static void veth_xdp_get(struct xdp_buff *xdp) 707 { 708 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 709 int i; 710 711 get_page(virt_to_page(xdp->data)); 712 if (likely(!xdp_buff_has_frags(xdp))) 713 return; 714 715 for (i = 0; i < sinfo->nr_frags; i++) 716 __skb_frag_ref(&sinfo->frags[i]); 717 } 718 719 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 720 struct xdp_buff *xdp, 721 struct sk_buff **pskb) 722 { 723 struct sk_buff *skb = *pskb; 724 u32 frame_sz; 725 726 if (skb_shared(skb) || skb_head_is_locked(skb) || 727 skb_shinfo(skb)->nr_frags || 728 skb_headroom(skb) < XDP_PACKET_HEADROOM) { 729 u32 size, len, max_head_size, off; 730 struct sk_buff *nskb; 731 struct page *page; 732 int i, head_off; 733 734 /* We need a private copy of the skb and data buffers since 735 * the ebpf program can modify it. We segment the original skb 736 * into order-0 pages without linearize it. 737 * 738 * Make sure we have enough space for linear and paged area 739 */ 740 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - 741 VETH_XDP_HEADROOM); 742 if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) 743 goto drop; 744 745 /* Allocate skb head */ 746 page = page_pool_dev_alloc_pages(rq->page_pool); 747 if (!page) 748 goto drop; 749 750 nskb = build_skb(page_address(page), PAGE_SIZE); 751 if (!nskb) { 752 page_pool_put_full_page(rq->page_pool, page, true); 753 goto drop; 754 } 755 756 skb_reserve(nskb, VETH_XDP_HEADROOM); 757 skb_copy_header(nskb, skb); 758 skb_mark_for_recycle(nskb); 759 760 size = min_t(u32, skb->len, max_head_size); 761 if (skb_copy_bits(skb, 0, nskb->data, size)) { 762 consume_skb(nskb); 763 goto drop; 764 } 765 skb_put(nskb, size); 766 767 head_off = skb_headroom(nskb) - skb_headroom(skb); 768 skb_headers_offset_update(nskb, head_off); 769 770 /* Allocate paged area of new skb */ 771 off = size; 772 len = skb->len - off; 773 774 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { 775 page = page_pool_dev_alloc_pages(rq->page_pool); 776 if (!page) { 777 consume_skb(nskb); 778 goto drop; 779 } 780 781 size = min_t(u32, len, PAGE_SIZE); 782 skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); 783 if (skb_copy_bits(skb, off, page_address(page), 784 size)) { 785 consume_skb(nskb); 786 goto drop; 787 } 788 789 len -= size; 790 off += size; 791 } 792 793 consume_skb(skb); 794 skb = nskb; 795 } 796 797 /* SKB "head" area always have tailroom for skb_shared_info */ 798 frame_sz = skb_end_pointer(skb) - skb->head; 799 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 800 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 801 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 802 skb_headlen(skb), true); 803 804 if (skb_is_nonlinear(skb)) { 805 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 806 xdp_buff_set_frags_flag(xdp); 807 } else { 808 xdp_buff_clear_frags_flag(xdp); 809 } 810 *pskb = skb; 811 812 return 0; 813 drop: 814 consume_skb(skb); 815 *pskb = NULL; 816 817 return -ENOMEM; 818 } 819 820 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 821 struct sk_buff *skb, 822 struct veth_xdp_tx_bq *bq, 823 struct veth_stats *stats) 824 { 825 void *orig_data, *orig_data_end; 826 struct bpf_prog *xdp_prog; 827 struct veth_xdp_buff vxbuf; 828 struct xdp_buff *xdp = &vxbuf.xdp; 829 u32 act, metalen; 830 int off; 831 832 skb_prepare_for_gro(skb); 833 834 rcu_read_lock(); 835 xdp_prog = rcu_dereference(rq->xdp_prog); 836 if (unlikely(!xdp_prog)) { 837 rcu_read_unlock(); 838 goto out; 839 } 840 841 __skb_push(skb, skb->data - skb_mac_header(skb)); 842 if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) 843 goto drop; 844 vxbuf.skb = skb; 845 846 orig_data = xdp->data; 847 orig_data_end = xdp->data_end; 848 849 act = bpf_prog_run_xdp(xdp_prog, xdp); 850 851 switch (act) { 852 case XDP_PASS: 853 break; 854 case XDP_TX: 855 veth_xdp_get(xdp); 856 consume_skb(skb); 857 xdp->rxq->mem = rq->xdp_mem; 858 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 859 trace_xdp_exception(rq->dev, xdp_prog, act); 860 stats->rx_drops++; 861 goto err_xdp; 862 } 863 stats->xdp_tx++; 864 rcu_read_unlock(); 865 goto xdp_xmit; 866 case XDP_REDIRECT: 867 veth_xdp_get(xdp); 868 consume_skb(skb); 869 xdp->rxq->mem = rq->xdp_mem; 870 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 871 stats->rx_drops++; 872 goto err_xdp; 873 } 874 stats->xdp_redirect++; 875 rcu_read_unlock(); 876 goto xdp_xmit; 877 default: 878 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 879 fallthrough; 880 case XDP_ABORTED: 881 trace_xdp_exception(rq->dev, xdp_prog, act); 882 fallthrough; 883 case XDP_DROP: 884 stats->xdp_drops++; 885 goto xdp_drop; 886 } 887 rcu_read_unlock(); 888 889 /* check if bpf_xdp_adjust_head was used */ 890 off = orig_data - xdp->data; 891 if (off > 0) 892 __skb_push(skb, off); 893 else if (off < 0) 894 __skb_pull(skb, -off); 895 896 skb_reset_mac_header(skb); 897 898 /* check if bpf_xdp_adjust_tail was used */ 899 off = xdp->data_end - orig_data_end; 900 if (off != 0) 901 __skb_put(skb, off); /* positive on grow, negative on shrink */ 902 903 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 904 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 905 */ 906 if (xdp_buff_has_frags(xdp)) 907 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 908 else 909 skb->data_len = 0; 910 911 skb->protocol = eth_type_trans(skb, rq->dev); 912 913 metalen = xdp->data - xdp->data_meta; 914 if (metalen) 915 skb_metadata_set(skb, metalen); 916 out: 917 return skb; 918 drop: 919 stats->rx_drops++; 920 xdp_drop: 921 rcu_read_unlock(); 922 kfree_skb(skb); 923 return NULL; 924 err_xdp: 925 rcu_read_unlock(); 926 xdp_return_buff(xdp); 927 xdp_xmit: 928 return NULL; 929 } 930 931 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 932 struct veth_xdp_tx_bq *bq, 933 struct veth_stats *stats) 934 { 935 int i, done = 0, n_xdpf = 0; 936 void *xdpf[VETH_XDP_BATCH]; 937 938 for (i = 0; i < budget; i++) { 939 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 940 941 if (!ptr) 942 break; 943 944 if (veth_is_xdp_frame(ptr)) { 945 /* ndo_xdp_xmit */ 946 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 947 948 stats->xdp_bytes += xdp_get_frame_len(frame); 949 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 950 if (frame) { 951 /* XDP_PASS */ 952 xdpf[n_xdpf++] = frame; 953 if (n_xdpf == VETH_XDP_BATCH) { 954 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 955 bq, stats); 956 n_xdpf = 0; 957 } 958 } 959 } else { 960 /* ndo_start_xmit */ 961 struct sk_buff *skb = ptr; 962 963 stats->xdp_bytes += skb->len; 964 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 965 if (skb) { 966 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 967 netif_receive_skb(skb); 968 else 969 napi_gro_receive(&rq->xdp_napi, skb); 970 } 971 } 972 done++; 973 } 974 975 if (n_xdpf) 976 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 977 978 u64_stats_update_begin(&rq->stats.syncp); 979 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 980 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 981 rq->stats.vs.xdp_drops += stats->xdp_drops; 982 rq->stats.vs.rx_drops += stats->rx_drops; 983 rq->stats.vs.xdp_packets += done; 984 u64_stats_update_end(&rq->stats.syncp); 985 986 return done; 987 } 988 989 static int veth_poll(struct napi_struct *napi, int budget) 990 { 991 struct veth_rq *rq = 992 container_of(napi, struct veth_rq, xdp_napi); 993 struct veth_stats stats = {}; 994 struct veth_xdp_tx_bq bq; 995 int done; 996 997 bq.count = 0; 998 999 xdp_set_return_frame_no_direct(); 1000 done = veth_xdp_rcv(rq, budget, &bq, &stats); 1001 1002 if (stats.xdp_redirect > 0) 1003 xdp_do_flush(); 1004 1005 if (done < budget && napi_complete_done(napi, done)) { 1006 /* Write rx_notify_masked before reading ptr_ring */ 1007 smp_store_mb(rq->rx_notify_masked, false); 1008 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 1009 if (napi_schedule_prep(&rq->xdp_napi)) { 1010 WRITE_ONCE(rq->rx_notify_masked, true); 1011 __napi_schedule(&rq->xdp_napi); 1012 } 1013 } 1014 } 1015 1016 if (stats.xdp_tx > 0) 1017 veth_xdp_flush(rq, &bq); 1018 xdp_clear_return_frame_no_direct(); 1019 1020 return done; 1021 } 1022 1023 static int veth_create_page_pool(struct veth_rq *rq) 1024 { 1025 struct page_pool_params pp_params = { 1026 .order = 0, 1027 .pool_size = VETH_RING_SIZE, 1028 .nid = NUMA_NO_NODE, 1029 .dev = &rq->dev->dev, 1030 }; 1031 1032 rq->page_pool = page_pool_create(&pp_params); 1033 if (IS_ERR(rq->page_pool)) { 1034 int err = PTR_ERR(rq->page_pool); 1035 1036 rq->page_pool = NULL; 1037 return err; 1038 } 1039 1040 return 0; 1041 } 1042 1043 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 1044 { 1045 struct veth_priv *priv = netdev_priv(dev); 1046 int err, i; 1047 1048 for (i = start; i < end; i++) { 1049 err = veth_create_page_pool(&priv->rq[i]); 1050 if (err) 1051 goto err_page_pool; 1052 } 1053 1054 for (i = start; i < end; i++) { 1055 struct veth_rq *rq = &priv->rq[i]; 1056 1057 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 1058 if (err) 1059 goto err_xdp_ring; 1060 } 1061 1062 for (i = start; i < end; i++) { 1063 struct veth_rq *rq = &priv->rq[i]; 1064 1065 napi_enable(&rq->xdp_napi); 1066 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1067 } 1068 1069 return 0; 1070 1071 err_xdp_ring: 1072 for (i--; i >= start; i--) 1073 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1074 err_page_pool: 1075 for (i = start; i < end; i++) { 1076 page_pool_destroy(priv->rq[i].page_pool); 1077 priv->rq[i].page_pool = NULL; 1078 } 1079 1080 return err; 1081 } 1082 1083 static int __veth_napi_enable(struct net_device *dev) 1084 { 1085 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1086 } 1087 1088 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1089 { 1090 struct veth_priv *priv = netdev_priv(dev); 1091 int i; 1092 1093 for (i = start; i < end; i++) { 1094 struct veth_rq *rq = &priv->rq[i]; 1095 1096 rcu_assign_pointer(priv->rq[i].napi, NULL); 1097 napi_disable(&rq->xdp_napi); 1098 __netif_napi_del(&rq->xdp_napi); 1099 } 1100 synchronize_net(); 1101 1102 for (i = start; i < end; i++) { 1103 struct veth_rq *rq = &priv->rq[i]; 1104 1105 rq->rx_notify_masked = false; 1106 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1107 } 1108 1109 for (i = start; i < end; i++) { 1110 page_pool_destroy(priv->rq[i].page_pool); 1111 priv->rq[i].page_pool = NULL; 1112 } 1113 } 1114 1115 static void veth_napi_del(struct net_device *dev) 1116 { 1117 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1118 } 1119 1120 static bool veth_gro_requested(const struct net_device *dev) 1121 { 1122 return !!(dev->wanted_features & NETIF_F_GRO); 1123 } 1124 1125 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1126 bool napi_already_on) 1127 { 1128 struct veth_priv *priv = netdev_priv(dev); 1129 int err, i; 1130 1131 for (i = start; i < end; i++) { 1132 struct veth_rq *rq = &priv->rq[i]; 1133 1134 if (!napi_already_on) 1135 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1136 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1137 if (err < 0) 1138 goto err_rxq_reg; 1139 1140 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1141 MEM_TYPE_PAGE_SHARED, 1142 NULL); 1143 if (err < 0) 1144 goto err_reg_mem; 1145 1146 /* Save original mem info as it can be overwritten */ 1147 rq->xdp_mem = rq->xdp_rxq.mem; 1148 } 1149 return 0; 1150 1151 err_reg_mem: 1152 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1153 err_rxq_reg: 1154 for (i--; i >= start; i--) { 1155 struct veth_rq *rq = &priv->rq[i]; 1156 1157 xdp_rxq_info_unreg(&rq->xdp_rxq); 1158 if (!napi_already_on) 1159 netif_napi_del(&rq->xdp_napi); 1160 } 1161 1162 return err; 1163 } 1164 1165 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1166 bool delete_napi) 1167 { 1168 struct veth_priv *priv = netdev_priv(dev); 1169 int i; 1170 1171 for (i = start; i < end; i++) { 1172 struct veth_rq *rq = &priv->rq[i]; 1173 1174 rq->xdp_rxq.mem = rq->xdp_mem; 1175 xdp_rxq_info_unreg(&rq->xdp_rxq); 1176 1177 if (delete_napi) 1178 netif_napi_del(&rq->xdp_napi); 1179 } 1180 } 1181 1182 static int veth_enable_xdp(struct net_device *dev) 1183 { 1184 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1185 struct veth_priv *priv = netdev_priv(dev); 1186 int err, i; 1187 1188 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1189 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1190 if (err) 1191 return err; 1192 1193 if (!napi_already_on) { 1194 err = __veth_napi_enable(dev); 1195 if (err) { 1196 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1197 return err; 1198 } 1199 1200 if (!veth_gro_requested(dev)) { 1201 /* user-space did not require GRO, but adding XDP 1202 * is supposed to get GRO working 1203 */ 1204 dev->features |= NETIF_F_GRO; 1205 netdev_features_change(dev); 1206 } 1207 } 1208 } 1209 1210 for (i = 0; i < dev->real_num_rx_queues; i++) { 1211 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1212 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1213 } 1214 1215 return 0; 1216 } 1217 1218 static void veth_disable_xdp(struct net_device *dev) 1219 { 1220 struct veth_priv *priv = netdev_priv(dev); 1221 int i; 1222 1223 for (i = 0; i < dev->real_num_rx_queues; i++) 1224 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1225 1226 if (!netif_running(dev) || !veth_gro_requested(dev)) { 1227 veth_napi_del(dev); 1228 1229 /* if user-space did not require GRO, since adding XDP 1230 * enabled it, clear it now 1231 */ 1232 if (!veth_gro_requested(dev) && netif_running(dev)) { 1233 dev->features &= ~NETIF_F_GRO; 1234 netdev_features_change(dev); 1235 } 1236 } 1237 1238 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1239 } 1240 1241 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1242 { 1243 struct veth_priv *priv = netdev_priv(dev); 1244 int err, i; 1245 1246 for (i = start; i < end; i++) { 1247 struct veth_rq *rq = &priv->rq[i]; 1248 1249 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1250 } 1251 1252 err = __veth_napi_enable_range(dev, start, end); 1253 if (err) { 1254 for (i = start; i < end; i++) { 1255 struct veth_rq *rq = &priv->rq[i]; 1256 1257 netif_napi_del(&rq->xdp_napi); 1258 } 1259 return err; 1260 } 1261 return err; 1262 } 1263 1264 static int veth_napi_enable(struct net_device *dev) 1265 { 1266 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1267 } 1268 1269 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1270 { 1271 struct veth_priv *priv = netdev_priv(dev); 1272 1273 if (start >= end) 1274 return; 1275 1276 if (priv->_xdp_prog) { 1277 veth_napi_del_range(dev, start, end); 1278 veth_disable_xdp_range(dev, start, end, false); 1279 } else if (veth_gro_requested(dev)) { 1280 veth_napi_del_range(dev, start, end); 1281 } 1282 } 1283 1284 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1285 { 1286 struct veth_priv *priv = netdev_priv(dev); 1287 int err; 1288 1289 if (start >= end) 1290 return 0; 1291 1292 if (priv->_xdp_prog) { 1293 /* these channels are freshly initialized, napi is not on there even 1294 * when GRO is requeste 1295 */ 1296 err = veth_enable_xdp_range(dev, start, end, false); 1297 if (err) 1298 return err; 1299 1300 err = __veth_napi_enable_range(dev, start, end); 1301 if (err) { 1302 /* on error always delete the newly added napis */ 1303 veth_disable_xdp_range(dev, start, end, true); 1304 return err; 1305 } 1306 } else if (veth_gro_requested(dev)) { 1307 return veth_napi_enable_range(dev, start, end); 1308 } 1309 return 0; 1310 } 1311 1312 static void veth_set_xdp_features(struct net_device *dev) 1313 { 1314 struct veth_priv *priv = netdev_priv(dev); 1315 struct net_device *peer; 1316 1317 peer = rtnl_dereference(priv->peer); 1318 if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { 1319 struct veth_priv *priv_peer = netdev_priv(peer); 1320 xdp_features_t val = NETDEV_XDP_ACT_BASIC | 1321 NETDEV_XDP_ACT_REDIRECT | 1322 NETDEV_XDP_ACT_RX_SG; 1323 1324 if (priv_peer->_xdp_prog || veth_gro_requested(peer)) 1325 val |= NETDEV_XDP_ACT_NDO_XMIT | 1326 NETDEV_XDP_ACT_NDO_XMIT_SG; 1327 xdp_set_features_flag(dev, val); 1328 } else { 1329 xdp_clear_features_flag(dev); 1330 } 1331 } 1332 1333 static int veth_set_channels(struct net_device *dev, 1334 struct ethtool_channels *ch) 1335 { 1336 struct veth_priv *priv = netdev_priv(dev); 1337 unsigned int old_rx_count, new_rx_count; 1338 struct veth_priv *peer_priv; 1339 struct net_device *peer; 1340 int err; 1341 1342 /* sanity check. Upper bounds are already enforced by the caller */ 1343 if (!ch->rx_count || !ch->tx_count) 1344 return -EINVAL; 1345 1346 /* avoid braking XDP, if that is enabled */ 1347 peer = rtnl_dereference(priv->peer); 1348 peer_priv = peer ? netdev_priv(peer) : NULL; 1349 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1350 return -EINVAL; 1351 1352 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1353 return -EINVAL; 1354 1355 old_rx_count = dev->real_num_rx_queues; 1356 new_rx_count = ch->rx_count; 1357 if (netif_running(dev)) { 1358 /* turn device off */ 1359 netif_carrier_off(dev); 1360 if (peer) 1361 netif_carrier_off(peer); 1362 1363 /* try to allocate new resurces, as needed*/ 1364 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1365 if (err) 1366 goto out; 1367 } 1368 1369 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1370 if (err) 1371 goto revert; 1372 1373 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1374 if (err) { 1375 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1376 1377 /* this error condition could happen only if rx and tx change 1378 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1379 * and we can't do anything to fully restore the original 1380 * status 1381 */ 1382 if (err2) 1383 pr_warn("Can't restore rx queues config %d -> %d %d", 1384 new_rx_count, old_rx_count, err2); 1385 else 1386 goto revert; 1387 } 1388 1389 out: 1390 if (netif_running(dev)) { 1391 /* note that we need to swap the arguments WRT the enable part 1392 * to identify the range we have to disable 1393 */ 1394 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1395 netif_carrier_on(dev); 1396 if (peer) 1397 netif_carrier_on(peer); 1398 } 1399 1400 /* update XDP supported features */ 1401 veth_set_xdp_features(dev); 1402 if (peer) 1403 veth_set_xdp_features(peer); 1404 1405 return err; 1406 1407 revert: 1408 new_rx_count = old_rx_count; 1409 old_rx_count = ch->rx_count; 1410 goto out; 1411 } 1412 1413 static int veth_open(struct net_device *dev) 1414 { 1415 struct veth_priv *priv = netdev_priv(dev); 1416 struct net_device *peer = rtnl_dereference(priv->peer); 1417 int err; 1418 1419 if (!peer) 1420 return -ENOTCONN; 1421 1422 if (priv->_xdp_prog) { 1423 err = veth_enable_xdp(dev); 1424 if (err) 1425 return err; 1426 } else if (veth_gro_requested(dev)) { 1427 err = veth_napi_enable(dev); 1428 if (err) 1429 return err; 1430 } 1431 1432 if (peer->flags & IFF_UP) { 1433 netif_carrier_on(dev); 1434 netif_carrier_on(peer); 1435 } 1436 1437 return 0; 1438 } 1439 1440 static int veth_close(struct net_device *dev) 1441 { 1442 struct veth_priv *priv = netdev_priv(dev); 1443 struct net_device *peer = rtnl_dereference(priv->peer); 1444 1445 netif_carrier_off(dev); 1446 if (peer) 1447 netif_carrier_off(peer); 1448 1449 if (priv->_xdp_prog) 1450 veth_disable_xdp(dev); 1451 else if (veth_gro_requested(dev)) 1452 veth_napi_del(dev); 1453 1454 return 0; 1455 } 1456 1457 static int is_valid_veth_mtu(int mtu) 1458 { 1459 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1460 } 1461 1462 static int veth_alloc_queues(struct net_device *dev) 1463 { 1464 struct veth_priv *priv = netdev_priv(dev); 1465 int i; 1466 1467 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT); 1468 if (!priv->rq) 1469 return -ENOMEM; 1470 1471 for (i = 0; i < dev->num_rx_queues; i++) { 1472 priv->rq[i].dev = dev; 1473 u64_stats_init(&priv->rq[i].stats.syncp); 1474 } 1475 1476 return 0; 1477 } 1478 1479 static void veth_free_queues(struct net_device *dev) 1480 { 1481 struct veth_priv *priv = netdev_priv(dev); 1482 1483 kfree(priv->rq); 1484 } 1485 1486 static int veth_dev_init(struct net_device *dev) 1487 { 1488 int err; 1489 1490 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1491 if (!dev->lstats) 1492 return -ENOMEM; 1493 1494 err = veth_alloc_queues(dev); 1495 if (err) { 1496 free_percpu(dev->lstats); 1497 return err; 1498 } 1499 1500 return 0; 1501 } 1502 1503 static void veth_dev_free(struct net_device *dev) 1504 { 1505 veth_free_queues(dev); 1506 free_percpu(dev->lstats); 1507 } 1508 1509 #ifdef CONFIG_NET_POLL_CONTROLLER 1510 static void veth_poll_controller(struct net_device *dev) 1511 { 1512 /* veth only receives frames when its peer sends one 1513 * Since it has nothing to do with disabling irqs, we are guaranteed 1514 * never to have pending data when we poll for it so 1515 * there is nothing to do here. 1516 * 1517 * We need this though so netpoll recognizes us as an interface that 1518 * supports polling, which enables bridge devices in virt setups to 1519 * still use netconsole 1520 */ 1521 } 1522 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1523 1524 static int veth_get_iflink(const struct net_device *dev) 1525 { 1526 struct veth_priv *priv = netdev_priv(dev); 1527 struct net_device *peer; 1528 int iflink; 1529 1530 rcu_read_lock(); 1531 peer = rcu_dereference(priv->peer); 1532 iflink = peer ? peer->ifindex : 0; 1533 rcu_read_unlock(); 1534 1535 return iflink; 1536 } 1537 1538 static netdev_features_t veth_fix_features(struct net_device *dev, 1539 netdev_features_t features) 1540 { 1541 struct veth_priv *priv = netdev_priv(dev); 1542 struct net_device *peer; 1543 1544 peer = rtnl_dereference(priv->peer); 1545 if (peer) { 1546 struct veth_priv *peer_priv = netdev_priv(peer); 1547 1548 if (peer_priv->_xdp_prog) 1549 features &= ~NETIF_F_GSO_SOFTWARE; 1550 } 1551 if (priv->_xdp_prog) 1552 features |= NETIF_F_GRO; 1553 1554 return features; 1555 } 1556 1557 static int veth_set_features(struct net_device *dev, 1558 netdev_features_t features) 1559 { 1560 netdev_features_t changed = features ^ dev->features; 1561 struct veth_priv *priv = netdev_priv(dev); 1562 struct net_device *peer; 1563 int err; 1564 1565 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1566 return 0; 1567 1568 peer = rtnl_dereference(priv->peer); 1569 if (features & NETIF_F_GRO) { 1570 err = veth_napi_enable(dev); 1571 if (err) 1572 return err; 1573 1574 if (peer) 1575 xdp_features_set_redirect_target(peer, true); 1576 } else { 1577 if (peer) 1578 xdp_features_clear_redirect_target(peer); 1579 veth_napi_del(dev); 1580 } 1581 return 0; 1582 } 1583 1584 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1585 { 1586 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1587 struct net_device *peer; 1588 1589 if (new_hr < 0) 1590 new_hr = 0; 1591 1592 rcu_read_lock(); 1593 peer = rcu_dereference(priv->peer); 1594 if (unlikely(!peer)) 1595 goto out; 1596 1597 peer_priv = netdev_priv(peer); 1598 priv->requested_headroom = new_hr; 1599 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1600 dev->needed_headroom = new_hr; 1601 peer->needed_headroom = new_hr; 1602 1603 out: 1604 rcu_read_unlock(); 1605 } 1606 1607 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1608 struct netlink_ext_ack *extack) 1609 { 1610 struct veth_priv *priv = netdev_priv(dev); 1611 struct bpf_prog *old_prog; 1612 struct net_device *peer; 1613 unsigned int max_mtu; 1614 int err; 1615 1616 old_prog = priv->_xdp_prog; 1617 priv->_xdp_prog = prog; 1618 peer = rtnl_dereference(priv->peer); 1619 1620 if (prog) { 1621 if (!peer) { 1622 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1623 err = -ENOTCONN; 1624 goto err; 1625 } 1626 1627 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1628 peer->hard_header_len; 1629 /* Allow increasing the max_mtu if the program supports 1630 * XDP fragments. 1631 */ 1632 if (prog->aux->xdp_has_frags) 1633 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1634 1635 if (peer->mtu > max_mtu) { 1636 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1637 err = -ERANGE; 1638 goto err; 1639 } 1640 1641 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1642 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1643 err = -ENOSPC; 1644 goto err; 1645 } 1646 1647 if (dev->flags & IFF_UP) { 1648 err = veth_enable_xdp(dev); 1649 if (err) { 1650 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1651 goto err; 1652 } 1653 } 1654 1655 if (!old_prog) { 1656 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1657 peer->max_mtu = max_mtu; 1658 } 1659 1660 xdp_features_set_redirect_target(peer, true); 1661 } 1662 1663 if (old_prog) { 1664 if (!prog) { 1665 if (peer && !veth_gro_requested(dev)) 1666 xdp_features_clear_redirect_target(peer); 1667 1668 if (dev->flags & IFF_UP) 1669 veth_disable_xdp(dev); 1670 1671 if (peer) { 1672 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1673 peer->max_mtu = ETH_MAX_MTU; 1674 } 1675 } 1676 bpf_prog_put(old_prog); 1677 } 1678 1679 if ((!!old_prog ^ !!prog) && peer) 1680 netdev_update_features(peer); 1681 1682 return 0; 1683 err: 1684 priv->_xdp_prog = old_prog; 1685 1686 return err; 1687 } 1688 1689 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1690 { 1691 switch (xdp->command) { 1692 case XDP_SETUP_PROG: 1693 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1694 default: 1695 return -EINVAL; 1696 } 1697 } 1698 1699 static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) 1700 { 1701 struct veth_xdp_buff *_ctx = (void *)ctx; 1702 1703 if (!_ctx->skb) 1704 return -ENODATA; 1705 1706 *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; 1707 return 0; 1708 } 1709 1710 static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, 1711 enum xdp_rss_hash_type *rss_type) 1712 { 1713 struct veth_xdp_buff *_ctx = (void *)ctx; 1714 struct sk_buff *skb = _ctx->skb; 1715 1716 if (!skb) 1717 return -ENODATA; 1718 1719 *hash = skb_get_hash(skb); 1720 *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; 1721 1722 return 0; 1723 } 1724 1725 static const struct net_device_ops veth_netdev_ops = { 1726 .ndo_init = veth_dev_init, 1727 .ndo_open = veth_open, 1728 .ndo_stop = veth_close, 1729 .ndo_start_xmit = veth_xmit, 1730 .ndo_get_stats64 = veth_get_stats64, 1731 .ndo_set_rx_mode = veth_set_multicast_list, 1732 .ndo_set_mac_address = eth_mac_addr, 1733 #ifdef CONFIG_NET_POLL_CONTROLLER 1734 .ndo_poll_controller = veth_poll_controller, 1735 #endif 1736 .ndo_get_iflink = veth_get_iflink, 1737 .ndo_fix_features = veth_fix_features, 1738 .ndo_set_features = veth_set_features, 1739 .ndo_features_check = passthru_features_check, 1740 .ndo_set_rx_headroom = veth_set_rx_headroom, 1741 .ndo_bpf = veth_xdp, 1742 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1743 .ndo_get_peer_dev = veth_peer_dev, 1744 }; 1745 1746 static const struct xdp_metadata_ops veth_xdp_metadata_ops = { 1747 .xmo_rx_timestamp = veth_xdp_rx_timestamp, 1748 .xmo_rx_hash = veth_xdp_rx_hash, 1749 }; 1750 1751 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1752 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1753 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1754 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1755 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1756 1757 static void veth_setup(struct net_device *dev) 1758 { 1759 ether_setup(dev); 1760 1761 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1762 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1763 dev->priv_flags |= IFF_NO_QUEUE; 1764 dev->priv_flags |= IFF_PHONY_HEADROOM; 1765 1766 dev->netdev_ops = &veth_netdev_ops; 1767 dev->xdp_metadata_ops = &veth_xdp_metadata_ops; 1768 dev->ethtool_ops = &veth_ethtool_ops; 1769 dev->features |= NETIF_F_LLTX; 1770 dev->features |= VETH_FEATURES; 1771 dev->vlan_features = dev->features & 1772 ~(NETIF_F_HW_VLAN_CTAG_TX | 1773 NETIF_F_HW_VLAN_STAG_TX | 1774 NETIF_F_HW_VLAN_CTAG_RX | 1775 NETIF_F_HW_VLAN_STAG_RX); 1776 dev->needs_free_netdev = true; 1777 dev->priv_destructor = veth_dev_free; 1778 dev->max_mtu = ETH_MAX_MTU; 1779 1780 dev->hw_features = VETH_FEATURES; 1781 dev->hw_enc_features = VETH_FEATURES; 1782 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1783 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1784 } 1785 1786 /* 1787 * netlink interface 1788 */ 1789 1790 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1791 struct netlink_ext_ack *extack) 1792 { 1793 if (tb[IFLA_ADDRESS]) { 1794 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1795 return -EINVAL; 1796 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1797 return -EADDRNOTAVAIL; 1798 } 1799 if (tb[IFLA_MTU]) { 1800 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1801 return -EINVAL; 1802 } 1803 return 0; 1804 } 1805 1806 static struct rtnl_link_ops veth_link_ops; 1807 1808 static void veth_disable_gro(struct net_device *dev) 1809 { 1810 dev->features &= ~NETIF_F_GRO; 1811 dev->wanted_features &= ~NETIF_F_GRO; 1812 netdev_update_features(dev); 1813 } 1814 1815 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1816 { 1817 int err; 1818 1819 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1820 err = netif_set_real_num_tx_queues(dev, 1); 1821 if (err) 1822 return err; 1823 } 1824 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1825 err = netif_set_real_num_rx_queues(dev, 1); 1826 if (err) 1827 return err; 1828 } 1829 return 0; 1830 } 1831 1832 static int veth_newlink(struct net *src_net, struct net_device *dev, 1833 struct nlattr *tb[], struct nlattr *data[], 1834 struct netlink_ext_ack *extack) 1835 { 1836 int err; 1837 struct net_device *peer; 1838 struct veth_priv *priv; 1839 char ifname[IFNAMSIZ]; 1840 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1841 unsigned char name_assign_type; 1842 struct ifinfomsg *ifmp; 1843 struct net *net; 1844 1845 /* 1846 * create and register peer first 1847 */ 1848 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1849 struct nlattr *nla_peer; 1850 1851 nla_peer = data[VETH_INFO_PEER]; 1852 ifmp = nla_data(nla_peer); 1853 err = rtnl_nla_parse_ifla(peer_tb, 1854 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1855 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1856 NULL); 1857 if (err < 0) 1858 return err; 1859 1860 err = veth_validate(peer_tb, NULL, extack); 1861 if (err < 0) 1862 return err; 1863 1864 tbp = peer_tb; 1865 } else { 1866 ifmp = NULL; 1867 tbp = tb; 1868 } 1869 1870 if (ifmp && tbp[IFLA_IFNAME]) { 1871 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1872 name_assign_type = NET_NAME_USER; 1873 } else { 1874 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1875 name_assign_type = NET_NAME_ENUM; 1876 } 1877 1878 net = rtnl_link_get_net(src_net, tbp); 1879 if (IS_ERR(net)) 1880 return PTR_ERR(net); 1881 1882 peer = rtnl_create_link(net, ifname, name_assign_type, 1883 &veth_link_ops, tbp, extack); 1884 if (IS_ERR(peer)) { 1885 put_net(net); 1886 return PTR_ERR(peer); 1887 } 1888 1889 if (!ifmp || !tbp[IFLA_ADDRESS]) 1890 eth_hw_addr_random(peer); 1891 1892 if (ifmp && (dev->ifindex != 0)) 1893 peer->ifindex = ifmp->ifi_index; 1894 1895 netif_inherit_tso_max(peer, dev); 1896 1897 err = register_netdevice(peer); 1898 put_net(net); 1899 net = NULL; 1900 if (err < 0) 1901 goto err_register_peer; 1902 1903 /* keep GRO disabled by default to be consistent with the established 1904 * veth behavior 1905 */ 1906 veth_disable_gro(peer); 1907 netif_carrier_off(peer); 1908 1909 err = rtnl_configure_link(peer, ifmp, 0, NULL); 1910 if (err < 0) 1911 goto err_configure_peer; 1912 1913 /* 1914 * register dev last 1915 * 1916 * note, that since we've registered new device the dev's name 1917 * should be re-allocated 1918 */ 1919 1920 if (tb[IFLA_ADDRESS] == NULL) 1921 eth_hw_addr_random(dev); 1922 1923 if (tb[IFLA_IFNAME]) 1924 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1925 else 1926 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1927 1928 err = register_netdevice(dev); 1929 if (err < 0) 1930 goto err_register_dev; 1931 1932 netif_carrier_off(dev); 1933 1934 /* 1935 * tie the deviced together 1936 */ 1937 1938 priv = netdev_priv(dev); 1939 rcu_assign_pointer(priv->peer, peer); 1940 err = veth_init_queues(dev, tb); 1941 if (err) 1942 goto err_queues; 1943 1944 priv = netdev_priv(peer); 1945 rcu_assign_pointer(priv->peer, dev); 1946 err = veth_init_queues(peer, tb); 1947 if (err) 1948 goto err_queues; 1949 1950 veth_disable_gro(dev); 1951 /* update XDP supported features */ 1952 veth_set_xdp_features(dev); 1953 veth_set_xdp_features(peer); 1954 1955 return 0; 1956 1957 err_queues: 1958 unregister_netdevice(dev); 1959 err_register_dev: 1960 /* nothing to do */ 1961 err_configure_peer: 1962 unregister_netdevice(peer); 1963 return err; 1964 1965 err_register_peer: 1966 free_netdev(peer); 1967 return err; 1968 } 1969 1970 static void veth_dellink(struct net_device *dev, struct list_head *head) 1971 { 1972 struct veth_priv *priv; 1973 struct net_device *peer; 1974 1975 priv = netdev_priv(dev); 1976 peer = rtnl_dereference(priv->peer); 1977 1978 /* Note : dellink() is called from default_device_exit_batch(), 1979 * before a rcu_synchronize() point. The devices are guaranteed 1980 * not being freed before one RCU grace period. 1981 */ 1982 RCU_INIT_POINTER(priv->peer, NULL); 1983 unregister_netdevice_queue(dev, head); 1984 1985 if (peer) { 1986 priv = netdev_priv(peer); 1987 RCU_INIT_POINTER(priv->peer, NULL); 1988 unregister_netdevice_queue(peer, head); 1989 } 1990 } 1991 1992 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1993 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1994 }; 1995 1996 static struct net *veth_get_link_net(const struct net_device *dev) 1997 { 1998 struct veth_priv *priv = netdev_priv(dev); 1999 struct net_device *peer = rtnl_dereference(priv->peer); 2000 2001 return peer ? dev_net(peer) : dev_net(dev); 2002 } 2003 2004 static unsigned int veth_get_num_queues(void) 2005 { 2006 /* enforce the same queue limit as rtnl_create_link */ 2007 int queues = num_possible_cpus(); 2008 2009 if (queues > 4096) 2010 queues = 4096; 2011 return queues; 2012 } 2013 2014 static struct rtnl_link_ops veth_link_ops = { 2015 .kind = DRV_NAME, 2016 .priv_size = sizeof(struct veth_priv), 2017 .setup = veth_setup, 2018 .validate = veth_validate, 2019 .newlink = veth_newlink, 2020 .dellink = veth_dellink, 2021 .policy = veth_policy, 2022 .maxtype = VETH_INFO_MAX, 2023 .get_link_net = veth_get_link_net, 2024 .get_num_tx_queues = veth_get_num_queues, 2025 .get_num_rx_queues = veth_get_num_queues, 2026 }; 2027 2028 /* 2029 * init/fini 2030 */ 2031 2032 static __init int veth_init(void) 2033 { 2034 return rtnl_link_register(&veth_link_ops); 2035 } 2036 2037 static __exit void veth_exit(void) 2038 { 2039 rtnl_link_unregister(&veth_link_ops); 2040 } 2041 2042 module_init(veth_init); 2043 module_exit(veth_exit); 2044 2045 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 2046 MODULE_LICENSE("GPL v2"); 2047 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 2048