1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 #define VETH_XDP_TX_BULK_SIZE 16 38 #define VETH_XDP_BATCH 16 39 40 struct veth_stats { 41 u64 rx_drops; 42 /* xdp */ 43 u64 xdp_packets; 44 u64 xdp_bytes; 45 u64 xdp_redirect; 46 u64 xdp_drops; 47 u64 xdp_tx; 48 u64 xdp_tx_err; 49 u64 peer_tq_xdp_xmit; 50 u64 peer_tq_xdp_xmit_err; 51 }; 52 53 struct veth_rq_stats { 54 struct veth_stats vs; 55 struct u64_stats_sync syncp; 56 }; 57 58 struct veth_rq { 59 struct napi_struct xdp_napi; 60 struct net_device *dev; 61 struct bpf_prog __rcu *xdp_prog; 62 struct xdp_mem_info xdp_mem; 63 struct veth_rq_stats stats; 64 bool rx_notify_masked; 65 struct ptr_ring xdp_ring; 66 struct xdp_rxq_info xdp_rxq; 67 }; 68 69 struct veth_priv { 70 struct net_device __rcu *peer; 71 atomic64_t dropped; 72 struct bpf_prog *_xdp_prog; 73 struct veth_rq *rq; 74 unsigned int requested_headroom; 75 }; 76 77 struct veth_xdp_tx_bq { 78 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 79 unsigned int count; 80 }; 81 82 /* 83 * ethtool interface 84 */ 85 86 struct veth_q_stat_desc { 87 char desc[ETH_GSTRING_LEN]; 88 size_t offset; 89 }; 90 91 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 92 93 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 94 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 95 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 96 { "drops", VETH_RQ_STAT(rx_drops) }, 97 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 98 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 99 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 100 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 101 }; 102 103 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 104 105 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 106 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 107 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 108 }; 109 110 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 111 112 static struct { 113 const char string[ETH_GSTRING_LEN]; 114 } ethtool_stats_keys[] = { 115 { "peer_ifindex" }, 116 }; 117 118 static int veth_get_link_ksettings(struct net_device *dev, 119 struct ethtool_link_ksettings *cmd) 120 { 121 cmd->base.speed = SPEED_10000; 122 cmd->base.duplex = DUPLEX_FULL; 123 cmd->base.port = PORT_TP; 124 cmd->base.autoneg = AUTONEG_DISABLE; 125 return 0; 126 } 127 128 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 129 { 130 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 131 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 132 } 133 134 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 135 { 136 char *p = (char *)buf; 137 int i, j; 138 139 switch(stringset) { 140 case ETH_SS_STATS: 141 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 142 p += sizeof(ethtool_stats_keys); 143 for (i = 0; i < dev->real_num_rx_queues; i++) { 144 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 145 snprintf(p, ETH_GSTRING_LEN, 146 "rx_queue_%u_%.18s", 147 i, veth_rq_stats_desc[j].desc); 148 p += ETH_GSTRING_LEN; 149 } 150 } 151 for (i = 0; i < dev->real_num_tx_queues; i++) { 152 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 153 snprintf(p, ETH_GSTRING_LEN, 154 "tx_queue_%u_%.18s", 155 i, veth_tq_stats_desc[j].desc); 156 p += ETH_GSTRING_LEN; 157 } 158 } 159 break; 160 } 161 } 162 163 static int veth_get_sset_count(struct net_device *dev, int sset) 164 { 165 switch (sset) { 166 case ETH_SS_STATS: 167 return ARRAY_SIZE(ethtool_stats_keys) + 168 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 169 VETH_TQ_STATS_LEN * dev->real_num_tx_queues; 170 default: 171 return -EOPNOTSUPP; 172 } 173 } 174 175 static void veth_get_ethtool_stats(struct net_device *dev, 176 struct ethtool_stats *stats, u64 *data) 177 { 178 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 179 struct net_device *peer = rtnl_dereference(priv->peer); 180 int i, j, idx; 181 182 data[0] = peer ? peer->ifindex : 0; 183 idx = 1; 184 for (i = 0; i < dev->real_num_rx_queues; i++) { 185 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 186 const void *stats_base = (void *)&rq_stats->vs; 187 unsigned int start; 188 size_t offset; 189 190 do { 191 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 192 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 193 offset = veth_rq_stats_desc[j].offset; 194 data[idx + j] = *(u64 *)(stats_base + offset); 195 } 196 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 197 idx += VETH_RQ_STATS_LEN; 198 } 199 200 if (!peer) 201 return; 202 203 rcv_priv = netdev_priv(peer); 204 for (i = 0; i < peer->real_num_rx_queues; i++) { 205 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 206 const void *base = (void *)&rq_stats->vs; 207 unsigned int start, tx_idx = idx; 208 size_t offset; 209 210 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 211 do { 212 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 213 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 214 offset = veth_tq_stats_desc[j].offset; 215 data[tx_idx + j] += *(u64 *)(base + offset); 216 } 217 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 218 } 219 } 220 221 static const struct ethtool_ops veth_ethtool_ops = { 222 .get_drvinfo = veth_get_drvinfo, 223 .get_link = ethtool_op_get_link, 224 .get_strings = veth_get_strings, 225 .get_sset_count = veth_get_sset_count, 226 .get_ethtool_stats = veth_get_ethtool_stats, 227 .get_link_ksettings = veth_get_link_ksettings, 228 .get_ts_info = ethtool_op_get_ts_info, 229 }; 230 231 /* general routines */ 232 233 static bool veth_is_xdp_frame(void *ptr) 234 { 235 return (unsigned long)ptr & VETH_XDP_FLAG; 236 } 237 238 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 239 { 240 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 241 } 242 243 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 244 { 245 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 246 } 247 248 static void veth_ptr_free(void *ptr) 249 { 250 if (veth_is_xdp_frame(ptr)) 251 xdp_return_frame(veth_ptr_to_xdp(ptr)); 252 else 253 kfree_skb(ptr); 254 } 255 256 static void __veth_xdp_flush(struct veth_rq *rq) 257 { 258 /* Write ptr_ring before reading rx_notify_masked */ 259 smp_mb(); 260 if (!rq->rx_notify_masked) { 261 rq->rx_notify_masked = true; 262 napi_schedule(&rq->xdp_napi); 263 } 264 } 265 266 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 267 { 268 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 269 dev_kfree_skb_any(skb); 270 return NET_RX_DROP; 271 } 272 273 return NET_RX_SUCCESS; 274 } 275 276 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 277 struct veth_rq *rq, bool xdp) 278 { 279 return __dev_forward_skb(dev, skb) ?: xdp ? 280 veth_xdp_rx(rq, skb) : 281 netif_rx(skb); 282 } 283 284 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 285 { 286 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 287 struct veth_rq *rq = NULL; 288 struct net_device *rcv; 289 int length = skb->len; 290 bool rcv_xdp = false; 291 int rxq; 292 293 rcu_read_lock(); 294 rcv = rcu_dereference(priv->peer); 295 if (unlikely(!rcv)) { 296 kfree_skb(skb); 297 goto drop; 298 } 299 300 rcv_priv = netdev_priv(rcv); 301 rxq = skb_get_queue_mapping(skb); 302 if (rxq < rcv->real_num_rx_queues) { 303 rq = &rcv_priv->rq[rxq]; 304 rcv_xdp = rcu_access_pointer(rq->xdp_prog); 305 if (rcv_xdp) 306 skb_record_rx_queue(skb, rxq); 307 } 308 309 skb_tx_timestamp(skb); 310 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 311 if (!rcv_xdp) 312 dev_lstats_add(dev, length); 313 } else { 314 drop: 315 atomic64_inc(&priv->dropped); 316 } 317 318 if (rcv_xdp) 319 __veth_xdp_flush(rq); 320 321 rcu_read_unlock(); 322 323 return NETDEV_TX_OK; 324 } 325 326 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 327 { 328 struct veth_priv *priv = netdev_priv(dev); 329 330 dev_lstats_read(dev, packets, bytes); 331 return atomic64_read(&priv->dropped); 332 } 333 334 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 335 { 336 struct veth_priv *priv = netdev_priv(dev); 337 int i; 338 339 result->peer_tq_xdp_xmit_err = 0; 340 result->xdp_packets = 0; 341 result->xdp_tx_err = 0; 342 result->xdp_bytes = 0; 343 result->rx_drops = 0; 344 for (i = 0; i < dev->num_rx_queues; i++) { 345 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 346 struct veth_rq_stats *stats = &priv->rq[i].stats; 347 unsigned int start; 348 349 do { 350 start = u64_stats_fetch_begin_irq(&stats->syncp); 351 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 352 xdp_tx_err = stats->vs.xdp_tx_err; 353 packets = stats->vs.xdp_packets; 354 bytes = stats->vs.xdp_bytes; 355 drops = stats->vs.rx_drops; 356 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 357 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 358 result->xdp_tx_err += xdp_tx_err; 359 result->xdp_packets += packets; 360 result->xdp_bytes += bytes; 361 result->rx_drops += drops; 362 } 363 } 364 365 static void veth_get_stats64(struct net_device *dev, 366 struct rtnl_link_stats64 *tot) 367 { 368 struct veth_priv *priv = netdev_priv(dev); 369 struct net_device *peer; 370 struct veth_stats rx; 371 u64 packets, bytes; 372 373 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 374 tot->tx_bytes = bytes; 375 tot->tx_packets = packets; 376 377 veth_stats_rx(&rx, dev); 378 tot->tx_dropped += rx.xdp_tx_err; 379 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 380 tot->rx_bytes = rx.xdp_bytes; 381 tot->rx_packets = rx.xdp_packets; 382 383 rcu_read_lock(); 384 peer = rcu_dereference(priv->peer); 385 if (peer) { 386 veth_stats_tx(peer, &packets, &bytes); 387 tot->rx_bytes += bytes; 388 tot->rx_packets += packets; 389 390 veth_stats_rx(&rx, peer); 391 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 392 tot->rx_dropped += rx.xdp_tx_err; 393 tot->tx_bytes += rx.xdp_bytes; 394 tot->tx_packets += rx.xdp_packets; 395 } 396 rcu_read_unlock(); 397 } 398 399 /* fake multicast ability */ 400 static void veth_set_multicast_list(struct net_device *dev) 401 { 402 } 403 404 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 405 int buflen) 406 { 407 struct sk_buff *skb; 408 409 skb = build_skb(head, buflen); 410 if (!skb) 411 return NULL; 412 413 skb_reserve(skb, headroom); 414 skb_put(skb, len); 415 416 return skb; 417 } 418 419 static int veth_select_rxq(struct net_device *dev) 420 { 421 return smp_processor_id() % dev->real_num_rx_queues; 422 } 423 424 static struct net_device *veth_peer_dev(struct net_device *dev) 425 { 426 struct veth_priv *priv = netdev_priv(dev); 427 428 /* Callers must be under RCU read side. */ 429 return rcu_dereference(priv->peer); 430 } 431 432 static int veth_xdp_xmit(struct net_device *dev, int n, 433 struct xdp_frame **frames, 434 u32 flags, bool ndo_xmit) 435 { 436 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 437 int i, ret = -ENXIO, nxmit = 0; 438 struct net_device *rcv; 439 unsigned int max_len; 440 struct veth_rq *rq; 441 442 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 443 return -EINVAL; 444 445 rcu_read_lock(); 446 rcv = rcu_dereference(priv->peer); 447 if (unlikely(!rcv)) 448 goto out; 449 450 rcv_priv = netdev_priv(rcv); 451 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 452 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 453 * side. This means an XDP program is loaded on the peer and the peer 454 * device is up. 455 */ 456 if (!rcu_access_pointer(rq->xdp_prog)) 457 goto out; 458 459 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 460 461 spin_lock(&rq->xdp_ring.producer_lock); 462 for (i = 0; i < n; i++) { 463 struct xdp_frame *frame = frames[i]; 464 void *ptr = veth_xdp_to_ptr(frame); 465 466 if (unlikely(frame->len > max_len || 467 __ptr_ring_produce(&rq->xdp_ring, ptr))) 468 break; 469 nxmit++; 470 } 471 spin_unlock(&rq->xdp_ring.producer_lock); 472 473 if (flags & XDP_XMIT_FLUSH) 474 __veth_xdp_flush(rq); 475 476 ret = nxmit; 477 if (ndo_xmit) { 478 u64_stats_update_begin(&rq->stats.syncp); 479 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 480 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 481 u64_stats_update_end(&rq->stats.syncp); 482 } 483 484 out: 485 rcu_read_unlock(); 486 487 return ret; 488 } 489 490 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 491 struct xdp_frame **frames, u32 flags) 492 { 493 int err; 494 495 err = veth_xdp_xmit(dev, n, frames, flags, true); 496 if (err < 0) { 497 struct veth_priv *priv = netdev_priv(dev); 498 499 atomic64_add(n, &priv->dropped); 500 } 501 502 return err; 503 } 504 505 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 506 { 507 int sent, i, err = 0, drops; 508 509 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 510 if (sent < 0) { 511 err = sent; 512 sent = 0; 513 } 514 515 for (i = sent; unlikely(i < bq->count); i++) 516 xdp_return_frame(bq->q[i]); 517 518 drops = bq->count - sent; 519 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 520 521 u64_stats_update_begin(&rq->stats.syncp); 522 rq->stats.vs.xdp_tx += sent; 523 rq->stats.vs.xdp_tx_err += drops; 524 u64_stats_update_end(&rq->stats.syncp); 525 526 bq->count = 0; 527 } 528 529 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 530 { 531 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 532 struct net_device *rcv; 533 struct veth_rq *rcv_rq; 534 535 rcu_read_lock(); 536 veth_xdp_flush_bq(rq, bq); 537 rcv = rcu_dereference(priv->peer); 538 if (unlikely(!rcv)) 539 goto out; 540 541 rcv_priv = netdev_priv(rcv); 542 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 543 /* xdp_ring is initialized on receive side? */ 544 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 545 goto out; 546 547 __veth_xdp_flush(rcv_rq); 548 out: 549 rcu_read_unlock(); 550 } 551 552 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 553 struct veth_xdp_tx_bq *bq) 554 { 555 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 556 557 if (unlikely(!frame)) 558 return -EOVERFLOW; 559 560 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 561 veth_xdp_flush_bq(rq, bq); 562 563 bq->q[bq->count++] = frame; 564 565 return 0; 566 } 567 568 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 569 struct xdp_frame *frame, 570 struct veth_xdp_tx_bq *bq, 571 struct veth_stats *stats) 572 { 573 struct xdp_frame orig_frame; 574 struct bpf_prog *xdp_prog; 575 576 rcu_read_lock(); 577 xdp_prog = rcu_dereference(rq->xdp_prog); 578 if (likely(xdp_prog)) { 579 struct xdp_buff xdp; 580 u32 act; 581 582 xdp_convert_frame_to_buff(frame, &xdp); 583 xdp.rxq = &rq->xdp_rxq; 584 585 act = bpf_prog_run_xdp(xdp_prog, &xdp); 586 587 switch (act) { 588 case XDP_PASS: 589 if (xdp_update_frame_from_buff(&xdp, frame)) 590 goto err_xdp; 591 break; 592 case XDP_TX: 593 orig_frame = *frame; 594 xdp.rxq->mem = frame->mem; 595 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 596 trace_xdp_exception(rq->dev, xdp_prog, act); 597 frame = &orig_frame; 598 stats->rx_drops++; 599 goto err_xdp; 600 } 601 stats->xdp_tx++; 602 rcu_read_unlock(); 603 goto xdp_xmit; 604 case XDP_REDIRECT: 605 orig_frame = *frame; 606 xdp.rxq->mem = frame->mem; 607 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 608 frame = &orig_frame; 609 stats->rx_drops++; 610 goto err_xdp; 611 } 612 stats->xdp_redirect++; 613 rcu_read_unlock(); 614 goto xdp_xmit; 615 default: 616 bpf_warn_invalid_xdp_action(act); 617 fallthrough; 618 case XDP_ABORTED: 619 trace_xdp_exception(rq->dev, xdp_prog, act); 620 fallthrough; 621 case XDP_DROP: 622 stats->xdp_drops++; 623 goto err_xdp; 624 } 625 } 626 rcu_read_unlock(); 627 628 return frame; 629 err_xdp: 630 rcu_read_unlock(); 631 xdp_return_frame(frame); 632 xdp_xmit: 633 return NULL; 634 } 635 636 /* frames array contains VETH_XDP_BATCH at most */ 637 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 638 int n_xdpf, struct veth_xdp_tx_bq *bq, 639 struct veth_stats *stats) 640 { 641 void *skbs[VETH_XDP_BATCH]; 642 int i; 643 644 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 645 GFP_ATOMIC | __GFP_ZERO) < 0) { 646 for (i = 0; i < n_xdpf; i++) 647 xdp_return_frame(frames[i]); 648 stats->rx_drops += n_xdpf; 649 650 return; 651 } 652 653 for (i = 0; i < n_xdpf; i++) { 654 struct sk_buff *skb = skbs[i]; 655 656 skb = __xdp_build_skb_from_frame(frames[i], skb, 657 rq->dev); 658 if (!skb) { 659 xdp_return_frame(frames[i]); 660 stats->rx_drops++; 661 continue; 662 } 663 napi_gro_receive(&rq->xdp_napi, skb); 664 } 665 } 666 667 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 668 struct sk_buff *skb, 669 struct veth_xdp_tx_bq *bq, 670 struct veth_stats *stats) 671 { 672 u32 pktlen, headroom, act, metalen, frame_sz; 673 void *orig_data, *orig_data_end; 674 struct bpf_prog *xdp_prog; 675 int mac_len, delta, off; 676 struct xdp_buff xdp; 677 678 skb_orphan(skb); 679 680 rcu_read_lock(); 681 xdp_prog = rcu_dereference(rq->xdp_prog); 682 if (unlikely(!xdp_prog)) { 683 rcu_read_unlock(); 684 goto out; 685 } 686 687 mac_len = skb->data - skb_mac_header(skb); 688 pktlen = skb->len + mac_len; 689 headroom = skb_headroom(skb) - mac_len; 690 691 if (skb_shared(skb) || skb_head_is_locked(skb) || 692 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 693 struct sk_buff *nskb; 694 int size, head_off; 695 void *head, *start; 696 struct page *page; 697 698 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 699 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 700 if (size > PAGE_SIZE) 701 goto drop; 702 703 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 704 if (!page) 705 goto drop; 706 707 head = page_address(page); 708 start = head + VETH_XDP_HEADROOM; 709 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 710 page_frag_free(head); 711 goto drop; 712 } 713 714 nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, 715 skb->len, PAGE_SIZE); 716 if (!nskb) { 717 page_frag_free(head); 718 goto drop; 719 } 720 721 skb_copy_header(nskb, skb); 722 head_off = skb_headroom(nskb) - skb_headroom(skb); 723 skb_headers_offset_update(nskb, head_off); 724 consume_skb(skb); 725 skb = nskb; 726 } 727 728 /* SKB "head" area always have tailroom for skb_shared_info */ 729 frame_sz = skb_end_pointer(skb) - skb->head; 730 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 731 xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); 732 xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true); 733 734 orig_data = xdp.data; 735 orig_data_end = xdp.data_end; 736 737 act = bpf_prog_run_xdp(xdp_prog, &xdp); 738 739 switch (act) { 740 case XDP_PASS: 741 break; 742 case XDP_TX: 743 get_page(virt_to_page(xdp.data)); 744 consume_skb(skb); 745 xdp.rxq->mem = rq->xdp_mem; 746 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 747 trace_xdp_exception(rq->dev, xdp_prog, act); 748 stats->rx_drops++; 749 goto err_xdp; 750 } 751 stats->xdp_tx++; 752 rcu_read_unlock(); 753 goto xdp_xmit; 754 case XDP_REDIRECT: 755 get_page(virt_to_page(xdp.data)); 756 consume_skb(skb); 757 xdp.rxq->mem = rq->xdp_mem; 758 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 759 stats->rx_drops++; 760 goto err_xdp; 761 } 762 stats->xdp_redirect++; 763 rcu_read_unlock(); 764 goto xdp_xmit; 765 default: 766 bpf_warn_invalid_xdp_action(act); 767 fallthrough; 768 case XDP_ABORTED: 769 trace_xdp_exception(rq->dev, xdp_prog, act); 770 fallthrough; 771 case XDP_DROP: 772 stats->xdp_drops++; 773 goto xdp_drop; 774 } 775 rcu_read_unlock(); 776 777 /* check if bpf_xdp_adjust_head was used */ 778 delta = orig_data - xdp.data; 779 off = mac_len + delta; 780 if (off > 0) 781 __skb_push(skb, off); 782 else if (off < 0) 783 __skb_pull(skb, -off); 784 skb->mac_header -= delta; 785 786 /* check if bpf_xdp_adjust_tail was used */ 787 off = xdp.data_end - orig_data_end; 788 if (off != 0) 789 __skb_put(skb, off); /* positive on grow, negative on shrink */ 790 skb->protocol = eth_type_trans(skb, rq->dev); 791 792 metalen = xdp.data - xdp.data_meta; 793 if (metalen) 794 skb_metadata_set(skb, metalen); 795 out: 796 return skb; 797 drop: 798 stats->rx_drops++; 799 xdp_drop: 800 rcu_read_unlock(); 801 kfree_skb(skb); 802 return NULL; 803 err_xdp: 804 rcu_read_unlock(); 805 page_frag_free(xdp.data); 806 xdp_xmit: 807 return NULL; 808 } 809 810 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 811 struct veth_xdp_tx_bq *bq, 812 struct veth_stats *stats) 813 { 814 int i, done = 0, n_xdpf = 0; 815 void *xdpf[VETH_XDP_BATCH]; 816 817 for (i = 0; i < budget; i++) { 818 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 819 820 if (!ptr) 821 break; 822 823 if (veth_is_xdp_frame(ptr)) { 824 /* ndo_xdp_xmit */ 825 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 826 827 stats->xdp_bytes += frame->len; 828 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 829 if (frame) { 830 /* XDP_PASS */ 831 xdpf[n_xdpf++] = frame; 832 if (n_xdpf == VETH_XDP_BATCH) { 833 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 834 bq, stats); 835 n_xdpf = 0; 836 } 837 } 838 } else { 839 /* ndo_start_xmit */ 840 struct sk_buff *skb = ptr; 841 842 stats->xdp_bytes += skb->len; 843 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 844 if (skb) 845 napi_gro_receive(&rq->xdp_napi, skb); 846 } 847 done++; 848 } 849 850 if (n_xdpf) 851 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 852 853 u64_stats_update_begin(&rq->stats.syncp); 854 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 855 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 856 rq->stats.vs.xdp_drops += stats->xdp_drops; 857 rq->stats.vs.rx_drops += stats->rx_drops; 858 rq->stats.vs.xdp_packets += done; 859 u64_stats_update_end(&rq->stats.syncp); 860 861 return done; 862 } 863 864 static int veth_poll(struct napi_struct *napi, int budget) 865 { 866 struct veth_rq *rq = 867 container_of(napi, struct veth_rq, xdp_napi); 868 struct veth_stats stats = {}; 869 struct veth_xdp_tx_bq bq; 870 int done; 871 872 bq.count = 0; 873 874 xdp_set_return_frame_no_direct(); 875 done = veth_xdp_rcv(rq, budget, &bq, &stats); 876 877 if (done < budget && napi_complete_done(napi, done)) { 878 /* Write rx_notify_masked before reading ptr_ring */ 879 smp_store_mb(rq->rx_notify_masked, false); 880 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 881 rq->rx_notify_masked = true; 882 napi_schedule(&rq->xdp_napi); 883 } 884 } 885 886 if (stats.xdp_tx > 0) 887 veth_xdp_flush(rq, &bq); 888 if (stats.xdp_redirect > 0) 889 xdp_do_flush(); 890 xdp_clear_return_frame_no_direct(); 891 892 return done; 893 } 894 895 static int veth_napi_add(struct net_device *dev) 896 { 897 struct veth_priv *priv = netdev_priv(dev); 898 int err, i; 899 900 for (i = 0; i < dev->real_num_rx_queues; i++) { 901 struct veth_rq *rq = &priv->rq[i]; 902 903 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 904 if (err) 905 goto err_xdp_ring; 906 } 907 908 for (i = 0; i < dev->real_num_rx_queues; i++) { 909 struct veth_rq *rq = &priv->rq[i]; 910 911 napi_enable(&rq->xdp_napi); 912 } 913 914 return 0; 915 err_xdp_ring: 916 for (i--; i >= 0; i--) 917 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 918 919 return err; 920 } 921 922 static void veth_napi_del(struct net_device *dev) 923 { 924 struct veth_priv *priv = netdev_priv(dev); 925 int i; 926 927 for (i = 0; i < dev->real_num_rx_queues; i++) { 928 struct veth_rq *rq = &priv->rq[i]; 929 930 napi_disable(&rq->xdp_napi); 931 __netif_napi_del(&rq->xdp_napi); 932 } 933 synchronize_net(); 934 935 for (i = 0; i < dev->real_num_rx_queues; i++) { 936 struct veth_rq *rq = &priv->rq[i]; 937 938 rq->rx_notify_masked = false; 939 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 940 } 941 } 942 943 static int veth_enable_xdp(struct net_device *dev) 944 { 945 struct veth_priv *priv = netdev_priv(dev); 946 int err, i; 947 948 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 949 for (i = 0; i < dev->real_num_rx_queues; i++) { 950 struct veth_rq *rq = &priv->rq[i]; 951 952 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 953 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 954 if (err < 0) 955 goto err_rxq_reg; 956 957 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 958 MEM_TYPE_PAGE_SHARED, 959 NULL); 960 if (err < 0) 961 goto err_reg_mem; 962 963 /* Save original mem info as it can be overwritten */ 964 rq->xdp_mem = rq->xdp_rxq.mem; 965 } 966 967 err = veth_napi_add(dev); 968 if (err) 969 goto err_rxq_reg; 970 } 971 972 for (i = 0; i < dev->real_num_rx_queues; i++) 973 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 974 975 return 0; 976 err_reg_mem: 977 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 978 err_rxq_reg: 979 for (i--; i >= 0; i--) { 980 struct veth_rq *rq = &priv->rq[i]; 981 982 xdp_rxq_info_unreg(&rq->xdp_rxq); 983 netif_napi_del(&rq->xdp_napi); 984 } 985 986 return err; 987 } 988 989 static void veth_disable_xdp(struct net_device *dev) 990 { 991 struct veth_priv *priv = netdev_priv(dev); 992 int i; 993 994 for (i = 0; i < dev->real_num_rx_queues; i++) 995 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 996 veth_napi_del(dev); 997 for (i = 0; i < dev->real_num_rx_queues; i++) { 998 struct veth_rq *rq = &priv->rq[i]; 999 1000 rq->xdp_rxq.mem = rq->xdp_mem; 1001 xdp_rxq_info_unreg(&rq->xdp_rxq); 1002 } 1003 } 1004 1005 static int veth_open(struct net_device *dev) 1006 { 1007 struct veth_priv *priv = netdev_priv(dev); 1008 struct net_device *peer = rtnl_dereference(priv->peer); 1009 int err; 1010 1011 if (!peer) 1012 return -ENOTCONN; 1013 1014 if (priv->_xdp_prog) { 1015 err = veth_enable_xdp(dev); 1016 if (err) 1017 return err; 1018 } 1019 1020 if (peer->flags & IFF_UP) { 1021 netif_carrier_on(dev); 1022 netif_carrier_on(peer); 1023 } 1024 1025 return 0; 1026 } 1027 1028 static int veth_close(struct net_device *dev) 1029 { 1030 struct veth_priv *priv = netdev_priv(dev); 1031 struct net_device *peer = rtnl_dereference(priv->peer); 1032 1033 netif_carrier_off(dev); 1034 if (peer) 1035 netif_carrier_off(peer); 1036 1037 if (priv->_xdp_prog) 1038 veth_disable_xdp(dev); 1039 1040 return 0; 1041 } 1042 1043 static int is_valid_veth_mtu(int mtu) 1044 { 1045 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1046 } 1047 1048 static int veth_alloc_queues(struct net_device *dev) 1049 { 1050 struct veth_priv *priv = netdev_priv(dev); 1051 int i; 1052 1053 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 1054 if (!priv->rq) 1055 return -ENOMEM; 1056 1057 for (i = 0; i < dev->num_rx_queues; i++) { 1058 priv->rq[i].dev = dev; 1059 u64_stats_init(&priv->rq[i].stats.syncp); 1060 } 1061 1062 return 0; 1063 } 1064 1065 static void veth_free_queues(struct net_device *dev) 1066 { 1067 struct veth_priv *priv = netdev_priv(dev); 1068 1069 kfree(priv->rq); 1070 } 1071 1072 static int veth_dev_init(struct net_device *dev) 1073 { 1074 int err; 1075 1076 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1077 if (!dev->lstats) 1078 return -ENOMEM; 1079 1080 err = veth_alloc_queues(dev); 1081 if (err) { 1082 free_percpu(dev->lstats); 1083 return err; 1084 } 1085 1086 return 0; 1087 } 1088 1089 static void veth_dev_free(struct net_device *dev) 1090 { 1091 veth_free_queues(dev); 1092 free_percpu(dev->lstats); 1093 } 1094 1095 #ifdef CONFIG_NET_POLL_CONTROLLER 1096 static void veth_poll_controller(struct net_device *dev) 1097 { 1098 /* veth only receives frames when its peer sends one 1099 * Since it has nothing to do with disabling irqs, we are guaranteed 1100 * never to have pending data when we poll for it so 1101 * there is nothing to do here. 1102 * 1103 * We need this though so netpoll recognizes us as an interface that 1104 * supports polling, which enables bridge devices in virt setups to 1105 * still use netconsole 1106 */ 1107 } 1108 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1109 1110 static int veth_get_iflink(const struct net_device *dev) 1111 { 1112 struct veth_priv *priv = netdev_priv(dev); 1113 struct net_device *peer; 1114 int iflink; 1115 1116 rcu_read_lock(); 1117 peer = rcu_dereference(priv->peer); 1118 iflink = peer ? peer->ifindex : 0; 1119 rcu_read_unlock(); 1120 1121 return iflink; 1122 } 1123 1124 static netdev_features_t veth_fix_features(struct net_device *dev, 1125 netdev_features_t features) 1126 { 1127 struct veth_priv *priv = netdev_priv(dev); 1128 struct net_device *peer; 1129 1130 peer = rtnl_dereference(priv->peer); 1131 if (peer) { 1132 struct veth_priv *peer_priv = netdev_priv(peer); 1133 1134 if (peer_priv->_xdp_prog) 1135 features &= ~NETIF_F_GSO_SOFTWARE; 1136 } 1137 1138 return features; 1139 } 1140 1141 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1142 { 1143 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1144 struct net_device *peer; 1145 1146 if (new_hr < 0) 1147 new_hr = 0; 1148 1149 rcu_read_lock(); 1150 peer = rcu_dereference(priv->peer); 1151 if (unlikely(!peer)) 1152 goto out; 1153 1154 peer_priv = netdev_priv(peer); 1155 priv->requested_headroom = new_hr; 1156 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1157 dev->needed_headroom = new_hr; 1158 peer->needed_headroom = new_hr; 1159 1160 out: 1161 rcu_read_unlock(); 1162 } 1163 1164 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1165 struct netlink_ext_ack *extack) 1166 { 1167 struct veth_priv *priv = netdev_priv(dev); 1168 struct bpf_prog *old_prog; 1169 struct net_device *peer; 1170 unsigned int max_mtu; 1171 int err; 1172 1173 old_prog = priv->_xdp_prog; 1174 priv->_xdp_prog = prog; 1175 peer = rtnl_dereference(priv->peer); 1176 1177 if (prog) { 1178 if (!peer) { 1179 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1180 err = -ENOTCONN; 1181 goto err; 1182 } 1183 1184 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1185 peer->hard_header_len - 1186 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1187 if (peer->mtu > max_mtu) { 1188 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1189 err = -ERANGE; 1190 goto err; 1191 } 1192 1193 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1194 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1195 err = -ENOSPC; 1196 goto err; 1197 } 1198 1199 if (dev->flags & IFF_UP) { 1200 err = veth_enable_xdp(dev); 1201 if (err) { 1202 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1203 goto err; 1204 } 1205 } 1206 1207 if (!old_prog) { 1208 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1209 peer->max_mtu = max_mtu; 1210 } 1211 } 1212 1213 if (old_prog) { 1214 if (!prog) { 1215 if (dev->flags & IFF_UP) 1216 veth_disable_xdp(dev); 1217 1218 if (peer) { 1219 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1220 peer->max_mtu = ETH_MAX_MTU; 1221 } 1222 } 1223 bpf_prog_put(old_prog); 1224 } 1225 1226 if ((!!old_prog ^ !!prog) && peer) 1227 netdev_update_features(peer); 1228 1229 return 0; 1230 err: 1231 priv->_xdp_prog = old_prog; 1232 1233 return err; 1234 } 1235 1236 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1237 { 1238 switch (xdp->command) { 1239 case XDP_SETUP_PROG: 1240 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1241 default: 1242 return -EINVAL; 1243 } 1244 } 1245 1246 static const struct net_device_ops veth_netdev_ops = { 1247 .ndo_init = veth_dev_init, 1248 .ndo_open = veth_open, 1249 .ndo_stop = veth_close, 1250 .ndo_start_xmit = veth_xmit, 1251 .ndo_get_stats64 = veth_get_stats64, 1252 .ndo_set_rx_mode = veth_set_multicast_list, 1253 .ndo_set_mac_address = eth_mac_addr, 1254 #ifdef CONFIG_NET_POLL_CONTROLLER 1255 .ndo_poll_controller = veth_poll_controller, 1256 #endif 1257 .ndo_get_iflink = veth_get_iflink, 1258 .ndo_fix_features = veth_fix_features, 1259 .ndo_features_check = passthru_features_check, 1260 .ndo_set_rx_headroom = veth_set_rx_headroom, 1261 .ndo_bpf = veth_xdp, 1262 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1263 .ndo_get_peer_dev = veth_peer_dev, 1264 }; 1265 1266 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1267 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1268 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1269 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1270 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1271 1272 static void veth_setup(struct net_device *dev) 1273 { 1274 ether_setup(dev); 1275 1276 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1277 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1278 dev->priv_flags |= IFF_NO_QUEUE; 1279 dev->priv_flags |= IFF_PHONY_HEADROOM; 1280 1281 dev->netdev_ops = &veth_netdev_ops; 1282 dev->ethtool_ops = &veth_ethtool_ops; 1283 dev->features |= NETIF_F_LLTX; 1284 dev->features |= VETH_FEATURES; 1285 dev->vlan_features = dev->features & 1286 ~(NETIF_F_HW_VLAN_CTAG_TX | 1287 NETIF_F_HW_VLAN_STAG_TX | 1288 NETIF_F_HW_VLAN_CTAG_RX | 1289 NETIF_F_HW_VLAN_STAG_RX); 1290 dev->needs_free_netdev = true; 1291 dev->priv_destructor = veth_dev_free; 1292 dev->max_mtu = ETH_MAX_MTU; 1293 1294 dev->hw_features = VETH_FEATURES; 1295 dev->hw_enc_features = VETH_FEATURES; 1296 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1297 } 1298 1299 /* 1300 * netlink interface 1301 */ 1302 1303 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1304 struct netlink_ext_ack *extack) 1305 { 1306 if (tb[IFLA_ADDRESS]) { 1307 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1308 return -EINVAL; 1309 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1310 return -EADDRNOTAVAIL; 1311 } 1312 if (tb[IFLA_MTU]) { 1313 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1314 return -EINVAL; 1315 } 1316 return 0; 1317 } 1318 1319 static struct rtnl_link_ops veth_link_ops; 1320 1321 static int veth_newlink(struct net *src_net, struct net_device *dev, 1322 struct nlattr *tb[], struct nlattr *data[], 1323 struct netlink_ext_ack *extack) 1324 { 1325 int err; 1326 struct net_device *peer; 1327 struct veth_priv *priv; 1328 char ifname[IFNAMSIZ]; 1329 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1330 unsigned char name_assign_type; 1331 struct ifinfomsg *ifmp; 1332 struct net *net; 1333 1334 /* 1335 * create and register peer first 1336 */ 1337 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1338 struct nlattr *nla_peer; 1339 1340 nla_peer = data[VETH_INFO_PEER]; 1341 ifmp = nla_data(nla_peer); 1342 err = rtnl_nla_parse_ifla(peer_tb, 1343 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1344 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1345 NULL); 1346 if (err < 0) 1347 return err; 1348 1349 err = veth_validate(peer_tb, NULL, extack); 1350 if (err < 0) 1351 return err; 1352 1353 tbp = peer_tb; 1354 } else { 1355 ifmp = NULL; 1356 tbp = tb; 1357 } 1358 1359 if (ifmp && tbp[IFLA_IFNAME]) { 1360 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1361 name_assign_type = NET_NAME_USER; 1362 } else { 1363 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1364 name_assign_type = NET_NAME_ENUM; 1365 } 1366 1367 net = rtnl_link_get_net(src_net, tbp); 1368 if (IS_ERR(net)) 1369 return PTR_ERR(net); 1370 1371 peer = rtnl_create_link(net, ifname, name_assign_type, 1372 &veth_link_ops, tbp, extack); 1373 if (IS_ERR(peer)) { 1374 put_net(net); 1375 return PTR_ERR(peer); 1376 } 1377 1378 if (!ifmp || !tbp[IFLA_ADDRESS]) 1379 eth_hw_addr_random(peer); 1380 1381 if (ifmp && (dev->ifindex != 0)) 1382 peer->ifindex = ifmp->ifi_index; 1383 1384 peer->gso_max_size = dev->gso_max_size; 1385 peer->gso_max_segs = dev->gso_max_segs; 1386 1387 err = register_netdevice(peer); 1388 put_net(net); 1389 net = NULL; 1390 if (err < 0) 1391 goto err_register_peer; 1392 1393 netif_carrier_off(peer); 1394 1395 err = rtnl_configure_link(peer, ifmp); 1396 if (err < 0) 1397 goto err_configure_peer; 1398 1399 /* 1400 * register dev last 1401 * 1402 * note, that since we've registered new device the dev's name 1403 * should be re-allocated 1404 */ 1405 1406 if (tb[IFLA_ADDRESS] == NULL) 1407 eth_hw_addr_random(dev); 1408 1409 if (tb[IFLA_IFNAME]) 1410 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1411 else 1412 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1413 1414 err = register_netdevice(dev); 1415 if (err < 0) 1416 goto err_register_dev; 1417 1418 netif_carrier_off(dev); 1419 1420 /* 1421 * tie the deviced together 1422 */ 1423 1424 priv = netdev_priv(dev); 1425 rcu_assign_pointer(priv->peer, peer); 1426 1427 priv = netdev_priv(peer); 1428 rcu_assign_pointer(priv->peer, dev); 1429 1430 return 0; 1431 1432 err_register_dev: 1433 /* nothing to do */ 1434 err_configure_peer: 1435 unregister_netdevice(peer); 1436 return err; 1437 1438 err_register_peer: 1439 free_netdev(peer); 1440 return err; 1441 } 1442 1443 static void veth_dellink(struct net_device *dev, struct list_head *head) 1444 { 1445 struct veth_priv *priv; 1446 struct net_device *peer; 1447 1448 priv = netdev_priv(dev); 1449 peer = rtnl_dereference(priv->peer); 1450 1451 /* Note : dellink() is called from default_device_exit_batch(), 1452 * before a rcu_synchronize() point. The devices are guaranteed 1453 * not being freed before one RCU grace period. 1454 */ 1455 RCU_INIT_POINTER(priv->peer, NULL); 1456 unregister_netdevice_queue(dev, head); 1457 1458 if (peer) { 1459 priv = netdev_priv(peer); 1460 RCU_INIT_POINTER(priv->peer, NULL); 1461 unregister_netdevice_queue(peer, head); 1462 } 1463 } 1464 1465 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1466 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1467 }; 1468 1469 static struct net *veth_get_link_net(const struct net_device *dev) 1470 { 1471 struct veth_priv *priv = netdev_priv(dev); 1472 struct net_device *peer = rtnl_dereference(priv->peer); 1473 1474 return peer ? dev_net(peer) : dev_net(dev); 1475 } 1476 1477 static struct rtnl_link_ops veth_link_ops = { 1478 .kind = DRV_NAME, 1479 .priv_size = sizeof(struct veth_priv), 1480 .setup = veth_setup, 1481 .validate = veth_validate, 1482 .newlink = veth_newlink, 1483 .dellink = veth_dellink, 1484 .policy = veth_policy, 1485 .maxtype = VETH_INFO_MAX, 1486 .get_link_net = veth_get_link_net, 1487 }; 1488 1489 /* 1490 * init/fini 1491 */ 1492 1493 static __init int veth_init(void) 1494 { 1495 return rtnl_link_register(&veth_link_ops); 1496 } 1497 1498 static __exit void veth_exit(void) 1499 { 1500 rtnl_link_unregister(&veth_link_ops); 1501 } 1502 1503 module_init(veth_init); 1504 module_exit(veth_exit); 1505 1506 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1507 MODULE_LICENSE("GPL v2"); 1508 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1509