1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 #define VETH_XDP_TX_BULK_SIZE 16 38 39 struct veth_stats { 40 u64 rx_drops; 41 /* xdp */ 42 u64 xdp_packets; 43 u64 xdp_bytes; 44 u64 xdp_redirect; 45 u64 xdp_drops; 46 u64 xdp_tx; 47 u64 xdp_tx_err; 48 u64 peer_tq_xdp_xmit; 49 u64 peer_tq_xdp_xmit_err; 50 }; 51 52 struct veth_rq_stats { 53 struct veth_stats vs; 54 struct u64_stats_sync syncp; 55 }; 56 57 struct veth_rq { 58 struct napi_struct xdp_napi; 59 struct net_device *dev; 60 struct bpf_prog __rcu *xdp_prog; 61 struct xdp_mem_info xdp_mem; 62 struct veth_rq_stats stats; 63 bool rx_notify_masked; 64 struct ptr_ring xdp_ring; 65 struct xdp_rxq_info xdp_rxq; 66 }; 67 68 struct veth_priv { 69 struct net_device __rcu *peer; 70 atomic64_t dropped; 71 struct bpf_prog *_xdp_prog; 72 struct veth_rq *rq; 73 unsigned int requested_headroom; 74 }; 75 76 struct veth_xdp_tx_bq { 77 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 78 unsigned int count; 79 }; 80 81 /* 82 * ethtool interface 83 */ 84 85 struct veth_q_stat_desc { 86 char desc[ETH_GSTRING_LEN]; 87 size_t offset; 88 }; 89 90 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 91 92 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 93 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 94 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 95 { "drops", VETH_RQ_STAT(rx_drops) }, 96 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 97 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 98 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 99 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 100 }; 101 102 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 103 104 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 105 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 106 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 107 }; 108 109 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 110 111 static struct { 112 const char string[ETH_GSTRING_LEN]; 113 } ethtool_stats_keys[] = { 114 { "peer_ifindex" }, 115 }; 116 117 static int veth_get_link_ksettings(struct net_device *dev, 118 struct ethtool_link_ksettings *cmd) 119 { 120 cmd->base.speed = SPEED_10000; 121 cmd->base.duplex = DUPLEX_FULL; 122 cmd->base.port = PORT_TP; 123 cmd->base.autoneg = AUTONEG_DISABLE; 124 return 0; 125 } 126 127 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 128 { 129 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 130 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 131 } 132 133 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 134 { 135 char *p = (char *)buf; 136 int i, j; 137 138 switch(stringset) { 139 case ETH_SS_STATS: 140 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 141 p += sizeof(ethtool_stats_keys); 142 for (i = 0; i < dev->real_num_rx_queues; i++) { 143 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 144 snprintf(p, ETH_GSTRING_LEN, 145 "rx_queue_%u_%.18s", 146 i, veth_rq_stats_desc[j].desc); 147 p += ETH_GSTRING_LEN; 148 } 149 } 150 for (i = 0; i < dev->real_num_tx_queues; i++) { 151 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 152 snprintf(p, ETH_GSTRING_LEN, 153 "tx_queue_%u_%.18s", 154 i, veth_tq_stats_desc[j].desc); 155 p += ETH_GSTRING_LEN; 156 } 157 } 158 break; 159 } 160 } 161 162 static int veth_get_sset_count(struct net_device *dev, int sset) 163 { 164 switch (sset) { 165 case ETH_SS_STATS: 166 return ARRAY_SIZE(ethtool_stats_keys) + 167 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 168 VETH_TQ_STATS_LEN * dev->real_num_tx_queues; 169 default: 170 return -EOPNOTSUPP; 171 } 172 } 173 174 static void veth_get_ethtool_stats(struct net_device *dev, 175 struct ethtool_stats *stats, u64 *data) 176 { 177 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 178 struct net_device *peer = rtnl_dereference(priv->peer); 179 int i, j, idx; 180 181 data[0] = peer ? peer->ifindex : 0; 182 idx = 1; 183 for (i = 0; i < dev->real_num_rx_queues; i++) { 184 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 185 const void *stats_base = (void *)&rq_stats->vs; 186 unsigned int start; 187 size_t offset; 188 189 do { 190 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 191 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 192 offset = veth_rq_stats_desc[j].offset; 193 data[idx + j] = *(u64 *)(stats_base + offset); 194 } 195 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 196 idx += VETH_RQ_STATS_LEN; 197 } 198 199 if (!peer) 200 return; 201 202 rcv_priv = netdev_priv(peer); 203 for (i = 0; i < peer->real_num_rx_queues; i++) { 204 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 205 const void *base = (void *)&rq_stats->vs; 206 unsigned int start, tx_idx = idx; 207 size_t offset; 208 209 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 210 do { 211 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 212 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 213 offset = veth_tq_stats_desc[j].offset; 214 data[tx_idx + j] += *(u64 *)(base + offset); 215 } 216 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 217 } 218 } 219 220 static const struct ethtool_ops veth_ethtool_ops = { 221 .get_drvinfo = veth_get_drvinfo, 222 .get_link = ethtool_op_get_link, 223 .get_strings = veth_get_strings, 224 .get_sset_count = veth_get_sset_count, 225 .get_ethtool_stats = veth_get_ethtool_stats, 226 .get_link_ksettings = veth_get_link_ksettings, 227 .get_ts_info = ethtool_op_get_ts_info, 228 }; 229 230 /* general routines */ 231 232 static bool veth_is_xdp_frame(void *ptr) 233 { 234 return (unsigned long)ptr & VETH_XDP_FLAG; 235 } 236 237 static void *veth_ptr_to_xdp(void *ptr) 238 { 239 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 240 } 241 242 static void *veth_xdp_to_ptr(void *ptr) 243 { 244 return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 245 } 246 247 static void veth_ptr_free(void *ptr) 248 { 249 if (veth_is_xdp_frame(ptr)) 250 xdp_return_frame(veth_ptr_to_xdp(ptr)); 251 else 252 kfree_skb(ptr); 253 } 254 255 static void __veth_xdp_flush(struct veth_rq *rq) 256 { 257 /* Write ptr_ring before reading rx_notify_masked */ 258 smp_mb(); 259 if (!rq->rx_notify_masked) { 260 rq->rx_notify_masked = true; 261 napi_schedule(&rq->xdp_napi); 262 } 263 } 264 265 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 266 { 267 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 268 dev_kfree_skb_any(skb); 269 return NET_RX_DROP; 270 } 271 272 return NET_RX_SUCCESS; 273 } 274 275 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 276 struct veth_rq *rq, bool xdp) 277 { 278 return __dev_forward_skb(dev, skb) ?: xdp ? 279 veth_xdp_rx(rq, skb) : 280 netif_rx(skb); 281 } 282 283 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 284 { 285 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 286 struct veth_rq *rq = NULL; 287 struct net_device *rcv; 288 int length = skb->len; 289 bool rcv_xdp = false; 290 int rxq; 291 292 rcu_read_lock(); 293 rcv = rcu_dereference(priv->peer); 294 if (unlikely(!rcv)) { 295 kfree_skb(skb); 296 goto drop; 297 } 298 299 rcv_priv = netdev_priv(rcv); 300 rxq = skb_get_queue_mapping(skb); 301 if (rxq < rcv->real_num_rx_queues) { 302 rq = &rcv_priv->rq[rxq]; 303 rcv_xdp = rcu_access_pointer(rq->xdp_prog); 304 if (rcv_xdp) 305 skb_record_rx_queue(skb, rxq); 306 } 307 308 skb_tx_timestamp(skb); 309 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 310 if (!rcv_xdp) 311 dev_lstats_add(dev, length); 312 } else { 313 drop: 314 atomic64_inc(&priv->dropped); 315 } 316 317 if (rcv_xdp) 318 __veth_xdp_flush(rq); 319 320 rcu_read_unlock(); 321 322 return NETDEV_TX_OK; 323 } 324 325 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 326 { 327 struct veth_priv *priv = netdev_priv(dev); 328 329 dev_lstats_read(dev, packets, bytes); 330 return atomic64_read(&priv->dropped); 331 } 332 333 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 334 { 335 struct veth_priv *priv = netdev_priv(dev); 336 int i; 337 338 result->peer_tq_xdp_xmit_err = 0; 339 result->xdp_packets = 0; 340 result->xdp_tx_err = 0; 341 result->xdp_bytes = 0; 342 result->rx_drops = 0; 343 for (i = 0; i < dev->num_rx_queues; i++) { 344 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 345 struct veth_rq_stats *stats = &priv->rq[i].stats; 346 unsigned int start; 347 348 do { 349 start = u64_stats_fetch_begin_irq(&stats->syncp); 350 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 351 xdp_tx_err = stats->vs.xdp_tx_err; 352 packets = stats->vs.xdp_packets; 353 bytes = stats->vs.xdp_bytes; 354 drops = stats->vs.rx_drops; 355 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 356 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 357 result->xdp_tx_err += xdp_tx_err; 358 result->xdp_packets += packets; 359 result->xdp_bytes += bytes; 360 result->rx_drops += drops; 361 } 362 } 363 364 static void veth_get_stats64(struct net_device *dev, 365 struct rtnl_link_stats64 *tot) 366 { 367 struct veth_priv *priv = netdev_priv(dev); 368 struct net_device *peer; 369 struct veth_stats rx; 370 u64 packets, bytes; 371 372 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 373 tot->tx_bytes = bytes; 374 tot->tx_packets = packets; 375 376 veth_stats_rx(&rx, dev); 377 tot->tx_dropped += rx.xdp_tx_err; 378 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 379 tot->rx_bytes = rx.xdp_bytes; 380 tot->rx_packets = rx.xdp_packets; 381 382 rcu_read_lock(); 383 peer = rcu_dereference(priv->peer); 384 if (peer) { 385 veth_stats_tx(peer, &packets, &bytes); 386 tot->rx_bytes += bytes; 387 tot->rx_packets += packets; 388 389 veth_stats_rx(&rx, peer); 390 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 391 tot->rx_dropped += rx.xdp_tx_err; 392 tot->tx_bytes += rx.xdp_bytes; 393 tot->tx_packets += rx.xdp_packets; 394 } 395 rcu_read_unlock(); 396 } 397 398 /* fake multicast ability */ 399 static void veth_set_multicast_list(struct net_device *dev) 400 { 401 } 402 403 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 404 int buflen) 405 { 406 struct sk_buff *skb; 407 408 skb = build_skb(head, buflen); 409 if (!skb) 410 return NULL; 411 412 skb_reserve(skb, headroom); 413 skb_put(skb, len); 414 415 return skb; 416 } 417 418 static int veth_select_rxq(struct net_device *dev) 419 { 420 return smp_processor_id() % dev->real_num_rx_queues; 421 } 422 423 static int veth_xdp_xmit(struct net_device *dev, int n, 424 struct xdp_frame **frames, 425 u32 flags, bool ndo_xmit) 426 { 427 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 428 int i, ret = -ENXIO, drops = 0; 429 struct net_device *rcv; 430 unsigned int max_len; 431 struct veth_rq *rq; 432 433 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 434 return -EINVAL; 435 436 rcu_read_lock(); 437 rcv = rcu_dereference(priv->peer); 438 if (unlikely(!rcv)) 439 goto out; 440 441 rcv_priv = netdev_priv(rcv); 442 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 443 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 444 * side. This means an XDP program is loaded on the peer and the peer 445 * device is up. 446 */ 447 if (!rcu_access_pointer(rq->xdp_prog)) 448 goto out; 449 450 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 451 452 spin_lock(&rq->xdp_ring.producer_lock); 453 for (i = 0; i < n; i++) { 454 struct xdp_frame *frame = frames[i]; 455 void *ptr = veth_xdp_to_ptr(frame); 456 457 if (unlikely(frame->len > max_len || 458 __ptr_ring_produce(&rq->xdp_ring, ptr))) { 459 xdp_return_frame_rx_napi(frame); 460 drops++; 461 } 462 } 463 spin_unlock(&rq->xdp_ring.producer_lock); 464 465 if (flags & XDP_XMIT_FLUSH) 466 __veth_xdp_flush(rq); 467 468 ret = n - drops; 469 if (ndo_xmit) { 470 u64_stats_update_begin(&rq->stats.syncp); 471 rq->stats.vs.peer_tq_xdp_xmit += n - drops; 472 rq->stats.vs.peer_tq_xdp_xmit_err += drops; 473 u64_stats_update_end(&rq->stats.syncp); 474 } 475 476 out: 477 rcu_read_unlock(); 478 479 return ret; 480 } 481 482 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 483 struct xdp_frame **frames, u32 flags) 484 { 485 int err; 486 487 err = veth_xdp_xmit(dev, n, frames, flags, true); 488 if (err < 0) { 489 struct veth_priv *priv = netdev_priv(dev); 490 491 atomic64_add(n, &priv->dropped); 492 } 493 494 return err; 495 } 496 497 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 498 { 499 int sent, i, err = 0; 500 501 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 502 if (sent < 0) { 503 err = sent; 504 sent = 0; 505 for (i = 0; i < bq->count; i++) 506 xdp_return_frame(bq->q[i]); 507 } 508 trace_xdp_bulk_tx(rq->dev, sent, bq->count - sent, err); 509 510 u64_stats_update_begin(&rq->stats.syncp); 511 rq->stats.vs.xdp_tx += sent; 512 rq->stats.vs.xdp_tx_err += bq->count - sent; 513 u64_stats_update_end(&rq->stats.syncp); 514 515 bq->count = 0; 516 } 517 518 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 519 { 520 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 521 struct net_device *rcv; 522 struct veth_rq *rcv_rq; 523 524 rcu_read_lock(); 525 veth_xdp_flush_bq(rq, bq); 526 rcv = rcu_dereference(priv->peer); 527 if (unlikely(!rcv)) 528 goto out; 529 530 rcv_priv = netdev_priv(rcv); 531 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 532 /* xdp_ring is initialized on receive side? */ 533 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 534 goto out; 535 536 __veth_xdp_flush(rcv_rq); 537 out: 538 rcu_read_unlock(); 539 } 540 541 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 542 struct veth_xdp_tx_bq *bq) 543 { 544 struct xdp_frame *frame = convert_to_xdp_frame(xdp); 545 546 if (unlikely(!frame)) 547 return -EOVERFLOW; 548 549 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 550 veth_xdp_flush_bq(rq, bq); 551 552 bq->q[bq->count++] = frame; 553 554 return 0; 555 } 556 557 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 558 struct xdp_frame *frame, 559 struct veth_xdp_tx_bq *bq, 560 struct veth_stats *stats) 561 { 562 void *hard_start = frame->data - frame->headroom; 563 int len = frame->len, delta = 0; 564 struct xdp_frame orig_frame; 565 struct bpf_prog *xdp_prog; 566 unsigned int headroom; 567 struct sk_buff *skb; 568 569 /* bpf_xdp_adjust_head() assures BPF cannot access xdp_frame area */ 570 hard_start -= sizeof(struct xdp_frame); 571 572 rcu_read_lock(); 573 xdp_prog = rcu_dereference(rq->xdp_prog); 574 if (likely(xdp_prog)) { 575 struct xdp_buff xdp; 576 u32 act; 577 578 xdp.data_hard_start = hard_start; 579 xdp.data = frame->data; 580 xdp.data_end = frame->data + frame->len; 581 xdp.data_meta = frame->data - frame->metasize; 582 xdp.frame_sz = frame->frame_sz; 583 xdp.rxq = &rq->xdp_rxq; 584 585 act = bpf_prog_run_xdp(xdp_prog, &xdp); 586 587 switch (act) { 588 case XDP_PASS: 589 delta = frame->data - xdp.data; 590 len = xdp.data_end - xdp.data; 591 break; 592 case XDP_TX: 593 orig_frame = *frame; 594 xdp.rxq->mem = frame->mem; 595 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 596 trace_xdp_exception(rq->dev, xdp_prog, act); 597 frame = &orig_frame; 598 stats->rx_drops++; 599 goto err_xdp; 600 } 601 stats->xdp_tx++; 602 rcu_read_unlock(); 603 goto xdp_xmit; 604 case XDP_REDIRECT: 605 orig_frame = *frame; 606 xdp.rxq->mem = frame->mem; 607 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 608 frame = &orig_frame; 609 stats->rx_drops++; 610 goto err_xdp; 611 } 612 stats->xdp_redirect++; 613 rcu_read_unlock(); 614 goto xdp_xmit; 615 default: 616 bpf_warn_invalid_xdp_action(act); 617 /* fall through */ 618 case XDP_ABORTED: 619 trace_xdp_exception(rq->dev, xdp_prog, act); 620 /* fall through */ 621 case XDP_DROP: 622 stats->xdp_drops++; 623 goto err_xdp; 624 } 625 } 626 rcu_read_unlock(); 627 628 headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 629 skb = veth_build_skb(hard_start, headroom, len, frame->frame_sz); 630 if (!skb) { 631 xdp_return_frame(frame); 632 stats->rx_drops++; 633 goto err; 634 } 635 636 xdp_release_frame(frame); 637 xdp_scrub_frame(frame); 638 skb->protocol = eth_type_trans(skb, rq->dev); 639 err: 640 return skb; 641 err_xdp: 642 rcu_read_unlock(); 643 xdp_return_frame(frame); 644 xdp_xmit: 645 return NULL; 646 } 647 648 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 649 struct sk_buff *skb, 650 struct veth_xdp_tx_bq *bq, 651 struct veth_stats *stats) 652 { 653 u32 pktlen, headroom, act, metalen; 654 void *orig_data, *orig_data_end; 655 struct bpf_prog *xdp_prog; 656 int mac_len, delta, off; 657 struct xdp_buff xdp; 658 659 skb_orphan(skb); 660 661 rcu_read_lock(); 662 xdp_prog = rcu_dereference(rq->xdp_prog); 663 if (unlikely(!xdp_prog)) { 664 rcu_read_unlock(); 665 goto out; 666 } 667 668 mac_len = skb->data - skb_mac_header(skb); 669 pktlen = skb->len + mac_len; 670 headroom = skb_headroom(skb) - mac_len; 671 672 if (skb_shared(skb) || skb_head_is_locked(skb) || 673 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 674 struct sk_buff *nskb; 675 int size, head_off; 676 void *head, *start; 677 struct page *page; 678 679 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 680 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 681 if (size > PAGE_SIZE) 682 goto drop; 683 684 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 685 if (!page) 686 goto drop; 687 688 head = page_address(page); 689 start = head + VETH_XDP_HEADROOM; 690 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 691 page_frag_free(head); 692 goto drop; 693 } 694 695 nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, 696 skb->len, PAGE_SIZE); 697 if (!nskb) { 698 page_frag_free(head); 699 goto drop; 700 } 701 702 skb_copy_header(nskb, skb); 703 head_off = skb_headroom(nskb) - skb_headroom(skb); 704 skb_headers_offset_update(nskb, head_off); 705 consume_skb(skb); 706 skb = nskb; 707 } 708 709 xdp.data_hard_start = skb->head; 710 xdp.data = skb_mac_header(skb); 711 xdp.data_end = xdp.data + pktlen; 712 xdp.data_meta = xdp.data; 713 xdp.rxq = &rq->xdp_rxq; 714 715 /* SKB "head" area always have tailroom for skb_shared_info */ 716 xdp.frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; 717 xdp.frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 718 719 orig_data = xdp.data; 720 orig_data_end = xdp.data_end; 721 722 act = bpf_prog_run_xdp(xdp_prog, &xdp); 723 724 switch (act) { 725 case XDP_PASS: 726 break; 727 case XDP_TX: 728 get_page(virt_to_page(xdp.data)); 729 consume_skb(skb); 730 xdp.rxq->mem = rq->xdp_mem; 731 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 732 trace_xdp_exception(rq->dev, xdp_prog, act); 733 stats->rx_drops++; 734 goto err_xdp; 735 } 736 stats->xdp_tx++; 737 rcu_read_unlock(); 738 goto xdp_xmit; 739 case XDP_REDIRECT: 740 get_page(virt_to_page(xdp.data)); 741 consume_skb(skb); 742 xdp.rxq->mem = rq->xdp_mem; 743 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 744 stats->rx_drops++; 745 goto err_xdp; 746 } 747 stats->xdp_redirect++; 748 rcu_read_unlock(); 749 goto xdp_xmit; 750 default: 751 bpf_warn_invalid_xdp_action(act); 752 /* fall through */ 753 case XDP_ABORTED: 754 trace_xdp_exception(rq->dev, xdp_prog, act); 755 /* fall through */ 756 case XDP_DROP: 757 stats->xdp_drops++; 758 goto xdp_drop; 759 } 760 rcu_read_unlock(); 761 762 /* check if bpf_xdp_adjust_head was used */ 763 delta = orig_data - xdp.data; 764 off = mac_len + delta; 765 if (off > 0) 766 __skb_push(skb, off); 767 else if (off < 0) 768 __skb_pull(skb, -off); 769 skb->mac_header -= delta; 770 771 /* check if bpf_xdp_adjust_tail was used */ 772 off = xdp.data_end - orig_data_end; 773 if (off != 0) 774 __skb_put(skb, off); /* positive on grow, negative on shrink */ 775 skb->protocol = eth_type_trans(skb, rq->dev); 776 777 metalen = xdp.data - xdp.data_meta; 778 if (metalen) 779 skb_metadata_set(skb, metalen); 780 out: 781 return skb; 782 drop: 783 stats->rx_drops++; 784 xdp_drop: 785 rcu_read_unlock(); 786 kfree_skb(skb); 787 return NULL; 788 err_xdp: 789 rcu_read_unlock(); 790 page_frag_free(xdp.data); 791 xdp_xmit: 792 return NULL; 793 } 794 795 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 796 struct veth_xdp_tx_bq *bq, 797 struct veth_stats *stats) 798 { 799 int i, done = 0; 800 801 for (i = 0; i < budget; i++) { 802 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 803 struct sk_buff *skb; 804 805 if (!ptr) 806 break; 807 808 if (veth_is_xdp_frame(ptr)) { 809 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 810 811 stats->xdp_bytes += frame->len; 812 skb = veth_xdp_rcv_one(rq, frame, bq, stats); 813 } else { 814 skb = ptr; 815 stats->xdp_bytes += skb->len; 816 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 817 } 818 819 if (skb) 820 napi_gro_receive(&rq->xdp_napi, skb); 821 822 done++; 823 } 824 825 u64_stats_update_begin(&rq->stats.syncp); 826 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 827 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 828 rq->stats.vs.xdp_drops += stats->xdp_drops; 829 rq->stats.vs.rx_drops += stats->rx_drops; 830 rq->stats.vs.xdp_packets += done; 831 u64_stats_update_end(&rq->stats.syncp); 832 833 return done; 834 } 835 836 static int veth_poll(struct napi_struct *napi, int budget) 837 { 838 struct veth_rq *rq = 839 container_of(napi, struct veth_rq, xdp_napi); 840 struct veth_stats stats = {}; 841 struct veth_xdp_tx_bq bq; 842 int done; 843 844 bq.count = 0; 845 846 xdp_set_return_frame_no_direct(); 847 done = veth_xdp_rcv(rq, budget, &bq, &stats); 848 849 if (done < budget && napi_complete_done(napi, done)) { 850 /* Write rx_notify_masked before reading ptr_ring */ 851 smp_store_mb(rq->rx_notify_masked, false); 852 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 853 rq->rx_notify_masked = true; 854 napi_schedule(&rq->xdp_napi); 855 } 856 } 857 858 if (stats.xdp_tx > 0) 859 veth_xdp_flush(rq, &bq); 860 if (stats.xdp_redirect > 0) 861 xdp_do_flush(); 862 xdp_clear_return_frame_no_direct(); 863 864 return done; 865 } 866 867 static int veth_napi_add(struct net_device *dev) 868 { 869 struct veth_priv *priv = netdev_priv(dev); 870 int err, i; 871 872 for (i = 0; i < dev->real_num_rx_queues; i++) { 873 struct veth_rq *rq = &priv->rq[i]; 874 875 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 876 if (err) 877 goto err_xdp_ring; 878 } 879 880 for (i = 0; i < dev->real_num_rx_queues; i++) { 881 struct veth_rq *rq = &priv->rq[i]; 882 883 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 884 napi_enable(&rq->xdp_napi); 885 } 886 887 return 0; 888 err_xdp_ring: 889 for (i--; i >= 0; i--) 890 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 891 892 return err; 893 } 894 895 static void veth_napi_del(struct net_device *dev) 896 { 897 struct veth_priv *priv = netdev_priv(dev); 898 int i; 899 900 for (i = 0; i < dev->real_num_rx_queues; i++) { 901 struct veth_rq *rq = &priv->rq[i]; 902 903 napi_disable(&rq->xdp_napi); 904 napi_hash_del(&rq->xdp_napi); 905 } 906 synchronize_net(); 907 908 for (i = 0; i < dev->real_num_rx_queues; i++) { 909 struct veth_rq *rq = &priv->rq[i]; 910 911 netif_napi_del(&rq->xdp_napi); 912 rq->rx_notify_masked = false; 913 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 914 } 915 } 916 917 static int veth_enable_xdp(struct net_device *dev) 918 { 919 struct veth_priv *priv = netdev_priv(dev); 920 int err, i; 921 922 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 923 for (i = 0; i < dev->real_num_rx_queues; i++) { 924 struct veth_rq *rq = &priv->rq[i]; 925 926 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 927 if (err < 0) 928 goto err_rxq_reg; 929 930 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 931 MEM_TYPE_PAGE_SHARED, 932 NULL); 933 if (err < 0) 934 goto err_reg_mem; 935 936 /* Save original mem info as it can be overwritten */ 937 rq->xdp_mem = rq->xdp_rxq.mem; 938 } 939 940 err = veth_napi_add(dev); 941 if (err) 942 goto err_rxq_reg; 943 } 944 945 for (i = 0; i < dev->real_num_rx_queues; i++) 946 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 947 948 return 0; 949 err_reg_mem: 950 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 951 err_rxq_reg: 952 for (i--; i >= 0; i--) 953 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 954 955 return err; 956 } 957 958 static void veth_disable_xdp(struct net_device *dev) 959 { 960 struct veth_priv *priv = netdev_priv(dev); 961 int i; 962 963 for (i = 0; i < dev->real_num_rx_queues; i++) 964 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 965 veth_napi_del(dev); 966 for (i = 0; i < dev->real_num_rx_queues; i++) { 967 struct veth_rq *rq = &priv->rq[i]; 968 969 rq->xdp_rxq.mem = rq->xdp_mem; 970 xdp_rxq_info_unreg(&rq->xdp_rxq); 971 } 972 } 973 974 static int veth_open(struct net_device *dev) 975 { 976 struct veth_priv *priv = netdev_priv(dev); 977 struct net_device *peer = rtnl_dereference(priv->peer); 978 int err; 979 980 if (!peer) 981 return -ENOTCONN; 982 983 if (priv->_xdp_prog) { 984 err = veth_enable_xdp(dev); 985 if (err) 986 return err; 987 } 988 989 if (peer->flags & IFF_UP) { 990 netif_carrier_on(dev); 991 netif_carrier_on(peer); 992 } 993 994 return 0; 995 } 996 997 static int veth_close(struct net_device *dev) 998 { 999 struct veth_priv *priv = netdev_priv(dev); 1000 struct net_device *peer = rtnl_dereference(priv->peer); 1001 1002 netif_carrier_off(dev); 1003 if (peer) 1004 netif_carrier_off(peer); 1005 1006 if (priv->_xdp_prog) 1007 veth_disable_xdp(dev); 1008 1009 return 0; 1010 } 1011 1012 static int is_valid_veth_mtu(int mtu) 1013 { 1014 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1015 } 1016 1017 static int veth_alloc_queues(struct net_device *dev) 1018 { 1019 struct veth_priv *priv = netdev_priv(dev); 1020 int i; 1021 1022 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 1023 if (!priv->rq) 1024 return -ENOMEM; 1025 1026 for (i = 0; i < dev->num_rx_queues; i++) { 1027 priv->rq[i].dev = dev; 1028 u64_stats_init(&priv->rq[i].stats.syncp); 1029 } 1030 1031 return 0; 1032 } 1033 1034 static void veth_free_queues(struct net_device *dev) 1035 { 1036 struct veth_priv *priv = netdev_priv(dev); 1037 1038 kfree(priv->rq); 1039 } 1040 1041 static int veth_dev_init(struct net_device *dev) 1042 { 1043 int err; 1044 1045 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1046 if (!dev->lstats) 1047 return -ENOMEM; 1048 1049 err = veth_alloc_queues(dev); 1050 if (err) { 1051 free_percpu(dev->lstats); 1052 return err; 1053 } 1054 1055 return 0; 1056 } 1057 1058 static void veth_dev_free(struct net_device *dev) 1059 { 1060 veth_free_queues(dev); 1061 free_percpu(dev->lstats); 1062 } 1063 1064 #ifdef CONFIG_NET_POLL_CONTROLLER 1065 static void veth_poll_controller(struct net_device *dev) 1066 { 1067 /* veth only receives frames when its peer sends one 1068 * Since it has nothing to do with disabling irqs, we are guaranteed 1069 * never to have pending data when we poll for it so 1070 * there is nothing to do here. 1071 * 1072 * We need this though so netpoll recognizes us as an interface that 1073 * supports polling, which enables bridge devices in virt setups to 1074 * still use netconsole 1075 */ 1076 } 1077 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1078 1079 static int veth_get_iflink(const struct net_device *dev) 1080 { 1081 struct veth_priv *priv = netdev_priv(dev); 1082 struct net_device *peer; 1083 int iflink; 1084 1085 rcu_read_lock(); 1086 peer = rcu_dereference(priv->peer); 1087 iflink = peer ? peer->ifindex : 0; 1088 rcu_read_unlock(); 1089 1090 return iflink; 1091 } 1092 1093 static netdev_features_t veth_fix_features(struct net_device *dev, 1094 netdev_features_t features) 1095 { 1096 struct veth_priv *priv = netdev_priv(dev); 1097 struct net_device *peer; 1098 1099 peer = rtnl_dereference(priv->peer); 1100 if (peer) { 1101 struct veth_priv *peer_priv = netdev_priv(peer); 1102 1103 if (peer_priv->_xdp_prog) 1104 features &= ~NETIF_F_GSO_SOFTWARE; 1105 } 1106 1107 return features; 1108 } 1109 1110 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1111 { 1112 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1113 struct net_device *peer; 1114 1115 if (new_hr < 0) 1116 new_hr = 0; 1117 1118 rcu_read_lock(); 1119 peer = rcu_dereference(priv->peer); 1120 if (unlikely(!peer)) 1121 goto out; 1122 1123 peer_priv = netdev_priv(peer); 1124 priv->requested_headroom = new_hr; 1125 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1126 dev->needed_headroom = new_hr; 1127 peer->needed_headroom = new_hr; 1128 1129 out: 1130 rcu_read_unlock(); 1131 } 1132 1133 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1134 struct netlink_ext_ack *extack) 1135 { 1136 struct veth_priv *priv = netdev_priv(dev); 1137 struct bpf_prog *old_prog; 1138 struct net_device *peer; 1139 unsigned int max_mtu; 1140 int err; 1141 1142 old_prog = priv->_xdp_prog; 1143 priv->_xdp_prog = prog; 1144 peer = rtnl_dereference(priv->peer); 1145 1146 if (prog) { 1147 if (!peer) { 1148 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1149 err = -ENOTCONN; 1150 goto err; 1151 } 1152 1153 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1154 peer->hard_header_len - 1155 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1156 if (peer->mtu > max_mtu) { 1157 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1158 err = -ERANGE; 1159 goto err; 1160 } 1161 1162 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1163 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1164 err = -ENOSPC; 1165 goto err; 1166 } 1167 1168 if (dev->flags & IFF_UP) { 1169 err = veth_enable_xdp(dev); 1170 if (err) { 1171 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1172 goto err; 1173 } 1174 } 1175 1176 if (!old_prog) { 1177 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1178 peer->max_mtu = max_mtu; 1179 } 1180 } 1181 1182 if (old_prog) { 1183 if (!prog) { 1184 if (dev->flags & IFF_UP) 1185 veth_disable_xdp(dev); 1186 1187 if (peer) { 1188 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1189 peer->max_mtu = ETH_MAX_MTU; 1190 } 1191 } 1192 bpf_prog_put(old_prog); 1193 } 1194 1195 if ((!!old_prog ^ !!prog) && peer) 1196 netdev_update_features(peer); 1197 1198 return 0; 1199 err: 1200 priv->_xdp_prog = old_prog; 1201 1202 return err; 1203 } 1204 1205 static u32 veth_xdp_query(struct net_device *dev) 1206 { 1207 struct veth_priv *priv = netdev_priv(dev); 1208 const struct bpf_prog *xdp_prog; 1209 1210 xdp_prog = priv->_xdp_prog; 1211 if (xdp_prog) 1212 return xdp_prog->aux->id; 1213 1214 return 0; 1215 } 1216 1217 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1218 { 1219 switch (xdp->command) { 1220 case XDP_SETUP_PROG: 1221 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1222 case XDP_QUERY_PROG: 1223 xdp->prog_id = veth_xdp_query(dev); 1224 return 0; 1225 default: 1226 return -EINVAL; 1227 } 1228 } 1229 1230 static const struct net_device_ops veth_netdev_ops = { 1231 .ndo_init = veth_dev_init, 1232 .ndo_open = veth_open, 1233 .ndo_stop = veth_close, 1234 .ndo_start_xmit = veth_xmit, 1235 .ndo_get_stats64 = veth_get_stats64, 1236 .ndo_set_rx_mode = veth_set_multicast_list, 1237 .ndo_set_mac_address = eth_mac_addr, 1238 #ifdef CONFIG_NET_POLL_CONTROLLER 1239 .ndo_poll_controller = veth_poll_controller, 1240 #endif 1241 .ndo_get_iflink = veth_get_iflink, 1242 .ndo_fix_features = veth_fix_features, 1243 .ndo_features_check = passthru_features_check, 1244 .ndo_set_rx_headroom = veth_set_rx_headroom, 1245 .ndo_bpf = veth_xdp, 1246 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1247 }; 1248 1249 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1250 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1251 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1252 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1253 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1254 1255 static void veth_setup(struct net_device *dev) 1256 { 1257 ether_setup(dev); 1258 1259 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1260 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1261 dev->priv_flags |= IFF_NO_QUEUE; 1262 dev->priv_flags |= IFF_PHONY_HEADROOM; 1263 1264 dev->netdev_ops = &veth_netdev_ops; 1265 dev->ethtool_ops = &veth_ethtool_ops; 1266 dev->features |= NETIF_F_LLTX; 1267 dev->features |= VETH_FEATURES; 1268 dev->vlan_features = dev->features & 1269 ~(NETIF_F_HW_VLAN_CTAG_TX | 1270 NETIF_F_HW_VLAN_STAG_TX | 1271 NETIF_F_HW_VLAN_CTAG_RX | 1272 NETIF_F_HW_VLAN_STAG_RX); 1273 dev->needs_free_netdev = true; 1274 dev->priv_destructor = veth_dev_free; 1275 dev->max_mtu = ETH_MAX_MTU; 1276 1277 dev->hw_features = VETH_FEATURES; 1278 dev->hw_enc_features = VETH_FEATURES; 1279 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1280 } 1281 1282 /* 1283 * netlink interface 1284 */ 1285 1286 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1287 struct netlink_ext_ack *extack) 1288 { 1289 if (tb[IFLA_ADDRESS]) { 1290 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1291 return -EINVAL; 1292 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1293 return -EADDRNOTAVAIL; 1294 } 1295 if (tb[IFLA_MTU]) { 1296 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1297 return -EINVAL; 1298 } 1299 return 0; 1300 } 1301 1302 static struct rtnl_link_ops veth_link_ops; 1303 1304 static int veth_newlink(struct net *src_net, struct net_device *dev, 1305 struct nlattr *tb[], struct nlattr *data[], 1306 struct netlink_ext_ack *extack) 1307 { 1308 int err; 1309 struct net_device *peer; 1310 struct veth_priv *priv; 1311 char ifname[IFNAMSIZ]; 1312 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1313 unsigned char name_assign_type; 1314 struct ifinfomsg *ifmp; 1315 struct net *net; 1316 1317 /* 1318 * create and register peer first 1319 */ 1320 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1321 struct nlattr *nla_peer; 1322 1323 nla_peer = data[VETH_INFO_PEER]; 1324 ifmp = nla_data(nla_peer); 1325 err = rtnl_nla_parse_ifla(peer_tb, 1326 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1327 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1328 NULL); 1329 if (err < 0) 1330 return err; 1331 1332 err = veth_validate(peer_tb, NULL, extack); 1333 if (err < 0) 1334 return err; 1335 1336 tbp = peer_tb; 1337 } else { 1338 ifmp = NULL; 1339 tbp = tb; 1340 } 1341 1342 if (ifmp && tbp[IFLA_IFNAME]) { 1343 nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1344 name_assign_type = NET_NAME_USER; 1345 } else { 1346 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1347 name_assign_type = NET_NAME_ENUM; 1348 } 1349 1350 net = rtnl_link_get_net(src_net, tbp); 1351 if (IS_ERR(net)) 1352 return PTR_ERR(net); 1353 1354 peer = rtnl_create_link(net, ifname, name_assign_type, 1355 &veth_link_ops, tbp, extack); 1356 if (IS_ERR(peer)) { 1357 put_net(net); 1358 return PTR_ERR(peer); 1359 } 1360 1361 if (!ifmp || !tbp[IFLA_ADDRESS]) 1362 eth_hw_addr_random(peer); 1363 1364 if (ifmp && (dev->ifindex != 0)) 1365 peer->ifindex = ifmp->ifi_index; 1366 1367 peer->gso_max_size = dev->gso_max_size; 1368 peer->gso_max_segs = dev->gso_max_segs; 1369 1370 err = register_netdevice(peer); 1371 put_net(net); 1372 net = NULL; 1373 if (err < 0) 1374 goto err_register_peer; 1375 1376 netif_carrier_off(peer); 1377 1378 err = rtnl_configure_link(peer, ifmp); 1379 if (err < 0) 1380 goto err_configure_peer; 1381 1382 /* 1383 * register dev last 1384 * 1385 * note, that since we've registered new device the dev's name 1386 * should be re-allocated 1387 */ 1388 1389 if (tb[IFLA_ADDRESS] == NULL) 1390 eth_hw_addr_random(dev); 1391 1392 if (tb[IFLA_IFNAME]) 1393 nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1394 else 1395 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1396 1397 err = register_netdevice(dev); 1398 if (err < 0) 1399 goto err_register_dev; 1400 1401 netif_carrier_off(dev); 1402 1403 /* 1404 * tie the deviced together 1405 */ 1406 1407 priv = netdev_priv(dev); 1408 rcu_assign_pointer(priv->peer, peer); 1409 1410 priv = netdev_priv(peer); 1411 rcu_assign_pointer(priv->peer, dev); 1412 1413 return 0; 1414 1415 err_register_dev: 1416 /* nothing to do */ 1417 err_configure_peer: 1418 unregister_netdevice(peer); 1419 return err; 1420 1421 err_register_peer: 1422 free_netdev(peer); 1423 return err; 1424 } 1425 1426 static void veth_dellink(struct net_device *dev, struct list_head *head) 1427 { 1428 struct veth_priv *priv; 1429 struct net_device *peer; 1430 1431 priv = netdev_priv(dev); 1432 peer = rtnl_dereference(priv->peer); 1433 1434 /* Note : dellink() is called from default_device_exit_batch(), 1435 * before a rcu_synchronize() point. The devices are guaranteed 1436 * not being freed before one RCU grace period. 1437 */ 1438 RCU_INIT_POINTER(priv->peer, NULL); 1439 unregister_netdevice_queue(dev, head); 1440 1441 if (peer) { 1442 priv = netdev_priv(peer); 1443 RCU_INIT_POINTER(priv->peer, NULL); 1444 unregister_netdevice_queue(peer, head); 1445 } 1446 } 1447 1448 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1449 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1450 }; 1451 1452 static struct net *veth_get_link_net(const struct net_device *dev) 1453 { 1454 struct veth_priv *priv = netdev_priv(dev); 1455 struct net_device *peer = rtnl_dereference(priv->peer); 1456 1457 return peer ? dev_net(peer) : dev_net(dev); 1458 } 1459 1460 static struct rtnl_link_ops veth_link_ops = { 1461 .kind = DRV_NAME, 1462 .priv_size = sizeof(struct veth_priv), 1463 .setup = veth_setup, 1464 .validate = veth_validate, 1465 .newlink = veth_newlink, 1466 .dellink = veth_dellink, 1467 .policy = veth_policy, 1468 .maxtype = VETH_INFO_MAX, 1469 .get_link_net = veth_get_link_net, 1470 }; 1471 1472 /* 1473 * init/fini 1474 */ 1475 1476 static __init int veth_init(void) 1477 { 1478 return rtnl_link_register(&veth_link_ops); 1479 } 1480 1481 static __exit void veth_exit(void) 1482 { 1483 rtnl_link_unregister(&veth_link_ops); 1484 } 1485 1486 module_init(veth_init); 1487 module_exit(veth_exit); 1488 1489 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1490 MODULE_LICENSE("GPL v2"); 1491 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1492