1 /* 2 * vrf.c: device driver to encapsulate a VRF space 3 * 4 * Copyright (c) 2015 Cumulus Networks. All rights reserved. 5 * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> 6 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> 7 * 8 * Based on dummy, team and ipvlan drivers 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/kernel.h> 18 #include <linux/netdevice.h> 19 #include <linux/etherdevice.h> 20 #include <linux/ip.h> 21 #include <linux/init.h> 22 #include <linux/moduleparam.h> 23 #include <linux/netfilter.h> 24 #include <linux/rtnetlink.h> 25 #include <net/rtnetlink.h> 26 #include <linux/u64_stats_sync.h> 27 #include <linux/hashtable.h> 28 29 #include <linux/inetdevice.h> 30 #include <net/arp.h> 31 #include <net/ip.h> 32 #include <net/ip_fib.h> 33 #include <net/ip6_fib.h> 34 #include <net/ip6_route.h> 35 #include <net/rtnetlink.h> 36 #include <net/route.h> 37 #include <net/addrconf.h> 38 #include <net/l3mdev.h> 39 40 #define RT_FL_TOS(oldflp4) \ 41 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 42 43 #define DRV_NAME "vrf" 44 #define DRV_VERSION "1.0" 45 46 #define vrf_master_get_rcu(dev) \ 47 ((struct net_device *)rcu_dereference(dev->rx_handler_data)) 48 49 struct net_vrf { 50 struct rtable *rth; 51 struct rt6_info *rt6; 52 u32 tb_id; 53 }; 54 55 struct pcpu_dstats { 56 u64 tx_pkts; 57 u64 tx_bytes; 58 u64 tx_drps; 59 u64 rx_pkts; 60 u64 rx_bytes; 61 struct u64_stats_sync syncp; 62 }; 63 64 static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie) 65 { 66 return dst; 67 } 68 69 static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) 70 { 71 return ip_local_out(net, sk, skb); 72 } 73 74 static unsigned int vrf_v4_mtu(const struct dst_entry *dst) 75 { 76 /* TO-DO: return max ethernet size? */ 77 return dst->dev->mtu; 78 } 79 80 static void vrf_dst_destroy(struct dst_entry *dst) 81 { 82 /* our dst lives forever - or until the device is closed */ 83 } 84 85 static unsigned int vrf_default_advmss(const struct dst_entry *dst) 86 { 87 return 65535 - 40; 88 } 89 90 static struct dst_ops vrf_dst_ops = { 91 .family = AF_INET, 92 .local_out = vrf_ip_local_out, 93 .check = vrf_ip_check, 94 .mtu = vrf_v4_mtu, 95 .destroy = vrf_dst_destroy, 96 .default_advmss = vrf_default_advmss, 97 }; 98 99 /* neighbor handling is done with actual device; do not want 100 * to flip skb->dev for those ndisc packets. This really fails 101 * for multiple next protocols (e.g., NEXTHDR_HOP). But it is 102 * a start. 103 */ 104 #if IS_ENABLED(CONFIG_IPV6) 105 static bool check_ipv6_frame(const struct sk_buff *skb) 106 { 107 const struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->data; 108 size_t hlen = sizeof(*ipv6h); 109 bool rc = true; 110 111 if (skb->len < hlen) 112 goto out; 113 114 if (ipv6h->nexthdr == NEXTHDR_ICMP) { 115 const struct icmp6hdr *icmph; 116 117 if (skb->len < hlen + sizeof(*icmph)) 118 goto out; 119 120 icmph = (struct icmp6hdr *)(skb->data + sizeof(*ipv6h)); 121 switch (icmph->icmp6_type) { 122 case NDISC_ROUTER_SOLICITATION: 123 case NDISC_ROUTER_ADVERTISEMENT: 124 case NDISC_NEIGHBOUR_SOLICITATION: 125 case NDISC_NEIGHBOUR_ADVERTISEMENT: 126 case NDISC_REDIRECT: 127 rc = false; 128 break; 129 } 130 } 131 132 out: 133 return rc; 134 } 135 #else 136 static bool check_ipv6_frame(const struct sk_buff *skb) 137 { 138 return false; 139 } 140 #endif 141 142 static bool is_ip_rx_frame(struct sk_buff *skb) 143 { 144 switch (skb->protocol) { 145 case htons(ETH_P_IP): 146 return true; 147 case htons(ETH_P_IPV6): 148 return check_ipv6_frame(skb); 149 } 150 return false; 151 } 152 153 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) 154 { 155 vrf_dev->stats.tx_errors++; 156 kfree_skb(skb); 157 } 158 159 /* note: already called with rcu_read_lock */ 160 static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb) 161 { 162 struct sk_buff *skb = *pskb; 163 164 if (is_ip_rx_frame(skb)) { 165 struct net_device *dev = vrf_master_get_rcu(skb->dev); 166 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 167 168 u64_stats_update_begin(&dstats->syncp); 169 dstats->rx_pkts++; 170 dstats->rx_bytes += skb->len; 171 u64_stats_update_end(&dstats->syncp); 172 173 skb->dev = dev; 174 175 return RX_HANDLER_ANOTHER; 176 } 177 return RX_HANDLER_PASS; 178 } 179 180 static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev, 181 struct rtnl_link_stats64 *stats) 182 { 183 int i; 184 185 for_each_possible_cpu(i) { 186 const struct pcpu_dstats *dstats; 187 u64 tbytes, tpkts, tdrops, rbytes, rpkts; 188 unsigned int start; 189 190 dstats = per_cpu_ptr(dev->dstats, i); 191 do { 192 start = u64_stats_fetch_begin_irq(&dstats->syncp); 193 tbytes = dstats->tx_bytes; 194 tpkts = dstats->tx_pkts; 195 tdrops = dstats->tx_drps; 196 rbytes = dstats->rx_bytes; 197 rpkts = dstats->rx_pkts; 198 } while (u64_stats_fetch_retry_irq(&dstats->syncp, start)); 199 stats->tx_bytes += tbytes; 200 stats->tx_packets += tpkts; 201 stats->tx_dropped += tdrops; 202 stats->rx_bytes += rbytes; 203 stats->rx_packets += rpkts; 204 } 205 return stats; 206 } 207 208 #if IS_ENABLED(CONFIG_IPV6) 209 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 210 struct net_device *dev) 211 { 212 const struct ipv6hdr *iph = ipv6_hdr(skb); 213 struct net *net = dev_net(skb->dev); 214 struct flowi6 fl6 = { 215 /* needed to match OIF rule */ 216 .flowi6_oif = dev->ifindex, 217 .flowi6_iif = LOOPBACK_IFINDEX, 218 .daddr = iph->daddr, 219 .saddr = iph->saddr, 220 .flowlabel = ip6_flowinfo(iph), 221 .flowi6_mark = skb->mark, 222 .flowi6_proto = iph->nexthdr, 223 .flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF, 224 }; 225 int ret = NET_XMIT_DROP; 226 struct dst_entry *dst; 227 struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; 228 229 dst = ip6_route_output(net, NULL, &fl6); 230 if (dst == dst_null) 231 goto err; 232 233 skb_dst_drop(skb); 234 skb_dst_set(skb, dst); 235 236 ret = ip6_local_out(net, skb->sk, skb); 237 if (unlikely(net_xmit_eval(ret))) 238 dev->stats.tx_errors++; 239 else 240 ret = NET_XMIT_SUCCESS; 241 242 return ret; 243 err: 244 vrf_tx_error(dev, skb); 245 return NET_XMIT_DROP; 246 } 247 #else 248 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 249 struct net_device *dev) 250 { 251 vrf_tx_error(dev, skb); 252 return NET_XMIT_DROP; 253 } 254 #endif 255 256 static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4, 257 struct net_device *vrf_dev) 258 { 259 struct rtable *rt; 260 int err = 1; 261 262 rt = ip_route_output_flow(dev_net(vrf_dev), fl4, NULL); 263 if (IS_ERR(rt)) 264 goto out; 265 266 /* TO-DO: what about broadcast ? */ 267 if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { 268 ip_rt_put(rt); 269 goto out; 270 } 271 272 skb_dst_drop(skb); 273 skb_dst_set(skb, &rt->dst); 274 err = 0; 275 out: 276 return err; 277 } 278 279 static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, 280 struct net_device *vrf_dev) 281 { 282 struct iphdr *ip4h = ip_hdr(skb); 283 int ret = NET_XMIT_DROP; 284 struct flowi4 fl4 = { 285 /* needed to match OIF rule */ 286 .flowi4_oif = vrf_dev->ifindex, 287 .flowi4_iif = LOOPBACK_IFINDEX, 288 .flowi4_tos = RT_TOS(ip4h->tos), 289 .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC | 290 FLOWI_FLAG_SKIP_NH_OIF, 291 .daddr = ip4h->daddr, 292 }; 293 294 if (vrf_send_v4_prep(skb, &fl4, vrf_dev)) 295 goto err; 296 297 if (!ip4h->saddr) { 298 ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, 299 RT_SCOPE_LINK); 300 } 301 302 ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 303 if (unlikely(net_xmit_eval(ret))) 304 vrf_dev->stats.tx_errors++; 305 else 306 ret = NET_XMIT_SUCCESS; 307 308 out: 309 return ret; 310 err: 311 vrf_tx_error(vrf_dev, skb); 312 goto out; 313 } 314 315 static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) 316 { 317 /* strip the ethernet header added for pass through VRF device */ 318 __skb_pull(skb, skb_network_offset(skb)); 319 320 switch (skb->protocol) { 321 case htons(ETH_P_IP): 322 return vrf_process_v4_outbound(skb, dev); 323 case htons(ETH_P_IPV6): 324 return vrf_process_v6_outbound(skb, dev); 325 default: 326 vrf_tx_error(dev, skb); 327 return NET_XMIT_DROP; 328 } 329 } 330 331 static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) 332 { 333 netdev_tx_t ret = is_ip_tx_frame(skb, dev); 334 335 if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { 336 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 337 338 u64_stats_update_begin(&dstats->syncp); 339 dstats->tx_pkts++; 340 dstats->tx_bytes += skb->len; 341 u64_stats_update_end(&dstats->syncp); 342 } else { 343 this_cpu_inc(dev->dstats->tx_drps); 344 } 345 346 return ret; 347 } 348 349 #if IS_ENABLED(CONFIG_IPV6) 350 static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie) 351 { 352 return dst; 353 } 354 355 static struct dst_ops vrf_dst_ops6 = { 356 .family = AF_INET6, 357 .local_out = ip6_local_out, 358 .check = vrf_ip6_check, 359 .mtu = vrf_v4_mtu, 360 .destroy = vrf_dst_destroy, 361 .default_advmss = vrf_default_advmss, 362 }; 363 364 static int init_dst_ops6_kmem_cachep(void) 365 { 366 vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache", 367 sizeof(struct rt6_info), 368 0, 369 SLAB_HWCACHE_ALIGN, 370 NULL); 371 372 if (!vrf_dst_ops6.kmem_cachep) 373 return -ENOMEM; 374 375 return 0; 376 } 377 378 static void free_dst_ops6_kmem_cachep(void) 379 { 380 kmem_cache_destroy(vrf_dst_ops6.kmem_cachep); 381 } 382 383 static int vrf_input6(struct sk_buff *skb) 384 { 385 skb->dev->stats.rx_errors++; 386 kfree_skb(skb); 387 return 0; 388 } 389 390 /* modelled after ip6_finish_output2 */ 391 static int vrf_finish_output6(struct net *net, struct sock *sk, 392 struct sk_buff *skb) 393 { 394 struct dst_entry *dst = skb_dst(skb); 395 struct net_device *dev = dst->dev; 396 struct neighbour *neigh; 397 struct in6_addr *nexthop; 398 int ret; 399 400 skb->protocol = htons(ETH_P_IPV6); 401 skb->dev = dev; 402 403 rcu_read_lock_bh(); 404 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 405 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 406 if (unlikely(!neigh)) 407 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 408 if (!IS_ERR(neigh)) { 409 ret = dst_neigh_output(dst, neigh, skb); 410 rcu_read_unlock_bh(); 411 return ret; 412 } 413 rcu_read_unlock_bh(); 414 415 IP6_INC_STATS(dev_net(dst->dev), 416 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 417 kfree_skb(skb); 418 return -EINVAL; 419 } 420 421 /* modelled after ip6_output */ 422 static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) 423 { 424 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 425 net, sk, skb, NULL, skb_dst(skb)->dev, 426 vrf_finish_output6, 427 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 428 } 429 430 static void vrf_rt6_destroy(struct net_vrf *vrf) 431 { 432 dst_destroy(&vrf->rt6->dst); 433 free_percpu(vrf->rt6->rt6i_pcpu); 434 vrf->rt6 = NULL; 435 } 436 437 static int vrf_rt6_create(struct net_device *dev) 438 { 439 struct net_vrf *vrf = netdev_priv(dev); 440 struct dst_entry *dst; 441 struct rt6_info *rt6; 442 int cpu; 443 int rc = -ENOMEM; 444 445 rt6 = dst_alloc(&vrf_dst_ops6, dev, 0, 446 DST_OBSOLETE_NONE, 447 (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); 448 if (!rt6) 449 goto out; 450 451 dst = &rt6->dst; 452 453 rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL); 454 if (!rt6->rt6i_pcpu) { 455 dst_destroy(dst); 456 goto out; 457 } 458 for_each_possible_cpu(cpu) { 459 struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu); 460 *p = NULL; 461 } 462 463 memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst)); 464 465 INIT_LIST_HEAD(&rt6->rt6i_siblings); 466 INIT_LIST_HEAD(&rt6->rt6i_uncached); 467 468 rt6->dst.input = vrf_input6; 469 rt6->dst.output = vrf_output6; 470 471 rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id); 472 473 atomic_set(&rt6->dst.__refcnt, 2); 474 475 vrf->rt6 = rt6; 476 rc = 0; 477 out: 478 return rc; 479 } 480 #else 481 static int init_dst_ops6_kmem_cachep(void) 482 { 483 return 0; 484 } 485 486 static void free_dst_ops6_kmem_cachep(void) 487 { 488 } 489 490 static void vrf_rt6_destroy(struct net_vrf *vrf) 491 { 492 } 493 494 static int vrf_rt6_create(struct net_device *dev) 495 { 496 return 0; 497 } 498 #endif 499 500 /* modelled after ip_finish_output2 */ 501 static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 502 { 503 struct dst_entry *dst = skb_dst(skb); 504 struct rtable *rt = (struct rtable *)dst; 505 struct net_device *dev = dst->dev; 506 unsigned int hh_len = LL_RESERVED_SPACE(dev); 507 struct neighbour *neigh; 508 u32 nexthop; 509 int ret = -EINVAL; 510 511 /* Be paranoid, rather than too clever. */ 512 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 513 struct sk_buff *skb2; 514 515 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 516 if (!skb2) { 517 ret = -ENOMEM; 518 goto err; 519 } 520 if (skb->sk) 521 skb_set_owner_w(skb2, skb->sk); 522 523 consume_skb(skb); 524 skb = skb2; 525 } 526 527 rcu_read_lock_bh(); 528 529 nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); 530 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); 531 if (unlikely(!neigh)) 532 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); 533 if (!IS_ERR(neigh)) 534 ret = dst_neigh_output(dst, neigh, skb); 535 536 rcu_read_unlock_bh(); 537 err: 538 if (unlikely(ret < 0)) 539 vrf_tx_error(skb->dev, skb); 540 return ret; 541 } 542 543 static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 544 { 545 struct net_device *dev = skb_dst(skb)->dev; 546 547 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); 548 549 skb->dev = dev; 550 skb->protocol = htons(ETH_P_IP); 551 552 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, 553 net, sk, skb, NULL, dev, 554 vrf_finish_output, 555 !(IPCB(skb)->flags & IPSKB_REROUTED)); 556 } 557 558 static void vrf_rtable_destroy(struct net_vrf *vrf) 559 { 560 struct dst_entry *dst = (struct dst_entry *)vrf->rth; 561 562 dst_destroy(dst); 563 vrf->rth = NULL; 564 } 565 566 static struct rtable *vrf_rtable_create(struct net_device *dev) 567 { 568 struct net_vrf *vrf = netdev_priv(dev); 569 struct rtable *rth; 570 571 rth = dst_alloc(&vrf_dst_ops, dev, 2, 572 DST_OBSOLETE_NONE, 573 (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); 574 if (rth) { 575 rth->dst.output = vrf_output; 576 rth->rt_genid = rt_genid_ipv4(dev_net(dev)); 577 rth->rt_flags = 0; 578 rth->rt_type = RTN_UNICAST; 579 rth->rt_is_input = 0; 580 rth->rt_iif = 0; 581 rth->rt_pmtu = 0; 582 rth->rt_gateway = 0; 583 rth->rt_uses_gateway = 0; 584 rth->rt_table_id = vrf->tb_id; 585 INIT_LIST_HEAD(&rth->rt_uncached); 586 rth->rt_uncached_list = NULL; 587 } 588 589 return rth; 590 } 591 592 /**************************** device handling ********************/ 593 594 /* cycle interface to flush neighbor cache and move routes across tables */ 595 static void cycle_netdev(struct net_device *dev) 596 { 597 unsigned int flags = dev->flags; 598 int ret; 599 600 if (!netif_running(dev)) 601 return; 602 603 ret = dev_change_flags(dev, flags & ~IFF_UP); 604 if (ret >= 0) 605 ret = dev_change_flags(dev, flags); 606 607 if (ret < 0) { 608 netdev_err(dev, 609 "Failed to cycle device %s; route tables might be wrong!\n", 610 dev->name); 611 } 612 } 613 614 static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 615 { 616 int ret; 617 618 /* register the packet handler for slave ports */ 619 ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev); 620 if (ret) { 621 netdev_err(port_dev, 622 "Device %s failed to register rx_handler\n", 623 port_dev->name); 624 goto out_fail; 625 } 626 627 ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL); 628 if (ret < 0) 629 goto out_unregister; 630 631 port_dev->priv_flags |= IFF_L3MDEV_SLAVE; 632 cycle_netdev(port_dev); 633 634 return 0; 635 636 out_unregister: 637 netdev_rx_handler_unregister(port_dev); 638 out_fail: 639 return ret; 640 } 641 642 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 643 { 644 if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev)) 645 return -EINVAL; 646 647 return do_vrf_add_slave(dev, port_dev); 648 } 649 650 /* inverse of do_vrf_add_slave */ 651 static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 652 { 653 netdev_upper_dev_unlink(port_dev, dev); 654 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; 655 656 netdev_rx_handler_unregister(port_dev); 657 658 cycle_netdev(port_dev); 659 660 return 0; 661 } 662 663 static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 664 { 665 return do_vrf_del_slave(dev, port_dev); 666 } 667 668 static void vrf_dev_uninit(struct net_device *dev) 669 { 670 struct net_vrf *vrf = netdev_priv(dev); 671 struct net_device *port_dev; 672 struct list_head *iter; 673 674 vrf_rtable_destroy(vrf); 675 vrf_rt6_destroy(vrf); 676 677 netdev_for_each_lower_dev(dev, port_dev, iter) 678 vrf_del_slave(dev, port_dev); 679 680 free_percpu(dev->dstats); 681 dev->dstats = NULL; 682 } 683 684 static int vrf_dev_init(struct net_device *dev) 685 { 686 struct net_vrf *vrf = netdev_priv(dev); 687 688 dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); 689 if (!dev->dstats) 690 goto out_nomem; 691 692 /* create the default dst which points back to us */ 693 vrf->rth = vrf_rtable_create(dev); 694 if (!vrf->rth) 695 goto out_stats; 696 697 if (vrf_rt6_create(dev) != 0) 698 goto out_rth; 699 700 dev->flags = IFF_MASTER | IFF_NOARP; 701 702 return 0; 703 704 out_rth: 705 vrf_rtable_destroy(vrf); 706 out_stats: 707 free_percpu(dev->dstats); 708 dev->dstats = NULL; 709 out_nomem: 710 return -ENOMEM; 711 } 712 713 static const struct net_device_ops vrf_netdev_ops = { 714 .ndo_init = vrf_dev_init, 715 .ndo_uninit = vrf_dev_uninit, 716 .ndo_start_xmit = vrf_xmit, 717 .ndo_get_stats64 = vrf_get_stats64, 718 .ndo_add_slave = vrf_add_slave, 719 .ndo_del_slave = vrf_del_slave, 720 }; 721 722 static u32 vrf_fib_table(const struct net_device *dev) 723 { 724 struct net_vrf *vrf = netdev_priv(dev); 725 726 return vrf->tb_id; 727 } 728 729 static struct rtable *vrf_get_rtable(const struct net_device *dev, 730 const struct flowi4 *fl4) 731 { 732 struct rtable *rth = NULL; 733 734 if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) { 735 struct net_vrf *vrf = netdev_priv(dev); 736 737 rth = vrf->rth; 738 atomic_inc(&rth->dst.__refcnt); 739 } 740 741 return rth; 742 } 743 744 /* called under rcu_read_lock */ 745 static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4) 746 { 747 struct fib_result res = { .tclassid = 0 }; 748 struct net *net = dev_net(dev); 749 u32 orig_tos = fl4->flowi4_tos; 750 u8 flags = fl4->flowi4_flags; 751 u8 scope = fl4->flowi4_scope; 752 u8 tos = RT_FL_TOS(fl4); 753 int rc; 754 755 if (unlikely(!fl4->daddr)) 756 return 0; 757 758 fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF; 759 fl4->flowi4_iif = LOOPBACK_IFINDEX; 760 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 761 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 762 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 763 764 rc = fib_lookup(net, fl4, &res, 0); 765 if (!rc) { 766 if (res.type == RTN_LOCAL) 767 fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr; 768 else 769 fib_select_path(net, &res, fl4, -1); 770 } 771 772 fl4->flowi4_flags = flags; 773 fl4->flowi4_tos = orig_tos; 774 fl4->flowi4_scope = scope; 775 776 return rc; 777 } 778 779 #if IS_ENABLED(CONFIG_IPV6) 780 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, 781 const struct flowi6 *fl6) 782 { 783 struct rt6_info *rt = NULL; 784 785 if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { 786 struct net_vrf *vrf = netdev_priv(dev); 787 788 rt = vrf->rt6; 789 atomic_inc(&rt->dst.__refcnt); 790 } 791 792 return (struct dst_entry *)rt; 793 } 794 #endif 795 796 static const struct l3mdev_ops vrf_l3mdev_ops = { 797 .l3mdev_fib_table = vrf_fib_table, 798 .l3mdev_get_rtable = vrf_get_rtable, 799 .l3mdev_get_saddr = vrf_get_saddr, 800 #if IS_ENABLED(CONFIG_IPV6) 801 .l3mdev_get_rt6_dst = vrf_get_rt6_dst, 802 #endif 803 }; 804 805 static void vrf_get_drvinfo(struct net_device *dev, 806 struct ethtool_drvinfo *info) 807 { 808 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 809 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 810 } 811 812 static const struct ethtool_ops vrf_ethtool_ops = { 813 .get_drvinfo = vrf_get_drvinfo, 814 }; 815 816 static void vrf_setup(struct net_device *dev) 817 { 818 ether_setup(dev); 819 820 /* Initialize the device structure. */ 821 dev->netdev_ops = &vrf_netdev_ops; 822 dev->l3mdev_ops = &vrf_l3mdev_ops; 823 dev->ethtool_ops = &vrf_ethtool_ops; 824 dev->destructor = free_netdev; 825 826 /* Fill in device structure with ethernet-generic values. */ 827 eth_hw_addr_random(dev); 828 829 /* don't acquire vrf device's netif_tx_lock when transmitting */ 830 dev->features |= NETIF_F_LLTX; 831 832 /* don't allow vrf devices to change network namespaces. */ 833 dev->features |= NETIF_F_NETNS_LOCAL; 834 } 835 836 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[]) 837 { 838 if (tb[IFLA_ADDRESS]) { 839 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 840 return -EINVAL; 841 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 842 return -EADDRNOTAVAIL; 843 } 844 return 0; 845 } 846 847 static void vrf_dellink(struct net_device *dev, struct list_head *head) 848 { 849 unregister_netdevice_queue(dev, head); 850 } 851 852 static int vrf_newlink(struct net *src_net, struct net_device *dev, 853 struct nlattr *tb[], struct nlattr *data[]) 854 { 855 struct net_vrf *vrf = netdev_priv(dev); 856 857 if (!data || !data[IFLA_VRF_TABLE]) 858 return -EINVAL; 859 860 vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); 861 862 dev->priv_flags |= IFF_L3MDEV_MASTER; 863 864 return register_netdevice(dev); 865 } 866 867 static size_t vrf_nl_getsize(const struct net_device *dev) 868 { 869 return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ 870 } 871 872 static int vrf_fillinfo(struct sk_buff *skb, 873 const struct net_device *dev) 874 { 875 struct net_vrf *vrf = netdev_priv(dev); 876 877 return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); 878 } 879 880 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { 881 [IFLA_VRF_TABLE] = { .type = NLA_U32 }, 882 }; 883 884 static struct rtnl_link_ops vrf_link_ops __read_mostly = { 885 .kind = DRV_NAME, 886 .priv_size = sizeof(struct net_vrf), 887 888 .get_size = vrf_nl_getsize, 889 .policy = vrf_nl_policy, 890 .validate = vrf_validate, 891 .fill_info = vrf_fillinfo, 892 893 .newlink = vrf_newlink, 894 .dellink = vrf_dellink, 895 .setup = vrf_setup, 896 .maxtype = IFLA_VRF_MAX, 897 }; 898 899 static int vrf_device_event(struct notifier_block *unused, 900 unsigned long event, void *ptr) 901 { 902 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 903 904 /* only care about unregister events to drop slave references */ 905 if (event == NETDEV_UNREGISTER) { 906 struct net_device *vrf_dev; 907 908 if (!netif_is_l3_slave(dev)) 909 goto out; 910 911 vrf_dev = netdev_master_upper_dev_get(dev); 912 vrf_del_slave(vrf_dev, dev); 913 } 914 out: 915 return NOTIFY_DONE; 916 } 917 918 static struct notifier_block vrf_notifier_block __read_mostly = { 919 .notifier_call = vrf_device_event, 920 }; 921 922 static int __init vrf_init_module(void) 923 { 924 int rc; 925 926 vrf_dst_ops.kmem_cachep = 927 kmem_cache_create("vrf_ip_dst_cache", 928 sizeof(struct rtable), 0, 929 SLAB_HWCACHE_ALIGN, 930 NULL); 931 932 if (!vrf_dst_ops.kmem_cachep) 933 return -ENOMEM; 934 935 rc = init_dst_ops6_kmem_cachep(); 936 if (rc != 0) 937 goto error2; 938 939 register_netdevice_notifier(&vrf_notifier_block); 940 941 rc = rtnl_link_register(&vrf_link_ops); 942 if (rc < 0) 943 goto error; 944 945 return 0; 946 947 error: 948 unregister_netdevice_notifier(&vrf_notifier_block); 949 free_dst_ops6_kmem_cachep(); 950 error2: 951 kmem_cache_destroy(vrf_dst_ops.kmem_cachep); 952 return rc; 953 } 954 955 static void __exit vrf_cleanup_module(void) 956 { 957 rtnl_link_unregister(&vrf_link_ops); 958 unregister_netdevice_notifier(&vrf_notifier_block); 959 kmem_cache_destroy(vrf_dst_ops.kmem_cachep); 960 free_dst_ops6_kmem_cachep(); 961 } 962 963 module_init(vrf_init_module); 964 module_exit(vrf_cleanup_module); 965 MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); 966 MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); 967 MODULE_LICENSE("GPL"); 968 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 969 MODULE_VERSION(DRV_VERSION); 970