1 /* 2 * vrf.c: device driver to encapsulate a VRF space 3 * 4 * Copyright (c) 2015 Cumulus Networks. All rights reserved. 5 * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> 6 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> 7 * 8 * Based on dummy, team and ipvlan drivers 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/kernel.h> 18 #include <linux/netdevice.h> 19 #include <linux/etherdevice.h> 20 #include <linux/ip.h> 21 #include <linux/init.h> 22 #include <linux/moduleparam.h> 23 #include <linux/netfilter.h> 24 #include <linux/rtnetlink.h> 25 #include <net/rtnetlink.h> 26 #include <linux/u64_stats_sync.h> 27 #include <linux/hashtable.h> 28 29 #include <linux/inetdevice.h> 30 #include <net/arp.h> 31 #include <net/ip.h> 32 #include <net/ip_fib.h> 33 #include <net/ip6_fib.h> 34 #include <net/ip6_route.h> 35 #include <net/rtnetlink.h> 36 #include <net/route.h> 37 #include <net/addrconf.h> 38 #include <net/l3mdev.h> 39 40 #define RT_FL_TOS(oldflp4) \ 41 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 42 43 #define DRV_NAME "vrf" 44 #define DRV_VERSION "1.0" 45 46 #define vrf_master_get_rcu(dev) \ 47 ((struct net_device *)rcu_dereference(dev->rx_handler_data)) 48 49 struct net_vrf { 50 struct rtable *rth; 51 struct rt6_info *rt6; 52 u32 tb_id; 53 }; 54 55 struct pcpu_dstats { 56 u64 tx_pkts; 57 u64 tx_bytes; 58 u64 tx_drps; 59 u64 rx_pkts; 60 u64 rx_bytes; 61 struct u64_stats_sync syncp; 62 }; 63 64 static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie) 65 { 66 return dst; 67 } 68 69 static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) 70 { 71 return ip_local_out(net, sk, skb); 72 } 73 74 static unsigned int vrf_v4_mtu(const struct dst_entry *dst) 75 { 76 /* TO-DO: return max ethernet size? */ 77 return dst->dev->mtu; 78 } 79 80 static void vrf_dst_destroy(struct dst_entry *dst) 81 { 82 /* our dst lives forever - or until the device is closed */ 83 } 84 85 static unsigned int vrf_default_advmss(const struct dst_entry *dst) 86 { 87 return 65535 - 40; 88 } 89 90 static struct dst_ops vrf_dst_ops = { 91 .family = AF_INET, 92 .local_out = vrf_ip_local_out, 93 .check = vrf_ip_check, 94 .mtu = vrf_v4_mtu, 95 .destroy = vrf_dst_destroy, 96 .default_advmss = vrf_default_advmss, 97 }; 98 99 /* neighbor handling is done with actual device; do not want 100 * to flip skb->dev for those ndisc packets. This really fails 101 * for multiple next protocols (e.g., NEXTHDR_HOP). But it is 102 * a start. 103 */ 104 #if IS_ENABLED(CONFIG_IPV6) 105 static bool check_ipv6_frame(const struct sk_buff *skb) 106 { 107 const struct ipv6hdr *ipv6h; 108 struct ipv6hdr _ipv6h; 109 bool rc = true; 110 111 ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h); 112 if (!ipv6h) 113 goto out; 114 115 if (ipv6h->nexthdr == NEXTHDR_ICMP) { 116 const struct icmp6hdr *icmph; 117 struct icmp6hdr _icmph; 118 119 icmph = skb_header_pointer(skb, sizeof(_ipv6h), 120 sizeof(_icmph), &_icmph); 121 if (!icmph) 122 goto out; 123 124 switch (icmph->icmp6_type) { 125 case NDISC_ROUTER_SOLICITATION: 126 case NDISC_ROUTER_ADVERTISEMENT: 127 case NDISC_NEIGHBOUR_SOLICITATION: 128 case NDISC_NEIGHBOUR_ADVERTISEMENT: 129 case NDISC_REDIRECT: 130 rc = false; 131 break; 132 } 133 } 134 135 out: 136 return rc; 137 } 138 #else 139 static bool check_ipv6_frame(const struct sk_buff *skb) 140 { 141 return false; 142 } 143 #endif 144 145 static bool is_ip_rx_frame(struct sk_buff *skb) 146 { 147 switch (skb->protocol) { 148 case htons(ETH_P_IP): 149 return true; 150 case htons(ETH_P_IPV6): 151 return check_ipv6_frame(skb); 152 } 153 return false; 154 } 155 156 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) 157 { 158 vrf_dev->stats.tx_errors++; 159 kfree_skb(skb); 160 } 161 162 /* note: already called with rcu_read_lock */ 163 static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb) 164 { 165 struct sk_buff *skb = *pskb; 166 167 if (is_ip_rx_frame(skb)) { 168 struct net_device *dev = vrf_master_get_rcu(skb->dev); 169 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 170 171 u64_stats_update_begin(&dstats->syncp); 172 dstats->rx_pkts++; 173 dstats->rx_bytes += skb->len; 174 u64_stats_update_end(&dstats->syncp); 175 176 skb->dev = dev; 177 178 return RX_HANDLER_ANOTHER; 179 } 180 return RX_HANDLER_PASS; 181 } 182 183 static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev, 184 struct rtnl_link_stats64 *stats) 185 { 186 int i; 187 188 for_each_possible_cpu(i) { 189 const struct pcpu_dstats *dstats; 190 u64 tbytes, tpkts, tdrops, rbytes, rpkts; 191 unsigned int start; 192 193 dstats = per_cpu_ptr(dev->dstats, i); 194 do { 195 start = u64_stats_fetch_begin_irq(&dstats->syncp); 196 tbytes = dstats->tx_bytes; 197 tpkts = dstats->tx_pkts; 198 tdrops = dstats->tx_drps; 199 rbytes = dstats->rx_bytes; 200 rpkts = dstats->rx_pkts; 201 } while (u64_stats_fetch_retry_irq(&dstats->syncp, start)); 202 stats->tx_bytes += tbytes; 203 stats->tx_packets += tpkts; 204 stats->tx_dropped += tdrops; 205 stats->rx_bytes += rbytes; 206 stats->rx_packets += rpkts; 207 } 208 return stats; 209 } 210 211 #if IS_ENABLED(CONFIG_IPV6) 212 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 213 struct net_device *dev) 214 { 215 const struct ipv6hdr *iph = ipv6_hdr(skb); 216 struct net *net = dev_net(skb->dev); 217 struct flowi6 fl6 = { 218 /* needed to match OIF rule */ 219 .flowi6_oif = dev->ifindex, 220 .flowi6_iif = LOOPBACK_IFINDEX, 221 .daddr = iph->daddr, 222 .saddr = iph->saddr, 223 .flowlabel = ip6_flowinfo(iph), 224 .flowi6_mark = skb->mark, 225 .flowi6_proto = iph->nexthdr, 226 .flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF, 227 }; 228 int ret = NET_XMIT_DROP; 229 struct dst_entry *dst; 230 struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; 231 232 dst = ip6_route_output(net, NULL, &fl6); 233 if (dst == dst_null) 234 goto err; 235 236 skb_dst_drop(skb); 237 skb_dst_set(skb, dst); 238 239 ret = ip6_local_out(net, skb->sk, skb); 240 if (unlikely(net_xmit_eval(ret))) 241 dev->stats.tx_errors++; 242 else 243 ret = NET_XMIT_SUCCESS; 244 245 return ret; 246 err: 247 vrf_tx_error(dev, skb); 248 return NET_XMIT_DROP; 249 } 250 #else 251 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 252 struct net_device *dev) 253 { 254 vrf_tx_error(dev, skb); 255 return NET_XMIT_DROP; 256 } 257 #endif 258 259 static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4, 260 struct net_device *vrf_dev) 261 { 262 struct rtable *rt; 263 int err = 1; 264 265 rt = ip_route_output_flow(dev_net(vrf_dev), fl4, NULL); 266 if (IS_ERR(rt)) 267 goto out; 268 269 /* TO-DO: what about broadcast ? */ 270 if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { 271 ip_rt_put(rt); 272 goto out; 273 } 274 275 skb_dst_drop(skb); 276 skb_dst_set(skb, &rt->dst); 277 err = 0; 278 out: 279 return err; 280 } 281 282 static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, 283 struct net_device *vrf_dev) 284 { 285 struct iphdr *ip4h = ip_hdr(skb); 286 int ret = NET_XMIT_DROP; 287 struct flowi4 fl4 = { 288 /* needed to match OIF rule */ 289 .flowi4_oif = vrf_dev->ifindex, 290 .flowi4_iif = LOOPBACK_IFINDEX, 291 .flowi4_tos = RT_TOS(ip4h->tos), 292 .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC | 293 FLOWI_FLAG_SKIP_NH_OIF, 294 .daddr = ip4h->daddr, 295 }; 296 297 if (vrf_send_v4_prep(skb, &fl4, vrf_dev)) 298 goto err; 299 300 if (!ip4h->saddr) { 301 ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, 302 RT_SCOPE_LINK); 303 } 304 305 ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 306 if (unlikely(net_xmit_eval(ret))) 307 vrf_dev->stats.tx_errors++; 308 else 309 ret = NET_XMIT_SUCCESS; 310 311 out: 312 return ret; 313 err: 314 vrf_tx_error(vrf_dev, skb); 315 goto out; 316 } 317 318 static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) 319 { 320 /* strip the ethernet header added for pass through VRF device */ 321 __skb_pull(skb, skb_network_offset(skb)); 322 323 switch (skb->protocol) { 324 case htons(ETH_P_IP): 325 return vrf_process_v4_outbound(skb, dev); 326 case htons(ETH_P_IPV6): 327 return vrf_process_v6_outbound(skb, dev); 328 default: 329 vrf_tx_error(dev, skb); 330 return NET_XMIT_DROP; 331 } 332 } 333 334 static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) 335 { 336 netdev_tx_t ret = is_ip_tx_frame(skb, dev); 337 338 if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { 339 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 340 341 u64_stats_update_begin(&dstats->syncp); 342 dstats->tx_pkts++; 343 dstats->tx_bytes += skb->len; 344 u64_stats_update_end(&dstats->syncp); 345 } else { 346 this_cpu_inc(dev->dstats->tx_drps); 347 } 348 349 return ret; 350 } 351 352 #if IS_ENABLED(CONFIG_IPV6) 353 static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie) 354 { 355 return dst; 356 } 357 358 static struct dst_ops vrf_dst_ops6 = { 359 .family = AF_INET6, 360 .local_out = ip6_local_out, 361 .check = vrf_ip6_check, 362 .mtu = vrf_v4_mtu, 363 .destroy = vrf_dst_destroy, 364 .default_advmss = vrf_default_advmss, 365 }; 366 367 static int init_dst_ops6_kmem_cachep(void) 368 { 369 vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache", 370 sizeof(struct rt6_info), 371 0, 372 SLAB_HWCACHE_ALIGN, 373 NULL); 374 375 if (!vrf_dst_ops6.kmem_cachep) 376 return -ENOMEM; 377 378 return 0; 379 } 380 381 static void free_dst_ops6_kmem_cachep(void) 382 { 383 kmem_cache_destroy(vrf_dst_ops6.kmem_cachep); 384 } 385 386 static int vrf_input6(struct sk_buff *skb) 387 { 388 skb->dev->stats.rx_errors++; 389 kfree_skb(skb); 390 return 0; 391 } 392 393 /* modelled after ip6_finish_output2 */ 394 static int vrf_finish_output6(struct net *net, struct sock *sk, 395 struct sk_buff *skb) 396 { 397 struct dst_entry *dst = skb_dst(skb); 398 struct net_device *dev = dst->dev; 399 struct neighbour *neigh; 400 struct in6_addr *nexthop; 401 int ret; 402 403 skb->protocol = htons(ETH_P_IPV6); 404 skb->dev = dev; 405 406 rcu_read_lock_bh(); 407 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 408 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 409 if (unlikely(!neigh)) 410 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 411 if (!IS_ERR(neigh)) { 412 ret = dst_neigh_output(dst, neigh, skb); 413 rcu_read_unlock_bh(); 414 return ret; 415 } 416 rcu_read_unlock_bh(); 417 418 IP6_INC_STATS(dev_net(dst->dev), 419 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 420 kfree_skb(skb); 421 return -EINVAL; 422 } 423 424 /* modelled after ip6_output */ 425 static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) 426 { 427 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 428 net, sk, skb, NULL, skb_dst(skb)->dev, 429 vrf_finish_output6, 430 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 431 } 432 433 static void vrf_rt6_destroy(struct net_vrf *vrf) 434 { 435 dst_destroy(&vrf->rt6->dst); 436 free_percpu(vrf->rt6->rt6i_pcpu); 437 vrf->rt6 = NULL; 438 } 439 440 static int vrf_rt6_create(struct net_device *dev) 441 { 442 struct net_vrf *vrf = netdev_priv(dev); 443 struct dst_entry *dst; 444 struct rt6_info *rt6; 445 int cpu; 446 int rc = -ENOMEM; 447 448 rt6 = dst_alloc(&vrf_dst_ops6, dev, 0, 449 DST_OBSOLETE_NONE, 450 (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); 451 if (!rt6) 452 goto out; 453 454 dst = &rt6->dst; 455 456 rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL); 457 if (!rt6->rt6i_pcpu) { 458 dst_destroy(dst); 459 goto out; 460 } 461 for_each_possible_cpu(cpu) { 462 struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu); 463 *p = NULL; 464 } 465 466 memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst)); 467 468 INIT_LIST_HEAD(&rt6->rt6i_siblings); 469 INIT_LIST_HEAD(&rt6->rt6i_uncached); 470 471 rt6->dst.input = vrf_input6; 472 rt6->dst.output = vrf_output6; 473 474 rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id); 475 476 atomic_set(&rt6->dst.__refcnt, 2); 477 478 vrf->rt6 = rt6; 479 rc = 0; 480 out: 481 return rc; 482 } 483 #else 484 static int init_dst_ops6_kmem_cachep(void) 485 { 486 return 0; 487 } 488 489 static void free_dst_ops6_kmem_cachep(void) 490 { 491 } 492 493 static void vrf_rt6_destroy(struct net_vrf *vrf) 494 { 495 } 496 497 static int vrf_rt6_create(struct net_device *dev) 498 { 499 return 0; 500 } 501 #endif 502 503 /* modelled after ip_finish_output2 */ 504 static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 505 { 506 struct dst_entry *dst = skb_dst(skb); 507 struct rtable *rt = (struct rtable *)dst; 508 struct net_device *dev = dst->dev; 509 unsigned int hh_len = LL_RESERVED_SPACE(dev); 510 struct neighbour *neigh; 511 u32 nexthop; 512 int ret = -EINVAL; 513 514 /* Be paranoid, rather than too clever. */ 515 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 516 struct sk_buff *skb2; 517 518 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 519 if (!skb2) { 520 ret = -ENOMEM; 521 goto err; 522 } 523 if (skb->sk) 524 skb_set_owner_w(skb2, skb->sk); 525 526 consume_skb(skb); 527 skb = skb2; 528 } 529 530 rcu_read_lock_bh(); 531 532 nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); 533 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); 534 if (unlikely(!neigh)) 535 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); 536 if (!IS_ERR(neigh)) 537 ret = dst_neigh_output(dst, neigh, skb); 538 539 rcu_read_unlock_bh(); 540 err: 541 if (unlikely(ret < 0)) 542 vrf_tx_error(skb->dev, skb); 543 return ret; 544 } 545 546 static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 547 { 548 struct net_device *dev = skb_dst(skb)->dev; 549 550 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); 551 552 skb->dev = dev; 553 skb->protocol = htons(ETH_P_IP); 554 555 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, 556 net, sk, skb, NULL, dev, 557 vrf_finish_output, 558 !(IPCB(skb)->flags & IPSKB_REROUTED)); 559 } 560 561 static void vrf_rtable_destroy(struct net_vrf *vrf) 562 { 563 struct dst_entry *dst = (struct dst_entry *)vrf->rth; 564 565 dst_destroy(dst); 566 vrf->rth = NULL; 567 } 568 569 static struct rtable *vrf_rtable_create(struct net_device *dev) 570 { 571 struct net_vrf *vrf = netdev_priv(dev); 572 struct rtable *rth; 573 574 rth = dst_alloc(&vrf_dst_ops, dev, 2, 575 DST_OBSOLETE_NONE, 576 (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); 577 if (rth) { 578 rth->dst.output = vrf_output; 579 rth->rt_genid = rt_genid_ipv4(dev_net(dev)); 580 rth->rt_flags = 0; 581 rth->rt_type = RTN_UNICAST; 582 rth->rt_is_input = 0; 583 rth->rt_iif = 0; 584 rth->rt_pmtu = 0; 585 rth->rt_gateway = 0; 586 rth->rt_uses_gateway = 0; 587 rth->rt_table_id = vrf->tb_id; 588 INIT_LIST_HEAD(&rth->rt_uncached); 589 rth->rt_uncached_list = NULL; 590 } 591 592 return rth; 593 } 594 595 /**************************** device handling ********************/ 596 597 /* cycle interface to flush neighbor cache and move routes across tables */ 598 static void cycle_netdev(struct net_device *dev) 599 { 600 unsigned int flags = dev->flags; 601 int ret; 602 603 if (!netif_running(dev)) 604 return; 605 606 ret = dev_change_flags(dev, flags & ~IFF_UP); 607 if (ret >= 0) 608 ret = dev_change_flags(dev, flags); 609 610 if (ret < 0) { 611 netdev_err(dev, 612 "Failed to cycle device %s; route tables might be wrong!\n", 613 dev->name); 614 } 615 } 616 617 static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 618 { 619 int ret; 620 621 /* register the packet handler for slave ports */ 622 ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev); 623 if (ret) { 624 netdev_err(port_dev, 625 "Device %s failed to register rx_handler\n", 626 port_dev->name); 627 goto out_fail; 628 } 629 630 ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL); 631 if (ret < 0) 632 goto out_unregister; 633 634 port_dev->priv_flags |= IFF_L3MDEV_SLAVE; 635 cycle_netdev(port_dev); 636 637 return 0; 638 639 out_unregister: 640 netdev_rx_handler_unregister(port_dev); 641 out_fail: 642 return ret; 643 } 644 645 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 646 { 647 if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev)) 648 return -EINVAL; 649 650 return do_vrf_add_slave(dev, port_dev); 651 } 652 653 /* inverse of do_vrf_add_slave */ 654 static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 655 { 656 netdev_upper_dev_unlink(port_dev, dev); 657 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; 658 659 netdev_rx_handler_unregister(port_dev); 660 661 cycle_netdev(port_dev); 662 663 return 0; 664 } 665 666 static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 667 { 668 return do_vrf_del_slave(dev, port_dev); 669 } 670 671 static void vrf_dev_uninit(struct net_device *dev) 672 { 673 struct net_vrf *vrf = netdev_priv(dev); 674 struct net_device *port_dev; 675 struct list_head *iter; 676 677 vrf_rtable_destroy(vrf); 678 vrf_rt6_destroy(vrf); 679 680 netdev_for_each_lower_dev(dev, port_dev, iter) 681 vrf_del_slave(dev, port_dev); 682 683 free_percpu(dev->dstats); 684 dev->dstats = NULL; 685 } 686 687 static int vrf_dev_init(struct net_device *dev) 688 { 689 struct net_vrf *vrf = netdev_priv(dev); 690 691 dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); 692 if (!dev->dstats) 693 goto out_nomem; 694 695 /* create the default dst which points back to us */ 696 vrf->rth = vrf_rtable_create(dev); 697 if (!vrf->rth) 698 goto out_stats; 699 700 if (vrf_rt6_create(dev) != 0) 701 goto out_rth; 702 703 dev->flags = IFF_MASTER | IFF_NOARP; 704 705 return 0; 706 707 out_rth: 708 vrf_rtable_destroy(vrf); 709 out_stats: 710 free_percpu(dev->dstats); 711 dev->dstats = NULL; 712 out_nomem: 713 return -ENOMEM; 714 } 715 716 static const struct net_device_ops vrf_netdev_ops = { 717 .ndo_init = vrf_dev_init, 718 .ndo_uninit = vrf_dev_uninit, 719 .ndo_start_xmit = vrf_xmit, 720 .ndo_get_stats64 = vrf_get_stats64, 721 .ndo_add_slave = vrf_add_slave, 722 .ndo_del_slave = vrf_del_slave, 723 }; 724 725 static u32 vrf_fib_table(const struct net_device *dev) 726 { 727 struct net_vrf *vrf = netdev_priv(dev); 728 729 return vrf->tb_id; 730 } 731 732 static struct rtable *vrf_get_rtable(const struct net_device *dev, 733 const struct flowi4 *fl4) 734 { 735 struct rtable *rth = NULL; 736 737 if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) { 738 struct net_vrf *vrf = netdev_priv(dev); 739 740 rth = vrf->rth; 741 atomic_inc(&rth->dst.__refcnt); 742 } 743 744 return rth; 745 } 746 747 /* called under rcu_read_lock */ 748 static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4) 749 { 750 struct fib_result res = { .tclassid = 0 }; 751 struct net *net = dev_net(dev); 752 u32 orig_tos = fl4->flowi4_tos; 753 u8 flags = fl4->flowi4_flags; 754 u8 scope = fl4->flowi4_scope; 755 u8 tos = RT_FL_TOS(fl4); 756 int rc; 757 758 if (unlikely(!fl4->daddr)) 759 return 0; 760 761 fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF; 762 fl4->flowi4_iif = LOOPBACK_IFINDEX; 763 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 764 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 765 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 766 767 rc = fib_lookup(net, fl4, &res, 0); 768 if (!rc) { 769 if (res.type == RTN_LOCAL) 770 fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr; 771 else 772 fib_select_path(net, &res, fl4, -1); 773 } 774 775 fl4->flowi4_flags = flags; 776 fl4->flowi4_tos = orig_tos; 777 fl4->flowi4_scope = scope; 778 779 return rc; 780 } 781 782 #if IS_ENABLED(CONFIG_IPV6) 783 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, 784 const struct flowi6 *fl6) 785 { 786 struct rt6_info *rt = NULL; 787 788 if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { 789 struct net_vrf *vrf = netdev_priv(dev); 790 791 rt = vrf->rt6; 792 atomic_inc(&rt->dst.__refcnt); 793 } 794 795 return (struct dst_entry *)rt; 796 } 797 #endif 798 799 static const struct l3mdev_ops vrf_l3mdev_ops = { 800 .l3mdev_fib_table = vrf_fib_table, 801 .l3mdev_get_rtable = vrf_get_rtable, 802 .l3mdev_get_saddr = vrf_get_saddr, 803 #if IS_ENABLED(CONFIG_IPV6) 804 .l3mdev_get_rt6_dst = vrf_get_rt6_dst, 805 #endif 806 }; 807 808 static void vrf_get_drvinfo(struct net_device *dev, 809 struct ethtool_drvinfo *info) 810 { 811 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 812 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 813 } 814 815 static const struct ethtool_ops vrf_ethtool_ops = { 816 .get_drvinfo = vrf_get_drvinfo, 817 }; 818 819 static void vrf_setup(struct net_device *dev) 820 { 821 ether_setup(dev); 822 823 /* Initialize the device structure. */ 824 dev->netdev_ops = &vrf_netdev_ops; 825 dev->l3mdev_ops = &vrf_l3mdev_ops; 826 dev->ethtool_ops = &vrf_ethtool_ops; 827 dev->destructor = free_netdev; 828 829 /* Fill in device structure with ethernet-generic values. */ 830 eth_hw_addr_random(dev); 831 832 /* don't acquire vrf device's netif_tx_lock when transmitting */ 833 dev->features |= NETIF_F_LLTX; 834 835 /* don't allow vrf devices to change network namespaces. */ 836 dev->features |= NETIF_F_NETNS_LOCAL; 837 } 838 839 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[]) 840 { 841 if (tb[IFLA_ADDRESS]) { 842 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 843 return -EINVAL; 844 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 845 return -EADDRNOTAVAIL; 846 } 847 return 0; 848 } 849 850 static void vrf_dellink(struct net_device *dev, struct list_head *head) 851 { 852 unregister_netdevice_queue(dev, head); 853 } 854 855 static int vrf_newlink(struct net *src_net, struct net_device *dev, 856 struct nlattr *tb[], struct nlattr *data[]) 857 { 858 struct net_vrf *vrf = netdev_priv(dev); 859 860 if (!data || !data[IFLA_VRF_TABLE]) 861 return -EINVAL; 862 863 vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); 864 865 dev->priv_flags |= IFF_L3MDEV_MASTER; 866 867 return register_netdevice(dev); 868 } 869 870 static size_t vrf_nl_getsize(const struct net_device *dev) 871 { 872 return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ 873 } 874 875 static int vrf_fillinfo(struct sk_buff *skb, 876 const struct net_device *dev) 877 { 878 struct net_vrf *vrf = netdev_priv(dev); 879 880 return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); 881 } 882 883 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { 884 [IFLA_VRF_TABLE] = { .type = NLA_U32 }, 885 }; 886 887 static struct rtnl_link_ops vrf_link_ops __read_mostly = { 888 .kind = DRV_NAME, 889 .priv_size = sizeof(struct net_vrf), 890 891 .get_size = vrf_nl_getsize, 892 .policy = vrf_nl_policy, 893 .validate = vrf_validate, 894 .fill_info = vrf_fillinfo, 895 896 .newlink = vrf_newlink, 897 .dellink = vrf_dellink, 898 .setup = vrf_setup, 899 .maxtype = IFLA_VRF_MAX, 900 }; 901 902 static int vrf_device_event(struct notifier_block *unused, 903 unsigned long event, void *ptr) 904 { 905 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 906 907 /* only care about unregister events to drop slave references */ 908 if (event == NETDEV_UNREGISTER) { 909 struct net_device *vrf_dev; 910 911 if (!netif_is_l3_slave(dev)) 912 goto out; 913 914 vrf_dev = netdev_master_upper_dev_get(dev); 915 vrf_del_slave(vrf_dev, dev); 916 } 917 out: 918 return NOTIFY_DONE; 919 } 920 921 static struct notifier_block vrf_notifier_block __read_mostly = { 922 .notifier_call = vrf_device_event, 923 }; 924 925 static int __init vrf_init_module(void) 926 { 927 int rc; 928 929 vrf_dst_ops.kmem_cachep = 930 kmem_cache_create("vrf_ip_dst_cache", 931 sizeof(struct rtable), 0, 932 SLAB_HWCACHE_ALIGN, 933 NULL); 934 935 if (!vrf_dst_ops.kmem_cachep) 936 return -ENOMEM; 937 938 rc = init_dst_ops6_kmem_cachep(); 939 if (rc != 0) 940 goto error2; 941 942 register_netdevice_notifier(&vrf_notifier_block); 943 944 rc = rtnl_link_register(&vrf_link_ops); 945 if (rc < 0) 946 goto error; 947 948 return 0; 949 950 error: 951 unregister_netdevice_notifier(&vrf_notifier_block); 952 free_dst_ops6_kmem_cachep(); 953 error2: 954 kmem_cache_destroy(vrf_dst_ops.kmem_cachep); 955 return rc; 956 } 957 958 static void __exit vrf_cleanup_module(void) 959 { 960 rtnl_link_unregister(&vrf_link_ops); 961 unregister_netdevice_notifier(&vrf_notifier_block); 962 kmem_cache_destroy(vrf_dst_ops.kmem_cachep); 963 free_dst_ops6_kmem_cachep(); 964 } 965 966 module_init(vrf_init_module); 967 module_exit(vrf_cleanup_module); 968 MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); 969 MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); 970 MODULE_LICENSE("GPL"); 971 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 972 MODULE_VERSION(DRV_VERSION); 973