1 /* 2 * vrf.c: device driver to encapsulate a VRF space 3 * 4 * Copyright (c) 2015 Cumulus Networks. All rights reserved. 5 * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> 6 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> 7 * 8 * Based on dummy, team and ipvlan drivers 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/kernel.h> 18 #include <linux/netdevice.h> 19 #include <linux/etherdevice.h> 20 #include <linux/ip.h> 21 #include <linux/init.h> 22 #include <linux/moduleparam.h> 23 #include <linux/netfilter.h> 24 #include <linux/rtnetlink.h> 25 #include <net/rtnetlink.h> 26 #include <linux/u64_stats_sync.h> 27 #include <linux/hashtable.h> 28 29 #include <linux/inetdevice.h> 30 #include <net/arp.h> 31 #include <net/ip.h> 32 #include <net/ip_fib.h> 33 #include <net/ip6_fib.h> 34 #include <net/ip6_route.h> 35 #include <net/route.h> 36 #include <net/addrconf.h> 37 #include <net/l3mdev.h> 38 39 #define RT_FL_TOS(oldflp4) \ 40 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 41 42 #define DRV_NAME "vrf" 43 #define DRV_VERSION "1.0" 44 45 #define vrf_master_get_rcu(dev) \ 46 ((struct net_device *)rcu_dereference(dev->rx_handler_data)) 47 48 struct net_vrf { 49 struct rtable *rth; 50 struct rt6_info *rt6; 51 u32 tb_id; 52 }; 53 54 struct pcpu_dstats { 55 u64 tx_pkts; 56 u64 tx_bytes; 57 u64 tx_drps; 58 u64 rx_pkts; 59 u64 rx_bytes; 60 struct u64_stats_sync syncp; 61 }; 62 63 static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie) 64 { 65 return dst; 66 } 67 68 static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) 69 { 70 return ip_local_out(net, sk, skb); 71 } 72 73 static unsigned int vrf_v4_mtu(const struct dst_entry *dst) 74 { 75 /* TO-DO: return max ethernet size? */ 76 return dst->dev->mtu; 77 } 78 79 static void vrf_dst_destroy(struct dst_entry *dst) 80 { 81 /* our dst lives forever - or until the device is closed */ 82 } 83 84 static unsigned int vrf_default_advmss(const struct dst_entry *dst) 85 { 86 return 65535 - 40; 87 } 88 89 static struct dst_ops vrf_dst_ops = { 90 .family = AF_INET, 91 .local_out = vrf_ip_local_out, 92 .check = vrf_ip_check, 93 .mtu = vrf_v4_mtu, 94 .destroy = vrf_dst_destroy, 95 .default_advmss = vrf_default_advmss, 96 }; 97 98 /* neighbor handling is done with actual device; do not want 99 * to flip skb->dev for those ndisc packets. This really fails 100 * for multiple next protocols (e.g., NEXTHDR_HOP). But it is 101 * a start. 102 */ 103 #if IS_ENABLED(CONFIG_IPV6) 104 static bool check_ipv6_frame(const struct sk_buff *skb) 105 { 106 const struct ipv6hdr *ipv6h; 107 struct ipv6hdr _ipv6h; 108 bool rc = true; 109 110 ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h); 111 if (!ipv6h) 112 goto out; 113 114 if (ipv6h->nexthdr == NEXTHDR_ICMP) { 115 const struct icmp6hdr *icmph; 116 struct icmp6hdr _icmph; 117 118 icmph = skb_header_pointer(skb, sizeof(_ipv6h), 119 sizeof(_icmph), &_icmph); 120 if (!icmph) 121 goto out; 122 123 switch (icmph->icmp6_type) { 124 case NDISC_ROUTER_SOLICITATION: 125 case NDISC_ROUTER_ADVERTISEMENT: 126 case NDISC_NEIGHBOUR_SOLICITATION: 127 case NDISC_NEIGHBOUR_ADVERTISEMENT: 128 case NDISC_REDIRECT: 129 rc = false; 130 break; 131 } 132 } 133 134 out: 135 return rc; 136 } 137 #else 138 static bool check_ipv6_frame(const struct sk_buff *skb) 139 { 140 return false; 141 } 142 #endif 143 144 static bool is_ip_rx_frame(struct sk_buff *skb) 145 { 146 switch (skb->protocol) { 147 case htons(ETH_P_IP): 148 return true; 149 case htons(ETH_P_IPV6): 150 return check_ipv6_frame(skb); 151 } 152 return false; 153 } 154 155 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) 156 { 157 vrf_dev->stats.tx_errors++; 158 kfree_skb(skb); 159 } 160 161 /* note: already called with rcu_read_lock */ 162 static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb) 163 { 164 struct sk_buff *skb = *pskb; 165 166 if (is_ip_rx_frame(skb)) { 167 struct net_device *dev = vrf_master_get_rcu(skb->dev); 168 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 169 170 u64_stats_update_begin(&dstats->syncp); 171 dstats->rx_pkts++; 172 dstats->rx_bytes += skb->len; 173 u64_stats_update_end(&dstats->syncp); 174 175 skb->dev = dev; 176 177 return RX_HANDLER_ANOTHER; 178 } 179 return RX_HANDLER_PASS; 180 } 181 182 static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev, 183 struct rtnl_link_stats64 *stats) 184 { 185 int i; 186 187 for_each_possible_cpu(i) { 188 const struct pcpu_dstats *dstats; 189 u64 tbytes, tpkts, tdrops, rbytes, rpkts; 190 unsigned int start; 191 192 dstats = per_cpu_ptr(dev->dstats, i); 193 do { 194 start = u64_stats_fetch_begin_irq(&dstats->syncp); 195 tbytes = dstats->tx_bytes; 196 tpkts = dstats->tx_pkts; 197 tdrops = dstats->tx_drps; 198 rbytes = dstats->rx_bytes; 199 rpkts = dstats->rx_pkts; 200 } while (u64_stats_fetch_retry_irq(&dstats->syncp, start)); 201 stats->tx_bytes += tbytes; 202 stats->tx_packets += tpkts; 203 stats->tx_dropped += tdrops; 204 stats->rx_bytes += rbytes; 205 stats->rx_packets += rpkts; 206 } 207 return stats; 208 } 209 210 #if IS_ENABLED(CONFIG_IPV6) 211 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 212 struct net_device *dev) 213 { 214 const struct ipv6hdr *iph = ipv6_hdr(skb); 215 struct net *net = dev_net(skb->dev); 216 struct flowi6 fl6 = { 217 /* needed to match OIF rule */ 218 .flowi6_oif = dev->ifindex, 219 .flowi6_iif = LOOPBACK_IFINDEX, 220 .daddr = iph->daddr, 221 .saddr = iph->saddr, 222 .flowlabel = ip6_flowinfo(iph), 223 .flowi6_mark = skb->mark, 224 .flowi6_proto = iph->nexthdr, 225 .flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF, 226 }; 227 int ret = NET_XMIT_DROP; 228 struct dst_entry *dst; 229 struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; 230 231 dst = ip6_route_output(net, NULL, &fl6); 232 if (dst == dst_null) 233 goto err; 234 235 skb_dst_drop(skb); 236 skb_dst_set(skb, dst); 237 238 ret = ip6_local_out(net, skb->sk, skb); 239 if (unlikely(net_xmit_eval(ret))) 240 dev->stats.tx_errors++; 241 else 242 ret = NET_XMIT_SUCCESS; 243 244 return ret; 245 err: 246 vrf_tx_error(dev, skb); 247 return NET_XMIT_DROP; 248 } 249 #else 250 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 251 struct net_device *dev) 252 { 253 vrf_tx_error(dev, skb); 254 return NET_XMIT_DROP; 255 } 256 #endif 257 258 static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4, 259 struct net_device *vrf_dev) 260 { 261 struct rtable *rt; 262 int err = 1; 263 264 rt = ip_route_output_flow(dev_net(vrf_dev), fl4, NULL); 265 if (IS_ERR(rt)) 266 goto out; 267 268 /* TO-DO: what about broadcast ? */ 269 if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { 270 ip_rt_put(rt); 271 goto out; 272 } 273 274 skb_dst_drop(skb); 275 skb_dst_set(skb, &rt->dst); 276 err = 0; 277 out: 278 return err; 279 } 280 281 static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, 282 struct net_device *vrf_dev) 283 { 284 struct iphdr *ip4h = ip_hdr(skb); 285 int ret = NET_XMIT_DROP; 286 struct flowi4 fl4 = { 287 /* needed to match OIF rule */ 288 .flowi4_oif = vrf_dev->ifindex, 289 .flowi4_iif = LOOPBACK_IFINDEX, 290 .flowi4_tos = RT_TOS(ip4h->tos), 291 .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC | 292 FLOWI_FLAG_SKIP_NH_OIF, 293 .daddr = ip4h->daddr, 294 }; 295 296 if (vrf_send_v4_prep(skb, &fl4, vrf_dev)) 297 goto err; 298 299 if (!ip4h->saddr) { 300 ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, 301 RT_SCOPE_LINK); 302 } 303 304 ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 305 if (unlikely(net_xmit_eval(ret))) 306 vrf_dev->stats.tx_errors++; 307 else 308 ret = NET_XMIT_SUCCESS; 309 310 out: 311 return ret; 312 err: 313 vrf_tx_error(vrf_dev, skb); 314 goto out; 315 } 316 317 static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) 318 { 319 /* strip the ethernet header added for pass through VRF device */ 320 __skb_pull(skb, skb_network_offset(skb)); 321 322 switch (skb->protocol) { 323 case htons(ETH_P_IP): 324 return vrf_process_v4_outbound(skb, dev); 325 case htons(ETH_P_IPV6): 326 return vrf_process_v6_outbound(skb, dev); 327 default: 328 vrf_tx_error(dev, skb); 329 return NET_XMIT_DROP; 330 } 331 } 332 333 static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) 334 { 335 netdev_tx_t ret = is_ip_tx_frame(skb, dev); 336 337 if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { 338 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 339 340 u64_stats_update_begin(&dstats->syncp); 341 dstats->tx_pkts++; 342 dstats->tx_bytes += skb->len; 343 u64_stats_update_end(&dstats->syncp); 344 } else { 345 this_cpu_inc(dev->dstats->tx_drps); 346 } 347 348 return ret; 349 } 350 351 #if IS_ENABLED(CONFIG_IPV6) 352 static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie) 353 { 354 return dst; 355 } 356 357 static struct dst_ops vrf_dst_ops6 = { 358 .family = AF_INET6, 359 .local_out = ip6_local_out, 360 .check = vrf_ip6_check, 361 .mtu = vrf_v4_mtu, 362 .destroy = vrf_dst_destroy, 363 .default_advmss = vrf_default_advmss, 364 }; 365 366 static int init_dst_ops6_kmem_cachep(void) 367 { 368 vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache", 369 sizeof(struct rt6_info), 370 0, 371 SLAB_HWCACHE_ALIGN, 372 NULL); 373 374 if (!vrf_dst_ops6.kmem_cachep) 375 return -ENOMEM; 376 377 return 0; 378 } 379 380 static void free_dst_ops6_kmem_cachep(void) 381 { 382 kmem_cache_destroy(vrf_dst_ops6.kmem_cachep); 383 } 384 385 static int vrf_input6(struct sk_buff *skb) 386 { 387 skb->dev->stats.rx_errors++; 388 kfree_skb(skb); 389 return 0; 390 } 391 392 /* modelled after ip6_finish_output2 */ 393 static int vrf_finish_output6(struct net *net, struct sock *sk, 394 struct sk_buff *skb) 395 { 396 struct dst_entry *dst = skb_dst(skb); 397 struct net_device *dev = dst->dev; 398 struct neighbour *neigh; 399 struct in6_addr *nexthop; 400 int ret; 401 402 skb->protocol = htons(ETH_P_IPV6); 403 skb->dev = dev; 404 405 rcu_read_lock_bh(); 406 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 407 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 408 if (unlikely(!neigh)) 409 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 410 if (!IS_ERR(neigh)) { 411 ret = dst_neigh_output(dst, neigh, skb); 412 rcu_read_unlock_bh(); 413 return ret; 414 } 415 rcu_read_unlock_bh(); 416 417 IP6_INC_STATS(dev_net(dst->dev), 418 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 419 kfree_skb(skb); 420 return -EINVAL; 421 } 422 423 /* modelled after ip6_output */ 424 static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) 425 { 426 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 427 net, sk, skb, NULL, skb_dst(skb)->dev, 428 vrf_finish_output6, 429 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 430 } 431 432 static void vrf_rt6_destroy(struct net_vrf *vrf) 433 { 434 dst_destroy(&vrf->rt6->dst); 435 free_percpu(vrf->rt6->rt6i_pcpu); 436 vrf->rt6 = NULL; 437 } 438 439 static int vrf_rt6_create(struct net_device *dev) 440 { 441 struct net_vrf *vrf = netdev_priv(dev); 442 struct dst_entry *dst; 443 struct rt6_info *rt6; 444 int cpu; 445 int rc = -ENOMEM; 446 447 rt6 = dst_alloc(&vrf_dst_ops6, dev, 0, 448 DST_OBSOLETE_NONE, 449 (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); 450 if (!rt6) 451 goto out; 452 453 dst = &rt6->dst; 454 455 rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL); 456 if (!rt6->rt6i_pcpu) { 457 dst_destroy(dst); 458 goto out; 459 } 460 for_each_possible_cpu(cpu) { 461 struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu); 462 *p = NULL; 463 } 464 465 memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst)); 466 467 INIT_LIST_HEAD(&rt6->rt6i_siblings); 468 INIT_LIST_HEAD(&rt6->rt6i_uncached); 469 470 rt6->dst.input = vrf_input6; 471 rt6->dst.output = vrf_output6; 472 473 rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id); 474 475 atomic_set(&rt6->dst.__refcnt, 2); 476 477 vrf->rt6 = rt6; 478 rc = 0; 479 out: 480 return rc; 481 } 482 #else 483 static int init_dst_ops6_kmem_cachep(void) 484 { 485 return 0; 486 } 487 488 static void free_dst_ops6_kmem_cachep(void) 489 { 490 } 491 492 static void vrf_rt6_destroy(struct net_vrf *vrf) 493 { 494 } 495 496 static int vrf_rt6_create(struct net_device *dev) 497 { 498 return 0; 499 } 500 #endif 501 502 /* modelled after ip_finish_output2 */ 503 static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 504 { 505 struct dst_entry *dst = skb_dst(skb); 506 struct rtable *rt = (struct rtable *)dst; 507 struct net_device *dev = dst->dev; 508 unsigned int hh_len = LL_RESERVED_SPACE(dev); 509 struct neighbour *neigh; 510 u32 nexthop; 511 int ret = -EINVAL; 512 513 /* Be paranoid, rather than too clever. */ 514 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 515 struct sk_buff *skb2; 516 517 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 518 if (!skb2) { 519 ret = -ENOMEM; 520 goto err; 521 } 522 if (skb->sk) 523 skb_set_owner_w(skb2, skb->sk); 524 525 consume_skb(skb); 526 skb = skb2; 527 } 528 529 rcu_read_lock_bh(); 530 531 nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); 532 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); 533 if (unlikely(!neigh)) 534 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); 535 if (!IS_ERR(neigh)) 536 ret = dst_neigh_output(dst, neigh, skb); 537 538 rcu_read_unlock_bh(); 539 err: 540 if (unlikely(ret < 0)) 541 vrf_tx_error(skb->dev, skb); 542 return ret; 543 } 544 545 static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 546 { 547 struct net_device *dev = skb_dst(skb)->dev; 548 549 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); 550 551 skb->dev = dev; 552 skb->protocol = htons(ETH_P_IP); 553 554 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, 555 net, sk, skb, NULL, dev, 556 vrf_finish_output, 557 !(IPCB(skb)->flags & IPSKB_REROUTED)); 558 } 559 560 static void vrf_rtable_destroy(struct net_vrf *vrf) 561 { 562 struct dst_entry *dst = (struct dst_entry *)vrf->rth; 563 564 dst_destroy(dst); 565 vrf->rth = NULL; 566 } 567 568 static struct rtable *vrf_rtable_create(struct net_device *dev) 569 { 570 struct net_vrf *vrf = netdev_priv(dev); 571 struct rtable *rth; 572 573 rth = dst_alloc(&vrf_dst_ops, dev, 2, 574 DST_OBSOLETE_NONE, 575 (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); 576 if (rth) { 577 rth->dst.output = vrf_output; 578 rth->rt_genid = rt_genid_ipv4(dev_net(dev)); 579 rth->rt_flags = 0; 580 rth->rt_type = RTN_UNICAST; 581 rth->rt_is_input = 0; 582 rth->rt_iif = 0; 583 rth->rt_pmtu = 0; 584 rth->rt_gateway = 0; 585 rth->rt_uses_gateway = 0; 586 rth->rt_table_id = vrf->tb_id; 587 INIT_LIST_HEAD(&rth->rt_uncached); 588 rth->rt_uncached_list = NULL; 589 } 590 591 return rth; 592 } 593 594 /**************************** device handling ********************/ 595 596 /* cycle interface to flush neighbor cache and move routes across tables */ 597 static void cycle_netdev(struct net_device *dev) 598 { 599 unsigned int flags = dev->flags; 600 int ret; 601 602 if (!netif_running(dev)) 603 return; 604 605 ret = dev_change_flags(dev, flags & ~IFF_UP); 606 if (ret >= 0) 607 ret = dev_change_flags(dev, flags); 608 609 if (ret < 0) { 610 netdev_err(dev, 611 "Failed to cycle device %s; route tables might be wrong!\n", 612 dev->name); 613 } 614 } 615 616 static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 617 { 618 int ret; 619 620 /* register the packet handler for slave ports */ 621 ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev); 622 if (ret) { 623 netdev_err(port_dev, 624 "Device %s failed to register rx_handler\n", 625 port_dev->name); 626 goto out_fail; 627 } 628 629 ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL); 630 if (ret < 0) 631 goto out_unregister; 632 633 port_dev->priv_flags |= IFF_L3MDEV_SLAVE; 634 cycle_netdev(port_dev); 635 636 return 0; 637 638 out_unregister: 639 netdev_rx_handler_unregister(port_dev); 640 out_fail: 641 return ret; 642 } 643 644 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 645 { 646 if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev)) 647 return -EINVAL; 648 649 return do_vrf_add_slave(dev, port_dev); 650 } 651 652 /* inverse of do_vrf_add_slave */ 653 static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 654 { 655 netdev_upper_dev_unlink(port_dev, dev); 656 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; 657 658 netdev_rx_handler_unregister(port_dev); 659 660 cycle_netdev(port_dev); 661 662 return 0; 663 } 664 665 static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 666 { 667 return do_vrf_del_slave(dev, port_dev); 668 } 669 670 static void vrf_dev_uninit(struct net_device *dev) 671 { 672 struct net_vrf *vrf = netdev_priv(dev); 673 struct net_device *port_dev; 674 struct list_head *iter; 675 676 vrf_rtable_destroy(vrf); 677 vrf_rt6_destroy(vrf); 678 679 netdev_for_each_lower_dev(dev, port_dev, iter) 680 vrf_del_slave(dev, port_dev); 681 682 free_percpu(dev->dstats); 683 dev->dstats = NULL; 684 } 685 686 static int vrf_dev_init(struct net_device *dev) 687 { 688 struct net_vrf *vrf = netdev_priv(dev); 689 690 dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); 691 if (!dev->dstats) 692 goto out_nomem; 693 694 /* create the default dst which points back to us */ 695 vrf->rth = vrf_rtable_create(dev); 696 if (!vrf->rth) 697 goto out_stats; 698 699 if (vrf_rt6_create(dev) != 0) 700 goto out_rth; 701 702 dev->flags = IFF_MASTER | IFF_NOARP; 703 704 return 0; 705 706 out_rth: 707 vrf_rtable_destroy(vrf); 708 out_stats: 709 free_percpu(dev->dstats); 710 dev->dstats = NULL; 711 out_nomem: 712 return -ENOMEM; 713 } 714 715 static const struct net_device_ops vrf_netdev_ops = { 716 .ndo_init = vrf_dev_init, 717 .ndo_uninit = vrf_dev_uninit, 718 .ndo_start_xmit = vrf_xmit, 719 .ndo_get_stats64 = vrf_get_stats64, 720 .ndo_add_slave = vrf_add_slave, 721 .ndo_del_slave = vrf_del_slave, 722 }; 723 724 static u32 vrf_fib_table(const struct net_device *dev) 725 { 726 struct net_vrf *vrf = netdev_priv(dev); 727 728 return vrf->tb_id; 729 } 730 731 static struct rtable *vrf_get_rtable(const struct net_device *dev, 732 const struct flowi4 *fl4) 733 { 734 struct rtable *rth = NULL; 735 736 if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) { 737 struct net_vrf *vrf = netdev_priv(dev); 738 739 rth = vrf->rth; 740 atomic_inc(&rth->dst.__refcnt); 741 } 742 743 return rth; 744 } 745 746 /* called under rcu_read_lock */ 747 static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4) 748 { 749 struct fib_result res = { .tclassid = 0 }; 750 struct net *net = dev_net(dev); 751 u32 orig_tos = fl4->flowi4_tos; 752 u8 flags = fl4->flowi4_flags; 753 u8 scope = fl4->flowi4_scope; 754 u8 tos = RT_FL_TOS(fl4); 755 int rc; 756 757 if (unlikely(!fl4->daddr)) 758 return 0; 759 760 fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF; 761 fl4->flowi4_iif = LOOPBACK_IFINDEX; 762 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 763 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 764 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 765 766 rc = fib_lookup(net, fl4, &res, 0); 767 if (!rc) { 768 if (res.type == RTN_LOCAL) 769 fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr; 770 else 771 fib_select_path(net, &res, fl4, -1); 772 } 773 774 fl4->flowi4_flags = flags; 775 fl4->flowi4_tos = orig_tos; 776 fl4->flowi4_scope = scope; 777 778 return rc; 779 } 780 781 #if IS_ENABLED(CONFIG_IPV6) 782 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, 783 const struct flowi6 *fl6) 784 { 785 struct rt6_info *rt = NULL; 786 787 if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { 788 struct net_vrf *vrf = netdev_priv(dev); 789 790 rt = vrf->rt6; 791 atomic_inc(&rt->dst.__refcnt); 792 } 793 794 return (struct dst_entry *)rt; 795 } 796 #endif 797 798 static const struct l3mdev_ops vrf_l3mdev_ops = { 799 .l3mdev_fib_table = vrf_fib_table, 800 .l3mdev_get_rtable = vrf_get_rtable, 801 .l3mdev_get_saddr = vrf_get_saddr, 802 #if IS_ENABLED(CONFIG_IPV6) 803 .l3mdev_get_rt6_dst = vrf_get_rt6_dst, 804 #endif 805 }; 806 807 static void vrf_get_drvinfo(struct net_device *dev, 808 struct ethtool_drvinfo *info) 809 { 810 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 811 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 812 } 813 814 static const struct ethtool_ops vrf_ethtool_ops = { 815 .get_drvinfo = vrf_get_drvinfo, 816 }; 817 818 static void vrf_setup(struct net_device *dev) 819 { 820 ether_setup(dev); 821 822 /* Initialize the device structure. */ 823 dev->netdev_ops = &vrf_netdev_ops; 824 dev->l3mdev_ops = &vrf_l3mdev_ops; 825 dev->ethtool_ops = &vrf_ethtool_ops; 826 dev->destructor = free_netdev; 827 828 /* Fill in device structure with ethernet-generic values. */ 829 eth_hw_addr_random(dev); 830 831 /* don't acquire vrf device's netif_tx_lock when transmitting */ 832 dev->features |= NETIF_F_LLTX; 833 834 /* don't allow vrf devices to change network namespaces. */ 835 dev->features |= NETIF_F_NETNS_LOCAL; 836 } 837 838 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[]) 839 { 840 if (tb[IFLA_ADDRESS]) { 841 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 842 return -EINVAL; 843 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 844 return -EADDRNOTAVAIL; 845 } 846 return 0; 847 } 848 849 static void vrf_dellink(struct net_device *dev, struct list_head *head) 850 { 851 unregister_netdevice_queue(dev, head); 852 } 853 854 static int vrf_newlink(struct net *src_net, struct net_device *dev, 855 struct nlattr *tb[], struct nlattr *data[]) 856 { 857 struct net_vrf *vrf = netdev_priv(dev); 858 859 if (!data || !data[IFLA_VRF_TABLE]) 860 return -EINVAL; 861 862 vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); 863 864 dev->priv_flags |= IFF_L3MDEV_MASTER; 865 866 return register_netdevice(dev); 867 } 868 869 static size_t vrf_nl_getsize(const struct net_device *dev) 870 { 871 return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ 872 } 873 874 static int vrf_fillinfo(struct sk_buff *skb, 875 const struct net_device *dev) 876 { 877 struct net_vrf *vrf = netdev_priv(dev); 878 879 return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); 880 } 881 882 static size_t vrf_get_slave_size(const struct net_device *bond_dev, 883 const struct net_device *slave_dev) 884 { 885 return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */ 886 } 887 888 static int vrf_fill_slave_info(struct sk_buff *skb, 889 const struct net_device *vrf_dev, 890 const struct net_device *slave_dev) 891 { 892 struct net_vrf *vrf = netdev_priv(vrf_dev); 893 894 if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id)) 895 return -EMSGSIZE; 896 897 return 0; 898 } 899 900 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { 901 [IFLA_VRF_TABLE] = { .type = NLA_U32 }, 902 }; 903 904 static struct rtnl_link_ops vrf_link_ops __read_mostly = { 905 .kind = DRV_NAME, 906 .priv_size = sizeof(struct net_vrf), 907 908 .get_size = vrf_nl_getsize, 909 .policy = vrf_nl_policy, 910 .validate = vrf_validate, 911 .fill_info = vrf_fillinfo, 912 913 .get_slave_size = vrf_get_slave_size, 914 .fill_slave_info = vrf_fill_slave_info, 915 916 .newlink = vrf_newlink, 917 .dellink = vrf_dellink, 918 .setup = vrf_setup, 919 .maxtype = IFLA_VRF_MAX, 920 }; 921 922 static int vrf_device_event(struct notifier_block *unused, 923 unsigned long event, void *ptr) 924 { 925 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 926 927 /* only care about unregister events to drop slave references */ 928 if (event == NETDEV_UNREGISTER) { 929 struct net_device *vrf_dev; 930 931 if (!netif_is_l3_slave(dev)) 932 goto out; 933 934 vrf_dev = netdev_master_upper_dev_get(dev); 935 vrf_del_slave(vrf_dev, dev); 936 } 937 out: 938 return NOTIFY_DONE; 939 } 940 941 static struct notifier_block vrf_notifier_block __read_mostly = { 942 .notifier_call = vrf_device_event, 943 }; 944 945 static int __init vrf_init_module(void) 946 { 947 int rc; 948 949 vrf_dst_ops.kmem_cachep = 950 kmem_cache_create("vrf_ip_dst_cache", 951 sizeof(struct rtable), 0, 952 SLAB_HWCACHE_ALIGN, 953 NULL); 954 955 if (!vrf_dst_ops.kmem_cachep) 956 return -ENOMEM; 957 958 rc = init_dst_ops6_kmem_cachep(); 959 if (rc != 0) 960 goto error2; 961 962 register_netdevice_notifier(&vrf_notifier_block); 963 964 rc = rtnl_link_register(&vrf_link_ops); 965 if (rc < 0) 966 goto error; 967 968 return 0; 969 970 error: 971 unregister_netdevice_notifier(&vrf_notifier_block); 972 free_dst_ops6_kmem_cachep(); 973 error2: 974 kmem_cache_destroy(vrf_dst_ops.kmem_cachep); 975 return rc; 976 } 977 978 static void __exit vrf_cleanup_module(void) 979 { 980 rtnl_link_unregister(&vrf_link_ops); 981 unregister_netdevice_notifier(&vrf_notifier_block); 982 kmem_cache_destroy(vrf_dst_ops.kmem_cachep); 983 free_dst_ops6_kmem_cachep(); 984 } 985 986 module_init(vrf_init_module); 987 module_exit(vrf_cleanup_module); 988 MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); 989 MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); 990 MODULE_LICENSE("GPL"); 991 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 992 MODULE_VERSION(DRV_VERSION); 993