1 /* 2 * Linux NET3: GRE over IP protocol decoder. 3 * 4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/capability.h> 16 #include <linux/module.h> 17 #include <linux/types.h> 18 #include <linux/kernel.h> 19 #include <linux/slab.h> 20 #include <asm/uaccess.h> 21 #include <linux/skbuff.h> 22 #include <linux/netdevice.h> 23 #include <linux/in.h> 24 #include <linux/tcp.h> 25 #include <linux/udp.h> 26 #include <linux/if_arp.h> 27 #include <linux/if_vlan.h> 28 #include <linux/init.h> 29 #include <linux/in6.h> 30 #include <linux/inetdevice.h> 31 #include <linux/igmp.h> 32 #include <linux/netfilter_ipv4.h> 33 #include <linux/etherdevice.h> 34 #include <linux/if_ether.h> 35 36 #include <net/sock.h> 37 #include <net/ip.h> 38 #include <net/icmp.h> 39 #include <net/protocol.h> 40 #include <net/ip_tunnels.h> 41 #include <net/arp.h> 42 #include <net/checksum.h> 43 #include <net/dsfield.h> 44 #include <net/inet_ecn.h> 45 #include <net/xfrm.h> 46 #include <net/net_namespace.h> 47 #include <net/netns/generic.h> 48 #include <net/rtnetlink.h> 49 #include <net/gre.h> 50 #include <net/dst_metadata.h> 51 52 #if IS_ENABLED(CONFIG_IPV6) 53 #include <net/ipv6.h> 54 #include <net/ip6_fib.h> 55 #include <net/ip6_route.h> 56 #endif 57 58 /* 59 Problems & solutions 60 -------------------- 61 62 1. The most important issue is detecting local dead loops. 63 They would cause complete host lockup in transmit, which 64 would be "resolved" by stack overflow or, if queueing is enabled, 65 with infinite looping in net_bh. 66 67 We cannot track such dead loops during route installation, 68 it is infeasible task. The most general solutions would be 69 to keep skb->encapsulation counter (sort of local ttl), 70 and silently drop packet when it expires. It is a good 71 solution, but it supposes maintaining new variable in ALL 72 skb, even if no tunneling is used. 73 74 Current solution: xmit_recursion breaks dead loops. This is a percpu 75 counter, since when we enter the first ndo_xmit(), cpu migration is 76 forbidden. We force an exit if this counter reaches RECURSION_LIMIT 77 78 2. Networking dead loops would not kill routers, but would really 79 kill network. IP hop limit plays role of "t->recursion" in this case, 80 if we copy it from packet being encapsulated to upper header. 81 It is very good solution, but it introduces two problems: 82 83 - Routing protocols, using packets with ttl=1 (OSPF, RIP2), 84 do not work over tunnels. 85 - traceroute does not work. I planned to relay ICMP from tunnel, 86 so that this problem would be solved and traceroute output 87 would even more informative. This idea appeared to be wrong: 88 only Linux complies to rfc1812 now (yes, guys, Linux is the only 89 true router now :-)), all routers (at least, in neighbourhood of mine) 90 return only 8 bytes of payload. It is the end. 91 92 Hence, if we want that OSPF worked or traceroute said something reasonable, 93 we should search for another solution. 94 95 One of them is to parse packet trying to detect inner encapsulation 96 made by our node. It is difficult or even impossible, especially, 97 taking into account fragmentation. TO be short, ttl is not solution at all. 98 99 Current solution: The solution was UNEXPECTEDLY SIMPLE. 100 We force DF flag on tunnels with preconfigured hop limit, 101 that is ALL. :-) Well, it does not remove the problem completely, 102 but exponential growth of network traffic is changed to linear 103 (branches, that exceed pmtu are pruned) and tunnel mtu 104 rapidly degrades to value <68, where looping stops. 105 Yes, it is not good if there exists a router in the loop, 106 which does not force DF, even when encapsulating packets have DF set. 107 But it is not our problem! Nobody could accuse us, we made 108 all that we could make. Even if it is your gated who injected 109 fatal route to network, even if it were you who configured 110 fatal static route: you are innocent. :-) 111 112 Alexey Kuznetsov. 113 */ 114 115 static bool log_ecn_error = true; 116 module_param(log_ecn_error, bool, 0644); 117 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 118 119 static struct rtnl_link_ops ipgre_link_ops __read_mostly; 120 static int ipgre_tunnel_init(struct net_device *dev); 121 122 static int ipgre_net_id __read_mostly; 123 static int gre_tap_net_id __read_mostly; 124 125 static void ipgre_err(struct sk_buff *skb, u32 info, 126 const struct tnl_ptk_info *tpi) 127 { 128 129 /* All the routers (except for Linux) return only 130 8 bytes of packet payload. It means, that precise relaying of 131 ICMP in the real Internet is absolutely infeasible. 132 133 Moreover, Cisco "wise men" put GRE key to the third word 134 in GRE header. It makes impossible maintaining even soft 135 state for keyed GRE tunnels with enabled checksum. Tell 136 them "thank you". 137 138 Well, I wonder, rfc1812 was written by Cisco employee, 139 what the hell these idiots break standards established 140 by themselves??? 141 */ 142 struct net *net = dev_net(skb->dev); 143 struct ip_tunnel_net *itn; 144 const struct iphdr *iph; 145 const int type = icmp_hdr(skb)->type; 146 const int code = icmp_hdr(skb)->code; 147 struct ip_tunnel *t; 148 149 switch (type) { 150 default: 151 case ICMP_PARAMETERPROB: 152 return; 153 154 case ICMP_DEST_UNREACH: 155 switch (code) { 156 case ICMP_SR_FAILED: 157 case ICMP_PORT_UNREACH: 158 /* Impossible event. */ 159 return; 160 default: 161 /* All others are translated to HOST_UNREACH. 162 rfc2003 contains "deep thoughts" about NET_UNREACH, 163 I believe they are just ether pollution. --ANK 164 */ 165 break; 166 } 167 break; 168 169 case ICMP_TIME_EXCEEDED: 170 if (code != ICMP_EXC_TTL) 171 return; 172 break; 173 174 case ICMP_REDIRECT: 175 break; 176 } 177 178 if (tpi->proto == htons(ETH_P_TEB)) 179 itn = net_generic(net, gre_tap_net_id); 180 else 181 itn = net_generic(net, ipgre_net_id); 182 183 iph = (const struct iphdr *)(icmp_hdr(skb) + 1); 184 t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, 185 iph->daddr, iph->saddr, tpi->key); 186 187 if (!t) 188 return; 189 190 if (t->parms.iph.daddr == 0 || 191 ipv4_is_multicast(t->parms.iph.daddr)) 192 return; 193 194 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 195 return; 196 197 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) 198 t->err_count++; 199 else 200 t->err_count = 1; 201 t->err_time = jiffies; 202 } 203 204 static void gre_err(struct sk_buff *skb, u32 info) 205 { 206 /* All the routers (except for Linux) return only 207 * 8 bytes of packet payload. It means, that precise relaying of 208 * ICMP in the real Internet is absolutely infeasible. 209 * 210 * Moreover, Cisco "wise men" put GRE key to the third word 211 * in GRE header. It makes impossible maintaining even soft 212 * state for keyed 213 * GRE tunnels with enabled checksum. Tell them "thank you". 214 * 215 * Well, I wonder, rfc1812 was written by Cisco employee, 216 * what the hell these idiots break standards established 217 * by themselves??? 218 */ 219 220 const int type = icmp_hdr(skb)->type; 221 const int code = icmp_hdr(skb)->code; 222 struct tnl_ptk_info tpi; 223 bool csum_err = false; 224 225 if (gre_parse_header(skb, &tpi, &csum_err) < 0) { 226 if (!csum_err) /* ignore csum errors. */ 227 return; 228 } 229 230 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 231 ipv4_update_pmtu(skb, dev_net(skb->dev), info, 232 skb->dev->ifindex, 0, IPPROTO_GRE, 0); 233 return; 234 } 235 if (type == ICMP_REDIRECT) { 236 ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, 237 IPPROTO_GRE, 0); 238 return; 239 } 240 241 ipgre_err(skb, info, &tpi); 242 } 243 244 static __be64 key_to_tunnel_id(__be32 key) 245 { 246 #ifdef __BIG_ENDIAN 247 return (__force __be64)((__force u32)key); 248 #else 249 return (__force __be64)((__force u64)key << 32); 250 #endif 251 } 252 253 /* Returns the least-significant 32 bits of a __be64. */ 254 static __be32 tunnel_id_to_key(__be64 x) 255 { 256 #ifdef __BIG_ENDIAN 257 return (__force __be32)x; 258 #else 259 return (__force __be32)((__force u64)x >> 32); 260 #endif 261 } 262 263 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, 264 struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) 265 { 266 struct metadata_dst *tun_dst = NULL; 267 const struct iphdr *iph; 268 struct ip_tunnel *tunnel; 269 270 iph = ip_hdr(skb); 271 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, 272 iph->saddr, iph->daddr, tpi->key); 273 274 if (tunnel) { 275 if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, 276 raw_proto, false) < 0) 277 goto drop; 278 279 skb_pop_mac_header(skb); 280 if (tunnel->collect_md) { 281 __be16 flags; 282 __be64 tun_id; 283 284 flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); 285 tun_id = key_to_tunnel_id(tpi->key); 286 tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); 287 if (!tun_dst) 288 return PACKET_REJECT; 289 } 290 291 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); 292 return PACKET_RCVD; 293 } 294 return PACKET_NEXT; 295 296 drop: 297 kfree_skb(skb); 298 return PACKET_RCVD; 299 } 300 301 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, 302 int hdr_len) 303 { 304 struct net *net = dev_net(skb->dev); 305 struct ip_tunnel_net *itn; 306 int res; 307 308 if (tpi->proto == htons(ETH_P_TEB)) 309 itn = net_generic(net, gre_tap_net_id); 310 else 311 itn = net_generic(net, ipgre_net_id); 312 313 res = __ipgre_rcv(skb, tpi, itn, hdr_len, false); 314 if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) { 315 /* ipgre tunnels in collect metadata mode should receive 316 * also ETH_P_TEB traffic. 317 */ 318 itn = net_generic(net, ipgre_net_id); 319 res = __ipgre_rcv(skb, tpi, itn, hdr_len, true); 320 } 321 return res; 322 } 323 324 static int gre_rcv(struct sk_buff *skb) 325 { 326 struct tnl_ptk_info tpi; 327 bool csum_err = false; 328 int hdr_len; 329 330 #ifdef CONFIG_NET_IPGRE_BROADCAST 331 if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { 332 /* Looped back packet, drop it! */ 333 if (rt_is_output_route(skb_rtable(skb))) 334 goto drop; 335 } 336 #endif 337 338 hdr_len = gre_parse_header(skb, &tpi, &csum_err); 339 if (hdr_len < 0) 340 goto drop; 341 342 if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) 343 return 0; 344 345 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 346 drop: 347 kfree_skb(skb); 348 return 0; 349 } 350 351 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, 352 const struct iphdr *tnl_params, 353 __be16 proto) 354 { 355 struct ip_tunnel *tunnel = netdev_priv(dev); 356 357 if (tunnel->parms.o_flags & TUNNEL_SEQ) 358 tunnel->o_seqno++; 359 360 /* Push GRE header. */ 361 gre_build_header(skb, tunnel->tun_hlen, 362 tunnel->parms.o_flags, proto, tunnel->parms.o_key, 363 htonl(tunnel->o_seqno)); 364 365 skb_set_inner_protocol(skb, proto); 366 ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); 367 } 368 369 static int gre_handle_offloads(struct sk_buff *skb, bool csum) 370 { 371 return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); 372 } 373 374 static struct rtable *gre_get_rt(struct sk_buff *skb, 375 struct net_device *dev, 376 struct flowi4 *fl, 377 const struct ip_tunnel_key *key) 378 { 379 struct net *net = dev_net(dev); 380 381 memset(fl, 0, sizeof(*fl)); 382 fl->daddr = key->u.ipv4.dst; 383 fl->saddr = key->u.ipv4.src; 384 fl->flowi4_tos = RT_TOS(key->tos); 385 fl->flowi4_mark = skb->mark; 386 fl->flowi4_proto = IPPROTO_GRE; 387 388 return ip_route_output_key(net, fl); 389 } 390 391 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, 392 __be16 proto) 393 { 394 struct ip_tunnel_info *tun_info; 395 const struct ip_tunnel_key *key; 396 struct rtable *rt = NULL; 397 struct flowi4 fl; 398 int min_headroom; 399 int tunnel_hlen; 400 __be16 df, flags; 401 bool use_cache; 402 int err; 403 404 tun_info = skb_tunnel_info(skb); 405 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 406 ip_tunnel_info_af(tun_info) != AF_INET)) 407 goto err_free_skb; 408 409 key = &tun_info->key; 410 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 411 if (use_cache) 412 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr); 413 if (!rt) { 414 rt = gre_get_rt(skb, dev, &fl, key); 415 if (IS_ERR(rt)) 416 goto err_free_skb; 417 if (use_cache) 418 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 419 fl.saddr); 420 } 421 422 tunnel_hlen = gre_calc_hlen(key->tun_flags); 423 424 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len 425 + tunnel_hlen + sizeof(struct iphdr); 426 if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { 427 int head_delta = SKB_DATA_ALIGN(min_headroom - 428 skb_headroom(skb) + 429 16); 430 err = pskb_expand_head(skb, max_t(int, head_delta, 0), 431 0, GFP_ATOMIC); 432 if (unlikely(err)) 433 goto err_free_rt; 434 } 435 436 /* Push Tunnel header. */ 437 if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) 438 goto err_free_rt; 439 440 flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); 441 gre_build_header(skb, tunnel_hlen, flags, proto, 442 tunnel_id_to_key(tun_info->key.tun_id), 0); 443 444 df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; 445 446 iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, 447 key->tos, key->ttl, df, false); 448 return; 449 450 err_free_rt: 451 ip_rt_put(rt); 452 err_free_skb: 453 kfree_skb(skb); 454 dev->stats.tx_dropped++; 455 } 456 457 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) 458 { 459 struct ip_tunnel_info *info = skb_tunnel_info(skb); 460 struct rtable *rt; 461 struct flowi4 fl4; 462 463 if (ip_tunnel_info_af(info) != AF_INET) 464 return -EINVAL; 465 466 rt = gre_get_rt(skb, dev, &fl4, &info->key); 467 if (IS_ERR(rt)) 468 return PTR_ERR(rt); 469 470 ip_rt_put(rt); 471 info->key.u.ipv4.src = fl4.saddr; 472 return 0; 473 } 474 475 static netdev_tx_t ipgre_xmit(struct sk_buff *skb, 476 struct net_device *dev) 477 { 478 struct ip_tunnel *tunnel = netdev_priv(dev); 479 const struct iphdr *tnl_params; 480 481 if (tunnel->collect_md) { 482 gre_fb_xmit(skb, dev, skb->protocol); 483 return NETDEV_TX_OK; 484 } 485 486 if (dev->header_ops) { 487 /* Need space for new headers */ 488 if (skb_cow_head(skb, dev->needed_headroom - 489 (tunnel->hlen + sizeof(struct iphdr)))) 490 goto free_skb; 491 492 tnl_params = (const struct iphdr *)skb->data; 493 494 /* Pull skb since ip_tunnel_xmit() needs skb->data pointing 495 * to gre header. 496 */ 497 skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); 498 skb_reset_mac_header(skb); 499 } else { 500 if (skb_cow_head(skb, dev->needed_headroom)) 501 goto free_skb; 502 503 tnl_params = &tunnel->parms.iph; 504 } 505 506 if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM))) 507 goto free_skb; 508 509 __gre_xmit(skb, dev, tnl_params, skb->protocol); 510 return NETDEV_TX_OK; 511 512 free_skb: 513 kfree_skb(skb); 514 dev->stats.tx_dropped++; 515 return NETDEV_TX_OK; 516 } 517 518 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, 519 struct net_device *dev) 520 { 521 struct ip_tunnel *tunnel = netdev_priv(dev); 522 523 if (tunnel->collect_md) { 524 gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); 525 return NETDEV_TX_OK; 526 } 527 528 if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM))) 529 goto free_skb; 530 531 if (skb_cow_head(skb, dev->needed_headroom)) 532 goto free_skb; 533 534 __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); 535 return NETDEV_TX_OK; 536 537 free_skb: 538 kfree_skb(skb); 539 dev->stats.tx_dropped++; 540 return NETDEV_TX_OK; 541 } 542 543 static int ipgre_tunnel_ioctl(struct net_device *dev, 544 struct ifreq *ifr, int cmd) 545 { 546 int err; 547 struct ip_tunnel_parm p; 548 549 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 550 return -EFAULT; 551 if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { 552 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || 553 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || 554 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) 555 return -EINVAL; 556 } 557 p.i_flags = gre_flags_to_tnl_flags(p.i_flags); 558 p.o_flags = gre_flags_to_tnl_flags(p.o_flags); 559 560 err = ip_tunnel_ioctl(dev, &p, cmd); 561 if (err) 562 return err; 563 564 p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); 565 p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); 566 567 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 568 return -EFAULT; 569 return 0; 570 } 571 572 /* Nice toy. Unfortunately, useless in real life :-) 573 It allows to construct virtual multiprotocol broadcast "LAN" 574 over the Internet, provided multicast routing is tuned. 575 576 577 I have no idea was this bicycle invented before me, 578 so that I had to set ARPHRD_IPGRE to a random value. 579 I have an impression, that Cisco could make something similar, 580 but this feature is apparently missing in IOS<=11.2(8). 581 582 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks 583 with broadcast 224.66.66.66. If you have access to mbone, play with me :-) 584 585 ping -t 255 224.66.66.66 586 587 If nobody answers, mbone does not work. 588 589 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255 590 ip addr add 10.66.66.<somewhat>/24 dev Universe 591 ifconfig Universe up 592 ifconfig Universe add fe80::<Your_real_addr>/10 593 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96 594 ftp 10.66.66.66 595 ... 596 ftp fec0:6666:6666::193.233.7.65 597 ... 598 */ 599 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 600 unsigned short type, 601 const void *daddr, const void *saddr, unsigned int len) 602 { 603 struct ip_tunnel *t = netdev_priv(dev); 604 struct iphdr *iph; 605 struct gre_base_hdr *greh; 606 607 iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph)); 608 greh = (struct gre_base_hdr *)(iph+1); 609 greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags); 610 greh->protocol = htons(type); 611 612 memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); 613 614 /* Set the source hardware address. */ 615 if (saddr) 616 memcpy(&iph->saddr, saddr, 4); 617 if (daddr) 618 memcpy(&iph->daddr, daddr, 4); 619 if (iph->daddr) 620 return t->hlen + sizeof(*iph); 621 622 return -(t->hlen + sizeof(*iph)); 623 } 624 625 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) 626 { 627 const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); 628 memcpy(haddr, &iph->saddr, 4); 629 return 4; 630 } 631 632 static const struct header_ops ipgre_header_ops = { 633 .create = ipgre_header, 634 .parse = ipgre_header_parse, 635 }; 636 637 #ifdef CONFIG_NET_IPGRE_BROADCAST 638 static int ipgre_open(struct net_device *dev) 639 { 640 struct ip_tunnel *t = netdev_priv(dev); 641 642 if (ipv4_is_multicast(t->parms.iph.daddr)) { 643 struct flowi4 fl4; 644 struct rtable *rt; 645 646 rt = ip_route_output_gre(t->net, &fl4, 647 t->parms.iph.daddr, 648 t->parms.iph.saddr, 649 t->parms.o_key, 650 RT_TOS(t->parms.iph.tos), 651 t->parms.link); 652 if (IS_ERR(rt)) 653 return -EADDRNOTAVAIL; 654 dev = rt->dst.dev; 655 ip_rt_put(rt); 656 if (!__in_dev_get_rtnl(dev)) 657 return -EADDRNOTAVAIL; 658 t->mlink = dev->ifindex; 659 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); 660 } 661 return 0; 662 } 663 664 static int ipgre_close(struct net_device *dev) 665 { 666 struct ip_tunnel *t = netdev_priv(dev); 667 668 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 669 struct in_device *in_dev; 670 in_dev = inetdev_by_index(t->net, t->mlink); 671 if (in_dev) 672 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 673 } 674 return 0; 675 } 676 #endif 677 678 static const struct net_device_ops ipgre_netdev_ops = { 679 .ndo_init = ipgre_tunnel_init, 680 .ndo_uninit = ip_tunnel_uninit, 681 #ifdef CONFIG_NET_IPGRE_BROADCAST 682 .ndo_open = ipgre_open, 683 .ndo_stop = ipgre_close, 684 #endif 685 .ndo_start_xmit = ipgre_xmit, 686 .ndo_do_ioctl = ipgre_tunnel_ioctl, 687 .ndo_change_mtu = ip_tunnel_change_mtu, 688 .ndo_get_stats64 = ip_tunnel_get_stats64, 689 .ndo_get_iflink = ip_tunnel_get_iflink, 690 }; 691 692 #define GRE_FEATURES (NETIF_F_SG | \ 693 NETIF_F_FRAGLIST | \ 694 NETIF_F_HIGHDMA | \ 695 NETIF_F_HW_CSUM) 696 697 static void ipgre_tunnel_setup(struct net_device *dev) 698 { 699 dev->netdev_ops = &ipgre_netdev_ops; 700 dev->type = ARPHRD_IPGRE; 701 ip_tunnel_setup(dev, ipgre_net_id); 702 } 703 704 static void __gre_tunnel_init(struct net_device *dev) 705 { 706 struct ip_tunnel *tunnel; 707 int t_hlen; 708 709 tunnel = netdev_priv(dev); 710 tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); 711 tunnel->parms.iph.protocol = IPPROTO_GRE; 712 713 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; 714 715 t_hlen = tunnel->hlen + sizeof(struct iphdr); 716 717 dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; 718 dev->mtu = ETH_DATA_LEN - t_hlen - 4; 719 720 dev->features |= GRE_FEATURES; 721 dev->hw_features |= GRE_FEATURES; 722 723 if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { 724 /* TCP offload with GRE SEQ is not supported, nor 725 * can we support 2 levels of outer headers requiring 726 * an update. 727 */ 728 if (!(tunnel->parms.o_flags & TUNNEL_CSUM) || 729 (tunnel->encap.type == TUNNEL_ENCAP_NONE)) { 730 dev->features |= NETIF_F_GSO_SOFTWARE; 731 dev->hw_features |= NETIF_F_GSO_SOFTWARE; 732 } 733 734 /* Can use a lockless transmit, unless we generate 735 * output sequences 736 */ 737 dev->features |= NETIF_F_LLTX; 738 } 739 } 740 741 static int ipgre_tunnel_init(struct net_device *dev) 742 { 743 struct ip_tunnel *tunnel = netdev_priv(dev); 744 struct iphdr *iph = &tunnel->parms.iph; 745 746 __gre_tunnel_init(dev); 747 748 memcpy(dev->dev_addr, &iph->saddr, 4); 749 memcpy(dev->broadcast, &iph->daddr, 4); 750 751 dev->flags = IFF_NOARP; 752 netif_keep_dst(dev); 753 dev->addr_len = 4; 754 755 if (iph->daddr && !tunnel->collect_md) { 756 #ifdef CONFIG_NET_IPGRE_BROADCAST 757 if (ipv4_is_multicast(iph->daddr)) { 758 if (!iph->saddr) 759 return -EINVAL; 760 dev->flags = IFF_BROADCAST; 761 dev->header_ops = &ipgre_header_ops; 762 } 763 #endif 764 } else if (!tunnel->collect_md) { 765 dev->header_ops = &ipgre_header_ops; 766 } 767 768 return ip_tunnel_init(dev); 769 } 770 771 static const struct gre_protocol ipgre_protocol = { 772 .handler = gre_rcv, 773 .err_handler = gre_err, 774 }; 775 776 static int __net_init ipgre_init_net(struct net *net) 777 { 778 return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); 779 } 780 781 static void __net_exit ipgre_exit_net(struct net *net) 782 { 783 struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); 784 ip_tunnel_delete_net(itn, &ipgre_link_ops); 785 } 786 787 static struct pernet_operations ipgre_net_ops = { 788 .init = ipgre_init_net, 789 .exit = ipgre_exit_net, 790 .id = &ipgre_net_id, 791 .size = sizeof(struct ip_tunnel_net), 792 }; 793 794 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) 795 { 796 __be16 flags; 797 798 if (!data) 799 return 0; 800 801 flags = 0; 802 if (data[IFLA_GRE_IFLAGS]) 803 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); 804 if (data[IFLA_GRE_OFLAGS]) 805 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); 806 if (flags & (GRE_VERSION|GRE_ROUTING)) 807 return -EINVAL; 808 809 if (data[IFLA_GRE_COLLECT_METADATA] && 810 data[IFLA_GRE_ENCAP_TYPE] && 811 nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE) 812 return -EINVAL; 813 814 return 0; 815 } 816 817 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) 818 { 819 __be32 daddr; 820 821 if (tb[IFLA_ADDRESS]) { 822 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 823 return -EINVAL; 824 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 825 return -EADDRNOTAVAIL; 826 } 827 828 if (!data) 829 goto out; 830 831 if (data[IFLA_GRE_REMOTE]) { 832 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); 833 if (!daddr) 834 return -EINVAL; 835 } 836 837 out: 838 return ipgre_tunnel_validate(tb, data); 839 } 840 841 static void ipgre_netlink_parms(struct net_device *dev, 842 struct nlattr *data[], 843 struct nlattr *tb[], 844 struct ip_tunnel_parm *parms) 845 { 846 memset(parms, 0, sizeof(*parms)); 847 848 parms->iph.protocol = IPPROTO_GRE; 849 850 if (!data) 851 return; 852 853 if (data[IFLA_GRE_LINK]) 854 parms->link = nla_get_u32(data[IFLA_GRE_LINK]); 855 856 if (data[IFLA_GRE_IFLAGS]) 857 parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS])); 858 859 if (data[IFLA_GRE_OFLAGS]) 860 parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS])); 861 862 if (data[IFLA_GRE_IKEY]) 863 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); 864 865 if (data[IFLA_GRE_OKEY]) 866 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); 867 868 if (data[IFLA_GRE_LOCAL]) 869 parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]); 870 871 if (data[IFLA_GRE_REMOTE]) 872 parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]); 873 874 if (data[IFLA_GRE_TTL]) 875 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); 876 877 if (data[IFLA_GRE_TOS]) 878 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); 879 880 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) 881 parms->iph.frag_off = htons(IP_DF); 882 883 if (data[IFLA_GRE_COLLECT_METADATA]) { 884 struct ip_tunnel *t = netdev_priv(dev); 885 886 t->collect_md = true; 887 } 888 } 889 890 /* This function returns true when ENCAP attributes are present in the nl msg */ 891 static bool ipgre_netlink_encap_parms(struct nlattr *data[], 892 struct ip_tunnel_encap *ipencap) 893 { 894 bool ret = false; 895 896 memset(ipencap, 0, sizeof(*ipencap)); 897 898 if (!data) 899 return ret; 900 901 if (data[IFLA_GRE_ENCAP_TYPE]) { 902 ret = true; 903 ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); 904 } 905 906 if (data[IFLA_GRE_ENCAP_FLAGS]) { 907 ret = true; 908 ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); 909 } 910 911 if (data[IFLA_GRE_ENCAP_SPORT]) { 912 ret = true; 913 ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]); 914 } 915 916 if (data[IFLA_GRE_ENCAP_DPORT]) { 917 ret = true; 918 ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]); 919 } 920 921 return ret; 922 } 923 924 static int gre_tap_init(struct net_device *dev) 925 { 926 __gre_tunnel_init(dev); 927 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 928 929 return ip_tunnel_init(dev); 930 } 931 932 static const struct net_device_ops gre_tap_netdev_ops = { 933 .ndo_init = gre_tap_init, 934 .ndo_uninit = ip_tunnel_uninit, 935 .ndo_start_xmit = gre_tap_xmit, 936 .ndo_set_mac_address = eth_mac_addr, 937 .ndo_validate_addr = eth_validate_addr, 938 .ndo_change_mtu = ip_tunnel_change_mtu, 939 .ndo_get_stats64 = ip_tunnel_get_stats64, 940 .ndo_get_iflink = ip_tunnel_get_iflink, 941 .ndo_fill_metadata_dst = gre_fill_metadata_dst, 942 }; 943 944 static void ipgre_tap_setup(struct net_device *dev) 945 { 946 ether_setup(dev); 947 dev->netdev_ops = &gre_tap_netdev_ops; 948 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 949 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 950 ip_tunnel_setup(dev, gre_tap_net_id); 951 } 952 953 static int ipgre_newlink(struct net *src_net, struct net_device *dev, 954 struct nlattr *tb[], struct nlattr *data[]) 955 { 956 struct ip_tunnel_parm p; 957 struct ip_tunnel_encap ipencap; 958 959 if (ipgre_netlink_encap_parms(data, &ipencap)) { 960 struct ip_tunnel *t = netdev_priv(dev); 961 int err = ip_tunnel_encap_setup(t, &ipencap); 962 963 if (err < 0) 964 return err; 965 } 966 967 ipgre_netlink_parms(dev, data, tb, &p); 968 return ip_tunnel_newlink(dev, tb, &p); 969 } 970 971 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], 972 struct nlattr *data[]) 973 { 974 struct ip_tunnel_parm p; 975 struct ip_tunnel_encap ipencap; 976 977 if (ipgre_netlink_encap_parms(data, &ipencap)) { 978 struct ip_tunnel *t = netdev_priv(dev); 979 int err = ip_tunnel_encap_setup(t, &ipencap); 980 981 if (err < 0) 982 return err; 983 } 984 985 ipgre_netlink_parms(dev, data, tb, &p); 986 return ip_tunnel_changelink(dev, tb, &p); 987 } 988 989 static size_t ipgre_get_size(const struct net_device *dev) 990 { 991 return 992 /* IFLA_GRE_LINK */ 993 nla_total_size(4) + 994 /* IFLA_GRE_IFLAGS */ 995 nla_total_size(2) + 996 /* IFLA_GRE_OFLAGS */ 997 nla_total_size(2) + 998 /* IFLA_GRE_IKEY */ 999 nla_total_size(4) + 1000 /* IFLA_GRE_OKEY */ 1001 nla_total_size(4) + 1002 /* IFLA_GRE_LOCAL */ 1003 nla_total_size(4) + 1004 /* IFLA_GRE_REMOTE */ 1005 nla_total_size(4) + 1006 /* IFLA_GRE_TTL */ 1007 nla_total_size(1) + 1008 /* IFLA_GRE_TOS */ 1009 nla_total_size(1) + 1010 /* IFLA_GRE_PMTUDISC */ 1011 nla_total_size(1) + 1012 /* IFLA_GRE_ENCAP_TYPE */ 1013 nla_total_size(2) + 1014 /* IFLA_GRE_ENCAP_FLAGS */ 1015 nla_total_size(2) + 1016 /* IFLA_GRE_ENCAP_SPORT */ 1017 nla_total_size(2) + 1018 /* IFLA_GRE_ENCAP_DPORT */ 1019 nla_total_size(2) + 1020 /* IFLA_GRE_COLLECT_METADATA */ 1021 nla_total_size(0) + 1022 0; 1023 } 1024 1025 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) 1026 { 1027 struct ip_tunnel *t = netdev_priv(dev); 1028 struct ip_tunnel_parm *p = &t->parms; 1029 1030 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || 1031 nla_put_be16(skb, IFLA_GRE_IFLAGS, 1032 gre_tnl_flags_to_gre_flags(p->i_flags)) || 1033 nla_put_be16(skb, IFLA_GRE_OFLAGS, 1034 gre_tnl_flags_to_gre_flags(p->o_flags)) || 1035 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || 1036 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || 1037 nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || 1038 nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) || 1039 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || 1040 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || 1041 nla_put_u8(skb, IFLA_GRE_PMTUDISC, 1042 !!(p->iph.frag_off & htons(IP_DF)))) 1043 goto nla_put_failure; 1044 1045 if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, 1046 t->encap.type) || 1047 nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT, 1048 t->encap.sport) || 1049 nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT, 1050 t->encap.dport) || 1051 nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, 1052 t->encap.flags)) 1053 goto nla_put_failure; 1054 1055 if (t->collect_md) { 1056 if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) 1057 goto nla_put_failure; 1058 } 1059 1060 return 0; 1061 1062 nla_put_failure: 1063 return -EMSGSIZE; 1064 } 1065 1066 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { 1067 [IFLA_GRE_LINK] = { .type = NLA_U32 }, 1068 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, 1069 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, 1070 [IFLA_GRE_IKEY] = { .type = NLA_U32 }, 1071 [IFLA_GRE_OKEY] = { .type = NLA_U32 }, 1072 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, 1073 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, 1074 [IFLA_GRE_TTL] = { .type = NLA_U8 }, 1075 [IFLA_GRE_TOS] = { .type = NLA_U8 }, 1076 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, 1077 [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, 1078 [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, 1079 [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, 1080 [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, 1081 [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, 1082 }; 1083 1084 static struct rtnl_link_ops ipgre_link_ops __read_mostly = { 1085 .kind = "gre", 1086 .maxtype = IFLA_GRE_MAX, 1087 .policy = ipgre_policy, 1088 .priv_size = sizeof(struct ip_tunnel), 1089 .setup = ipgre_tunnel_setup, 1090 .validate = ipgre_tunnel_validate, 1091 .newlink = ipgre_newlink, 1092 .changelink = ipgre_changelink, 1093 .dellink = ip_tunnel_dellink, 1094 .get_size = ipgre_get_size, 1095 .fill_info = ipgre_fill_info, 1096 .get_link_net = ip_tunnel_get_link_net, 1097 }; 1098 1099 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { 1100 .kind = "gretap", 1101 .maxtype = IFLA_GRE_MAX, 1102 .policy = ipgre_policy, 1103 .priv_size = sizeof(struct ip_tunnel), 1104 .setup = ipgre_tap_setup, 1105 .validate = ipgre_tap_validate, 1106 .newlink = ipgre_newlink, 1107 .changelink = ipgre_changelink, 1108 .dellink = ip_tunnel_dellink, 1109 .get_size = ipgre_get_size, 1110 .fill_info = ipgre_fill_info, 1111 .get_link_net = ip_tunnel_get_link_net, 1112 }; 1113 1114 struct net_device *gretap_fb_dev_create(struct net *net, const char *name, 1115 u8 name_assign_type) 1116 { 1117 struct nlattr *tb[IFLA_MAX + 1]; 1118 struct net_device *dev; 1119 struct ip_tunnel *t; 1120 int err; 1121 1122 memset(&tb, 0, sizeof(tb)); 1123 1124 dev = rtnl_create_link(net, name, name_assign_type, 1125 &ipgre_tap_ops, tb); 1126 if (IS_ERR(dev)) 1127 return dev; 1128 1129 /* Configure flow based GRE device. */ 1130 t = netdev_priv(dev); 1131 t->collect_md = true; 1132 1133 err = ipgre_newlink(net, dev, tb, NULL); 1134 if (err < 0) 1135 goto out; 1136 1137 /* openvswitch users expect packet sizes to be unrestricted, 1138 * so set the largest MTU we can. 1139 */ 1140 err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false); 1141 if (err) 1142 goto out; 1143 1144 return dev; 1145 out: 1146 free_netdev(dev); 1147 return ERR_PTR(err); 1148 } 1149 EXPORT_SYMBOL_GPL(gretap_fb_dev_create); 1150 1151 static int __net_init ipgre_tap_init_net(struct net *net) 1152 { 1153 return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); 1154 } 1155 1156 static void __net_exit ipgre_tap_exit_net(struct net *net) 1157 { 1158 struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); 1159 ip_tunnel_delete_net(itn, &ipgre_tap_ops); 1160 } 1161 1162 static struct pernet_operations ipgre_tap_net_ops = { 1163 .init = ipgre_tap_init_net, 1164 .exit = ipgre_tap_exit_net, 1165 .id = &gre_tap_net_id, 1166 .size = sizeof(struct ip_tunnel_net), 1167 }; 1168 1169 static int __init ipgre_init(void) 1170 { 1171 int err; 1172 1173 pr_info("GRE over IPv4 tunneling driver\n"); 1174 1175 err = register_pernet_device(&ipgre_net_ops); 1176 if (err < 0) 1177 return err; 1178 1179 err = register_pernet_device(&ipgre_tap_net_ops); 1180 if (err < 0) 1181 goto pnet_tap_faied; 1182 1183 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); 1184 if (err < 0) { 1185 pr_info("%s: can't add protocol\n", __func__); 1186 goto add_proto_failed; 1187 } 1188 1189 err = rtnl_link_register(&ipgre_link_ops); 1190 if (err < 0) 1191 goto rtnl_link_failed; 1192 1193 err = rtnl_link_register(&ipgre_tap_ops); 1194 if (err < 0) 1195 goto tap_ops_failed; 1196 1197 return 0; 1198 1199 tap_ops_failed: 1200 rtnl_link_unregister(&ipgre_link_ops); 1201 rtnl_link_failed: 1202 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); 1203 add_proto_failed: 1204 unregister_pernet_device(&ipgre_tap_net_ops); 1205 pnet_tap_faied: 1206 unregister_pernet_device(&ipgre_net_ops); 1207 return err; 1208 } 1209 1210 static void __exit ipgre_fini(void) 1211 { 1212 rtnl_link_unregister(&ipgre_tap_ops); 1213 rtnl_link_unregister(&ipgre_link_ops); 1214 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); 1215 unregister_pernet_device(&ipgre_tap_net_ops); 1216 unregister_pernet_device(&ipgre_net_ops); 1217 } 1218 1219 module_init(ipgre_init); 1220 module_exit(ipgre_fini); 1221 MODULE_LICENSE("GPL"); 1222 MODULE_ALIAS_RTNL_LINK("gre"); 1223 MODULE_ALIAS_RTNL_LINK("gretap"); 1224 MODULE_ALIAS_NETDEV("gre0"); 1225 MODULE_ALIAS_NETDEV("gretap0"); 1226