1 /* 2 * Linux NET3: GRE over IP protocol decoder. 3 * 4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 */ 12 13 #include <linux/capability.h> 14 #include <linux/module.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/slab.h> 18 #include <asm/uaccess.h> 19 #include <linux/skbuff.h> 20 #include <linux/netdevice.h> 21 #include <linux/in.h> 22 #include <linux/tcp.h> 23 #include <linux/udp.h> 24 #include <linux/if_arp.h> 25 #include <linux/mroute.h> 26 #include <linux/init.h> 27 #include <linux/in6.h> 28 #include <linux/inetdevice.h> 29 #include <linux/igmp.h> 30 #include <linux/netfilter_ipv4.h> 31 #include <linux/etherdevice.h> 32 #include <linux/if_ether.h> 33 34 #include <net/sock.h> 35 #include <net/ip.h> 36 #include <net/icmp.h> 37 #include <net/protocol.h> 38 #include <net/ipip.h> 39 #include <net/arp.h> 40 #include <net/checksum.h> 41 #include <net/dsfield.h> 42 #include <net/inet_ecn.h> 43 #include <net/xfrm.h> 44 #include <net/net_namespace.h> 45 #include <net/netns/generic.h> 46 #include <net/rtnetlink.h> 47 #include <net/gre.h> 48 49 #if IS_ENABLED(CONFIG_IPV6) 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #endif 54 55 /* 56 Problems & solutions 57 -------------------- 58 59 1. The most important issue is detecting local dead loops. 60 They would cause complete host lockup in transmit, which 61 would be "resolved" by stack overflow or, if queueing is enabled, 62 with infinite looping in net_bh. 63 64 We cannot track such dead loops during route installation, 65 it is infeasible task. The most general solutions would be 66 to keep skb->encapsulation counter (sort of local ttl), 67 and silently drop packet when it expires. It is a good 68 solution, but it supposes maintaing new variable in ALL 69 skb, even if no tunneling is used. 70 71 Current solution: xmit_recursion breaks dead loops. This is a percpu 72 counter, since when we enter the first ndo_xmit(), cpu migration is 73 forbidden. We force an exit if this counter reaches RECURSION_LIMIT 74 75 2. Networking dead loops would not kill routers, but would really 76 kill network. IP hop limit plays role of "t->recursion" in this case, 77 if we copy it from packet being encapsulated to upper header. 78 It is very good solution, but it introduces two problems: 79 80 - Routing protocols, using packets with ttl=1 (OSPF, RIP2), 81 do not work over tunnels. 82 - traceroute does not work. I planned to relay ICMP from tunnel, 83 so that this problem would be solved and traceroute output 84 would even more informative. This idea appeared to be wrong: 85 only Linux complies to rfc1812 now (yes, guys, Linux is the only 86 true router now :-)), all routers (at least, in neighbourhood of mine) 87 return only 8 bytes of payload. It is the end. 88 89 Hence, if we want that OSPF worked or traceroute said something reasonable, 90 we should search for another solution. 91 92 One of them is to parse packet trying to detect inner encapsulation 93 made by our node. It is difficult or even impossible, especially, 94 taking into account fragmentation. TO be short, tt is not solution at all. 95 96 Current solution: The solution was UNEXPECTEDLY SIMPLE. 97 We force DF flag on tunnels with preconfigured hop limit, 98 that is ALL. :-) Well, it does not remove the problem completely, 99 but exponential growth of network traffic is changed to linear 100 (branches, that exceed pmtu are pruned) and tunnel mtu 101 fastly degrades to value <68, where looping stops. 102 Yes, it is not good if there exists a router in the loop, 103 which does not force DF, even when encapsulating packets have DF set. 104 But it is not our problem! Nobody could accuse us, we made 105 all that we could make. Even if it is your gated who injected 106 fatal route to network, even if it were you who configured 107 fatal static route: you are innocent. :-) 108 109 110 111 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain 112 practically identical code. It would be good to glue them 113 together, but it is not very evident, how to make them modular. 114 sit is integral part of IPv6, ipip and gre are naturally modular. 115 We could extract common parts (hash table, ioctl etc) 116 to a separate module (ip_tunnel.c). 117 118 Alexey Kuznetsov. 119 */ 120 121 static struct rtnl_link_ops ipgre_link_ops __read_mostly; 122 static int ipgre_tunnel_init(struct net_device *dev); 123 static void ipgre_tunnel_setup(struct net_device *dev); 124 static int ipgre_tunnel_bind_dev(struct net_device *dev); 125 126 /* Fallback tunnel: no source, no destination, no key, no options */ 127 128 #define HASH_SIZE 16 129 130 static int ipgre_net_id __read_mostly; 131 struct ipgre_net { 132 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; 133 134 struct net_device *fb_tunnel_dev; 135 }; 136 137 /* Tunnel hash table */ 138 139 /* 140 4 hash tables: 141 142 3: (remote,local) 143 2: (remote,*) 144 1: (*,local) 145 0: (*,*) 146 147 We require exact key match i.e. if a key is present in packet 148 it will match only tunnel with the same key; if it is not present, 149 it will match only keyless tunnel. 150 151 All keysless packets, if not matched configured keyless tunnels 152 will match fallback tunnel. 153 */ 154 155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 156 157 #define tunnels_r_l tunnels[3] 158 #define tunnels_r tunnels[2] 159 #define tunnels_l tunnels[1] 160 #define tunnels_wc tunnels[0] 161 /* 162 * Locking : hash tables are protected by RCU and RTNL 163 */ 164 165 #define for_each_ip_tunnel_rcu(start) \ 166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 167 168 /* often modified stats are per cpu, other are shared (netdev->stats) */ 169 struct pcpu_tstats { 170 unsigned long rx_packets; 171 unsigned long rx_bytes; 172 unsigned long tx_packets; 173 unsigned long tx_bytes; 174 } __attribute__((aligned(4*sizeof(unsigned long)))); 175 176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev) 177 { 178 struct pcpu_tstats sum = { 0 }; 179 int i; 180 181 for_each_possible_cpu(i) { 182 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); 183 184 sum.rx_packets += tstats->rx_packets; 185 sum.rx_bytes += tstats->rx_bytes; 186 sum.tx_packets += tstats->tx_packets; 187 sum.tx_bytes += tstats->tx_bytes; 188 } 189 dev->stats.rx_packets = sum.rx_packets; 190 dev->stats.rx_bytes = sum.rx_bytes; 191 dev->stats.tx_packets = sum.tx_packets; 192 dev->stats.tx_bytes = sum.tx_bytes; 193 return &dev->stats; 194 } 195 196 /* Given src, dst and key, find appropriate for input tunnel. */ 197 198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, 199 __be32 remote, __be32 local, 200 __be32 key, __be16 gre_proto) 201 { 202 struct net *net = dev_net(dev); 203 int link = dev->ifindex; 204 unsigned int h0 = HASH(remote); 205 unsigned int h1 = HASH(key); 206 struct ip_tunnel *t, *cand = NULL; 207 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 208 int dev_type = (gre_proto == htons(ETH_P_TEB)) ? 209 ARPHRD_ETHER : ARPHRD_IPGRE; 210 int score, cand_score = 4; 211 212 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { 213 if (local != t->parms.iph.saddr || 214 remote != t->parms.iph.daddr || 215 key != t->parms.i_key || 216 !(t->dev->flags & IFF_UP)) 217 continue; 218 219 if (t->dev->type != ARPHRD_IPGRE && 220 t->dev->type != dev_type) 221 continue; 222 223 score = 0; 224 if (t->parms.link != link) 225 score |= 1; 226 if (t->dev->type != dev_type) 227 score |= 2; 228 if (score == 0) 229 return t; 230 231 if (score < cand_score) { 232 cand = t; 233 cand_score = score; 234 } 235 } 236 237 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { 238 if (remote != t->parms.iph.daddr || 239 key != t->parms.i_key || 240 !(t->dev->flags & IFF_UP)) 241 continue; 242 243 if (t->dev->type != ARPHRD_IPGRE && 244 t->dev->type != dev_type) 245 continue; 246 247 score = 0; 248 if (t->parms.link != link) 249 score |= 1; 250 if (t->dev->type != dev_type) 251 score |= 2; 252 if (score == 0) 253 return t; 254 255 if (score < cand_score) { 256 cand = t; 257 cand_score = score; 258 } 259 } 260 261 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) { 262 if ((local != t->parms.iph.saddr && 263 (local != t->parms.iph.daddr || 264 !ipv4_is_multicast(local))) || 265 key != t->parms.i_key || 266 !(t->dev->flags & IFF_UP)) 267 continue; 268 269 if (t->dev->type != ARPHRD_IPGRE && 270 t->dev->type != dev_type) 271 continue; 272 273 score = 0; 274 if (t->parms.link != link) 275 score |= 1; 276 if (t->dev->type != dev_type) 277 score |= 2; 278 if (score == 0) 279 return t; 280 281 if (score < cand_score) { 282 cand = t; 283 cand_score = score; 284 } 285 } 286 287 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) { 288 if (t->parms.i_key != key || 289 !(t->dev->flags & IFF_UP)) 290 continue; 291 292 if (t->dev->type != ARPHRD_IPGRE && 293 t->dev->type != dev_type) 294 continue; 295 296 score = 0; 297 if (t->parms.link != link) 298 score |= 1; 299 if (t->dev->type != dev_type) 300 score |= 2; 301 if (score == 0) 302 return t; 303 304 if (score < cand_score) { 305 cand = t; 306 cand_score = score; 307 } 308 } 309 310 if (cand != NULL) 311 return cand; 312 313 dev = ign->fb_tunnel_dev; 314 if (dev->flags & IFF_UP) 315 return netdev_priv(dev); 316 317 return NULL; 318 } 319 320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, 321 struct ip_tunnel_parm *parms) 322 { 323 __be32 remote = parms->iph.daddr; 324 __be32 local = parms->iph.saddr; 325 __be32 key = parms->i_key; 326 unsigned int h = HASH(key); 327 int prio = 0; 328 329 if (local) 330 prio |= 1; 331 if (remote && !ipv4_is_multicast(remote)) { 332 prio |= 2; 333 h ^= HASH(remote); 334 } 335 336 return &ign->tunnels[prio][h]; 337 } 338 339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, 340 struct ip_tunnel *t) 341 { 342 return __ipgre_bucket(ign, &t->parms); 343 } 344 345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) 346 { 347 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); 348 349 rcu_assign_pointer(t->next, rtnl_dereference(*tp)); 350 rcu_assign_pointer(*tp, t); 351 } 352 353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 354 { 355 struct ip_tunnel __rcu **tp; 356 struct ip_tunnel *iter; 357 358 for (tp = ipgre_bucket(ign, t); 359 (iter = rtnl_dereference(*tp)) != NULL; 360 tp = &iter->next) { 361 if (t == iter) { 362 rcu_assign_pointer(*tp, t->next); 363 break; 364 } 365 } 366 } 367 368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net, 369 struct ip_tunnel_parm *parms, 370 int type) 371 { 372 __be32 remote = parms->iph.daddr; 373 __be32 local = parms->iph.saddr; 374 __be32 key = parms->i_key; 375 int link = parms->link; 376 struct ip_tunnel *t; 377 struct ip_tunnel __rcu **tp; 378 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 379 380 for (tp = __ipgre_bucket(ign, parms); 381 (t = rtnl_dereference(*tp)) != NULL; 382 tp = &t->next) 383 if (local == t->parms.iph.saddr && 384 remote == t->parms.iph.daddr && 385 key == t->parms.i_key && 386 link == t->parms.link && 387 type == t->dev->type) 388 break; 389 390 return t; 391 } 392 393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, 394 struct ip_tunnel_parm *parms, int create) 395 { 396 struct ip_tunnel *t, *nt; 397 struct net_device *dev; 398 char name[IFNAMSIZ]; 399 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 400 401 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); 402 if (t || !create) 403 return t; 404 405 if (parms->name[0]) 406 strlcpy(name, parms->name, IFNAMSIZ); 407 else 408 strcpy(name, "gre%d"); 409 410 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); 411 if (!dev) 412 return NULL; 413 414 dev_net_set(dev, net); 415 416 nt = netdev_priv(dev); 417 nt->parms = *parms; 418 dev->rtnl_link_ops = &ipgre_link_ops; 419 420 dev->mtu = ipgre_tunnel_bind_dev(dev); 421 422 if (register_netdevice(dev) < 0) 423 goto failed_free; 424 425 dev_hold(dev); 426 ipgre_tunnel_link(ign, nt); 427 return nt; 428 429 failed_free: 430 free_netdev(dev); 431 return NULL; 432 } 433 434 static void ipgre_tunnel_uninit(struct net_device *dev) 435 { 436 struct net *net = dev_net(dev); 437 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 438 439 ipgre_tunnel_unlink(ign, netdev_priv(dev)); 440 dev_put(dev); 441 } 442 443 444 static void ipgre_err(struct sk_buff *skb, u32 info) 445 { 446 447 /* All the routers (except for Linux) return only 448 8 bytes of packet payload. It means, that precise relaying of 449 ICMP in the real Internet is absolutely infeasible. 450 451 Moreover, Cisco "wise men" put GRE key to the third word 452 in GRE header. It makes impossible maintaining even soft state for keyed 453 GRE tunnels with enabled checksum. Tell them "thank you". 454 455 Well, I wonder, rfc1812 was written by Cisco employee, 456 what the hell these idiots break standrads established 457 by themself??? 458 */ 459 460 const struct iphdr *iph = (const struct iphdr *)skb->data; 461 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); 462 int grehlen = (iph->ihl<<2) + 4; 463 const int type = icmp_hdr(skb)->type; 464 const int code = icmp_hdr(skb)->code; 465 struct ip_tunnel *t; 466 __be16 flags; 467 468 flags = p[0]; 469 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { 470 if (flags&(GRE_VERSION|GRE_ROUTING)) 471 return; 472 if (flags&GRE_KEY) { 473 grehlen += 4; 474 if (flags&GRE_CSUM) 475 grehlen += 4; 476 } 477 } 478 479 /* If only 8 bytes returned, keyed message will be dropped here */ 480 if (skb_headlen(skb) < grehlen) 481 return; 482 483 switch (type) { 484 default: 485 case ICMP_PARAMETERPROB: 486 return; 487 488 case ICMP_DEST_UNREACH: 489 switch (code) { 490 case ICMP_SR_FAILED: 491 case ICMP_PORT_UNREACH: 492 /* Impossible event. */ 493 return; 494 case ICMP_FRAG_NEEDED: 495 /* Soft state for pmtu is maintained by IP core. */ 496 return; 497 default: 498 /* All others are translated to HOST_UNREACH. 499 rfc2003 contains "deep thoughts" about NET_UNREACH, 500 I believe they are just ether pollution. --ANK 501 */ 502 break; 503 } 504 break; 505 case ICMP_TIME_EXCEEDED: 506 if (code != ICMP_EXC_TTL) 507 return; 508 break; 509 } 510 511 rcu_read_lock(); 512 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 513 flags & GRE_KEY ? 514 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 515 p[1]); 516 if (t == NULL || t->parms.iph.daddr == 0 || 517 ipv4_is_multicast(t->parms.iph.daddr)) 518 goto out; 519 520 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 521 goto out; 522 523 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) 524 t->err_count++; 525 else 526 t->err_count = 1; 527 t->err_time = jiffies; 528 out: 529 rcu_read_unlock(); 530 } 531 532 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) 533 { 534 if (INET_ECN_is_ce(iph->tos)) { 535 if (skb->protocol == htons(ETH_P_IP)) { 536 IP_ECN_set_ce(ip_hdr(skb)); 537 } else if (skb->protocol == htons(ETH_P_IPV6)) { 538 IP6_ECN_set_ce(ipv6_hdr(skb)); 539 } 540 } 541 } 542 543 static inline u8 544 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) 545 { 546 u8 inner = 0; 547 if (skb->protocol == htons(ETH_P_IP)) 548 inner = old_iph->tos; 549 else if (skb->protocol == htons(ETH_P_IPV6)) 550 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); 551 return INET_ECN_encapsulate(tos, inner); 552 } 553 554 static int ipgre_rcv(struct sk_buff *skb) 555 { 556 const struct iphdr *iph; 557 u8 *h; 558 __be16 flags; 559 __sum16 csum = 0; 560 __be32 key = 0; 561 u32 seqno = 0; 562 struct ip_tunnel *tunnel; 563 int offset = 4; 564 __be16 gre_proto; 565 566 if (!pskb_may_pull(skb, 16)) 567 goto drop_nolock; 568 569 iph = ip_hdr(skb); 570 h = skb->data; 571 flags = *(__be16*)h; 572 573 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { 574 /* - Version must be 0. 575 - We do not support routing headers. 576 */ 577 if (flags&(GRE_VERSION|GRE_ROUTING)) 578 goto drop_nolock; 579 580 if (flags&GRE_CSUM) { 581 switch (skb->ip_summed) { 582 case CHECKSUM_COMPLETE: 583 csum = csum_fold(skb->csum); 584 if (!csum) 585 break; 586 /* fall through */ 587 case CHECKSUM_NONE: 588 skb->csum = 0; 589 csum = __skb_checksum_complete(skb); 590 skb->ip_summed = CHECKSUM_COMPLETE; 591 } 592 offset += 4; 593 } 594 if (flags&GRE_KEY) { 595 key = *(__be32*)(h + offset); 596 offset += 4; 597 } 598 if (flags&GRE_SEQ) { 599 seqno = ntohl(*(__be32*)(h + offset)); 600 offset += 4; 601 } 602 } 603 604 gre_proto = *(__be16 *)(h + 2); 605 606 rcu_read_lock(); 607 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 608 iph->saddr, iph->daddr, key, 609 gre_proto))) { 610 struct pcpu_tstats *tstats; 611 612 secpath_reset(skb); 613 614 skb->protocol = gre_proto; 615 /* WCCP version 1 and 2 protocol decoding. 616 * - Change protocol to IP 617 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header 618 */ 619 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { 620 skb->protocol = htons(ETH_P_IP); 621 if ((*(h + offset) & 0xF0) != 0x40) 622 offset += 4; 623 } 624 625 skb->mac_header = skb->network_header; 626 __pskb_pull(skb, offset); 627 skb_postpull_rcsum(skb, skb_transport_header(skb), offset); 628 skb->pkt_type = PACKET_HOST; 629 #ifdef CONFIG_NET_IPGRE_BROADCAST 630 if (ipv4_is_multicast(iph->daddr)) { 631 /* Looped back packet, drop it! */ 632 if (rt_is_output_route(skb_rtable(skb))) 633 goto drop; 634 tunnel->dev->stats.multicast++; 635 skb->pkt_type = PACKET_BROADCAST; 636 } 637 #endif 638 639 if (((flags&GRE_CSUM) && csum) || 640 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { 641 tunnel->dev->stats.rx_crc_errors++; 642 tunnel->dev->stats.rx_errors++; 643 goto drop; 644 } 645 if (tunnel->parms.i_flags&GRE_SEQ) { 646 if (!(flags&GRE_SEQ) || 647 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { 648 tunnel->dev->stats.rx_fifo_errors++; 649 tunnel->dev->stats.rx_errors++; 650 goto drop; 651 } 652 tunnel->i_seqno = seqno + 1; 653 } 654 655 /* Warning: All skb pointers will be invalidated! */ 656 if (tunnel->dev->type == ARPHRD_ETHER) { 657 if (!pskb_may_pull(skb, ETH_HLEN)) { 658 tunnel->dev->stats.rx_length_errors++; 659 tunnel->dev->stats.rx_errors++; 660 goto drop; 661 } 662 663 iph = ip_hdr(skb); 664 skb->protocol = eth_type_trans(skb, tunnel->dev); 665 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 666 } 667 668 tstats = this_cpu_ptr(tunnel->dev->tstats); 669 tstats->rx_packets++; 670 tstats->rx_bytes += skb->len; 671 672 __skb_tunnel_rx(skb, tunnel->dev); 673 674 skb_reset_network_header(skb); 675 ipgre_ecn_decapsulate(iph, skb); 676 677 netif_rx(skb); 678 679 rcu_read_unlock(); 680 return 0; 681 } 682 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 683 684 drop: 685 rcu_read_unlock(); 686 drop_nolock: 687 kfree_skb(skb); 688 return 0; 689 } 690 691 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 692 { 693 struct ip_tunnel *tunnel = netdev_priv(dev); 694 struct pcpu_tstats *tstats; 695 const struct iphdr *old_iph = ip_hdr(skb); 696 const struct iphdr *tiph; 697 struct flowi4 fl4; 698 u8 tos; 699 __be16 df; 700 struct rtable *rt; /* Route to the other host */ 701 struct net_device *tdev; /* Device to other host */ 702 struct iphdr *iph; /* Our new IP header */ 703 unsigned int max_headroom; /* The extra header space needed */ 704 int gre_hlen; 705 __be32 dst; 706 int mtu; 707 708 if (dev->type == ARPHRD_ETHER) 709 IPCB(skb)->flags = 0; 710 711 if (dev->header_ops && dev->type == ARPHRD_IPGRE) { 712 gre_hlen = 0; 713 tiph = (const struct iphdr *)skb->data; 714 } else { 715 gre_hlen = tunnel->hlen; 716 tiph = &tunnel->parms.iph; 717 } 718 719 if ((dst = tiph->daddr) == 0) { 720 /* NBMA tunnel */ 721 722 if (skb_dst(skb) == NULL) { 723 dev->stats.tx_fifo_errors++; 724 goto tx_error; 725 } 726 727 if (skb->protocol == htons(ETH_P_IP)) { 728 rt = skb_rtable(skb); 729 dst = rt->rt_gateway; 730 } 731 #if IS_ENABLED(CONFIG_IPV6) 732 else if (skb->protocol == htons(ETH_P_IPV6)) { 733 const struct in6_addr *addr6; 734 struct neighbour *neigh; 735 bool do_tx_error_icmp; 736 int addr_type; 737 738 neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); 739 if (neigh == NULL) 740 goto tx_error; 741 742 addr6 = (const struct in6_addr *)&neigh->primary_key; 743 addr_type = ipv6_addr_type(addr6); 744 745 if (addr_type == IPV6_ADDR_ANY) { 746 addr6 = &ipv6_hdr(skb)->daddr; 747 addr_type = ipv6_addr_type(addr6); 748 } 749 750 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 751 do_tx_error_icmp = true; 752 else { 753 do_tx_error_icmp = false; 754 dst = addr6->s6_addr32[3]; 755 } 756 neigh_release(neigh); 757 if (do_tx_error_icmp) 758 goto tx_error_icmp; 759 } 760 #endif 761 else 762 goto tx_error; 763 } 764 765 tos = tiph->tos; 766 if (tos == 1) { 767 tos = 0; 768 if (skb->protocol == htons(ETH_P_IP)) 769 tos = old_iph->tos; 770 else if (skb->protocol == htons(ETH_P_IPV6)) 771 tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); 772 } 773 774 rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, 775 tunnel->parms.o_key, RT_TOS(tos), 776 tunnel->parms.link); 777 if (IS_ERR(rt)) { 778 dev->stats.tx_carrier_errors++; 779 goto tx_error; 780 } 781 tdev = rt->dst.dev; 782 783 if (tdev == dev) { 784 ip_rt_put(rt); 785 dev->stats.collisions++; 786 goto tx_error; 787 } 788 789 df = tiph->frag_off; 790 if (df) 791 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; 792 else 793 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 794 795 if (skb_dst(skb)) 796 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 797 798 if (skb->protocol == htons(ETH_P_IP)) { 799 df |= (old_iph->frag_off&htons(IP_DF)); 800 801 if ((old_iph->frag_off&htons(IP_DF)) && 802 mtu < ntohs(old_iph->tot_len)) { 803 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 804 ip_rt_put(rt); 805 goto tx_error; 806 } 807 } 808 #if IS_ENABLED(CONFIG_IPV6) 809 else if (skb->protocol == htons(ETH_P_IPV6)) { 810 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 811 812 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { 813 if ((tunnel->parms.iph.daddr && 814 !ipv4_is_multicast(tunnel->parms.iph.daddr)) || 815 rt6->rt6i_dst.plen == 128) { 816 rt6->rt6i_flags |= RTF_MODIFIED; 817 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 818 } 819 } 820 821 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { 822 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 823 ip_rt_put(rt); 824 goto tx_error; 825 } 826 } 827 #endif 828 829 if (tunnel->err_count > 0) { 830 if (time_before(jiffies, 831 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 832 tunnel->err_count--; 833 834 dst_link_failure(skb); 835 } else 836 tunnel->err_count = 0; 837 } 838 839 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; 840 841 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 842 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 843 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 844 if (max_headroom > dev->needed_headroom) 845 dev->needed_headroom = max_headroom; 846 if (!new_skb) { 847 ip_rt_put(rt); 848 dev->stats.tx_dropped++; 849 dev_kfree_skb(skb); 850 return NETDEV_TX_OK; 851 } 852 if (skb->sk) 853 skb_set_owner_w(new_skb, skb->sk); 854 dev_kfree_skb(skb); 855 skb = new_skb; 856 old_iph = ip_hdr(skb); 857 } 858 859 skb_reset_transport_header(skb); 860 skb_push(skb, gre_hlen); 861 skb_reset_network_header(skb); 862 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 863 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 864 IPSKB_REROUTED); 865 skb_dst_drop(skb); 866 skb_dst_set(skb, &rt->dst); 867 868 /* 869 * Push down and install the IPIP header. 870 */ 871 872 iph = ip_hdr(skb); 873 iph->version = 4; 874 iph->ihl = sizeof(struct iphdr) >> 2; 875 iph->frag_off = df; 876 iph->protocol = IPPROTO_GRE; 877 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); 878 iph->daddr = fl4.daddr; 879 iph->saddr = fl4.saddr; 880 881 if ((iph->ttl = tiph->ttl) == 0) { 882 if (skb->protocol == htons(ETH_P_IP)) 883 iph->ttl = old_iph->ttl; 884 #if IS_ENABLED(CONFIG_IPV6) 885 else if (skb->protocol == htons(ETH_P_IPV6)) 886 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; 887 #endif 888 else 889 iph->ttl = ip4_dst_hoplimit(&rt->dst); 890 } 891 892 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; 893 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? 894 htons(ETH_P_TEB) : skb->protocol; 895 896 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { 897 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); 898 899 if (tunnel->parms.o_flags&GRE_SEQ) { 900 ++tunnel->o_seqno; 901 *ptr = htonl(tunnel->o_seqno); 902 ptr--; 903 } 904 if (tunnel->parms.o_flags&GRE_KEY) { 905 *ptr = tunnel->parms.o_key; 906 ptr--; 907 } 908 if (tunnel->parms.o_flags&GRE_CSUM) { 909 *ptr = 0; 910 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); 911 } 912 } 913 914 nf_reset(skb); 915 tstats = this_cpu_ptr(dev->tstats); 916 __IPTUNNEL_XMIT(tstats, &dev->stats); 917 return NETDEV_TX_OK; 918 919 #if IS_ENABLED(CONFIG_IPV6) 920 tx_error_icmp: 921 dst_link_failure(skb); 922 #endif 923 tx_error: 924 dev->stats.tx_errors++; 925 dev_kfree_skb(skb); 926 return NETDEV_TX_OK; 927 } 928 929 static int ipgre_tunnel_bind_dev(struct net_device *dev) 930 { 931 struct net_device *tdev = NULL; 932 struct ip_tunnel *tunnel; 933 const struct iphdr *iph; 934 int hlen = LL_MAX_HEADER; 935 int mtu = ETH_DATA_LEN; 936 int addend = sizeof(struct iphdr) + 4; 937 938 tunnel = netdev_priv(dev); 939 iph = &tunnel->parms.iph; 940 941 /* Guess output device to choose reasonable mtu and needed_headroom */ 942 943 if (iph->daddr) { 944 struct flowi4 fl4; 945 struct rtable *rt; 946 947 rt = ip_route_output_gre(dev_net(dev), &fl4, 948 iph->daddr, iph->saddr, 949 tunnel->parms.o_key, 950 RT_TOS(iph->tos), 951 tunnel->parms.link); 952 if (!IS_ERR(rt)) { 953 tdev = rt->dst.dev; 954 ip_rt_put(rt); 955 } 956 957 if (dev->type != ARPHRD_ETHER) 958 dev->flags |= IFF_POINTOPOINT; 959 } 960 961 if (!tdev && tunnel->parms.link) 962 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); 963 964 if (tdev) { 965 hlen = tdev->hard_header_len + tdev->needed_headroom; 966 mtu = tdev->mtu; 967 } 968 dev->iflink = tunnel->parms.link; 969 970 /* Precalculate GRE options length */ 971 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { 972 if (tunnel->parms.o_flags&GRE_CSUM) 973 addend += 4; 974 if (tunnel->parms.o_flags&GRE_KEY) 975 addend += 4; 976 if (tunnel->parms.o_flags&GRE_SEQ) 977 addend += 4; 978 } 979 dev->needed_headroom = addend + hlen; 980 mtu -= dev->hard_header_len + addend; 981 982 if (mtu < 68) 983 mtu = 68; 984 985 tunnel->hlen = addend; 986 987 return mtu; 988 } 989 990 static int 991 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) 992 { 993 int err = 0; 994 struct ip_tunnel_parm p; 995 struct ip_tunnel *t; 996 struct net *net = dev_net(dev); 997 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 998 999 switch (cmd) { 1000 case SIOCGETTUNNEL: 1001 t = NULL; 1002 if (dev == ign->fb_tunnel_dev) { 1003 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { 1004 err = -EFAULT; 1005 break; 1006 } 1007 t = ipgre_tunnel_locate(net, &p, 0); 1008 } 1009 if (t == NULL) 1010 t = netdev_priv(dev); 1011 memcpy(&p, &t->parms, sizeof(p)); 1012 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 1013 err = -EFAULT; 1014 break; 1015 1016 case SIOCADDTUNNEL: 1017 case SIOCCHGTUNNEL: 1018 err = -EPERM; 1019 if (!capable(CAP_NET_ADMIN)) 1020 goto done; 1021 1022 err = -EFAULT; 1023 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 1024 goto done; 1025 1026 err = -EINVAL; 1027 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || 1028 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || 1029 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) 1030 goto done; 1031 if (p.iph.ttl) 1032 p.iph.frag_off |= htons(IP_DF); 1033 1034 if (!(p.i_flags&GRE_KEY)) 1035 p.i_key = 0; 1036 if (!(p.o_flags&GRE_KEY)) 1037 p.o_key = 0; 1038 1039 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); 1040 1041 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 1042 if (t != NULL) { 1043 if (t->dev != dev) { 1044 err = -EEXIST; 1045 break; 1046 } 1047 } else { 1048 unsigned int nflags = 0; 1049 1050 t = netdev_priv(dev); 1051 1052 if (ipv4_is_multicast(p.iph.daddr)) 1053 nflags = IFF_BROADCAST; 1054 else if (p.iph.daddr) 1055 nflags = IFF_POINTOPOINT; 1056 1057 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 1058 err = -EINVAL; 1059 break; 1060 } 1061 ipgre_tunnel_unlink(ign, t); 1062 synchronize_net(); 1063 t->parms.iph.saddr = p.iph.saddr; 1064 t->parms.iph.daddr = p.iph.daddr; 1065 t->parms.i_key = p.i_key; 1066 t->parms.o_key = p.o_key; 1067 memcpy(dev->dev_addr, &p.iph.saddr, 4); 1068 memcpy(dev->broadcast, &p.iph.daddr, 4); 1069 ipgre_tunnel_link(ign, t); 1070 netdev_state_change(dev); 1071 } 1072 } 1073 1074 if (t) { 1075 err = 0; 1076 if (cmd == SIOCCHGTUNNEL) { 1077 t->parms.iph.ttl = p.iph.ttl; 1078 t->parms.iph.tos = p.iph.tos; 1079 t->parms.iph.frag_off = p.iph.frag_off; 1080 if (t->parms.link != p.link) { 1081 t->parms.link = p.link; 1082 dev->mtu = ipgre_tunnel_bind_dev(dev); 1083 netdev_state_change(dev); 1084 } 1085 } 1086 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) 1087 err = -EFAULT; 1088 } else 1089 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); 1090 break; 1091 1092 case SIOCDELTUNNEL: 1093 err = -EPERM; 1094 if (!capable(CAP_NET_ADMIN)) 1095 goto done; 1096 1097 if (dev == ign->fb_tunnel_dev) { 1098 err = -EFAULT; 1099 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 1100 goto done; 1101 err = -ENOENT; 1102 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL) 1103 goto done; 1104 err = -EPERM; 1105 if (t == netdev_priv(ign->fb_tunnel_dev)) 1106 goto done; 1107 dev = t->dev; 1108 } 1109 unregister_netdevice(dev); 1110 err = 0; 1111 break; 1112 1113 default: 1114 err = -EINVAL; 1115 } 1116 1117 done: 1118 return err; 1119 } 1120 1121 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) 1122 { 1123 struct ip_tunnel *tunnel = netdev_priv(dev); 1124 if (new_mtu < 68 || 1125 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) 1126 return -EINVAL; 1127 dev->mtu = new_mtu; 1128 return 0; 1129 } 1130 1131 /* Nice toy. Unfortunately, useless in real life :-) 1132 It allows to construct virtual multiprotocol broadcast "LAN" 1133 over the Internet, provided multicast routing is tuned. 1134 1135 1136 I have no idea was this bicycle invented before me, 1137 so that I had to set ARPHRD_IPGRE to a random value. 1138 I have an impression, that Cisco could make something similar, 1139 but this feature is apparently missing in IOS<=11.2(8). 1140 1141 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks 1142 with broadcast 224.66.66.66. If you have access to mbone, play with me :-) 1143 1144 ping -t 255 224.66.66.66 1145 1146 If nobody answers, mbone does not work. 1147 1148 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255 1149 ip addr add 10.66.66.<somewhat>/24 dev Universe 1150 ifconfig Universe up 1151 ifconfig Universe add fe80::<Your_real_addr>/10 1152 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96 1153 ftp 10.66.66.66 1154 ... 1155 ftp fec0:6666:6666::193.233.7.65 1156 ... 1157 1158 */ 1159 1160 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 1161 unsigned short type, 1162 const void *daddr, const void *saddr, unsigned int len) 1163 { 1164 struct ip_tunnel *t = netdev_priv(dev); 1165 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1166 __be16 *p = (__be16*)(iph+1); 1167 1168 memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); 1169 p[0] = t->parms.o_flags; 1170 p[1] = htons(type); 1171 1172 /* 1173 * Set the source hardware address. 1174 */ 1175 1176 if (saddr) 1177 memcpy(&iph->saddr, saddr, 4); 1178 if (daddr) 1179 memcpy(&iph->daddr, daddr, 4); 1180 if (iph->daddr) 1181 return t->hlen; 1182 1183 return -t->hlen; 1184 } 1185 1186 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) 1187 { 1188 const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); 1189 memcpy(haddr, &iph->saddr, 4); 1190 return 4; 1191 } 1192 1193 static const struct header_ops ipgre_header_ops = { 1194 .create = ipgre_header, 1195 .parse = ipgre_header_parse, 1196 }; 1197 1198 #ifdef CONFIG_NET_IPGRE_BROADCAST 1199 static int ipgre_open(struct net_device *dev) 1200 { 1201 struct ip_tunnel *t = netdev_priv(dev); 1202 1203 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1204 struct flowi4 fl4; 1205 struct rtable *rt; 1206 1207 rt = ip_route_output_gre(dev_net(dev), &fl4, 1208 t->parms.iph.daddr, 1209 t->parms.iph.saddr, 1210 t->parms.o_key, 1211 RT_TOS(t->parms.iph.tos), 1212 t->parms.link); 1213 if (IS_ERR(rt)) 1214 return -EADDRNOTAVAIL; 1215 dev = rt->dst.dev; 1216 ip_rt_put(rt); 1217 if (__in_dev_get_rtnl(dev) == NULL) 1218 return -EADDRNOTAVAIL; 1219 t->mlink = dev->ifindex; 1220 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); 1221 } 1222 return 0; 1223 } 1224 1225 static int ipgre_close(struct net_device *dev) 1226 { 1227 struct ip_tunnel *t = netdev_priv(dev); 1228 1229 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 1230 struct in_device *in_dev; 1231 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 1232 if (in_dev) 1233 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 1234 } 1235 return 0; 1236 } 1237 1238 #endif 1239 1240 static const struct net_device_ops ipgre_netdev_ops = { 1241 .ndo_init = ipgre_tunnel_init, 1242 .ndo_uninit = ipgre_tunnel_uninit, 1243 #ifdef CONFIG_NET_IPGRE_BROADCAST 1244 .ndo_open = ipgre_open, 1245 .ndo_stop = ipgre_close, 1246 #endif 1247 .ndo_start_xmit = ipgre_tunnel_xmit, 1248 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1249 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1250 .ndo_get_stats = ipgre_get_stats, 1251 }; 1252 1253 static void ipgre_dev_free(struct net_device *dev) 1254 { 1255 free_percpu(dev->tstats); 1256 free_netdev(dev); 1257 } 1258 1259 static void ipgre_tunnel_setup(struct net_device *dev) 1260 { 1261 dev->netdev_ops = &ipgre_netdev_ops; 1262 dev->destructor = ipgre_dev_free; 1263 1264 dev->type = ARPHRD_IPGRE; 1265 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1266 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; 1267 dev->flags = IFF_NOARP; 1268 dev->iflink = 0; 1269 dev->addr_len = 4; 1270 dev->features |= NETIF_F_NETNS_LOCAL; 1271 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 1272 } 1273 1274 static int ipgre_tunnel_init(struct net_device *dev) 1275 { 1276 struct ip_tunnel *tunnel; 1277 struct iphdr *iph; 1278 1279 tunnel = netdev_priv(dev); 1280 iph = &tunnel->parms.iph; 1281 1282 tunnel->dev = dev; 1283 strcpy(tunnel->parms.name, dev->name); 1284 1285 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); 1286 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 1287 1288 if (iph->daddr) { 1289 #ifdef CONFIG_NET_IPGRE_BROADCAST 1290 if (ipv4_is_multicast(iph->daddr)) { 1291 if (!iph->saddr) 1292 return -EINVAL; 1293 dev->flags = IFF_BROADCAST; 1294 dev->header_ops = &ipgre_header_ops; 1295 } 1296 #endif 1297 } else 1298 dev->header_ops = &ipgre_header_ops; 1299 1300 dev->tstats = alloc_percpu(struct pcpu_tstats); 1301 if (!dev->tstats) 1302 return -ENOMEM; 1303 1304 return 0; 1305 } 1306 1307 static void ipgre_fb_tunnel_init(struct net_device *dev) 1308 { 1309 struct ip_tunnel *tunnel = netdev_priv(dev); 1310 struct iphdr *iph = &tunnel->parms.iph; 1311 1312 tunnel->dev = dev; 1313 strcpy(tunnel->parms.name, dev->name); 1314 1315 iph->version = 4; 1316 iph->protocol = IPPROTO_GRE; 1317 iph->ihl = 5; 1318 tunnel->hlen = sizeof(struct iphdr) + 4; 1319 1320 dev_hold(dev); 1321 } 1322 1323 1324 static const struct gre_protocol ipgre_protocol = { 1325 .handler = ipgre_rcv, 1326 .err_handler = ipgre_err, 1327 }; 1328 1329 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) 1330 { 1331 int prio; 1332 1333 for (prio = 0; prio < 4; prio++) { 1334 int h; 1335 for (h = 0; h < HASH_SIZE; h++) { 1336 struct ip_tunnel *t; 1337 1338 t = rtnl_dereference(ign->tunnels[prio][h]); 1339 1340 while (t != NULL) { 1341 unregister_netdevice_queue(t->dev, head); 1342 t = rtnl_dereference(t->next); 1343 } 1344 } 1345 } 1346 } 1347 1348 static int __net_init ipgre_init_net(struct net *net) 1349 { 1350 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1351 int err; 1352 1353 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", 1354 ipgre_tunnel_setup); 1355 if (!ign->fb_tunnel_dev) { 1356 err = -ENOMEM; 1357 goto err_alloc_dev; 1358 } 1359 dev_net_set(ign->fb_tunnel_dev, net); 1360 1361 ipgre_fb_tunnel_init(ign->fb_tunnel_dev); 1362 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; 1363 1364 if ((err = register_netdev(ign->fb_tunnel_dev))) 1365 goto err_reg_dev; 1366 1367 rcu_assign_pointer(ign->tunnels_wc[0], 1368 netdev_priv(ign->fb_tunnel_dev)); 1369 return 0; 1370 1371 err_reg_dev: 1372 ipgre_dev_free(ign->fb_tunnel_dev); 1373 err_alloc_dev: 1374 return err; 1375 } 1376 1377 static void __net_exit ipgre_exit_net(struct net *net) 1378 { 1379 struct ipgre_net *ign; 1380 LIST_HEAD(list); 1381 1382 ign = net_generic(net, ipgre_net_id); 1383 rtnl_lock(); 1384 ipgre_destroy_tunnels(ign, &list); 1385 unregister_netdevice_many(&list); 1386 rtnl_unlock(); 1387 } 1388 1389 static struct pernet_operations ipgre_net_ops = { 1390 .init = ipgre_init_net, 1391 .exit = ipgre_exit_net, 1392 .id = &ipgre_net_id, 1393 .size = sizeof(struct ipgre_net), 1394 }; 1395 1396 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) 1397 { 1398 __be16 flags; 1399 1400 if (!data) 1401 return 0; 1402 1403 flags = 0; 1404 if (data[IFLA_GRE_IFLAGS]) 1405 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); 1406 if (data[IFLA_GRE_OFLAGS]) 1407 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); 1408 if (flags & (GRE_VERSION|GRE_ROUTING)) 1409 return -EINVAL; 1410 1411 return 0; 1412 } 1413 1414 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) 1415 { 1416 __be32 daddr; 1417 1418 if (tb[IFLA_ADDRESS]) { 1419 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1420 return -EINVAL; 1421 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1422 return -EADDRNOTAVAIL; 1423 } 1424 1425 if (!data) 1426 goto out; 1427 1428 if (data[IFLA_GRE_REMOTE]) { 1429 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); 1430 if (!daddr) 1431 return -EINVAL; 1432 } 1433 1434 out: 1435 return ipgre_tunnel_validate(tb, data); 1436 } 1437 1438 static void ipgre_netlink_parms(struct nlattr *data[], 1439 struct ip_tunnel_parm *parms) 1440 { 1441 memset(parms, 0, sizeof(*parms)); 1442 1443 parms->iph.protocol = IPPROTO_GRE; 1444 1445 if (!data) 1446 return; 1447 1448 if (data[IFLA_GRE_LINK]) 1449 parms->link = nla_get_u32(data[IFLA_GRE_LINK]); 1450 1451 if (data[IFLA_GRE_IFLAGS]) 1452 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); 1453 1454 if (data[IFLA_GRE_OFLAGS]) 1455 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); 1456 1457 if (data[IFLA_GRE_IKEY]) 1458 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); 1459 1460 if (data[IFLA_GRE_OKEY]) 1461 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); 1462 1463 if (data[IFLA_GRE_LOCAL]) 1464 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); 1465 1466 if (data[IFLA_GRE_REMOTE]) 1467 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); 1468 1469 if (data[IFLA_GRE_TTL]) 1470 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); 1471 1472 if (data[IFLA_GRE_TOS]) 1473 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); 1474 1475 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) 1476 parms->iph.frag_off = htons(IP_DF); 1477 } 1478 1479 static int ipgre_tap_init(struct net_device *dev) 1480 { 1481 struct ip_tunnel *tunnel; 1482 1483 tunnel = netdev_priv(dev); 1484 1485 tunnel->dev = dev; 1486 strcpy(tunnel->parms.name, dev->name); 1487 1488 ipgre_tunnel_bind_dev(dev); 1489 1490 dev->tstats = alloc_percpu(struct pcpu_tstats); 1491 if (!dev->tstats) 1492 return -ENOMEM; 1493 1494 return 0; 1495 } 1496 1497 static const struct net_device_ops ipgre_tap_netdev_ops = { 1498 .ndo_init = ipgre_tap_init, 1499 .ndo_uninit = ipgre_tunnel_uninit, 1500 .ndo_start_xmit = ipgre_tunnel_xmit, 1501 .ndo_set_mac_address = eth_mac_addr, 1502 .ndo_validate_addr = eth_validate_addr, 1503 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1504 .ndo_get_stats = ipgre_get_stats, 1505 }; 1506 1507 static void ipgre_tap_setup(struct net_device *dev) 1508 { 1509 1510 ether_setup(dev); 1511 1512 dev->netdev_ops = &ipgre_tap_netdev_ops; 1513 dev->destructor = ipgre_dev_free; 1514 1515 dev->iflink = 0; 1516 dev->features |= NETIF_F_NETNS_LOCAL; 1517 } 1518 1519 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], 1520 struct nlattr *data[]) 1521 { 1522 struct ip_tunnel *nt; 1523 struct net *net = dev_net(dev); 1524 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1525 int mtu; 1526 int err; 1527 1528 nt = netdev_priv(dev); 1529 ipgre_netlink_parms(data, &nt->parms); 1530 1531 if (ipgre_tunnel_find(net, &nt->parms, dev->type)) 1532 return -EEXIST; 1533 1534 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1535 random_ether_addr(dev->dev_addr); 1536 1537 mtu = ipgre_tunnel_bind_dev(dev); 1538 if (!tb[IFLA_MTU]) 1539 dev->mtu = mtu; 1540 1541 /* Can use a lockless transmit, unless we generate output sequences */ 1542 if (!(nt->parms.o_flags & GRE_SEQ)) 1543 dev->features |= NETIF_F_LLTX; 1544 1545 err = register_netdevice(dev); 1546 if (err) 1547 goto out; 1548 1549 dev_hold(dev); 1550 ipgre_tunnel_link(ign, nt); 1551 1552 out: 1553 return err; 1554 } 1555 1556 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], 1557 struct nlattr *data[]) 1558 { 1559 struct ip_tunnel *t, *nt; 1560 struct net *net = dev_net(dev); 1561 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1562 struct ip_tunnel_parm p; 1563 int mtu; 1564 1565 if (dev == ign->fb_tunnel_dev) 1566 return -EINVAL; 1567 1568 nt = netdev_priv(dev); 1569 ipgre_netlink_parms(data, &p); 1570 1571 t = ipgre_tunnel_locate(net, &p, 0); 1572 1573 if (t) { 1574 if (t->dev != dev) 1575 return -EEXIST; 1576 } else { 1577 t = nt; 1578 1579 if (dev->type != ARPHRD_ETHER) { 1580 unsigned int nflags = 0; 1581 1582 if (ipv4_is_multicast(p.iph.daddr)) 1583 nflags = IFF_BROADCAST; 1584 else if (p.iph.daddr) 1585 nflags = IFF_POINTOPOINT; 1586 1587 if ((dev->flags ^ nflags) & 1588 (IFF_POINTOPOINT | IFF_BROADCAST)) 1589 return -EINVAL; 1590 } 1591 1592 ipgre_tunnel_unlink(ign, t); 1593 t->parms.iph.saddr = p.iph.saddr; 1594 t->parms.iph.daddr = p.iph.daddr; 1595 t->parms.i_key = p.i_key; 1596 if (dev->type != ARPHRD_ETHER) { 1597 memcpy(dev->dev_addr, &p.iph.saddr, 4); 1598 memcpy(dev->broadcast, &p.iph.daddr, 4); 1599 } 1600 ipgre_tunnel_link(ign, t); 1601 netdev_state_change(dev); 1602 } 1603 1604 t->parms.o_key = p.o_key; 1605 t->parms.iph.ttl = p.iph.ttl; 1606 t->parms.iph.tos = p.iph.tos; 1607 t->parms.iph.frag_off = p.iph.frag_off; 1608 1609 if (t->parms.link != p.link) { 1610 t->parms.link = p.link; 1611 mtu = ipgre_tunnel_bind_dev(dev); 1612 if (!tb[IFLA_MTU]) 1613 dev->mtu = mtu; 1614 netdev_state_change(dev); 1615 } 1616 1617 return 0; 1618 } 1619 1620 static size_t ipgre_get_size(const struct net_device *dev) 1621 { 1622 return 1623 /* IFLA_GRE_LINK */ 1624 nla_total_size(4) + 1625 /* IFLA_GRE_IFLAGS */ 1626 nla_total_size(2) + 1627 /* IFLA_GRE_OFLAGS */ 1628 nla_total_size(2) + 1629 /* IFLA_GRE_IKEY */ 1630 nla_total_size(4) + 1631 /* IFLA_GRE_OKEY */ 1632 nla_total_size(4) + 1633 /* IFLA_GRE_LOCAL */ 1634 nla_total_size(4) + 1635 /* IFLA_GRE_REMOTE */ 1636 nla_total_size(4) + 1637 /* IFLA_GRE_TTL */ 1638 nla_total_size(1) + 1639 /* IFLA_GRE_TOS */ 1640 nla_total_size(1) + 1641 /* IFLA_GRE_PMTUDISC */ 1642 nla_total_size(1) + 1643 0; 1644 } 1645 1646 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) 1647 { 1648 struct ip_tunnel *t = netdev_priv(dev); 1649 struct ip_tunnel_parm *p = &t->parms; 1650 1651 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); 1652 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); 1653 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); 1654 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); 1655 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); 1656 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr); 1657 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr); 1658 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); 1659 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); 1660 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); 1661 1662 return 0; 1663 1664 nla_put_failure: 1665 return -EMSGSIZE; 1666 } 1667 1668 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { 1669 [IFLA_GRE_LINK] = { .type = NLA_U32 }, 1670 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, 1671 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, 1672 [IFLA_GRE_IKEY] = { .type = NLA_U32 }, 1673 [IFLA_GRE_OKEY] = { .type = NLA_U32 }, 1674 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, 1675 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, 1676 [IFLA_GRE_TTL] = { .type = NLA_U8 }, 1677 [IFLA_GRE_TOS] = { .type = NLA_U8 }, 1678 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, 1679 }; 1680 1681 static struct rtnl_link_ops ipgre_link_ops __read_mostly = { 1682 .kind = "gre", 1683 .maxtype = IFLA_GRE_MAX, 1684 .policy = ipgre_policy, 1685 .priv_size = sizeof(struct ip_tunnel), 1686 .setup = ipgre_tunnel_setup, 1687 .validate = ipgre_tunnel_validate, 1688 .newlink = ipgre_newlink, 1689 .changelink = ipgre_changelink, 1690 .get_size = ipgre_get_size, 1691 .fill_info = ipgre_fill_info, 1692 }; 1693 1694 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { 1695 .kind = "gretap", 1696 .maxtype = IFLA_GRE_MAX, 1697 .policy = ipgre_policy, 1698 .priv_size = sizeof(struct ip_tunnel), 1699 .setup = ipgre_tap_setup, 1700 .validate = ipgre_tap_validate, 1701 .newlink = ipgre_newlink, 1702 .changelink = ipgre_changelink, 1703 .get_size = ipgre_get_size, 1704 .fill_info = ipgre_fill_info, 1705 }; 1706 1707 /* 1708 * And now the modules code and kernel interface. 1709 */ 1710 1711 static int __init ipgre_init(void) 1712 { 1713 int err; 1714 1715 printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); 1716 1717 err = register_pernet_device(&ipgre_net_ops); 1718 if (err < 0) 1719 return err; 1720 1721 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); 1722 if (err < 0) { 1723 printk(KERN_INFO "ipgre init: can't add protocol\n"); 1724 goto add_proto_failed; 1725 } 1726 1727 err = rtnl_link_register(&ipgre_link_ops); 1728 if (err < 0) 1729 goto rtnl_link_failed; 1730 1731 err = rtnl_link_register(&ipgre_tap_ops); 1732 if (err < 0) 1733 goto tap_ops_failed; 1734 1735 out: 1736 return err; 1737 1738 tap_ops_failed: 1739 rtnl_link_unregister(&ipgre_link_ops); 1740 rtnl_link_failed: 1741 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); 1742 add_proto_failed: 1743 unregister_pernet_device(&ipgre_net_ops); 1744 goto out; 1745 } 1746 1747 static void __exit ipgre_fini(void) 1748 { 1749 rtnl_link_unregister(&ipgre_tap_ops); 1750 rtnl_link_unregister(&ipgre_link_ops); 1751 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) 1752 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1753 unregister_pernet_device(&ipgre_net_ops); 1754 } 1755 1756 module_init(ipgre_init); 1757 module_exit(ipgre_fini); 1758 MODULE_LICENSE("GPL"); 1759 MODULE_ALIAS_RTNL_LINK("gre"); 1760 MODULE_ALIAS_RTNL_LINK("gretap"); 1761 MODULE_ALIAS_NETDEV("gre0"); 1762