1 /* 2 * Linux NET3: GRE over IP protocol decoder. 3 * 4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 */ 12 13 #include <linux/capability.h> 14 #include <linux/module.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/slab.h> 18 #include <asm/uaccess.h> 19 #include <linux/skbuff.h> 20 #include <linux/netdevice.h> 21 #include <linux/in.h> 22 #include <linux/tcp.h> 23 #include <linux/udp.h> 24 #include <linux/if_arp.h> 25 #include <linux/mroute.h> 26 #include <linux/init.h> 27 #include <linux/in6.h> 28 #include <linux/inetdevice.h> 29 #include <linux/igmp.h> 30 #include <linux/netfilter_ipv4.h> 31 #include <linux/etherdevice.h> 32 #include <linux/if_ether.h> 33 34 #include <net/sock.h> 35 #include <net/ip.h> 36 #include <net/icmp.h> 37 #include <net/protocol.h> 38 #include <net/ipip.h> 39 #include <net/arp.h> 40 #include <net/checksum.h> 41 #include <net/dsfield.h> 42 #include <net/inet_ecn.h> 43 #include <net/xfrm.h> 44 #include <net/net_namespace.h> 45 #include <net/netns/generic.h> 46 #include <net/rtnetlink.h> 47 48 #ifdef CONFIG_IPV6 49 #include <net/ipv6.h> 50 #include <net/ip6_fib.h> 51 #include <net/ip6_route.h> 52 #endif 53 54 /* 55 Problems & solutions 56 -------------------- 57 58 1. The most important issue is detecting local dead loops. 59 They would cause complete host lockup in transmit, which 60 would be "resolved" by stack overflow or, if queueing is enabled, 61 with infinite looping in net_bh. 62 63 We cannot track such dead loops during route installation, 64 it is infeasible task. The most general solutions would be 65 to keep skb->encapsulation counter (sort of local ttl), 66 and silently drop packet when it expires. It is the best 67 solution, but it supposes maintaing new variable in ALL 68 skb, even if no tunneling is used. 69 70 Current solution: HARD_TX_LOCK lock breaks dead loops. 71 72 73 74 2. Networking dead loops would not kill routers, but would really 75 kill network. IP hop limit plays role of "t->recursion" in this case, 76 if we copy it from packet being encapsulated to upper header. 77 It is very good solution, but it introduces two problems: 78 79 - Routing protocols, using packets with ttl=1 (OSPF, RIP2), 80 do not work over tunnels. 81 - traceroute does not work. I planned to relay ICMP from tunnel, 82 so that this problem would be solved and traceroute output 83 would even more informative. This idea appeared to be wrong: 84 only Linux complies to rfc1812 now (yes, guys, Linux is the only 85 true router now :-)), all routers (at least, in neighbourhood of mine) 86 return only 8 bytes of payload. It is the end. 87 88 Hence, if we want that OSPF worked or traceroute said something reasonable, 89 we should search for another solution. 90 91 One of them is to parse packet trying to detect inner encapsulation 92 made by our node. It is difficult or even impossible, especially, 93 taking into account fragmentation. TO be short, tt is not solution at all. 94 95 Current solution: The solution was UNEXPECTEDLY SIMPLE. 96 We force DF flag on tunnels with preconfigured hop limit, 97 that is ALL. :-) Well, it does not remove the problem completely, 98 but exponential growth of network traffic is changed to linear 99 (branches, that exceed pmtu are pruned) and tunnel mtu 100 fastly degrades to value <68, where looping stops. 101 Yes, it is not good if there exists a router in the loop, 102 which does not force DF, even when encapsulating packets have DF set. 103 But it is not our problem! Nobody could accuse us, we made 104 all that we could make. Even if it is your gated who injected 105 fatal route to network, even if it were you who configured 106 fatal static route: you are innocent. :-) 107 108 109 110 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain 111 practically identical code. It would be good to glue them 112 together, but it is not very evident, how to make them modular. 113 sit is integral part of IPv6, ipip and gre are naturally modular. 114 We could extract common parts (hash table, ioctl etc) 115 to a separate module (ip_tunnel.c). 116 117 Alexey Kuznetsov. 118 */ 119 120 static struct rtnl_link_ops ipgre_link_ops __read_mostly; 121 static int ipgre_tunnel_init(struct net_device *dev); 122 static void ipgre_tunnel_setup(struct net_device *dev); 123 static int ipgre_tunnel_bind_dev(struct net_device *dev); 124 125 /* Fallback tunnel: no source, no destination, no key, no options */ 126 127 #define HASH_SIZE 16 128 129 static int ipgre_net_id __read_mostly; 130 struct ipgre_net { 131 struct ip_tunnel *tunnels[4][HASH_SIZE]; 132 133 struct net_device *fb_tunnel_dev; 134 }; 135 136 /* Tunnel hash table */ 137 138 /* 139 4 hash tables: 140 141 3: (remote,local) 142 2: (remote,*) 143 1: (*,local) 144 0: (*,*) 145 146 We require exact key match i.e. if a key is present in packet 147 it will match only tunnel with the same key; if it is not present, 148 it will match only keyless tunnel. 149 150 All keysless packets, if not matched configured keyless tunnels 151 will match fallback tunnel. 152 */ 153 154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 155 156 #define tunnels_r_l tunnels[3] 157 #define tunnels_r tunnels[2] 158 #define tunnels_l tunnels[1] 159 #define tunnels_wc tunnels[0] 160 /* 161 * Locking : hash tables are protected by RCU and a spinlock 162 */ 163 static DEFINE_SPINLOCK(ipgre_lock); 164 165 #define for_each_ip_tunnel_rcu(start) \ 166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 167 168 /* Given src, dst and key, find appropriate for input tunnel. */ 169 170 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, 171 __be32 remote, __be32 local, 172 __be32 key, __be16 gre_proto) 173 { 174 struct net *net = dev_net(dev); 175 int link = dev->ifindex; 176 unsigned h0 = HASH(remote); 177 unsigned h1 = HASH(key); 178 struct ip_tunnel *t, *cand = NULL; 179 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 180 int dev_type = (gre_proto == htons(ETH_P_TEB)) ? 181 ARPHRD_ETHER : ARPHRD_IPGRE; 182 int score, cand_score = 4; 183 184 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { 185 if (local != t->parms.iph.saddr || 186 remote != t->parms.iph.daddr || 187 key != t->parms.i_key || 188 !(t->dev->flags & IFF_UP)) 189 continue; 190 191 if (t->dev->type != ARPHRD_IPGRE && 192 t->dev->type != dev_type) 193 continue; 194 195 score = 0; 196 if (t->parms.link != link) 197 score |= 1; 198 if (t->dev->type != dev_type) 199 score |= 2; 200 if (score == 0) 201 return t; 202 203 if (score < cand_score) { 204 cand = t; 205 cand_score = score; 206 } 207 } 208 209 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { 210 if (remote != t->parms.iph.daddr || 211 key != t->parms.i_key || 212 !(t->dev->flags & IFF_UP)) 213 continue; 214 215 if (t->dev->type != ARPHRD_IPGRE && 216 t->dev->type != dev_type) 217 continue; 218 219 score = 0; 220 if (t->parms.link != link) 221 score |= 1; 222 if (t->dev->type != dev_type) 223 score |= 2; 224 if (score == 0) 225 return t; 226 227 if (score < cand_score) { 228 cand = t; 229 cand_score = score; 230 } 231 } 232 233 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) { 234 if ((local != t->parms.iph.saddr && 235 (local != t->parms.iph.daddr || 236 !ipv4_is_multicast(local))) || 237 key != t->parms.i_key || 238 !(t->dev->flags & IFF_UP)) 239 continue; 240 241 if (t->dev->type != ARPHRD_IPGRE && 242 t->dev->type != dev_type) 243 continue; 244 245 score = 0; 246 if (t->parms.link != link) 247 score |= 1; 248 if (t->dev->type != dev_type) 249 score |= 2; 250 if (score == 0) 251 return t; 252 253 if (score < cand_score) { 254 cand = t; 255 cand_score = score; 256 } 257 } 258 259 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) { 260 if (t->parms.i_key != key || 261 !(t->dev->flags & IFF_UP)) 262 continue; 263 264 if (t->dev->type != ARPHRD_IPGRE && 265 t->dev->type != dev_type) 266 continue; 267 268 score = 0; 269 if (t->parms.link != link) 270 score |= 1; 271 if (t->dev->type != dev_type) 272 score |= 2; 273 if (score == 0) 274 return t; 275 276 if (score < cand_score) { 277 cand = t; 278 cand_score = score; 279 } 280 } 281 282 if (cand != NULL) 283 return cand; 284 285 dev = ign->fb_tunnel_dev; 286 if (dev->flags & IFF_UP) 287 return netdev_priv(dev); 288 289 return NULL; 290 } 291 292 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, 293 struct ip_tunnel_parm *parms) 294 { 295 __be32 remote = parms->iph.daddr; 296 __be32 local = parms->iph.saddr; 297 __be32 key = parms->i_key; 298 unsigned h = HASH(key); 299 int prio = 0; 300 301 if (local) 302 prio |= 1; 303 if (remote && !ipv4_is_multicast(remote)) { 304 prio |= 2; 305 h ^= HASH(remote); 306 } 307 308 return &ign->tunnels[prio][h]; 309 } 310 311 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, 312 struct ip_tunnel *t) 313 { 314 return __ipgre_bucket(ign, &t->parms); 315 } 316 317 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) 318 { 319 struct ip_tunnel **tp = ipgre_bucket(ign, t); 320 321 spin_lock_bh(&ipgre_lock); 322 t->next = *tp; 323 rcu_assign_pointer(*tp, t); 324 spin_unlock_bh(&ipgre_lock); 325 } 326 327 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 328 { 329 struct ip_tunnel **tp; 330 331 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { 332 if (t == *tp) { 333 spin_lock_bh(&ipgre_lock); 334 *tp = t->next; 335 spin_unlock_bh(&ipgre_lock); 336 break; 337 } 338 } 339 } 340 341 static struct ip_tunnel *ipgre_tunnel_find(struct net *net, 342 struct ip_tunnel_parm *parms, 343 int type) 344 { 345 __be32 remote = parms->iph.daddr; 346 __be32 local = parms->iph.saddr; 347 __be32 key = parms->i_key; 348 int link = parms->link; 349 struct ip_tunnel *t, **tp; 350 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 351 352 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) 353 if (local == t->parms.iph.saddr && 354 remote == t->parms.iph.daddr && 355 key == t->parms.i_key && 356 link == t->parms.link && 357 type == t->dev->type) 358 break; 359 360 return t; 361 } 362 363 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, 364 struct ip_tunnel_parm *parms, int create) 365 { 366 struct ip_tunnel *t, *nt; 367 struct net_device *dev; 368 char name[IFNAMSIZ]; 369 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 370 371 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); 372 if (t || !create) 373 return t; 374 375 if (parms->name[0]) 376 strlcpy(name, parms->name, IFNAMSIZ); 377 else 378 sprintf(name, "gre%%d"); 379 380 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); 381 if (!dev) 382 return NULL; 383 384 dev_net_set(dev, net); 385 386 if (strchr(name, '%')) { 387 if (dev_alloc_name(dev, name) < 0) 388 goto failed_free; 389 } 390 391 nt = netdev_priv(dev); 392 nt->parms = *parms; 393 dev->rtnl_link_ops = &ipgre_link_ops; 394 395 dev->mtu = ipgre_tunnel_bind_dev(dev); 396 397 if (register_netdevice(dev) < 0) 398 goto failed_free; 399 400 dev_hold(dev); 401 ipgre_tunnel_link(ign, nt); 402 return nt; 403 404 failed_free: 405 free_netdev(dev); 406 return NULL; 407 } 408 409 static void ipgre_tunnel_uninit(struct net_device *dev) 410 { 411 struct net *net = dev_net(dev); 412 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 413 414 ipgre_tunnel_unlink(ign, netdev_priv(dev)); 415 dev_put(dev); 416 } 417 418 419 static void ipgre_err(struct sk_buff *skb, u32 info) 420 { 421 422 /* All the routers (except for Linux) return only 423 8 bytes of packet payload. It means, that precise relaying of 424 ICMP in the real Internet is absolutely infeasible. 425 426 Moreover, Cisco "wise men" put GRE key to the third word 427 in GRE header. It makes impossible maintaining even soft state for keyed 428 GRE tunnels with enabled checksum. Tell them "thank you". 429 430 Well, I wonder, rfc1812 was written by Cisco employee, 431 what the hell these idiots break standrads established 432 by themself??? 433 */ 434 435 struct iphdr *iph = (struct iphdr *)skb->data; 436 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); 437 int grehlen = (iph->ihl<<2) + 4; 438 const int type = icmp_hdr(skb)->type; 439 const int code = icmp_hdr(skb)->code; 440 struct ip_tunnel *t; 441 __be16 flags; 442 443 flags = p[0]; 444 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { 445 if (flags&(GRE_VERSION|GRE_ROUTING)) 446 return; 447 if (flags&GRE_KEY) { 448 grehlen += 4; 449 if (flags&GRE_CSUM) 450 grehlen += 4; 451 } 452 } 453 454 /* If only 8 bytes returned, keyed message will be dropped here */ 455 if (skb_headlen(skb) < grehlen) 456 return; 457 458 switch (type) { 459 default: 460 case ICMP_PARAMETERPROB: 461 return; 462 463 case ICMP_DEST_UNREACH: 464 switch (code) { 465 case ICMP_SR_FAILED: 466 case ICMP_PORT_UNREACH: 467 /* Impossible event. */ 468 return; 469 case ICMP_FRAG_NEEDED: 470 /* Soft state for pmtu is maintained by IP core. */ 471 return; 472 default: 473 /* All others are translated to HOST_UNREACH. 474 rfc2003 contains "deep thoughts" about NET_UNREACH, 475 I believe they are just ether pollution. --ANK 476 */ 477 break; 478 } 479 break; 480 case ICMP_TIME_EXCEEDED: 481 if (code != ICMP_EXC_TTL) 482 return; 483 break; 484 } 485 486 rcu_read_lock(); 487 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 488 flags & GRE_KEY ? 489 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 490 p[1]); 491 if (t == NULL || t->parms.iph.daddr == 0 || 492 ipv4_is_multicast(t->parms.iph.daddr)) 493 goto out; 494 495 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 496 goto out; 497 498 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) 499 t->err_count++; 500 else 501 t->err_count = 1; 502 t->err_time = jiffies; 503 out: 504 rcu_read_unlock(); 505 } 506 507 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) 508 { 509 if (INET_ECN_is_ce(iph->tos)) { 510 if (skb->protocol == htons(ETH_P_IP)) { 511 IP_ECN_set_ce(ip_hdr(skb)); 512 } else if (skb->protocol == htons(ETH_P_IPV6)) { 513 IP6_ECN_set_ce(ipv6_hdr(skb)); 514 } 515 } 516 } 517 518 static inline u8 519 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb) 520 { 521 u8 inner = 0; 522 if (skb->protocol == htons(ETH_P_IP)) 523 inner = old_iph->tos; 524 else if (skb->protocol == htons(ETH_P_IPV6)) 525 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph); 526 return INET_ECN_encapsulate(tos, inner); 527 } 528 529 static int ipgre_rcv(struct sk_buff *skb) 530 { 531 struct iphdr *iph; 532 u8 *h; 533 __be16 flags; 534 __sum16 csum = 0; 535 __be32 key = 0; 536 u32 seqno = 0; 537 struct ip_tunnel *tunnel; 538 int offset = 4; 539 __be16 gre_proto; 540 541 if (!pskb_may_pull(skb, 16)) 542 goto drop_nolock; 543 544 iph = ip_hdr(skb); 545 h = skb->data; 546 flags = *(__be16*)h; 547 548 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { 549 /* - Version must be 0. 550 - We do not support routing headers. 551 */ 552 if (flags&(GRE_VERSION|GRE_ROUTING)) 553 goto drop_nolock; 554 555 if (flags&GRE_CSUM) { 556 switch (skb->ip_summed) { 557 case CHECKSUM_COMPLETE: 558 csum = csum_fold(skb->csum); 559 if (!csum) 560 break; 561 /* fall through */ 562 case CHECKSUM_NONE: 563 skb->csum = 0; 564 csum = __skb_checksum_complete(skb); 565 skb->ip_summed = CHECKSUM_COMPLETE; 566 } 567 offset += 4; 568 } 569 if (flags&GRE_KEY) { 570 key = *(__be32*)(h + offset); 571 offset += 4; 572 } 573 if (flags&GRE_SEQ) { 574 seqno = ntohl(*(__be32*)(h + offset)); 575 offset += 4; 576 } 577 } 578 579 gre_proto = *(__be16 *)(h + 2); 580 581 rcu_read_lock(); 582 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 583 iph->saddr, iph->daddr, key, 584 gre_proto))) { 585 struct net_device_stats *stats = &tunnel->dev->stats; 586 587 secpath_reset(skb); 588 589 skb->protocol = gre_proto; 590 /* WCCP version 1 and 2 protocol decoding. 591 * - Change protocol to IP 592 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header 593 */ 594 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { 595 skb->protocol = htons(ETH_P_IP); 596 if ((*(h + offset) & 0xF0) != 0x40) 597 offset += 4; 598 } 599 600 skb->mac_header = skb->network_header; 601 __pskb_pull(skb, offset); 602 skb_postpull_rcsum(skb, skb_transport_header(skb), offset); 603 skb->pkt_type = PACKET_HOST; 604 #ifdef CONFIG_NET_IPGRE_BROADCAST 605 if (ipv4_is_multicast(iph->daddr)) { 606 /* Looped back packet, drop it! */ 607 if (skb_rtable(skb)->fl.iif == 0) 608 goto drop; 609 stats->multicast++; 610 skb->pkt_type = PACKET_BROADCAST; 611 } 612 #endif 613 614 if (((flags&GRE_CSUM) && csum) || 615 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { 616 stats->rx_crc_errors++; 617 stats->rx_errors++; 618 goto drop; 619 } 620 if (tunnel->parms.i_flags&GRE_SEQ) { 621 if (!(flags&GRE_SEQ) || 622 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { 623 stats->rx_fifo_errors++; 624 stats->rx_errors++; 625 goto drop; 626 } 627 tunnel->i_seqno = seqno + 1; 628 } 629 630 /* Warning: All skb pointers will be invalidated! */ 631 if (tunnel->dev->type == ARPHRD_ETHER) { 632 if (!pskb_may_pull(skb, ETH_HLEN)) { 633 stats->rx_length_errors++; 634 stats->rx_errors++; 635 goto drop; 636 } 637 638 iph = ip_hdr(skb); 639 skb->protocol = eth_type_trans(skb, tunnel->dev); 640 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 641 } 642 643 skb_tunnel_rx(skb, tunnel->dev); 644 645 skb_reset_network_header(skb); 646 ipgre_ecn_decapsulate(iph, skb); 647 648 netif_rx(skb); 649 rcu_read_unlock(); 650 return(0); 651 } 652 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 653 654 drop: 655 rcu_read_unlock(); 656 drop_nolock: 657 kfree_skb(skb); 658 return(0); 659 } 660 661 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 662 { 663 struct ip_tunnel *tunnel = netdev_priv(dev); 664 struct net_device_stats *stats = &dev->stats; 665 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); 666 struct iphdr *old_iph = ip_hdr(skb); 667 struct iphdr *tiph; 668 u8 tos; 669 __be16 df; 670 struct rtable *rt; /* Route to the other host */ 671 struct net_device *tdev; /* Device to other host */ 672 struct iphdr *iph; /* Our new IP header */ 673 unsigned int max_headroom; /* The extra header space needed */ 674 int gre_hlen; 675 __be32 dst; 676 int mtu; 677 678 if (dev->type == ARPHRD_ETHER) 679 IPCB(skb)->flags = 0; 680 681 if (dev->header_ops && dev->type == ARPHRD_IPGRE) { 682 gre_hlen = 0; 683 tiph = (struct iphdr *)skb->data; 684 } else { 685 gre_hlen = tunnel->hlen; 686 tiph = &tunnel->parms.iph; 687 } 688 689 if ((dst = tiph->daddr) == 0) { 690 /* NBMA tunnel */ 691 692 if (skb_dst(skb) == NULL) { 693 stats->tx_fifo_errors++; 694 goto tx_error; 695 } 696 697 if (skb->protocol == htons(ETH_P_IP)) { 698 rt = skb_rtable(skb); 699 if ((dst = rt->rt_gateway) == 0) 700 goto tx_error_icmp; 701 } 702 #ifdef CONFIG_IPV6 703 else if (skb->protocol == htons(ETH_P_IPV6)) { 704 struct in6_addr *addr6; 705 int addr_type; 706 struct neighbour *neigh = skb_dst(skb)->neighbour; 707 708 if (neigh == NULL) 709 goto tx_error; 710 711 addr6 = (struct in6_addr *)&neigh->primary_key; 712 addr_type = ipv6_addr_type(addr6); 713 714 if (addr_type == IPV6_ADDR_ANY) { 715 addr6 = &ipv6_hdr(skb)->daddr; 716 addr_type = ipv6_addr_type(addr6); 717 } 718 719 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 720 goto tx_error_icmp; 721 722 dst = addr6->s6_addr32[3]; 723 } 724 #endif 725 else 726 goto tx_error; 727 } 728 729 tos = tiph->tos; 730 if (tos == 1) { 731 tos = 0; 732 if (skb->protocol == htons(ETH_P_IP)) 733 tos = old_iph->tos; 734 else if (skb->protocol == htons(ETH_P_IPV6)) 735 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); 736 } 737 738 { 739 struct flowi fl = { .oif = tunnel->parms.link, 740 .nl_u = { .ip4_u = 741 { .daddr = dst, 742 .saddr = tiph->saddr, 743 .tos = RT_TOS(tos) } }, 744 .proto = IPPROTO_GRE }; 745 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 746 stats->tx_carrier_errors++; 747 goto tx_error; 748 } 749 } 750 tdev = rt->dst.dev; 751 752 if (tdev == dev) { 753 ip_rt_put(rt); 754 stats->collisions++; 755 goto tx_error; 756 } 757 758 df = tiph->frag_off; 759 if (df) 760 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; 761 else 762 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 763 764 if (skb_dst(skb)) 765 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 766 767 if (skb->protocol == htons(ETH_P_IP)) { 768 df |= (old_iph->frag_off&htons(IP_DF)); 769 770 if ((old_iph->frag_off&htons(IP_DF)) && 771 mtu < ntohs(old_iph->tot_len)) { 772 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 773 ip_rt_put(rt); 774 goto tx_error; 775 } 776 } 777 #ifdef CONFIG_IPV6 778 else if (skb->protocol == htons(ETH_P_IPV6)) { 779 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 780 781 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { 782 if ((tunnel->parms.iph.daddr && 783 !ipv4_is_multicast(tunnel->parms.iph.daddr)) || 784 rt6->rt6i_dst.plen == 128) { 785 rt6->rt6i_flags |= RTF_MODIFIED; 786 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu; 787 } 788 } 789 790 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { 791 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 792 ip_rt_put(rt); 793 goto tx_error; 794 } 795 } 796 #endif 797 798 if (tunnel->err_count > 0) { 799 if (time_before(jiffies, 800 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 801 tunnel->err_count--; 802 803 dst_link_failure(skb); 804 } else 805 tunnel->err_count = 0; 806 } 807 808 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; 809 810 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 811 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 812 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 813 if (max_headroom > dev->needed_headroom) 814 dev->needed_headroom = max_headroom; 815 if (!new_skb) { 816 ip_rt_put(rt); 817 txq->tx_dropped++; 818 dev_kfree_skb(skb); 819 return NETDEV_TX_OK; 820 } 821 if (skb->sk) 822 skb_set_owner_w(new_skb, skb->sk); 823 dev_kfree_skb(skb); 824 skb = new_skb; 825 old_iph = ip_hdr(skb); 826 } 827 828 skb_reset_transport_header(skb); 829 skb_push(skb, gre_hlen); 830 skb_reset_network_header(skb); 831 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 832 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 833 IPSKB_REROUTED); 834 skb_dst_drop(skb); 835 skb_dst_set(skb, &rt->dst); 836 837 /* 838 * Push down and install the IPIP header. 839 */ 840 841 iph = ip_hdr(skb); 842 iph->version = 4; 843 iph->ihl = sizeof(struct iphdr) >> 2; 844 iph->frag_off = df; 845 iph->protocol = IPPROTO_GRE; 846 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); 847 iph->daddr = rt->rt_dst; 848 iph->saddr = rt->rt_src; 849 850 if ((iph->ttl = tiph->ttl) == 0) { 851 if (skb->protocol == htons(ETH_P_IP)) 852 iph->ttl = old_iph->ttl; 853 #ifdef CONFIG_IPV6 854 else if (skb->protocol == htons(ETH_P_IPV6)) 855 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; 856 #endif 857 else 858 iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT); 859 } 860 861 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; 862 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? 863 htons(ETH_P_TEB) : skb->protocol; 864 865 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { 866 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); 867 868 if (tunnel->parms.o_flags&GRE_SEQ) { 869 ++tunnel->o_seqno; 870 *ptr = htonl(tunnel->o_seqno); 871 ptr--; 872 } 873 if (tunnel->parms.o_flags&GRE_KEY) { 874 *ptr = tunnel->parms.o_key; 875 ptr--; 876 } 877 if (tunnel->parms.o_flags&GRE_CSUM) { 878 *ptr = 0; 879 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); 880 } 881 } 882 883 nf_reset(skb); 884 885 IPTUNNEL_XMIT(); 886 return NETDEV_TX_OK; 887 888 tx_error_icmp: 889 dst_link_failure(skb); 890 891 tx_error: 892 stats->tx_errors++; 893 dev_kfree_skb(skb); 894 return NETDEV_TX_OK; 895 } 896 897 static int ipgre_tunnel_bind_dev(struct net_device *dev) 898 { 899 struct net_device *tdev = NULL; 900 struct ip_tunnel *tunnel; 901 struct iphdr *iph; 902 int hlen = LL_MAX_HEADER; 903 int mtu = ETH_DATA_LEN; 904 int addend = sizeof(struct iphdr) + 4; 905 906 tunnel = netdev_priv(dev); 907 iph = &tunnel->parms.iph; 908 909 /* Guess output device to choose reasonable mtu and needed_headroom */ 910 911 if (iph->daddr) { 912 struct flowi fl = { .oif = tunnel->parms.link, 913 .nl_u = { .ip4_u = 914 { .daddr = iph->daddr, 915 .saddr = iph->saddr, 916 .tos = RT_TOS(iph->tos) } }, 917 .proto = IPPROTO_GRE }; 918 struct rtable *rt; 919 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 920 tdev = rt->dst.dev; 921 ip_rt_put(rt); 922 } 923 924 if (dev->type != ARPHRD_ETHER) 925 dev->flags |= IFF_POINTOPOINT; 926 } 927 928 if (!tdev && tunnel->parms.link) 929 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); 930 931 if (tdev) { 932 hlen = tdev->hard_header_len + tdev->needed_headroom; 933 mtu = tdev->mtu; 934 } 935 dev->iflink = tunnel->parms.link; 936 937 /* Precalculate GRE options length */ 938 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { 939 if (tunnel->parms.o_flags&GRE_CSUM) 940 addend += 4; 941 if (tunnel->parms.o_flags&GRE_KEY) 942 addend += 4; 943 if (tunnel->parms.o_flags&GRE_SEQ) 944 addend += 4; 945 } 946 dev->needed_headroom = addend + hlen; 947 mtu -= dev->hard_header_len + addend; 948 949 if (mtu < 68) 950 mtu = 68; 951 952 tunnel->hlen = addend; 953 954 return mtu; 955 } 956 957 static int 958 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) 959 { 960 int err = 0; 961 struct ip_tunnel_parm p; 962 struct ip_tunnel *t; 963 struct net *net = dev_net(dev); 964 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 965 966 switch (cmd) { 967 case SIOCGETTUNNEL: 968 t = NULL; 969 if (dev == ign->fb_tunnel_dev) { 970 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { 971 err = -EFAULT; 972 break; 973 } 974 t = ipgre_tunnel_locate(net, &p, 0); 975 } 976 if (t == NULL) 977 t = netdev_priv(dev); 978 memcpy(&p, &t->parms, sizeof(p)); 979 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 980 err = -EFAULT; 981 break; 982 983 case SIOCADDTUNNEL: 984 case SIOCCHGTUNNEL: 985 err = -EPERM; 986 if (!capable(CAP_NET_ADMIN)) 987 goto done; 988 989 err = -EFAULT; 990 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 991 goto done; 992 993 err = -EINVAL; 994 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || 995 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || 996 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) 997 goto done; 998 if (p.iph.ttl) 999 p.iph.frag_off |= htons(IP_DF); 1000 1001 if (!(p.i_flags&GRE_KEY)) 1002 p.i_key = 0; 1003 if (!(p.o_flags&GRE_KEY)) 1004 p.o_key = 0; 1005 1006 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); 1007 1008 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 1009 if (t != NULL) { 1010 if (t->dev != dev) { 1011 err = -EEXIST; 1012 break; 1013 } 1014 } else { 1015 unsigned nflags = 0; 1016 1017 t = netdev_priv(dev); 1018 1019 if (ipv4_is_multicast(p.iph.daddr)) 1020 nflags = IFF_BROADCAST; 1021 else if (p.iph.daddr) 1022 nflags = IFF_POINTOPOINT; 1023 1024 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 1025 err = -EINVAL; 1026 break; 1027 } 1028 ipgre_tunnel_unlink(ign, t); 1029 t->parms.iph.saddr = p.iph.saddr; 1030 t->parms.iph.daddr = p.iph.daddr; 1031 t->parms.i_key = p.i_key; 1032 t->parms.o_key = p.o_key; 1033 memcpy(dev->dev_addr, &p.iph.saddr, 4); 1034 memcpy(dev->broadcast, &p.iph.daddr, 4); 1035 ipgre_tunnel_link(ign, t); 1036 netdev_state_change(dev); 1037 } 1038 } 1039 1040 if (t) { 1041 err = 0; 1042 if (cmd == SIOCCHGTUNNEL) { 1043 t->parms.iph.ttl = p.iph.ttl; 1044 t->parms.iph.tos = p.iph.tos; 1045 t->parms.iph.frag_off = p.iph.frag_off; 1046 if (t->parms.link != p.link) { 1047 t->parms.link = p.link; 1048 dev->mtu = ipgre_tunnel_bind_dev(dev); 1049 netdev_state_change(dev); 1050 } 1051 } 1052 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) 1053 err = -EFAULT; 1054 } else 1055 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); 1056 break; 1057 1058 case SIOCDELTUNNEL: 1059 err = -EPERM; 1060 if (!capable(CAP_NET_ADMIN)) 1061 goto done; 1062 1063 if (dev == ign->fb_tunnel_dev) { 1064 err = -EFAULT; 1065 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 1066 goto done; 1067 err = -ENOENT; 1068 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL) 1069 goto done; 1070 err = -EPERM; 1071 if (t == netdev_priv(ign->fb_tunnel_dev)) 1072 goto done; 1073 dev = t->dev; 1074 } 1075 unregister_netdevice(dev); 1076 err = 0; 1077 break; 1078 1079 default: 1080 err = -EINVAL; 1081 } 1082 1083 done: 1084 return err; 1085 } 1086 1087 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) 1088 { 1089 struct ip_tunnel *tunnel = netdev_priv(dev); 1090 if (new_mtu < 68 || 1091 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) 1092 return -EINVAL; 1093 dev->mtu = new_mtu; 1094 return 0; 1095 } 1096 1097 /* Nice toy. Unfortunately, useless in real life :-) 1098 It allows to construct virtual multiprotocol broadcast "LAN" 1099 over the Internet, provided multicast routing is tuned. 1100 1101 1102 I have no idea was this bicycle invented before me, 1103 so that I had to set ARPHRD_IPGRE to a random value. 1104 I have an impression, that Cisco could make something similar, 1105 but this feature is apparently missing in IOS<=11.2(8). 1106 1107 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks 1108 with broadcast 224.66.66.66. If you have access to mbone, play with me :-) 1109 1110 ping -t 255 224.66.66.66 1111 1112 If nobody answers, mbone does not work. 1113 1114 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255 1115 ip addr add 10.66.66.<somewhat>/24 dev Universe 1116 ifconfig Universe up 1117 ifconfig Universe add fe80::<Your_real_addr>/10 1118 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96 1119 ftp 10.66.66.66 1120 ... 1121 ftp fec0:6666:6666::193.233.7.65 1122 ... 1123 1124 */ 1125 1126 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 1127 unsigned short type, 1128 const void *daddr, const void *saddr, unsigned len) 1129 { 1130 struct ip_tunnel *t = netdev_priv(dev); 1131 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1132 __be16 *p = (__be16*)(iph+1); 1133 1134 memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); 1135 p[0] = t->parms.o_flags; 1136 p[1] = htons(type); 1137 1138 /* 1139 * Set the source hardware address. 1140 */ 1141 1142 if (saddr) 1143 memcpy(&iph->saddr, saddr, 4); 1144 if (daddr) 1145 memcpy(&iph->daddr, daddr, 4); 1146 if (iph->daddr) 1147 return t->hlen; 1148 1149 return -t->hlen; 1150 } 1151 1152 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) 1153 { 1154 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb); 1155 memcpy(haddr, &iph->saddr, 4); 1156 return 4; 1157 } 1158 1159 static const struct header_ops ipgre_header_ops = { 1160 .create = ipgre_header, 1161 .parse = ipgre_header_parse, 1162 }; 1163 1164 #ifdef CONFIG_NET_IPGRE_BROADCAST 1165 static int ipgre_open(struct net_device *dev) 1166 { 1167 struct ip_tunnel *t = netdev_priv(dev); 1168 1169 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1170 struct flowi fl = { .oif = t->parms.link, 1171 .nl_u = { .ip4_u = 1172 { .daddr = t->parms.iph.daddr, 1173 .saddr = t->parms.iph.saddr, 1174 .tos = RT_TOS(t->parms.iph.tos) } }, 1175 .proto = IPPROTO_GRE }; 1176 struct rtable *rt; 1177 if (ip_route_output_key(dev_net(dev), &rt, &fl)) 1178 return -EADDRNOTAVAIL; 1179 dev = rt->dst.dev; 1180 ip_rt_put(rt); 1181 if (__in_dev_get_rtnl(dev) == NULL) 1182 return -EADDRNOTAVAIL; 1183 t->mlink = dev->ifindex; 1184 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); 1185 } 1186 return 0; 1187 } 1188 1189 static int ipgre_close(struct net_device *dev) 1190 { 1191 struct ip_tunnel *t = netdev_priv(dev); 1192 1193 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 1194 struct in_device *in_dev; 1195 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 1196 if (in_dev) { 1197 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 1198 in_dev_put(in_dev); 1199 } 1200 } 1201 return 0; 1202 } 1203 1204 #endif 1205 1206 static const struct net_device_ops ipgre_netdev_ops = { 1207 .ndo_init = ipgre_tunnel_init, 1208 .ndo_uninit = ipgre_tunnel_uninit, 1209 #ifdef CONFIG_NET_IPGRE_BROADCAST 1210 .ndo_open = ipgre_open, 1211 .ndo_stop = ipgre_close, 1212 #endif 1213 .ndo_start_xmit = ipgre_tunnel_xmit, 1214 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1215 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1216 }; 1217 1218 static void ipgre_tunnel_setup(struct net_device *dev) 1219 { 1220 dev->netdev_ops = &ipgre_netdev_ops; 1221 dev->destructor = free_netdev; 1222 1223 dev->type = ARPHRD_IPGRE; 1224 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1225 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; 1226 dev->flags = IFF_NOARP; 1227 dev->iflink = 0; 1228 dev->addr_len = 4; 1229 dev->features |= NETIF_F_NETNS_LOCAL; 1230 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 1231 } 1232 1233 static int ipgre_tunnel_init(struct net_device *dev) 1234 { 1235 struct ip_tunnel *tunnel; 1236 struct iphdr *iph; 1237 1238 tunnel = netdev_priv(dev); 1239 iph = &tunnel->parms.iph; 1240 1241 tunnel->dev = dev; 1242 strcpy(tunnel->parms.name, dev->name); 1243 1244 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); 1245 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 1246 1247 if (iph->daddr) { 1248 #ifdef CONFIG_NET_IPGRE_BROADCAST 1249 if (ipv4_is_multicast(iph->daddr)) { 1250 if (!iph->saddr) 1251 return -EINVAL; 1252 dev->flags = IFF_BROADCAST; 1253 dev->header_ops = &ipgre_header_ops; 1254 } 1255 #endif 1256 } else 1257 dev->header_ops = &ipgre_header_ops; 1258 1259 return 0; 1260 } 1261 1262 static void ipgre_fb_tunnel_init(struct net_device *dev) 1263 { 1264 struct ip_tunnel *tunnel = netdev_priv(dev); 1265 struct iphdr *iph = &tunnel->parms.iph; 1266 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id); 1267 1268 tunnel->dev = dev; 1269 strcpy(tunnel->parms.name, dev->name); 1270 1271 iph->version = 4; 1272 iph->protocol = IPPROTO_GRE; 1273 iph->ihl = 5; 1274 tunnel->hlen = sizeof(struct iphdr) + 4; 1275 1276 dev_hold(dev); 1277 ign->tunnels_wc[0] = tunnel; 1278 } 1279 1280 1281 static const struct net_protocol ipgre_protocol = { 1282 .handler = ipgre_rcv, 1283 .err_handler = ipgre_err, 1284 .netns_ok = 1, 1285 }; 1286 1287 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) 1288 { 1289 int prio; 1290 1291 for (prio = 0; prio < 4; prio++) { 1292 int h; 1293 for (h = 0; h < HASH_SIZE; h++) { 1294 struct ip_tunnel *t = ign->tunnels[prio][h]; 1295 1296 while (t != NULL) { 1297 unregister_netdevice_queue(t->dev, head); 1298 t = t->next; 1299 } 1300 } 1301 } 1302 } 1303 1304 static int __net_init ipgre_init_net(struct net *net) 1305 { 1306 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1307 int err; 1308 1309 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", 1310 ipgre_tunnel_setup); 1311 if (!ign->fb_tunnel_dev) { 1312 err = -ENOMEM; 1313 goto err_alloc_dev; 1314 } 1315 dev_net_set(ign->fb_tunnel_dev, net); 1316 1317 ipgre_fb_tunnel_init(ign->fb_tunnel_dev); 1318 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; 1319 1320 if ((err = register_netdev(ign->fb_tunnel_dev))) 1321 goto err_reg_dev; 1322 1323 return 0; 1324 1325 err_reg_dev: 1326 free_netdev(ign->fb_tunnel_dev); 1327 err_alloc_dev: 1328 return err; 1329 } 1330 1331 static void __net_exit ipgre_exit_net(struct net *net) 1332 { 1333 struct ipgre_net *ign; 1334 LIST_HEAD(list); 1335 1336 ign = net_generic(net, ipgre_net_id); 1337 rtnl_lock(); 1338 ipgre_destroy_tunnels(ign, &list); 1339 unregister_netdevice_many(&list); 1340 rtnl_unlock(); 1341 } 1342 1343 static struct pernet_operations ipgre_net_ops = { 1344 .init = ipgre_init_net, 1345 .exit = ipgre_exit_net, 1346 .id = &ipgre_net_id, 1347 .size = sizeof(struct ipgre_net), 1348 }; 1349 1350 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) 1351 { 1352 __be16 flags; 1353 1354 if (!data) 1355 return 0; 1356 1357 flags = 0; 1358 if (data[IFLA_GRE_IFLAGS]) 1359 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); 1360 if (data[IFLA_GRE_OFLAGS]) 1361 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); 1362 if (flags & (GRE_VERSION|GRE_ROUTING)) 1363 return -EINVAL; 1364 1365 return 0; 1366 } 1367 1368 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) 1369 { 1370 __be32 daddr; 1371 1372 if (tb[IFLA_ADDRESS]) { 1373 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1374 return -EINVAL; 1375 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1376 return -EADDRNOTAVAIL; 1377 } 1378 1379 if (!data) 1380 goto out; 1381 1382 if (data[IFLA_GRE_REMOTE]) { 1383 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); 1384 if (!daddr) 1385 return -EINVAL; 1386 } 1387 1388 out: 1389 return ipgre_tunnel_validate(tb, data); 1390 } 1391 1392 static void ipgre_netlink_parms(struct nlattr *data[], 1393 struct ip_tunnel_parm *parms) 1394 { 1395 memset(parms, 0, sizeof(*parms)); 1396 1397 parms->iph.protocol = IPPROTO_GRE; 1398 1399 if (!data) 1400 return; 1401 1402 if (data[IFLA_GRE_LINK]) 1403 parms->link = nla_get_u32(data[IFLA_GRE_LINK]); 1404 1405 if (data[IFLA_GRE_IFLAGS]) 1406 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); 1407 1408 if (data[IFLA_GRE_OFLAGS]) 1409 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); 1410 1411 if (data[IFLA_GRE_IKEY]) 1412 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); 1413 1414 if (data[IFLA_GRE_OKEY]) 1415 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); 1416 1417 if (data[IFLA_GRE_LOCAL]) 1418 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); 1419 1420 if (data[IFLA_GRE_REMOTE]) 1421 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); 1422 1423 if (data[IFLA_GRE_TTL]) 1424 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); 1425 1426 if (data[IFLA_GRE_TOS]) 1427 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); 1428 1429 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) 1430 parms->iph.frag_off = htons(IP_DF); 1431 } 1432 1433 static int ipgre_tap_init(struct net_device *dev) 1434 { 1435 struct ip_tunnel *tunnel; 1436 1437 tunnel = netdev_priv(dev); 1438 1439 tunnel->dev = dev; 1440 strcpy(tunnel->parms.name, dev->name); 1441 1442 ipgre_tunnel_bind_dev(dev); 1443 1444 return 0; 1445 } 1446 1447 static const struct net_device_ops ipgre_tap_netdev_ops = { 1448 .ndo_init = ipgre_tap_init, 1449 .ndo_uninit = ipgre_tunnel_uninit, 1450 .ndo_start_xmit = ipgre_tunnel_xmit, 1451 .ndo_set_mac_address = eth_mac_addr, 1452 .ndo_validate_addr = eth_validate_addr, 1453 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1454 }; 1455 1456 static void ipgre_tap_setup(struct net_device *dev) 1457 { 1458 1459 ether_setup(dev); 1460 1461 dev->netdev_ops = &ipgre_tap_netdev_ops; 1462 dev->destructor = free_netdev; 1463 1464 dev->iflink = 0; 1465 dev->features |= NETIF_F_NETNS_LOCAL; 1466 } 1467 1468 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], 1469 struct nlattr *data[]) 1470 { 1471 struct ip_tunnel *nt; 1472 struct net *net = dev_net(dev); 1473 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1474 int mtu; 1475 int err; 1476 1477 nt = netdev_priv(dev); 1478 ipgre_netlink_parms(data, &nt->parms); 1479 1480 if (ipgre_tunnel_find(net, &nt->parms, dev->type)) 1481 return -EEXIST; 1482 1483 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1484 random_ether_addr(dev->dev_addr); 1485 1486 mtu = ipgre_tunnel_bind_dev(dev); 1487 if (!tb[IFLA_MTU]) 1488 dev->mtu = mtu; 1489 1490 err = register_netdevice(dev); 1491 if (err) 1492 goto out; 1493 1494 dev_hold(dev); 1495 ipgre_tunnel_link(ign, nt); 1496 1497 out: 1498 return err; 1499 } 1500 1501 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], 1502 struct nlattr *data[]) 1503 { 1504 struct ip_tunnel *t, *nt; 1505 struct net *net = dev_net(dev); 1506 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1507 struct ip_tunnel_parm p; 1508 int mtu; 1509 1510 if (dev == ign->fb_tunnel_dev) 1511 return -EINVAL; 1512 1513 nt = netdev_priv(dev); 1514 ipgre_netlink_parms(data, &p); 1515 1516 t = ipgre_tunnel_locate(net, &p, 0); 1517 1518 if (t) { 1519 if (t->dev != dev) 1520 return -EEXIST; 1521 } else { 1522 t = nt; 1523 1524 if (dev->type != ARPHRD_ETHER) { 1525 unsigned nflags = 0; 1526 1527 if (ipv4_is_multicast(p.iph.daddr)) 1528 nflags = IFF_BROADCAST; 1529 else if (p.iph.daddr) 1530 nflags = IFF_POINTOPOINT; 1531 1532 if ((dev->flags ^ nflags) & 1533 (IFF_POINTOPOINT | IFF_BROADCAST)) 1534 return -EINVAL; 1535 } 1536 1537 ipgre_tunnel_unlink(ign, t); 1538 t->parms.iph.saddr = p.iph.saddr; 1539 t->parms.iph.daddr = p.iph.daddr; 1540 t->parms.i_key = p.i_key; 1541 if (dev->type != ARPHRD_ETHER) { 1542 memcpy(dev->dev_addr, &p.iph.saddr, 4); 1543 memcpy(dev->broadcast, &p.iph.daddr, 4); 1544 } 1545 ipgre_tunnel_link(ign, t); 1546 netdev_state_change(dev); 1547 } 1548 1549 t->parms.o_key = p.o_key; 1550 t->parms.iph.ttl = p.iph.ttl; 1551 t->parms.iph.tos = p.iph.tos; 1552 t->parms.iph.frag_off = p.iph.frag_off; 1553 1554 if (t->parms.link != p.link) { 1555 t->parms.link = p.link; 1556 mtu = ipgre_tunnel_bind_dev(dev); 1557 if (!tb[IFLA_MTU]) 1558 dev->mtu = mtu; 1559 netdev_state_change(dev); 1560 } 1561 1562 return 0; 1563 } 1564 1565 static size_t ipgre_get_size(const struct net_device *dev) 1566 { 1567 return 1568 /* IFLA_GRE_LINK */ 1569 nla_total_size(4) + 1570 /* IFLA_GRE_IFLAGS */ 1571 nla_total_size(2) + 1572 /* IFLA_GRE_OFLAGS */ 1573 nla_total_size(2) + 1574 /* IFLA_GRE_IKEY */ 1575 nla_total_size(4) + 1576 /* IFLA_GRE_OKEY */ 1577 nla_total_size(4) + 1578 /* IFLA_GRE_LOCAL */ 1579 nla_total_size(4) + 1580 /* IFLA_GRE_REMOTE */ 1581 nla_total_size(4) + 1582 /* IFLA_GRE_TTL */ 1583 nla_total_size(1) + 1584 /* IFLA_GRE_TOS */ 1585 nla_total_size(1) + 1586 /* IFLA_GRE_PMTUDISC */ 1587 nla_total_size(1) + 1588 0; 1589 } 1590 1591 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) 1592 { 1593 struct ip_tunnel *t = netdev_priv(dev); 1594 struct ip_tunnel_parm *p = &t->parms; 1595 1596 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); 1597 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); 1598 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); 1599 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); 1600 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); 1601 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr); 1602 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr); 1603 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); 1604 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); 1605 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); 1606 1607 return 0; 1608 1609 nla_put_failure: 1610 return -EMSGSIZE; 1611 } 1612 1613 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { 1614 [IFLA_GRE_LINK] = { .type = NLA_U32 }, 1615 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, 1616 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, 1617 [IFLA_GRE_IKEY] = { .type = NLA_U32 }, 1618 [IFLA_GRE_OKEY] = { .type = NLA_U32 }, 1619 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, 1620 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, 1621 [IFLA_GRE_TTL] = { .type = NLA_U8 }, 1622 [IFLA_GRE_TOS] = { .type = NLA_U8 }, 1623 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, 1624 }; 1625 1626 static struct rtnl_link_ops ipgre_link_ops __read_mostly = { 1627 .kind = "gre", 1628 .maxtype = IFLA_GRE_MAX, 1629 .policy = ipgre_policy, 1630 .priv_size = sizeof(struct ip_tunnel), 1631 .setup = ipgre_tunnel_setup, 1632 .validate = ipgre_tunnel_validate, 1633 .newlink = ipgre_newlink, 1634 .changelink = ipgre_changelink, 1635 .get_size = ipgre_get_size, 1636 .fill_info = ipgre_fill_info, 1637 }; 1638 1639 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { 1640 .kind = "gretap", 1641 .maxtype = IFLA_GRE_MAX, 1642 .policy = ipgre_policy, 1643 .priv_size = sizeof(struct ip_tunnel), 1644 .setup = ipgre_tap_setup, 1645 .validate = ipgre_tap_validate, 1646 .newlink = ipgre_newlink, 1647 .changelink = ipgre_changelink, 1648 .get_size = ipgre_get_size, 1649 .fill_info = ipgre_fill_info, 1650 }; 1651 1652 /* 1653 * And now the modules code and kernel interface. 1654 */ 1655 1656 static int __init ipgre_init(void) 1657 { 1658 int err; 1659 1660 printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); 1661 1662 err = register_pernet_device(&ipgre_net_ops); 1663 if (err < 0) 1664 return err; 1665 1666 err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); 1667 if (err < 0) { 1668 printk(KERN_INFO "ipgre init: can't add protocol\n"); 1669 goto add_proto_failed; 1670 } 1671 1672 err = rtnl_link_register(&ipgre_link_ops); 1673 if (err < 0) 1674 goto rtnl_link_failed; 1675 1676 err = rtnl_link_register(&ipgre_tap_ops); 1677 if (err < 0) 1678 goto tap_ops_failed; 1679 1680 out: 1681 return err; 1682 1683 tap_ops_failed: 1684 rtnl_link_unregister(&ipgre_link_ops); 1685 rtnl_link_failed: 1686 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); 1687 add_proto_failed: 1688 unregister_pernet_device(&ipgre_net_ops); 1689 goto out; 1690 } 1691 1692 static void __exit ipgre_fini(void) 1693 { 1694 rtnl_link_unregister(&ipgre_tap_ops); 1695 rtnl_link_unregister(&ipgre_link_ops); 1696 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1697 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1698 unregister_pernet_device(&ipgre_net_ops); 1699 } 1700 1701 module_init(ipgre_init); 1702 module_exit(ipgre_fini); 1703 MODULE_LICENSE("GPL"); 1704 MODULE_ALIAS_RTNL_LINK("gre"); 1705 MODULE_ALIAS_RTNL_LINK("gretap"); 1706