xref: /linux/net/ipv4/ip_gre.c (revision 765532c8aaac624b5f8687af6d319c6a1138a257)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33 
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48 
49 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54 
55 /*
56    Problems & solutions
57    --------------------
58 
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63 
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70 
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74 
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79 
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88 
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91 
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95 
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108 
109 
110 
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117 
118    Alexey Kuznetsov.
119  */
120 
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125 
126 /* Fallback tunnel: no source, no destination, no key, no options */
127 
128 #define HASH_SIZE  16
129 
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133 
134 	struct net_device *fb_tunnel_dev;
135 };
136 
137 /* Tunnel hash table */
138 
139 /*
140    4 hash tables:
141 
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146 
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150 
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154 
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156 
157 #define tunnels_r_l	tunnels[3]
158 #define tunnels_r	tunnels[2]
159 #define tunnels_l	tunnels[1]
160 #define tunnels_wc	tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164 
165 #define for_each_ip_tunnel_rcu(start) \
166 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170 	unsigned long	rx_packets;
171 	unsigned long	rx_bytes;
172 	unsigned long	tx_packets;
173 	unsigned long	tx_bytes;
174 };
175 
176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178 	struct pcpu_tstats sum = { 0 };
179 	int i;
180 
181 	for_each_possible_cpu(i) {
182 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183 
184 		sum.rx_packets += tstats->rx_packets;
185 		sum.rx_bytes   += tstats->rx_bytes;
186 		sum.tx_packets += tstats->tx_packets;
187 		sum.tx_bytes   += tstats->tx_bytes;
188 	}
189 	dev->stats.rx_packets = sum.rx_packets;
190 	dev->stats.rx_bytes   = sum.rx_bytes;
191 	dev->stats.tx_packets = sum.tx_packets;
192 	dev->stats.tx_bytes   = sum.tx_bytes;
193 	return &dev->stats;
194 }
195 
196 /* Given src, dst and key, find appropriate for input tunnel. */
197 
198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199 					      __be32 remote, __be32 local,
200 					      __be32 key, __be16 gre_proto)
201 {
202 	struct net *net = dev_net(dev);
203 	int link = dev->ifindex;
204 	unsigned int h0 = HASH(remote);
205 	unsigned int h1 = HASH(key);
206 	struct ip_tunnel *t, *cand = NULL;
207 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209 		       ARPHRD_ETHER : ARPHRD_IPGRE;
210 	int score, cand_score = 4;
211 
212 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213 		if (local != t->parms.iph.saddr ||
214 		    remote != t->parms.iph.daddr ||
215 		    key != t->parms.i_key ||
216 		    !(t->dev->flags & IFF_UP))
217 			continue;
218 
219 		if (t->dev->type != ARPHRD_IPGRE &&
220 		    t->dev->type != dev_type)
221 			continue;
222 
223 		score = 0;
224 		if (t->parms.link != link)
225 			score |= 1;
226 		if (t->dev->type != dev_type)
227 			score |= 2;
228 		if (score == 0)
229 			return t;
230 
231 		if (score < cand_score) {
232 			cand = t;
233 			cand_score = score;
234 		}
235 	}
236 
237 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238 		if (remote != t->parms.iph.daddr ||
239 		    key != t->parms.i_key ||
240 		    !(t->dev->flags & IFF_UP))
241 			continue;
242 
243 		if (t->dev->type != ARPHRD_IPGRE &&
244 		    t->dev->type != dev_type)
245 			continue;
246 
247 		score = 0;
248 		if (t->parms.link != link)
249 			score |= 1;
250 		if (t->dev->type != dev_type)
251 			score |= 2;
252 		if (score == 0)
253 			return t;
254 
255 		if (score < cand_score) {
256 			cand = t;
257 			cand_score = score;
258 		}
259 	}
260 
261 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262 		if ((local != t->parms.iph.saddr &&
263 		     (local != t->parms.iph.daddr ||
264 		      !ipv4_is_multicast(local))) ||
265 		    key != t->parms.i_key ||
266 		    !(t->dev->flags & IFF_UP))
267 			continue;
268 
269 		if (t->dev->type != ARPHRD_IPGRE &&
270 		    t->dev->type != dev_type)
271 			continue;
272 
273 		score = 0;
274 		if (t->parms.link != link)
275 			score |= 1;
276 		if (t->dev->type != dev_type)
277 			score |= 2;
278 		if (score == 0)
279 			return t;
280 
281 		if (score < cand_score) {
282 			cand = t;
283 			cand_score = score;
284 		}
285 	}
286 
287 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288 		if (t->parms.i_key != key ||
289 		    !(t->dev->flags & IFF_UP))
290 			continue;
291 
292 		if (t->dev->type != ARPHRD_IPGRE &&
293 		    t->dev->type != dev_type)
294 			continue;
295 
296 		score = 0;
297 		if (t->parms.link != link)
298 			score |= 1;
299 		if (t->dev->type != dev_type)
300 			score |= 2;
301 		if (score == 0)
302 			return t;
303 
304 		if (score < cand_score) {
305 			cand = t;
306 			cand_score = score;
307 		}
308 	}
309 
310 	if (cand != NULL)
311 		return cand;
312 
313 	dev = ign->fb_tunnel_dev;
314 	if (dev->flags & IFF_UP)
315 		return netdev_priv(dev);
316 
317 	return NULL;
318 }
319 
320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321 		struct ip_tunnel_parm *parms)
322 {
323 	__be32 remote = parms->iph.daddr;
324 	__be32 local = parms->iph.saddr;
325 	__be32 key = parms->i_key;
326 	unsigned int h = HASH(key);
327 	int prio = 0;
328 
329 	if (local)
330 		prio |= 1;
331 	if (remote && !ipv4_is_multicast(remote)) {
332 		prio |= 2;
333 		h ^= HASH(remote);
334 	}
335 
336 	return &ign->tunnels[prio][h];
337 }
338 
339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340 		struct ip_tunnel *t)
341 {
342 	return __ipgre_bucket(ign, &t->parms);
343 }
344 
345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348 
349 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350 	rcu_assign_pointer(*tp, t);
351 }
352 
353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355 	struct ip_tunnel __rcu **tp;
356 	struct ip_tunnel *iter;
357 
358 	for (tp = ipgre_bucket(ign, t);
359 	     (iter = rtnl_dereference(*tp)) != NULL;
360 	     tp = &iter->next) {
361 		if (t == iter) {
362 			rcu_assign_pointer(*tp, t->next);
363 			break;
364 		}
365 	}
366 }
367 
368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369 					   struct ip_tunnel_parm *parms,
370 					   int type)
371 {
372 	__be32 remote = parms->iph.daddr;
373 	__be32 local = parms->iph.saddr;
374 	__be32 key = parms->i_key;
375 	int link = parms->link;
376 	struct ip_tunnel *t;
377 	struct ip_tunnel __rcu **tp;
378 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379 
380 	for (tp = __ipgre_bucket(ign, parms);
381 	     (t = rtnl_dereference(*tp)) != NULL;
382 	     tp = &t->next)
383 		if (local == t->parms.iph.saddr &&
384 		    remote == t->parms.iph.daddr &&
385 		    key == t->parms.i_key &&
386 		    link == t->parms.link &&
387 		    type == t->dev->type)
388 			break;
389 
390 	return t;
391 }
392 
393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394 		struct ip_tunnel_parm *parms, int create)
395 {
396 	struct ip_tunnel *t, *nt;
397 	struct net_device *dev;
398 	char name[IFNAMSIZ];
399 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400 
401 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402 	if (t || !create)
403 		return t;
404 
405 	if (parms->name[0])
406 		strlcpy(name, parms->name, IFNAMSIZ);
407 	else
408 		sprintf(name, "gre%%d");
409 
410 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411 	if (!dev)
412 	  return NULL;
413 
414 	dev_net_set(dev, net);
415 
416 	if (strchr(name, '%')) {
417 		if (dev_alloc_name(dev, name) < 0)
418 			goto failed_free;
419 	}
420 
421 	nt = netdev_priv(dev);
422 	nt->parms = *parms;
423 	dev->rtnl_link_ops = &ipgre_link_ops;
424 
425 	dev->mtu = ipgre_tunnel_bind_dev(dev);
426 
427 	if (register_netdevice(dev) < 0)
428 		goto failed_free;
429 
430 	dev_hold(dev);
431 	ipgre_tunnel_link(ign, nt);
432 	return nt;
433 
434 failed_free:
435 	free_netdev(dev);
436 	return NULL;
437 }
438 
439 static void ipgre_tunnel_uninit(struct net_device *dev)
440 {
441 	struct net *net = dev_net(dev);
442 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
443 
444 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
445 	dev_put(dev);
446 }
447 
448 
449 static void ipgre_err(struct sk_buff *skb, u32 info)
450 {
451 
452 /* All the routers (except for Linux) return only
453    8 bytes of packet payload. It means, that precise relaying of
454    ICMP in the real Internet is absolutely infeasible.
455 
456    Moreover, Cisco "wise men" put GRE key to the third word
457    in GRE header. It makes impossible maintaining even soft state for keyed
458    GRE tunnels with enabled checksum. Tell them "thank you".
459 
460    Well, I wonder, rfc1812 was written by Cisco employee,
461    what the hell these idiots break standrads established
462    by themself???
463  */
464 
465 	struct iphdr *iph = (struct iphdr *)skb->data;
466 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
467 	int grehlen = (iph->ihl<<2) + 4;
468 	const int type = icmp_hdr(skb)->type;
469 	const int code = icmp_hdr(skb)->code;
470 	struct ip_tunnel *t;
471 	__be16 flags;
472 
473 	flags = p[0];
474 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
475 		if (flags&(GRE_VERSION|GRE_ROUTING))
476 			return;
477 		if (flags&GRE_KEY) {
478 			grehlen += 4;
479 			if (flags&GRE_CSUM)
480 				grehlen += 4;
481 		}
482 	}
483 
484 	/* If only 8 bytes returned, keyed message will be dropped here */
485 	if (skb_headlen(skb) < grehlen)
486 		return;
487 
488 	switch (type) {
489 	default:
490 	case ICMP_PARAMETERPROB:
491 		return;
492 
493 	case ICMP_DEST_UNREACH:
494 		switch (code) {
495 		case ICMP_SR_FAILED:
496 		case ICMP_PORT_UNREACH:
497 			/* Impossible event. */
498 			return;
499 		case ICMP_FRAG_NEEDED:
500 			/* Soft state for pmtu is maintained by IP core. */
501 			return;
502 		default:
503 			/* All others are translated to HOST_UNREACH.
504 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
505 			   I believe they are just ether pollution. --ANK
506 			 */
507 			break;
508 		}
509 		break;
510 	case ICMP_TIME_EXCEEDED:
511 		if (code != ICMP_EXC_TTL)
512 			return;
513 		break;
514 	}
515 
516 	rcu_read_lock();
517 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
518 				flags & GRE_KEY ?
519 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
520 				p[1]);
521 	if (t == NULL || t->parms.iph.daddr == 0 ||
522 	    ipv4_is_multicast(t->parms.iph.daddr))
523 		goto out;
524 
525 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
526 		goto out;
527 
528 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
529 		t->err_count++;
530 	else
531 		t->err_count = 1;
532 	t->err_time = jiffies;
533 out:
534 	rcu_read_unlock();
535 }
536 
537 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
538 {
539 	if (INET_ECN_is_ce(iph->tos)) {
540 		if (skb->protocol == htons(ETH_P_IP)) {
541 			IP_ECN_set_ce(ip_hdr(skb));
542 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
543 			IP6_ECN_set_ce(ipv6_hdr(skb));
544 		}
545 	}
546 }
547 
548 static inline u8
549 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
550 {
551 	u8 inner = 0;
552 	if (skb->protocol == htons(ETH_P_IP))
553 		inner = old_iph->tos;
554 	else if (skb->protocol == htons(ETH_P_IPV6))
555 		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
556 	return INET_ECN_encapsulate(tos, inner);
557 }
558 
559 static int ipgre_rcv(struct sk_buff *skb)
560 {
561 	struct iphdr *iph;
562 	u8     *h;
563 	__be16    flags;
564 	__sum16   csum = 0;
565 	__be32 key = 0;
566 	u32    seqno = 0;
567 	struct ip_tunnel *tunnel;
568 	int    offset = 4;
569 	__be16 gre_proto;
570 
571 	if (!pskb_may_pull(skb, 16))
572 		goto drop_nolock;
573 
574 	iph = ip_hdr(skb);
575 	h = skb->data;
576 	flags = *(__be16*)h;
577 
578 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
579 		/* - Version must be 0.
580 		   - We do not support routing headers.
581 		 */
582 		if (flags&(GRE_VERSION|GRE_ROUTING))
583 			goto drop_nolock;
584 
585 		if (flags&GRE_CSUM) {
586 			switch (skb->ip_summed) {
587 			case CHECKSUM_COMPLETE:
588 				csum = csum_fold(skb->csum);
589 				if (!csum)
590 					break;
591 				/* fall through */
592 			case CHECKSUM_NONE:
593 				skb->csum = 0;
594 				csum = __skb_checksum_complete(skb);
595 				skb->ip_summed = CHECKSUM_COMPLETE;
596 			}
597 			offset += 4;
598 		}
599 		if (flags&GRE_KEY) {
600 			key = *(__be32*)(h + offset);
601 			offset += 4;
602 		}
603 		if (flags&GRE_SEQ) {
604 			seqno = ntohl(*(__be32*)(h + offset));
605 			offset += 4;
606 		}
607 	}
608 
609 	gre_proto = *(__be16 *)(h + 2);
610 
611 	rcu_read_lock();
612 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
613 					  iph->saddr, iph->daddr, key,
614 					  gre_proto))) {
615 		struct pcpu_tstats *tstats;
616 
617 		secpath_reset(skb);
618 
619 		skb->protocol = gre_proto;
620 		/* WCCP version 1 and 2 protocol decoding.
621 		 * - Change protocol to IP
622 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
623 		 */
624 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
625 			skb->protocol = htons(ETH_P_IP);
626 			if ((*(h + offset) & 0xF0) != 0x40)
627 				offset += 4;
628 		}
629 
630 		skb->mac_header = skb->network_header;
631 		__pskb_pull(skb, offset);
632 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
633 		skb->pkt_type = PACKET_HOST;
634 #ifdef CONFIG_NET_IPGRE_BROADCAST
635 		if (ipv4_is_multicast(iph->daddr)) {
636 			/* Looped back packet, drop it! */
637 			if (skb_rtable(skb)->fl.iif == 0)
638 				goto drop;
639 			tunnel->dev->stats.multicast++;
640 			skb->pkt_type = PACKET_BROADCAST;
641 		}
642 #endif
643 
644 		if (((flags&GRE_CSUM) && csum) ||
645 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
646 			tunnel->dev->stats.rx_crc_errors++;
647 			tunnel->dev->stats.rx_errors++;
648 			goto drop;
649 		}
650 		if (tunnel->parms.i_flags&GRE_SEQ) {
651 			if (!(flags&GRE_SEQ) ||
652 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
653 				tunnel->dev->stats.rx_fifo_errors++;
654 				tunnel->dev->stats.rx_errors++;
655 				goto drop;
656 			}
657 			tunnel->i_seqno = seqno + 1;
658 		}
659 
660 		/* Warning: All skb pointers will be invalidated! */
661 		if (tunnel->dev->type == ARPHRD_ETHER) {
662 			if (!pskb_may_pull(skb, ETH_HLEN)) {
663 				tunnel->dev->stats.rx_length_errors++;
664 				tunnel->dev->stats.rx_errors++;
665 				goto drop;
666 			}
667 
668 			iph = ip_hdr(skb);
669 			skb->protocol = eth_type_trans(skb, tunnel->dev);
670 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
671 		}
672 
673 		tstats = this_cpu_ptr(tunnel->dev->tstats);
674 		tstats->rx_packets++;
675 		tstats->rx_bytes += skb->len;
676 
677 		__skb_tunnel_rx(skb, tunnel->dev);
678 
679 		skb_reset_network_header(skb);
680 		ipgre_ecn_decapsulate(iph, skb);
681 
682 		netif_rx(skb);
683 
684 		rcu_read_unlock();
685 		return 0;
686 	}
687 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
688 
689 drop:
690 	rcu_read_unlock();
691 drop_nolock:
692 	kfree_skb(skb);
693 	return 0;
694 }
695 
696 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
697 {
698 	struct ip_tunnel *tunnel = netdev_priv(dev);
699 	struct pcpu_tstats *tstats;
700 	struct iphdr  *old_iph = ip_hdr(skb);
701 	struct iphdr  *tiph;
702 	u8     tos;
703 	__be16 df;
704 	struct rtable *rt;     			/* Route to the other host */
705 	struct net_device *tdev;		/* Device to other host */
706 	struct iphdr  *iph;			/* Our new IP header */
707 	unsigned int max_headroom;		/* The extra header space needed */
708 	int    gre_hlen;
709 	__be32 dst;
710 	int    mtu;
711 
712 	if (dev->type == ARPHRD_ETHER)
713 		IPCB(skb)->flags = 0;
714 
715 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
716 		gre_hlen = 0;
717 		tiph = (struct iphdr *)skb->data;
718 	} else {
719 		gre_hlen = tunnel->hlen;
720 		tiph = &tunnel->parms.iph;
721 	}
722 
723 	if ((dst = tiph->daddr) == 0) {
724 		/* NBMA tunnel */
725 
726 		if (skb_dst(skb) == NULL) {
727 			dev->stats.tx_fifo_errors++;
728 			goto tx_error;
729 		}
730 
731 		if (skb->protocol == htons(ETH_P_IP)) {
732 			rt = skb_rtable(skb);
733 			if ((dst = rt->rt_gateway) == 0)
734 				goto tx_error_icmp;
735 		}
736 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
737 		else if (skb->protocol == htons(ETH_P_IPV6)) {
738 			struct in6_addr *addr6;
739 			int addr_type;
740 			struct neighbour *neigh = skb_dst(skb)->neighbour;
741 
742 			if (neigh == NULL)
743 				goto tx_error;
744 
745 			addr6 = (struct in6_addr *)&neigh->primary_key;
746 			addr_type = ipv6_addr_type(addr6);
747 
748 			if (addr_type == IPV6_ADDR_ANY) {
749 				addr6 = &ipv6_hdr(skb)->daddr;
750 				addr_type = ipv6_addr_type(addr6);
751 			}
752 
753 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
754 				goto tx_error_icmp;
755 
756 			dst = addr6->s6_addr32[3];
757 		}
758 #endif
759 		else
760 			goto tx_error;
761 	}
762 
763 	tos = tiph->tos;
764 	if (tos == 1) {
765 		tos = 0;
766 		if (skb->protocol == htons(ETH_P_IP))
767 			tos = old_iph->tos;
768 		else if (skb->protocol == htons(ETH_P_IPV6))
769 			tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
770 	}
771 
772 	{
773 		struct flowi fl = {
774 			.oif = tunnel->parms.link,
775 			.nl_u = {
776 				.ip4_u = {
777 					.daddr = dst,
778 					.saddr = tiph->saddr,
779 					.tos = RT_TOS(tos)
780 				}
781 			},
782 			.proto = IPPROTO_GRE
783 		}
784 ;
785 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
786 			dev->stats.tx_carrier_errors++;
787 			goto tx_error;
788 		}
789 	}
790 	tdev = rt->dst.dev;
791 
792 	if (tdev == dev) {
793 		ip_rt_put(rt);
794 		dev->stats.collisions++;
795 		goto tx_error;
796 	}
797 
798 	df = tiph->frag_off;
799 	if (df)
800 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
801 	else
802 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
803 
804 	if (skb_dst(skb))
805 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
806 
807 	if (skb->protocol == htons(ETH_P_IP)) {
808 		df |= (old_iph->frag_off&htons(IP_DF));
809 
810 		if ((old_iph->frag_off&htons(IP_DF)) &&
811 		    mtu < ntohs(old_iph->tot_len)) {
812 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
813 			ip_rt_put(rt);
814 			goto tx_error;
815 		}
816 	}
817 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
818 	else if (skb->protocol == htons(ETH_P_IPV6)) {
819 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
820 
821 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
822 			if ((tunnel->parms.iph.daddr &&
823 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
824 			    rt6->rt6i_dst.plen == 128) {
825 				rt6->rt6i_flags |= RTF_MODIFIED;
826 				skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
827 			}
828 		}
829 
830 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
831 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
832 			ip_rt_put(rt);
833 			goto tx_error;
834 		}
835 	}
836 #endif
837 
838 	if (tunnel->err_count > 0) {
839 		if (time_before(jiffies,
840 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
841 			tunnel->err_count--;
842 
843 			dst_link_failure(skb);
844 		} else
845 			tunnel->err_count = 0;
846 	}
847 
848 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
849 
850 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
851 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
852 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
853 		if (max_headroom > dev->needed_headroom)
854 			dev->needed_headroom = max_headroom;
855 		if (!new_skb) {
856 			ip_rt_put(rt);
857 			dev->stats.tx_dropped++;
858 			dev_kfree_skb(skb);
859 			return NETDEV_TX_OK;
860 		}
861 		if (skb->sk)
862 			skb_set_owner_w(new_skb, skb->sk);
863 		dev_kfree_skb(skb);
864 		skb = new_skb;
865 		old_iph = ip_hdr(skb);
866 	}
867 
868 	skb_reset_transport_header(skb);
869 	skb_push(skb, gre_hlen);
870 	skb_reset_network_header(skb);
871 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
872 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
873 			      IPSKB_REROUTED);
874 	skb_dst_drop(skb);
875 	skb_dst_set(skb, &rt->dst);
876 
877 	/*
878 	 *	Push down and install the IPIP header.
879 	 */
880 
881 	iph 			=	ip_hdr(skb);
882 	iph->version		=	4;
883 	iph->ihl		=	sizeof(struct iphdr) >> 2;
884 	iph->frag_off		=	df;
885 	iph->protocol		=	IPPROTO_GRE;
886 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
887 	iph->daddr		=	rt->rt_dst;
888 	iph->saddr		=	rt->rt_src;
889 
890 	if ((iph->ttl = tiph->ttl) == 0) {
891 		if (skb->protocol == htons(ETH_P_IP))
892 			iph->ttl = old_iph->ttl;
893 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
894 		else if (skb->protocol == htons(ETH_P_IPV6))
895 			iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
896 #endif
897 		else
898 			iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
899 	}
900 
901 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
902 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
903 				   htons(ETH_P_TEB) : skb->protocol;
904 
905 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
906 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
907 
908 		if (tunnel->parms.o_flags&GRE_SEQ) {
909 			++tunnel->o_seqno;
910 			*ptr = htonl(tunnel->o_seqno);
911 			ptr--;
912 		}
913 		if (tunnel->parms.o_flags&GRE_KEY) {
914 			*ptr = tunnel->parms.o_key;
915 			ptr--;
916 		}
917 		if (tunnel->parms.o_flags&GRE_CSUM) {
918 			*ptr = 0;
919 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
920 		}
921 	}
922 
923 	nf_reset(skb);
924 	tstats = this_cpu_ptr(dev->tstats);
925 	__IPTUNNEL_XMIT(tstats, &dev->stats);
926 	return NETDEV_TX_OK;
927 
928 tx_error_icmp:
929 	dst_link_failure(skb);
930 
931 tx_error:
932 	dev->stats.tx_errors++;
933 	dev_kfree_skb(skb);
934 	return NETDEV_TX_OK;
935 }
936 
937 static int ipgre_tunnel_bind_dev(struct net_device *dev)
938 {
939 	struct net_device *tdev = NULL;
940 	struct ip_tunnel *tunnel;
941 	struct iphdr *iph;
942 	int hlen = LL_MAX_HEADER;
943 	int mtu = ETH_DATA_LEN;
944 	int addend = sizeof(struct iphdr) + 4;
945 
946 	tunnel = netdev_priv(dev);
947 	iph = &tunnel->parms.iph;
948 
949 	/* Guess output device to choose reasonable mtu and needed_headroom */
950 
951 	if (iph->daddr) {
952 		struct flowi fl = {
953 			.oif = tunnel->parms.link,
954 			.nl_u = {
955 				.ip4_u = {
956 					.daddr = iph->daddr,
957 					.saddr = iph->saddr,
958 					.tos = RT_TOS(iph->tos)
959 				}
960 			},
961 			.proto = IPPROTO_GRE
962 		};
963 		struct rtable *rt;
964 
965 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
966 			tdev = rt->dst.dev;
967 			ip_rt_put(rt);
968 		}
969 
970 		if (dev->type != ARPHRD_ETHER)
971 			dev->flags |= IFF_POINTOPOINT;
972 	}
973 
974 	if (!tdev && tunnel->parms.link)
975 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
976 
977 	if (tdev) {
978 		hlen = tdev->hard_header_len + tdev->needed_headroom;
979 		mtu = tdev->mtu;
980 	}
981 	dev->iflink = tunnel->parms.link;
982 
983 	/* Precalculate GRE options length */
984 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
985 		if (tunnel->parms.o_flags&GRE_CSUM)
986 			addend += 4;
987 		if (tunnel->parms.o_flags&GRE_KEY)
988 			addend += 4;
989 		if (tunnel->parms.o_flags&GRE_SEQ)
990 			addend += 4;
991 	}
992 	dev->needed_headroom = addend + hlen;
993 	mtu -= dev->hard_header_len + addend;
994 
995 	if (mtu < 68)
996 		mtu = 68;
997 
998 	tunnel->hlen = addend;
999 
1000 	return mtu;
1001 }
1002 
1003 static int
1004 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1005 {
1006 	int err = 0;
1007 	struct ip_tunnel_parm p;
1008 	struct ip_tunnel *t;
1009 	struct net *net = dev_net(dev);
1010 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1011 
1012 	switch (cmd) {
1013 	case SIOCGETTUNNEL:
1014 		t = NULL;
1015 		if (dev == ign->fb_tunnel_dev) {
1016 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1017 				err = -EFAULT;
1018 				break;
1019 			}
1020 			t = ipgre_tunnel_locate(net, &p, 0);
1021 		}
1022 		if (t == NULL)
1023 			t = netdev_priv(dev);
1024 		memcpy(&p, &t->parms, sizeof(p));
1025 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1026 			err = -EFAULT;
1027 		break;
1028 
1029 	case SIOCADDTUNNEL:
1030 	case SIOCCHGTUNNEL:
1031 		err = -EPERM;
1032 		if (!capable(CAP_NET_ADMIN))
1033 			goto done;
1034 
1035 		err = -EFAULT;
1036 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1037 			goto done;
1038 
1039 		err = -EINVAL;
1040 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1041 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1042 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1043 			goto done;
1044 		if (p.iph.ttl)
1045 			p.iph.frag_off |= htons(IP_DF);
1046 
1047 		if (!(p.i_flags&GRE_KEY))
1048 			p.i_key = 0;
1049 		if (!(p.o_flags&GRE_KEY))
1050 			p.o_key = 0;
1051 
1052 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1053 
1054 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1055 			if (t != NULL) {
1056 				if (t->dev != dev) {
1057 					err = -EEXIST;
1058 					break;
1059 				}
1060 			} else {
1061 				unsigned int nflags = 0;
1062 
1063 				t = netdev_priv(dev);
1064 
1065 				if (ipv4_is_multicast(p.iph.daddr))
1066 					nflags = IFF_BROADCAST;
1067 				else if (p.iph.daddr)
1068 					nflags = IFF_POINTOPOINT;
1069 
1070 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1071 					err = -EINVAL;
1072 					break;
1073 				}
1074 				ipgre_tunnel_unlink(ign, t);
1075 				synchronize_net();
1076 				t->parms.iph.saddr = p.iph.saddr;
1077 				t->parms.iph.daddr = p.iph.daddr;
1078 				t->parms.i_key = p.i_key;
1079 				t->parms.o_key = p.o_key;
1080 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1081 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1082 				ipgre_tunnel_link(ign, t);
1083 				netdev_state_change(dev);
1084 			}
1085 		}
1086 
1087 		if (t) {
1088 			err = 0;
1089 			if (cmd == SIOCCHGTUNNEL) {
1090 				t->parms.iph.ttl = p.iph.ttl;
1091 				t->parms.iph.tos = p.iph.tos;
1092 				t->parms.iph.frag_off = p.iph.frag_off;
1093 				if (t->parms.link != p.link) {
1094 					t->parms.link = p.link;
1095 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1096 					netdev_state_change(dev);
1097 				}
1098 			}
1099 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1100 				err = -EFAULT;
1101 		} else
1102 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1103 		break;
1104 
1105 	case SIOCDELTUNNEL:
1106 		err = -EPERM;
1107 		if (!capable(CAP_NET_ADMIN))
1108 			goto done;
1109 
1110 		if (dev == ign->fb_tunnel_dev) {
1111 			err = -EFAULT;
1112 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1113 				goto done;
1114 			err = -ENOENT;
1115 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1116 				goto done;
1117 			err = -EPERM;
1118 			if (t == netdev_priv(ign->fb_tunnel_dev))
1119 				goto done;
1120 			dev = t->dev;
1121 		}
1122 		unregister_netdevice(dev);
1123 		err = 0;
1124 		break;
1125 
1126 	default:
1127 		err = -EINVAL;
1128 	}
1129 
1130 done:
1131 	return err;
1132 }
1133 
1134 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1135 {
1136 	struct ip_tunnel *tunnel = netdev_priv(dev);
1137 	if (new_mtu < 68 ||
1138 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1139 		return -EINVAL;
1140 	dev->mtu = new_mtu;
1141 	return 0;
1142 }
1143 
1144 /* Nice toy. Unfortunately, useless in real life :-)
1145    It allows to construct virtual multiprotocol broadcast "LAN"
1146    over the Internet, provided multicast routing is tuned.
1147 
1148 
1149    I have no idea was this bicycle invented before me,
1150    so that I had to set ARPHRD_IPGRE to a random value.
1151    I have an impression, that Cisco could make something similar,
1152    but this feature is apparently missing in IOS<=11.2(8).
1153 
1154    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1155    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1156 
1157    ping -t 255 224.66.66.66
1158 
1159    If nobody answers, mbone does not work.
1160 
1161    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1162    ip addr add 10.66.66.<somewhat>/24 dev Universe
1163    ifconfig Universe up
1164    ifconfig Universe add fe80::<Your_real_addr>/10
1165    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1166    ftp 10.66.66.66
1167    ...
1168    ftp fec0:6666:6666::193.233.7.65
1169    ...
1170 
1171  */
1172 
1173 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1174 			unsigned short type,
1175 			const void *daddr, const void *saddr, unsigned int len)
1176 {
1177 	struct ip_tunnel *t = netdev_priv(dev);
1178 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1179 	__be16 *p = (__be16*)(iph+1);
1180 
1181 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1182 	p[0]		= t->parms.o_flags;
1183 	p[1]		= htons(type);
1184 
1185 	/*
1186 	 *	Set the source hardware address.
1187 	 */
1188 
1189 	if (saddr)
1190 		memcpy(&iph->saddr, saddr, 4);
1191 	if (daddr)
1192 		memcpy(&iph->daddr, daddr, 4);
1193 	if (iph->daddr)
1194 		return t->hlen;
1195 
1196 	return -t->hlen;
1197 }
1198 
1199 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1200 {
1201 	struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1202 	memcpy(haddr, &iph->saddr, 4);
1203 	return 4;
1204 }
1205 
1206 static const struct header_ops ipgre_header_ops = {
1207 	.create	= ipgre_header,
1208 	.parse	= ipgre_header_parse,
1209 };
1210 
1211 #ifdef CONFIG_NET_IPGRE_BROADCAST
1212 static int ipgre_open(struct net_device *dev)
1213 {
1214 	struct ip_tunnel *t = netdev_priv(dev);
1215 
1216 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1217 		struct flowi fl = {
1218 			.oif = t->parms.link,
1219 			.nl_u = {
1220 				.ip4_u = {
1221 					.daddr = t->parms.iph.daddr,
1222 					.saddr = t->parms.iph.saddr,
1223 					.tos = RT_TOS(t->parms.iph.tos)
1224 				}
1225 			},
1226 			.proto = IPPROTO_GRE
1227 		};
1228 		struct rtable *rt;
1229 
1230 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
1231 			return -EADDRNOTAVAIL;
1232 		dev = rt->dst.dev;
1233 		ip_rt_put(rt);
1234 		if (__in_dev_get_rtnl(dev) == NULL)
1235 			return -EADDRNOTAVAIL;
1236 		t->mlink = dev->ifindex;
1237 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1238 	}
1239 	return 0;
1240 }
1241 
1242 static int ipgre_close(struct net_device *dev)
1243 {
1244 	struct ip_tunnel *t = netdev_priv(dev);
1245 
1246 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1247 		struct in_device *in_dev;
1248 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1249 		if (in_dev)
1250 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1251 	}
1252 	return 0;
1253 }
1254 
1255 #endif
1256 
1257 static const struct net_device_ops ipgre_netdev_ops = {
1258 	.ndo_init		= ipgre_tunnel_init,
1259 	.ndo_uninit		= ipgre_tunnel_uninit,
1260 #ifdef CONFIG_NET_IPGRE_BROADCAST
1261 	.ndo_open		= ipgre_open,
1262 	.ndo_stop		= ipgre_close,
1263 #endif
1264 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1265 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1266 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1267 	.ndo_get_stats		= ipgre_get_stats,
1268 };
1269 
1270 static void ipgre_dev_free(struct net_device *dev)
1271 {
1272 	free_percpu(dev->tstats);
1273 	free_netdev(dev);
1274 }
1275 
1276 static void ipgre_tunnel_setup(struct net_device *dev)
1277 {
1278 	dev->netdev_ops		= &ipgre_netdev_ops;
1279 	dev->destructor 	= ipgre_dev_free;
1280 
1281 	dev->type		= ARPHRD_IPGRE;
1282 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1283 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1284 	dev->flags		= IFF_NOARP;
1285 	dev->iflink		= 0;
1286 	dev->addr_len		= 4;
1287 	dev->features		|= NETIF_F_NETNS_LOCAL;
1288 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1289 }
1290 
1291 static int ipgre_tunnel_init(struct net_device *dev)
1292 {
1293 	struct ip_tunnel *tunnel;
1294 	struct iphdr *iph;
1295 
1296 	tunnel = netdev_priv(dev);
1297 	iph = &tunnel->parms.iph;
1298 
1299 	tunnel->dev = dev;
1300 	strcpy(tunnel->parms.name, dev->name);
1301 
1302 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1303 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1304 
1305 	if (iph->daddr) {
1306 #ifdef CONFIG_NET_IPGRE_BROADCAST
1307 		if (ipv4_is_multicast(iph->daddr)) {
1308 			if (!iph->saddr)
1309 				return -EINVAL;
1310 			dev->flags = IFF_BROADCAST;
1311 			dev->header_ops = &ipgre_header_ops;
1312 		}
1313 #endif
1314 	} else
1315 		dev->header_ops = &ipgre_header_ops;
1316 
1317 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1318 	if (!dev->tstats)
1319 		return -ENOMEM;
1320 
1321 	return 0;
1322 }
1323 
1324 static void ipgre_fb_tunnel_init(struct net_device *dev)
1325 {
1326 	struct ip_tunnel *tunnel = netdev_priv(dev);
1327 	struct iphdr *iph = &tunnel->parms.iph;
1328 
1329 	tunnel->dev = dev;
1330 	strcpy(tunnel->parms.name, dev->name);
1331 
1332 	iph->version		= 4;
1333 	iph->protocol		= IPPROTO_GRE;
1334 	iph->ihl		= 5;
1335 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1336 
1337 	dev_hold(dev);
1338 }
1339 
1340 
1341 static const struct gre_protocol ipgre_protocol = {
1342 	.handler     = ipgre_rcv,
1343 	.err_handler = ipgre_err,
1344 };
1345 
1346 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1347 {
1348 	int prio;
1349 
1350 	for (prio = 0; prio < 4; prio++) {
1351 		int h;
1352 		for (h = 0; h < HASH_SIZE; h++) {
1353 			struct ip_tunnel *t;
1354 
1355 			t = rtnl_dereference(ign->tunnels[prio][h]);
1356 
1357 			while (t != NULL) {
1358 				unregister_netdevice_queue(t->dev, head);
1359 				t = rtnl_dereference(t->next);
1360 			}
1361 		}
1362 	}
1363 }
1364 
1365 static int __net_init ipgre_init_net(struct net *net)
1366 {
1367 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1368 	int err;
1369 
1370 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1371 					   ipgre_tunnel_setup);
1372 	if (!ign->fb_tunnel_dev) {
1373 		err = -ENOMEM;
1374 		goto err_alloc_dev;
1375 	}
1376 	dev_net_set(ign->fb_tunnel_dev, net);
1377 
1378 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1379 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1380 
1381 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1382 		goto err_reg_dev;
1383 
1384 	rcu_assign_pointer(ign->tunnels_wc[0],
1385 			   netdev_priv(ign->fb_tunnel_dev));
1386 	return 0;
1387 
1388 err_reg_dev:
1389 	ipgre_dev_free(ign->fb_tunnel_dev);
1390 err_alloc_dev:
1391 	return err;
1392 }
1393 
1394 static void __net_exit ipgre_exit_net(struct net *net)
1395 {
1396 	struct ipgre_net *ign;
1397 	LIST_HEAD(list);
1398 
1399 	ign = net_generic(net, ipgre_net_id);
1400 	rtnl_lock();
1401 	ipgre_destroy_tunnels(ign, &list);
1402 	unregister_netdevice_many(&list);
1403 	rtnl_unlock();
1404 }
1405 
1406 static struct pernet_operations ipgre_net_ops = {
1407 	.init = ipgre_init_net,
1408 	.exit = ipgre_exit_net,
1409 	.id   = &ipgre_net_id,
1410 	.size = sizeof(struct ipgre_net),
1411 };
1412 
1413 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1414 {
1415 	__be16 flags;
1416 
1417 	if (!data)
1418 		return 0;
1419 
1420 	flags = 0;
1421 	if (data[IFLA_GRE_IFLAGS])
1422 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1423 	if (data[IFLA_GRE_OFLAGS])
1424 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1425 	if (flags & (GRE_VERSION|GRE_ROUTING))
1426 		return -EINVAL;
1427 
1428 	return 0;
1429 }
1430 
1431 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1432 {
1433 	__be32 daddr;
1434 
1435 	if (tb[IFLA_ADDRESS]) {
1436 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1437 			return -EINVAL;
1438 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1439 			return -EADDRNOTAVAIL;
1440 	}
1441 
1442 	if (!data)
1443 		goto out;
1444 
1445 	if (data[IFLA_GRE_REMOTE]) {
1446 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1447 		if (!daddr)
1448 			return -EINVAL;
1449 	}
1450 
1451 out:
1452 	return ipgre_tunnel_validate(tb, data);
1453 }
1454 
1455 static void ipgre_netlink_parms(struct nlattr *data[],
1456 				struct ip_tunnel_parm *parms)
1457 {
1458 	memset(parms, 0, sizeof(*parms));
1459 
1460 	parms->iph.protocol = IPPROTO_GRE;
1461 
1462 	if (!data)
1463 		return;
1464 
1465 	if (data[IFLA_GRE_LINK])
1466 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1467 
1468 	if (data[IFLA_GRE_IFLAGS])
1469 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1470 
1471 	if (data[IFLA_GRE_OFLAGS])
1472 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1473 
1474 	if (data[IFLA_GRE_IKEY])
1475 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1476 
1477 	if (data[IFLA_GRE_OKEY])
1478 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1479 
1480 	if (data[IFLA_GRE_LOCAL])
1481 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1482 
1483 	if (data[IFLA_GRE_REMOTE])
1484 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1485 
1486 	if (data[IFLA_GRE_TTL])
1487 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1488 
1489 	if (data[IFLA_GRE_TOS])
1490 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1491 
1492 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1493 		parms->iph.frag_off = htons(IP_DF);
1494 }
1495 
1496 static int ipgre_tap_init(struct net_device *dev)
1497 {
1498 	struct ip_tunnel *tunnel;
1499 
1500 	tunnel = netdev_priv(dev);
1501 
1502 	tunnel->dev = dev;
1503 	strcpy(tunnel->parms.name, dev->name);
1504 
1505 	ipgre_tunnel_bind_dev(dev);
1506 
1507 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1508 	if (!dev->tstats)
1509 		return -ENOMEM;
1510 
1511 	return 0;
1512 }
1513 
1514 static const struct net_device_ops ipgre_tap_netdev_ops = {
1515 	.ndo_init		= ipgre_tap_init,
1516 	.ndo_uninit		= ipgre_tunnel_uninit,
1517 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1518 	.ndo_set_mac_address 	= eth_mac_addr,
1519 	.ndo_validate_addr	= eth_validate_addr,
1520 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1521 	.ndo_get_stats		= ipgre_get_stats,
1522 };
1523 
1524 static void ipgre_tap_setup(struct net_device *dev)
1525 {
1526 
1527 	ether_setup(dev);
1528 
1529 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1530 	dev->destructor 	= ipgre_dev_free;
1531 
1532 	dev->iflink		= 0;
1533 	dev->features		|= NETIF_F_NETNS_LOCAL;
1534 }
1535 
1536 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1537 			 struct nlattr *data[])
1538 {
1539 	struct ip_tunnel *nt;
1540 	struct net *net = dev_net(dev);
1541 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1542 	int mtu;
1543 	int err;
1544 
1545 	nt = netdev_priv(dev);
1546 	ipgre_netlink_parms(data, &nt->parms);
1547 
1548 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1549 		return -EEXIST;
1550 
1551 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1552 		random_ether_addr(dev->dev_addr);
1553 
1554 	mtu = ipgre_tunnel_bind_dev(dev);
1555 	if (!tb[IFLA_MTU])
1556 		dev->mtu = mtu;
1557 
1558 	/* Can use a lockless transmit, unless we generate output sequences */
1559 	if (!(nt->parms.o_flags & GRE_SEQ))
1560 		dev->features |= NETIF_F_LLTX;
1561 
1562 	err = register_netdevice(dev);
1563 	if (err)
1564 		goto out;
1565 
1566 	dev_hold(dev);
1567 	ipgre_tunnel_link(ign, nt);
1568 
1569 out:
1570 	return err;
1571 }
1572 
1573 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1574 			    struct nlattr *data[])
1575 {
1576 	struct ip_tunnel *t, *nt;
1577 	struct net *net = dev_net(dev);
1578 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1579 	struct ip_tunnel_parm p;
1580 	int mtu;
1581 
1582 	if (dev == ign->fb_tunnel_dev)
1583 		return -EINVAL;
1584 
1585 	nt = netdev_priv(dev);
1586 	ipgre_netlink_parms(data, &p);
1587 
1588 	t = ipgre_tunnel_locate(net, &p, 0);
1589 
1590 	if (t) {
1591 		if (t->dev != dev)
1592 			return -EEXIST;
1593 	} else {
1594 		t = nt;
1595 
1596 		if (dev->type != ARPHRD_ETHER) {
1597 			unsigned int nflags = 0;
1598 
1599 			if (ipv4_is_multicast(p.iph.daddr))
1600 				nflags = IFF_BROADCAST;
1601 			else if (p.iph.daddr)
1602 				nflags = IFF_POINTOPOINT;
1603 
1604 			if ((dev->flags ^ nflags) &
1605 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1606 				return -EINVAL;
1607 		}
1608 
1609 		ipgre_tunnel_unlink(ign, t);
1610 		t->parms.iph.saddr = p.iph.saddr;
1611 		t->parms.iph.daddr = p.iph.daddr;
1612 		t->parms.i_key = p.i_key;
1613 		if (dev->type != ARPHRD_ETHER) {
1614 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1615 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1616 		}
1617 		ipgre_tunnel_link(ign, t);
1618 		netdev_state_change(dev);
1619 	}
1620 
1621 	t->parms.o_key = p.o_key;
1622 	t->parms.iph.ttl = p.iph.ttl;
1623 	t->parms.iph.tos = p.iph.tos;
1624 	t->parms.iph.frag_off = p.iph.frag_off;
1625 
1626 	if (t->parms.link != p.link) {
1627 		t->parms.link = p.link;
1628 		mtu = ipgre_tunnel_bind_dev(dev);
1629 		if (!tb[IFLA_MTU])
1630 			dev->mtu = mtu;
1631 		netdev_state_change(dev);
1632 	}
1633 
1634 	return 0;
1635 }
1636 
1637 static size_t ipgre_get_size(const struct net_device *dev)
1638 {
1639 	return
1640 		/* IFLA_GRE_LINK */
1641 		nla_total_size(4) +
1642 		/* IFLA_GRE_IFLAGS */
1643 		nla_total_size(2) +
1644 		/* IFLA_GRE_OFLAGS */
1645 		nla_total_size(2) +
1646 		/* IFLA_GRE_IKEY */
1647 		nla_total_size(4) +
1648 		/* IFLA_GRE_OKEY */
1649 		nla_total_size(4) +
1650 		/* IFLA_GRE_LOCAL */
1651 		nla_total_size(4) +
1652 		/* IFLA_GRE_REMOTE */
1653 		nla_total_size(4) +
1654 		/* IFLA_GRE_TTL */
1655 		nla_total_size(1) +
1656 		/* IFLA_GRE_TOS */
1657 		nla_total_size(1) +
1658 		/* IFLA_GRE_PMTUDISC */
1659 		nla_total_size(1) +
1660 		0;
1661 }
1662 
1663 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1664 {
1665 	struct ip_tunnel *t = netdev_priv(dev);
1666 	struct ip_tunnel_parm *p = &t->parms;
1667 
1668 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1669 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1670 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1671 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1672 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1673 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1674 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1675 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1676 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1677 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1678 
1679 	return 0;
1680 
1681 nla_put_failure:
1682 	return -EMSGSIZE;
1683 }
1684 
1685 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1686 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1687 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1688 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1689 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1690 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1691 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1692 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1693 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1694 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1695 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1696 };
1697 
1698 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1699 	.kind		= "gre",
1700 	.maxtype	= IFLA_GRE_MAX,
1701 	.policy		= ipgre_policy,
1702 	.priv_size	= sizeof(struct ip_tunnel),
1703 	.setup		= ipgre_tunnel_setup,
1704 	.validate	= ipgre_tunnel_validate,
1705 	.newlink	= ipgre_newlink,
1706 	.changelink	= ipgre_changelink,
1707 	.get_size	= ipgre_get_size,
1708 	.fill_info	= ipgre_fill_info,
1709 };
1710 
1711 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1712 	.kind		= "gretap",
1713 	.maxtype	= IFLA_GRE_MAX,
1714 	.policy		= ipgre_policy,
1715 	.priv_size	= sizeof(struct ip_tunnel),
1716 	.setup		= ipgre_tap_setup,
1717 	.validate	= ipgre_tap_validate,
1718 	.newlink	= ipgre_newlink,
1719 	.changelink	= ipgre_changelink,
1720 	.get_size	= ipgre_get_size,
1721 	.fill_info	= ipgre_fill_info,
1722 };
1723 
1724 /*
1725  *	And now the modules code and kernel interface.
1726  */
1727 
1728 static int __init ipgre_init(void)
1729 {
1730 	int err;
1731 
1732 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1733 
1734 	err = register_pernet_device(&ipgre_net_ops);
1735 	if (err < 0)
1736 		return err;
1737 
1738 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1739 	if (err < 0) {
1740 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1741 		goto add_proto_failed;
1742 	}
1743 
1744 	err = rtnl_link_register(&ipgre_link_ops);
1745 	if (err < 0)
1746 		goto rtnl_link_failed;
1747 
1748 	err = rtnl_link_register(&ipgre_tap_ops);
1749 	if (err < 0)
1750 		goto tap_ops_failed;
1751 
1752 out:
1753 	return err;
1754 
1755 tap_ops_failed:
1756 	rtnl_link_unregister(&ipgre_link_ops);
1757 rtnl_link_failed:
1758 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1759 add_proto_failed:
1760 	unregister_pernet_device(&ipgre_net_ops);
1761 	goto out;
1762 }
1763 
1764 static void __exit ipgre_fini(void)
1765 {
1766 	rtnl_link_unregister(&ipgre_tap_ops);
1767 	rtnl_link_unregister(&ipgre_link_ops);
1768 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1769 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1770 	unregister_pernet_device(&ipgre_net_ops);
1771 }
1772 
1773 module_init(ipgre_init);
1774 module_exit(ipgre_fini);
1775 MODULE_LICENSE("GPL");
1776 MODULE_ALIAS_RTNL_LINK("gre");
1777 MODULE_ALIAS_RTNL_LINK("gretap");
1778