xref: /linux/net/ipv4/ip_gre.c (revision d39d0ed196aa1685bb24771e92f78633c66ac9cb)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33 
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 
48 #ifdef CONFIG_IPV6
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #endif
53 
54 /*
55    Problems & solutions
56    --------------------
57 
58    1. The most important issue is detecting local dead loops.
59    They would cause complete host lockup in transmit, which
60    would be "resolved" by stack overflow or, if queueing is enabled,
61    with infinite looping in net_bh.
62 
63    We cannot track such dead loops during route installation,
64    it is infeasible task. The most general solutions would be
65    to keep skb->encapsulation counter (sort of local ttl),
66    and silently drop packet when it expires. It is the best
67    solution, but it supposes maintaing new variable in ALL
68    skb, even if no tunneling is used.
69 
70    Current solution: HARD_TX_LOCK lock breaks dead loops.
71 
72 
73 
74    2. Networking dead loops would not kill routers, but would really
75    kill network. IP hop limit plays role of "t->recursion" in this case,
76    if we copy it from packet being encapsulated to upper header.
77    It is very good solution, but it introduces two problems:
78 
79    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80      do not work over tunnels.
81    - traceroute does not work. I planned to relay ICMP from tunnel,
82      so that this problem would be solved and traceroute output
83      would even more informative. This idea appeared to be wrong:
84      only Linux complies to rfc1812 now (yes, guys, Linux is the only
85      true router now :-)), all routers (at least, in neighbourhood of mine)
86      return only 8 bytes of payload. It is the end.
87 
88    Hence, if we want that OSPF worked or traceroute said something reasonable,
89    we should search for another solution.
90 
91    One of them is to parse packet trying to detect inner encapsulation
92    made by our node. It is difficult or even impossible, especially,
93    taking into account fragmentation. TO be short, tt is not solution at all.
94 
95    Current solution: The solution was UNEXPECTEDLY SIMPLE.
96    We force DF flag on tunnels with preconfigured hop limit,
97    that is ALL. :-) Well, it does not remove the problem completely,
98    but exponential growth of network traffic is changed to linear
99    (branches, that exceed pmtu are pruned) and tunnel mtu
100    fastly degrades to value <68, where looping stops.
101    Yes, it is not good if there exists a router in the loop,
102    which does not force DF, even when encapsulating packets have DF set.
103    But it is not our problem! Nobody could accuse us, we made
104    all that we could make. Even if it is your gated who injected
105    fatal route to network, even if it were you who configured
106    fatal static route: you are innocent. :-)
107 
108 
109 
110    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111    practically identical code. It would be good to glue them
112    together, but it is not very evident, how to make them modular.
113    sit is integral part of IPv6, ipip and gre are naturally modular.
114    We could extract common parts (hash table, ioctl etc)
115    to a separate module (ip_tunnel.c).
116 
117    Alexey Kuznetsov.
118  */
119 
120 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
121 static int ipgre_tunnel_init(struct net_device *dev);
122 static void ipgre_tunnel_setup(struct net_device *dev);
123 static int ipgre_tunnel_bind_dev(struct net_device *dev);
124 
125 /* Fallback tunnel: no source, no destination, no key, no options */
126 
127 #define HASH_SIZE  16
128 
129 static int ipgre_net_id __read_mostly;
130 struct ipgre_net {
131 	struct ip_tunnel *tunnels[4][HASH_SIZE];
132 
133 	struct net_device *fb_tunnel_dev;
134 };
135 
136 /* Tunnel hash table */
137 
138 /*
139    4 hash tables:
140 
141    3: (remote,local)
142    2: (remote,*)
143    1: (*,local)
144    0: (*,*)
145 
146    We require exact key match i.e. if a key is present in packet
147    it will match only tunnel with the same key; if it is not present,
148    it will match only keyless tunnel.
149 
150    All keysless packets, if not matched configured keyless tunnels
151    will match fallback tunnel.
152  */
153 
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155 
156 #define tunnels_r_l	tunnels[3]
157 #define tunnels_r	tunnels[2]
158 #define tunnels_l	tunnels[1]
159 #define tunnels_wc	tunnels[0]
160 /*
161  * Locking : hash tables are protected by RCU and a spinlock
162  */
163 static DEFINE_SPINLOCK(ipgre_lock);
164 
165 #define for_each_ip_tunnel_rcu(start) \
166 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 
168 /* Given src, dst and key, find appropriate for input tunnel. */
169 
170 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
171 					      __be32 remote, __be32 local,
172 					      __be32 key, __be16 gre_proto)
173 {
174 	struct net *net = dev_net(dev);
175 	int link = dev->ifindex;
176 	unsigned h0 = HASH(remote);
177 	unsigned h1 = HASH(key);
178 	struct ip_tunnel *t, *cand = NULL;
179 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
181 		       ARPHRD_ETHER : ARPHRD_IPGRE;
182 	int score, cand_score = 4;
183 
184 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
185 		if (local != t->parms.iph.saddr ||
186 		    remote != t->parms.iph.daddr ||
187 		    key != t->parms.i_key ||
188 		    !(t->dev->flags & IFF_UP))
189 			continue;
190 
191 		if (t->dev->type != ARPHRD_IPGRE &&
192 		    t->dev->type != dev_type)
193 			continue;
194 
195 		score = 0;
196 		if (t->parms.link != link)
197 			score |= 1;
198 		if (t->dev->type != dev_type)
199 			score |= 2;
200 		if (score == 0)
201 			return t;
202 
203 		if (score < cand_score) {
204 			cand = t;
205 			cand_score = score;
206 		}
207 	}
208 
209 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
210 		if (remote != t->parms.iph.daddr ||
211 		    key != t->parms.i_key ||
212 		    !(t->dev->flags & IFF_UP))
213 			continue;
214 
215 		if (t->dev->type != ARPHRD_IPGRE &&
216 		    t->dev->type != dev_type)
217 			continue;
218 
219 		score = 0;
220 		if (t->parms.link != link)
221 			score |= 1;
222 		if (t->dev->type != dev_type)
223 			score |= 2;
224 		if (score == 0)
225 			return t;
226 
227 		if (score < cand_score) {
228 			cand = t;
229 			cand_score = score;
230 		}
231 	}
232 
233 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
234 		if ((local != t->parms.iph.saddr &&
235 		     (local != t->parms.iph.daddr ||
236 		      !ipv4_is_multicast(local))) ||
237 		    key != t->parms.i_key ||
238 		    !(t->dev->flags & IFF_UP))
239 			continue;
240 
241 		if (t->dev->type != ARPHRD_IPGRE &&
242 		    t->dev->type != dev_type)
243 			continue;
244 
245 		score = 0;
246 		if (t->parms.link != link)
247 			score |= 1;
248 		if (t->dev->type != dev_type)
249 			score |= 2;
250 		if (score == 0)
251 			return t;
252 
253 		if (score < cand_score) {
254 			cand = t;
255 			cand_score = score;
256 		}
257 	}
258 
259 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
260 		if (t->parms.i_key != key ||
261 		    !(t->dev->flags & IFF_UP))
262 			continue;
263 
264 		if (t->dev->type != ARPHRD_IPGRE &&
265 		    t->dev->type != dev_type)
266 			continue;
267 
268 		score = 0;
269 		if (t->parms.link != link)
270 			score |= 1;
271 		if (t->dev->type != dev_type)
272 			score |= 2;
273 		if (score == 0)
274 			return t;
275 
276 		if (score < cand_score) {
277 			cand = t;
278 			cand_score = score;
279 		}
280 	}
281 
282 	if (cand != NULL)
283 		return cand;
284 
285 	dev = ign->fb_tunnel_dev;
286 	if (dev->flags & IFF_UP)
287 		return netdev_priv(dev);
288 
289 	return NULL;
290 }
291 
292 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
293 		struct ip_tunnel_parm *parms)
294 {
295 	__be32 remote = parms->iph.daddr;
296 	__be32 local = parms->iph.saddr;
297 	__be32 key = parms->i_key;
298 	unsigned h = HASH(key);
299 	int prio = 0;
300 
301 	if (local)
302 		prio |= 1;
303 	if (remote && !ipv4_is_multicast(remote)) {
304 		prio |= 2;
305 		h ^= HASH(remote);
306 	}
307 
308 	return &ign->tunnels[prio][h];
309 }
310 
311 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
312 		struct ip_tunnel *t)
313 {
314 	return __ipgre_bucket(ign, &t->parms);
315 }
316 
317 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318 {
319 	struct ip_tunnel **tp = ipgre_bucket(ign, t);
320 
321 	spin_lock_bh(&ipgre_lock);
322 	t->next = *tp;
323 	rcu_assign_pointer(*tp, t);
324 	spin_unlock_bh(&ipgre_lock);
325 }
326 
327 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
328 {
329 	struct ip_tunnel **tp;
330 
331 	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
332 		if (t == *tp) {
333 			spin_lock_bh(&ipgre_lock);
334 			*tp = t->next;
335 			spin_unlock_bh(&ipgre_lock);
336 			break;
337 		}
338 	}
339 }
340 
341 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
342 					   struct ip_tunnel_parm *parms,
343 					   int type)
344 {
345 	__be32 remote = parms->iph.daddr;
346 	__be32 local = parms->iph.saddr;
347 	__be32 key = parms->i_key;
348 	int link = parms->link;
349 	struct ip_tunnel *t, **tp;
350 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351 
352 	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
353 		if (local == t->parms.iph.saddr &&
354 		    remote == t->parms.iph.daddr &&
355 		    key == t->parms.i_key &&
356 		    link == t->parms.link &&
357 		    type == t->dev->type)
358 			break;
359 
360 	return t;
361 }
362 
363 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
364 		struct ip_tunnel_parm *parms, int create)
365 {
366 	struct ip_tunnel *t, *nt;
367 	struct net_device *dev;
368 	char name[IFNAMSIZ];
369 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
370 
371 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
372 	if (t || !create)
373 		return t;
374 
375 	if (parms->name[0])
376 		strlcpy(name, parms->name, IFNAMSIZ);
377 	else
378 		sprintf(name, "gre%%d");
379 
380 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
381 	if (!dev)
382 	  return NULL;
383 
384 	dev_net_set(dev, net);
385 
386 	if (strchr(name, '%')) {
387 		if (dev_alloc_name(dev, name) < 0)
388 			goto failed_free;
389 	}
390 
391 	nt = netdev_priv(dev);
392 	nt->parms = *parms;
393 	dev->rtnl_link_ops = &ipgre_link_ops;
394 
395 	dev->mtu = ipgre_tunnel_bind_dev(dev);
396 
397 	if (register_netdevice(dev) < 0)
398 		goto failed_free;
399 
400 	dev_hold(dev);
401 	ipgre_tunnel_link(ign, nt);
402 	return nt;
403 
404 failed_free:
405 	free_netdev(dev);
406 	return NULL;
407 }
408 
409 static void ipgre_tunnel_uninit(struct net_device *dev)
410 {
411 	struct net *net = dev_net(dev);
412 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
413 
414 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
415 	dev_put(dev);
416 }
417 
418 
419 static void ipgre_err(struct sk_buff *skb, u32 info)
420 {
421 
422 /* All the routers (except for Linux) return only
423    8 bytes of packet payload. It means, that precise relaying of
424    ICMP in the real Internet is absolutely infeasible.
425 
426    Moreover, Cisco "wise men" put GRE key to the third word
427    in GRE header. It makes impossible maintaining even soft state for keyed
428    GRE tunnels with enabled checksum. Tell them "thank you".
429 
430    Well, I wonder, rfc1812 was written by Cisco employee,
431    what the hell these idiots break standrads established
432    by themself???
433  */
434 
435 	struct iphdr *iph = (struct iphdr *)skb->data;
436 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
437 	int grehlen = (iph->ihl<<2) + 4;
438 	const int type = icmp_hdr(skb)->type;
439 	const int code = icmp_hdr(skb)->code;
440 	struct ip_tunnel *t;
441 	__be16 flags;
442 
443 	flags = p[0];
444 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
445 		if (flags&(GRE_VERSION|GRE_ROUTING))
446 			return;
447 		if (flags&GRE_KEY) {
448 			grehlen += 4;
449 			if (flags&GRE_CSUM)
450 				grehlen += 4;
451 		}
452 	}
453 
454 	/* If only 8 bytes returned, keyed message will be dropped here */
455 	if (skb_headlen(skb) < grehlen)
456 		return;
457 
458 	switch (type) {
459 	default:
460 	case ICMP_PARAMETERPROB:
461 		return;
462 
463 	case ICMP_DEST_UNREACH:
464 		switch (code) {
465 		case ICMP_SR_FAILED:
466 		case ICMP_PORT_UNREACH:
467 			/* Impossible event. */
468 			return;
469 		case ICMP_FRAG_NEEDED:
470 			/* Soft state for pmtu is maintained by IP core. */
471 			return;
472 		default:
473 			/* All others are translated to HOST_UNREACH.
474 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
475 			   I believe they are just ether pollution. --ANK
476 			 */
477 			break;
478 		}
479 		break;
480 	case ICMP_TIME_EXCEEDED:
481 		if (code != ICMP_EXC_TTL)
482 			return;
483 		break;
484 	}
485 
486 	rcu_read_lock();
487 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
488 				flags & GRE_KEY ?
489 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
490 				p[1]);
491 	if (t == NULL || t->parms.iph.daddr == 0 ||
492 	    ipv4_is_multicast(t->parms.iph.daddr))
493 		goto out;
494 
495 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
496 		goto out;
497 
498 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
499 		t->err_count++;
500 	else
501 		t->err_count = 1;
502 	t->err_time = jiffies;
503 out:
504 	rcu_read_unlock();
505 }
506 
507 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
508 {
509 	if (INET_ECN_is_ce(iph->tos)) {
510 		if (skb->protocol == htons(ETH_P_IP)) {
511 			IP_ECN_set_ce(ip_hdr(skb));
512 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
513 			IP6_ECN_set_ce(ipv6_hdr(skb));
514 		}
515 	}
516 }
517 
518 static inline u8
519 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
520 {
521 	u8 inner = 0;
522 	if (skb->protocol == htons(ETH_P_IP))
523 		inner = old_iph->tos;
524 	else if (skb->protocol == htons(ETH_P_IPV6))
525 		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
526 	return INET_ECN_encapsulate(tos, inner);
527 }
528 
529 static int ipgre_rcv(struct sk_buff *skb)
530 {
531 	struct iphdr *iph;
532 	u8     *h;
533 	__be16    flags;
534 	__sum16   csum = 0;
535 	__be32 key = 0;
536 	u32    seqno = 0;
537 	struct ip_tunnel *tunnel;
538 	int    offset = 4;
539 	__be16 gre_proto;
540 
541 	if (!pskb_may_pull(skb, 16))
542 		goto drop_nolock;
543 
544 	iph = ip_hdr(skb);
545 	h = skb->data;
546 	flags = *(__be16*)h;
547 
548 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
549 		/* - Version must be 0.
550 		   - We do not support routing headers.
551 		 */
552 		if (flags&(GRE_VERSION|GRE_ROUTING))
553 			goto drop_nolock;
554 
555 		if (flags&GRE_CSUM) {
556 			switch (skb->ip_summed) {
557 			case CHECKSUM_COMPLETE:
558 				csum = csum_fold(skb->csum);
559 				if (!csum)
560 					break;
561 				/* fall through */
562 			case CHECKSUM_NONE:
563 				skb->csum = 0;
564 				csum = __skb_checksum_complete(skb);
565 				skb->ip_summed = CHECKSUM_COMPLETE;
566 			}
567 			offset += 4;
568 		}
569 		if (flags&GRE_KEY) {
570 			key = *(__be32*)(h + offset);
571 			offset += 4;
572 		}
573 		if (flags&GRE_SEQ) {
574 			seqno = ntohl(*(__be32*)(h + offset));
575 			offset += 4;
576 		}
577 	}
578 
579 	gre_proto = *(__be16 *)(h + 2);
580 
581 	rcu_read_lock();
582 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
583 					  iph->saddr, iph->daddr, key,
584 					  gre_proto))) {
585 		struct net_device_stats *stats = &tunnel->dev->stats;
586 
587 		secpath_reset(skb);
588 
589 		skb->protocol = gre_proto;
590 		/* WCCP version 1 and 2 protocol decoding.
591 		 * - Change protocol to IP
592 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
593 		 */
594 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
595 			skb->protocol = htons(ETH_P_IP);
596 			if ((*(h + offset) & 0xF0) != 0x40)
597 				offset += 4;
598 		}
599 
600 		skb->mac_header = skb->network_header;
601 		__pskb_pull(skb, offset);
602 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
603 		skb->pkt_type = PACKET_HOST;
604 #ifdef CONFIG_NET_IPGRE_BROADCAST
605 		if (ipv4_is_multicast(iph->daddr)) {
606 			/* Looped back packet, drop it! */
607 			if (skb_rtable(skb)->fl.iif == 0)
608 				goto drop;
609 			stats->multicast++;
610 			skb->pkt_type = PACKET_BROADCAST;
611 		}
612 #endif
613 
614 		if (((flags&GRE_CSUM) && csum) ||
615 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
616 			stats->rx_crc_errors++;
617 			stats->rx_errors++;
618 			goto drop;
619 		}
620 		if (tunnel->parms.i_flags&GRE_SEQ) {
621 			if (!(flags&GRE_SEQ) ||
622 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
623 				stats->rx_fifo_errors++;
624 				stats->rx_errors++;
625 				goto drop;
626 			}
627 			tunnel->i_seqno = seqno + 1;
628 		}
629 
630 		/* Warning: All skb pointers will be invalidated! */
631 		if (tunnel->dev->type == ARPHRD_ETHER) {
632 			if (!pskb_may_pull(skb, ETH_HLEN)) {
633 				stats->rx_length_errors++;
634 				stats->rx_errors++;
635 				goto drop;
636 			}
637 
638 			iph = ip_hdr(skb);
639 			skb->protocol = eth_type_trans(skb, tunnel->dev);
640 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641 		}
642 
643 		skb_tunnel_rx(skb, tunnel->dev);
644 
645 		skb_reset_network_header(skb);
646 		ipgre_ecn_decapsulate(iph, skb);
647 
648 		netif_rx(skb);
649 		rcu_read_unlock();
650 		return(0);
651 	}
652 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
653 
654 drop:
655 	rcu_read_unlock();
656 drop_nolock:
657 	kfree_skb(skb);
658 	return(0);
659 }
660 
661 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
662 {
663 	struct ip_tunnel *tunnel = netdev_priv(dev);
664 	struct net_device_stats *stats = &dev->stats;
665 	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
666 	struct iphdr  *old_iph = ip_hdr(skb);
667 	struct iphdr  *tiph;
668 	u8     tos;
669 	__be16 df;
670 	struct rtable *rt;     			/* Route to the other host */
671 	struct net_device *tdev;			/* Device to other host */
672 	struct iphdr  *iph;			/* Our new IP header */
673 	unsigned int max_headroom;		/* The extra header space needed */
674 	int    gre_hlen;
675 	__be32 dst;
676 	int    mtu;
677 
678 	if (dev->type == ARPHRD_ETHER)
679 		IPCB(skb)->flags = 0;
680 
681 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
682 		gre_hlen = 0;
683 		tiph = (struct iphdr *)skb->data;
684 	} else {
685 		gre_hlen = tunnel->hlen;
686 		tiph = &tunnel->parms.iph;
687 	}
688 
689 	if ((dst = tiph->daddr) == 0) {
690 		/* NBMA tunnel */
691 
692 		if (skb_dst(skb) == NULL) {
693 			stats->tx_fifo_errors++;
694 			goto tx_error;
695 		}
696 
697 		if (skb->protocol == htons(ETH_P_IP)) {
698 			rt = skb_rtable(skb);
699 			if ((dst = rt->rt_gateway) == 0)
700 				goto tx_error_icmp;
701 		}
702 #ifdef CONFIG_IPV6
703 		else if (skb->protocol == htons(ETH_P_IPV6)) {
704 			struct in6_addr *addr6;
705 			int addr_type;
706 			struct neighbour *neigh = skb_dst(skb)->neighbour;
707 
708 			if (neigh == NULL)
709 				goto tx_error;
710 
711 			addr6 = (struct in6_addr *)&neigh->primary_key;
712 			addr_type = ipv6_addr_type(addr6);
713 
714 			if (addr_type == IPV6_ADDR_ANY) {
715 				addr6 = &ipv6_hdr(skb)->daddr;
716 				addr_type = ipv6_addr_type(addr6);
717 			}
718 
719 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
720 				goto tx_error_icmp;
721 
722 			dst = addr6->s6_addr32[3];
723 		}
724 #endif
725 		else
726 			goto tx_error;
727 	}
728 
729 	tos = tiph->tos;
730 	if (tos == 1) {
731 		tos = 0;
732 		if (skb->protocol == htons(ETH_P_IP))
733 			tos = old_iph->tos;
734 		else if (skb->protocol == htons(ETH_P_IPV6))
735 			tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
736 	}
737 
738 	{
739 		struct flowi fl = { .oif = tunnel->parms.link,
740 				    .nl_u = { .ip4_u =
741 					      { .daddr = dst,
742 						.saddr = tiph->saddr,
743 						.tos = RT_TOS(tos) } },
744 				    .proto = IPPROTO_GRE };
745 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
746 			stats->tx_carrier_errors++;
747 			goto tx_error;
748 		}
749 	}
750 	tdev = rt->dst.dev;
751 
752 	if (tdev == dev) {
753 		ip_rt_put(rt);
754 		stats->collisions++;
755 		goto tx_error;
756 	}
757 
758 	df = tiph->frag_off;
759 	if (df)
760 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
761 	else
762 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
763 
764 	if (skb_dst(skb))
765 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
766 
767 	if (skb->protocol == htons(ETH_P_IP)) {
768 		df |= (old_iph->frag_off&htons(IP_DF));
769 
770 		if ((old_iph->frag_off&htons(IP_DF)) &&
771 		    mtu < ntohs(old_iph->tot_len)) {
772 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
773 			ip_rt_put(rt);
774 			goto tx_error;
775 		}
776 	}
777 #ifdef CONFIG_IPV6
778 	else if (skb->protocol == htons(ETH_P_IPV6)) {
779 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
780 
781 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
782 			if ((tunnel->parms.iph.daddr &&
783 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
784 			    rt6->rt6i_dst.plen == 128) {
785 				rt6->rt6i_flags |= RTF_MODIFIED;
786 				skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
787 			}
788 		}
789 
790 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
791 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
792 			ip_rt_put(rt);
793 			goto tx_error;
794 		}
795 	}
796 #endif
797 
798 	if (tunnel->err_count > 0) {
799 		if (time_before(jiffies,
800 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
801 			tunnel->err_count--;
802 
803 			dst_link_failure(skb);
804 		} else
805 			tunnel->err_count = 0;
806 	}
807 
808 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
809 
810 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
811 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
812 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
813 		if (max_headroom > dev->needed_headroom)
814 			dev->needed_headroom = max_headroom;
815 		if (!new_skb) {
816 			ip_rt_put(rt);
817 			txq->tx_dropped++;
818 			dev_kfree_skb(skb);
819 			return NETDEV_TX_OK;
820 		}
821 		if (skb->sk)
822 			skb_set_owner_w(new_skb, skb->sk);
823 		dev_kfree_skb(skb);
824 		skb = new_skb;
825 		old_iph = ip_hdr(skb);
826 	}
827 
828 	skb_reset_transport_header(skb);
829 	skb_push(skb, gre_hlen);
830 	skb_reset_network_header(skb);
831 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
832 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
833 			      IPSKB_REROUTED);
834 	skb_dst_drop(skb);
835 	skb_dst_set(skb, &rt->dst);
836 
837 	/*
838 	 *	Push down and install the IPIP header.
839 	 */
840 
841 	iph 			=	ip_hdr(skb);
842 	iph->version		=	4;
843 	iph->ihl		=	sizeof(struct iphdr) >> 2;
844 	iph->frag_off		=	df;
845 	iph->protocol		=	IPPROTO_GRE;
846 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
847 	iph->daddr		=	rt->rt_dst;
848 	iph->saddr		=	rt->rt_src;
849 
850 	if ((iph->ttl = tiph->ttl) == 0) {
851 		if (skb->protocol == htons(ETH_P_IP))
852 			iph->ttl = old_iph->ttl;
853 #ifdef CONFIG_IPV6
854 		else if (skb->protocol == htons(ETH_P_IPV6))
855 			iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
856 #endif
857 		else
858 			iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
859 	}
860 
861 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
862 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
863 				   htons(ETH_P_TEB) : skb->protocol;
864 
865 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
866 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
867 
868 		if (tunnel->parms.o_flags&GRE_SEQ) {
869 			++tunnel->o_seqno;
870 			*ptr = htonl(tunnel->o_seqno);
871 			ptr--;
872 		}
873 		if (tunnel->parms.o_flags&GRE_KEY) {
874 			*ptr = tunnel->parms.o_key;
875 			ptr--;
876 		}
877 		if (tunnel->parms.o_flags&GRE_CSUM) {
878 			*ptr = 0;
879 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
880 		}
881 	}
882 
883 	nf_reset(skb);
884 
885 	IPTUNNEL_XMIT();
886 	return NETDEV_TX_OK;
887 
888 tx_error_icmp:
889 	dst_link_failure(skb);
890 
891 tx_error:
892 	stats->tx_errors++;
893 	dev_kfree_skb(skb);
894 	return NETDEV_TX_OK;
895 }
896 
897 static int ipgre_tunnel_bind_dev(struct net_device *dev)
898 {
899 	struct net_device *tdev = NULL;
900 	struct ip_tunnel *tunnel;
901 	struct iphdr *iph;
902 	int hlen = LL_MAX_HEADER;
903 	int mtu = ETH_DATA_LEN;
904 	int addend = sizeof(struct iphdr) + 4;
905 
906 	tunnel = netdev_priv(dev);
907 	iph = &tunnel->parms.iph;
908 
909 	/* Guess output device to choose reasonable mtu and needed_headroom */
910 
911 	if (iph->daddr) {
912 		struct flowi fl = { .oif = tunnel->parms.link,
913 				    .nl_u = { .ip4_u =
914 					      { .daddr = iph->daddr,
915 						.saddr = iph->saddr,
916 						.tos = RT_TOS(iph->tos) } },
917 				    .proto = IPPROTO_GRE };
918 		struct rtable *rt;
919 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
920 			tdev = rt->dst.dev;
921 			ip_rt_put(rt);
922 		}
923 
924 		if (dev->type != ARPHRD_ETHER)
925 			dev->flags |= IFF_POINTOPOINT;
926 	}
927 
928 	if (!tdev && tunnel->parms.link)
929 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
930 
931 	if (tdev) {
932 		hlen = tdev->hard_header_len + tdev->needed_headroom;
933 		mtu = tdev->mtu;
934 	}
935 	dev->iflink = tunnel->parms.link;
936 
937 	/* Precalculate GRE options length */
938 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
939 		if (tunnel->parms.o_flags&GRE_CSUM)
940 			addend += 4;
941 		if (tunnel->parms.o_flags&GRE_KEY)
942 			addend += 4;
943 		if (tunnel->parms.o_flags&GRE_SEQ)
944 			addend += 4;
945 	}
946 	dev->needed_headroom = addend + hlen;
947 	mtu -= dev->hard_header_len + addend;
948 
949 	if (mtu < 68)
950 		mtu = 68;
951 
952 	tunnel->hlen = addend;
953 
954 	return mtu;
955 }
956 
957 static int
958 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
959 {
960 	int err = 0;
961 	struct ip_tunnel_parm p;
962 	struct ip_tunnel *t;
963 	struct net *net = dev_net(dev);
964 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
965 
966 	switch (cmd) {
967 	case SIOCGETTUNNEL:
968 		t = NULL;
969 		if (dev == ign->fb_tunnel_dev) {
970 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
971 				err = -EFAULT;
972 				break;
973 			}
974 			t = ipgre_tunnel_locate(net, &p, 0);
975 		}
976 		if (t == NULL)
977 			t = netdev_priv(dev);
978 		memcpy(&p, &t->parms, sizeof(p));
979 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
980 			err = -EFAULT;
981 		break;
982 
983 	case SIOCADDTUNNEL:
984 	case SIOCCHGTUNNEL:
985 		err = -EPERM;
986 		if (!capable(CAP_NET_ADMIN))
987 			goto done;
988 
989 		err = -EFAULT;
990 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
991 			goto done;
992 
993 		err = -EINVAL;
994 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
995 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
996 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
997 			goto done;
998 		if (p.iph.ttl)
999 			p.iph.frag_off |= htons(IP_DF);
1000 
1001 		if (!(p.i_flags&GRE_KEY))
1002 			p.i_key = 0;
1003 		if (!(p.o_flags&GRE_KEY))
1004 			p.o_key = 0;
1005 
1006 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1007 
1008 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1009 			if (t != NULL) {
1010 				if (t->dev != dev) {
1011 					err = -EEXIST;
1012 					break;
1013 				}
1014 			} else {
1015 				unsigned nflags = 0;
1016 
1017 				t = netdev_priv(dev);
1018 
1019 				if (ipv4_is_multicast(p.iph.daddr))
1020 					nflags = IFF_BROADCAST;
1021 				else if (p.iph.daddr)
1022 					nflags = IFF_POINTOPOINT;
1023 
1024 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1025 					err = -EINVAL;
1026 					break;
1027 				}
1028 				ipgre_tunnel_unlink(ign, t);
1029 				t->parms.iph.saddr = p.iph.saddr;
1030 				t->parms.iph.daddr = p.iph.daddr;
1031 				t->parms.i_key = p.i_key;
1032 				t->parms.o_key = p.o_key;
1033 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1034 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1035 				ipgre_tunnel_link(ign, t);
1036 				netdev_state_change(dev);
1037 			}
1038 		}
1039 
1040 		if (t) {
1041 			err = 0;
1042 			if (cmd == SIOCCHGTUNNEL) {
1043 				t->parms.iph.ttl = p.iph.ttl;
1044 				t->parms.iph.tos = p.iph.tos;
1045 				t->parms.iph.frag_off = p.iph.frag_off;
1046 				if (t->parms.link != p.link) {
1047 					t->parms.link = p.link;
1048 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1049 					netdev_state_change(dev);
1050 				}
1051 			}
1052 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1053 				err = -EFAULT;
1054 		} else
1055 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1056 		break;
1057 
1058 	case SIOCDELTUNNEL:
1059 		err = -EPERM;
1060 		if (!capable(CAP_NET_ADMIN))
1061 			goto done;
1062 
1063 		if (dev == ign->fb_tunnel_dev) {
1064 			err = -EFAULT;
1065 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1066 				goto done;
1067 			err = -ENOENT;
1068 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1069 				goto done;
1070 			err = -EPERM;
1071 			if (t == netdev_priv(ign->fb_tunnel_dev))
1072 				goto done;
1073 			dev = t->dev;
1074 		}
1075 		unregister_netdevice(dev);
1076 		err = 0;
1077 		break;
1078 
1079 	default:
1080 		err = -EINVAL;
1081 	}
1082 
1083 done:
1084 	return err;
1085 }
1086 
1087 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1088 {
1089 	struct ip_tunnel *tunnel = netdev_priv(dev);
1090 	if (new_mtu < 68 ||
1091 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1092 		return -EINVAL;
1093 	dev->mtu = new_mtu;
1094 	return 0;
1095 }
1096 
1097 /* Nice toy. Unfortunately, useless in real life :-)
1098    It allows to construct virtual multiprotocol broadcast "LAN"
1099    over the Internet, provided multicast routing is tuned.
1100 
1101 
1102    I have no idea was this bicycle invented before me,
1103    so that I had to set ARPHRD_IPGRE to a random value.
1104    I have an impression, that Cisco could make something similar,
1105    but this feature is apparently missing in IOS<=11.2(8).
1106 
1107    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1108    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1109 
1110    ping -t 255 224.66.66.66
1111 
1112    If nobody answers, mbone does not work.
1113 
1114    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1115    ip addr add 10.66.66.<somewhat>/24 dev Universe
1116    ifconfig Universe up
1117    ifconfig Universe add fe80::<Your_real_addr>/10
1118    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1119    ftp 10.66.66.66
1120    ...
1121    ftp fec0:6666:6666::193.233.7.65
1122    ...
1123 
1124  */
1125 
1126 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1127 			unsigned short type,
1128 			const void *daddr, const void *saddr, unsigned len)
1129 {
1130 	struct ip_tunnel *t = netdev_priv(dev);
1131 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1132 	__be16 *p = (__be16*)(iph+1);
1133 
1134 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1135 	p[0]		= t->parms.o_flags;
1136 	p[1]		= htons(type);
1137 
1138 	/*
1139 	 *	Set the source hardware address.
1140 	 */
1141 
1142 	if (saddr)
1143 		memcpy(&iph->saddr, saddr, 4);
1144 	if (daddr)
1145 		memcpy(&iph->daddr, daddr, 4);
1146 	if (iph->daddr)
1147 		return t->hlen;
1148 
1149 	return -t->hlen;
1150 }
1151 
1152 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1153 {
1154 	struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1155 	memcpy(haddr, &iph->saddr, 4);
1156 	return 4;
1157 }
1158 
1159 static const struct header_ops ipgre_header_ops = {
1160 	.create	= ipgre_header,
1161 	.parse	= ipgre_header_parse,
1162 };
1163 
1164 #ifdef CONFIG_NET_IPGRE_BROADCAST
1165 static int ipgre_open(struct net_device *dev)
1166 {
1167 	struct ip_tunnel *t = netdev_priv(dev);
1168 
1169 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1170 		struct flowi fl = { .oif = t->parms.link,
1171 				    .nl_u = { .ip4_u =
1172 					      { .daddr = t->parms.iph.daddr,
1173 						.saddr = t->parms.iph.saddr,
1174 						.tos = RT_TOS(t->parms.iph.tos) } },
1175 				    .proto = IPPROTO_GRE };
1176 		struct rtable *rt;
1177 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
1178 			return -EADDRNOTAVAIL;
1179 		dev = rt->dst.dev;
1180 		ip_rt_put(rt);
1181 		if (__in_dev_get_rtnl(dev) == NULL)
1182 			return -EADDRNOTAVAIL;
1183 		t->mlink = dev->ifindex;
1184 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1185 	}
1186 	return 0;
1187 }
1188 
1189 static int ipgre_close(struct net_device *dev)
1190 {
1191 	struct ip_tunnel *t = netdev_priv(dev);
1192 
1193 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1194 		struct in_device *in_dev;
1195 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1196 		if (in_dev) {
1197 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1198 			in_dev_put(in_dev);
1199 		}
1200 	}
1201 	return 0;
1202 }
1203 
1204 #endif
1205 
1206 static const struct net_device_ops ipgre_netdev_ops = {
1207 	.ndo_init		= ipgre_tunnel_init,
1208 	.ndo_uninit		= ipgre_tunnel_uninit,
1209 #ifdef CONFIG_NET_IPGRE_BROADCAST
1210 	.ndo_open		= ipgre_open,
1211 	.ndo_stop		= ipgre_close,
1212 #endif
1213 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1214 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1215 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1216 };
1217 
1218 static void ipgre_tunnel_setup(struct net_device *dev)
1219 {
1220 	dev->netdev_ops		= &ipgre_netdev_ops;
1221 	dev->destructor 	= free_netdev;
1222 
1223 	dev->type		= ARPHRD_IPGRE;
1224 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1225 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1226 	dev->flags		= IFF_NOARP;
1227 	dev->iflink		= 0;
1228 	dev->addr_len		= 4;
1229 	dev->features		|= NETIF_F_NETNS_LOCAL;
1230 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1231 }
1232 
1233 static int ipgre_tunnel_init(struct net_device *dev)
1234 {
1235 	struct ip_tunnel *tunnel;
1236 	struct iphdr *iph;
1237 
1238 	tunnel = netdev_priv(dev);
1239 	iph = &tunnel->parms.iph;
1240 
1241 	tunnel->dev = dev;
1242 	strcpy(tunnel->parms.name, dev->name);
1243 
1244 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1245 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1246 
1247 	if (iph->daddr) {
1248 #ifdef CONFIG_NET_IPGRE_BROADCAST
1249 		if (ipv4_is_multicast(iph->daddr)) {
1250 			if (!iph->saddr)
1251 				return -EINVAL;
1252 			dev->flags = IFF_BROADCAST;
1253 			dev->header_ops = &ipgre_header_ops;
1254 		}
1255 #endif
1256 	} else
1257 		dev->header_ops = &ipgre_header_ops;
1258 
1259 	return 0;
1260 }
1261 
1262 static void ipgre_fb_tunnel_init(struct net_device *dev)
1263 {
1264 	struct ip_tunnel *tunnel = netdev_priv(dev);
1265 	struct iphdr *iph = &tunnel->parms.iph;
1266 	struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1267 
1268 	tunnel->dev = dev;
1269 	strcpy(tunnel->parms.name, dev->name);
1270 
1271 	iph->version		= 4;
1272 	iph->protocol		= IPPROTO_GRE;
1273 	iph->ihl		= 5;
1274 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1275 
1276 	dev_hold(dev);
1277 	ign->tunnels_wc[0]	= tunnel;
1278 }
1279 
1280 
1281 static const struct net_protocol ipgre_protocol = {
1282 	.handler	=	ipgre_rcv,
1283 	.err_handler	=	ipgre_err,
1284 	.netns_ok	=	1,
1285 };
1286 
1287 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1288 {
1289 	int prio;
1290 
1291 	for (prio = 0; prio < 4; prio++) {
1292 		int h;
1293 		for (h = 0; h < HASH_SIZE; h++) {
1294 			struct ip_tunnel *t = ign->tunnels[prio][h];
1295 
1296 			while (t != NULL) {
1297 				unregister_netdevice_queue(t->dev, head);
1298 				t = t->next;
1299 			}
1300 		}
1301 	}
1302 }
1303 
1304 static int __net_init ipgre_init_net(struct net *net)
1305 {
1306 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1307 	int err;
1308 
1309 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1310 					   ipgre_tunnel_setup);
1311 	if (!ign->fb_tunnel_dev) {
1312 		err = -ENOMEM;
1313 		goto err_alloc_dev;
1314 	}
1315 	dev_net_set(ign->fb_tunnel_dev, net);
1316 
1317 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1318 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1319 
1320 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1321 		goto err_reg_dev;
1322 
1323 	return 0;
1324 
1325 err_reg_dev:
1326 	free_netdev(ign->fb_tunnel_dev);
1327 err_alloc_dev:
1328 	return err;
1329 }
1330 
1331 static void __net_exit ipgre_exit_net(struct net *net)
1332 {
1333 	struct ipgre_net *ign;
1334 	LIST_HEAD(list);
1335 
1336 	ign = net_generic(net, ipgre_net_id);
1337 	rtnl_lock();
1338 	ipgre_destroy_tunnels(ign, &list);
1339 	unregister_netdevice_many(&list);
1340 	rtnl_unlock();
1341 }
1342 
1343 static struct pernet_operations ipgre_net_ops = {
1344 	.init = ipgre_init_net,
1345 	.exit = ipgre_exit_net,
1346 	.id   = &ipgre_net_id,
1347 	.size = sizeof(struct ipgre_net),
1348 };
1349 
1350 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1351 {
1352 	__be16 flags;
1353 
1354 	if (!data)
1355 		return 0;
1356 
1357 	flags = 0;
1358 	if (data[IFLA_GRE_IFLAGS])
1359 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1360 	if (data[IFLA_GRE_OFLAGS])
1361 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1362 	if (flags & (GRE_VERSION|GRE_ROUTING))
1363 		return -EINVAL;
1364 
1365 	return 0;
1366 }
1367 
1368 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1369 {
1370 	__be32 daddr;
1371 
1372 	if (tb[IFLA_ADDRESS]) {
1373 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1374 			return -EINVAL;
1375 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1376 			return -EADDRNOTAVAIL;
1377 	}
1378 
1379 	if (!data)
1380 		goto out;
1381 
1382 	if (data[IFLA_GRE_REMOTE]) {
1383 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1384 		if (!daddr)
1385 			return -EINVAL;
1386 	}
1387 
1388 out:
1389 	return ipgre_tunnel_validate(tb, data);
1390 }
1391 
1392 static void ipgre_netlink_parms(struct nlattr *data[],
1393 				struct ip_tunnel_parm *parms)
1394 {
1395 	memset(parms, 0, sizeof(*parms));
1396 
1397 	parms->iph.protocol = IPPROTO_GRE;
1398 
1399 	if (!data)
1400 		return;
1401 
1402 	if (data[IFLA_GRE_LINK])
1403 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1404 
1405 	if (data[IFLA_GRE_IFLAGS])
1406 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1407 
1408 	if (data[IFLA_GRE_OFLAGS])
1409 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1410 
1411 	if (data[IFLA_GRE_IKEY])
1412 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1413 
1414 	if (data[IFLA_GRE_OKEY])
1415 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1416 
1417 	if (data[IFLA_GRE_LOCAL])
1418 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1419 
1420 	if (data[IFLA_GRE_REMOTE])
1421 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1422 
1423 	if (data[IFLA_GRE_TTL])
1424 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1425 
1426 	if (data[IFLA_GRE_TOS])
1427 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1428 
1429 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1430 		parms->iph.frag_off = htons(IP_DF);
1431 }
1432 
1433 static int ipgre_tap_init(struct net_device *dev)
1434 {
1435 	struct ip_tunnel *tunnel;
1436 
1437 	tunnel = netdev_priv(dev);
1438 
1439 	tunnel->dev = dev;
1440 	strcpy(tunnel->parms.name, dev->name);
1441 
1442 	ipgre_tunnel_bind_dev(dev);
1443 
1444 	return 0;
1445 }
1446 
1447 static const struct net_device_ops ipgre_tap_netdev_ops = {
1448 	.ndo_init		= ipgre_tap_init,
1449 	.ndo_uninit		= ipgre_tunnel_uninit,
1450 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1451 	.ndo_set_mac_address 	= eth_mac_addr,
1452 	.ndo_validate_addr	= eth_validate_addr,
1453 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1454 };
1455 
1456 static void ipgre_tap_setup(struct net_device *dev)
1457 {
1458 
1459 	ether_setup(dev);
1460 
1461 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1462 	dev->destructor 	= free_netdev;
1463 
1464 	dev->iflink		= 0;
1465 	dev->features		|= NETIF_F_NETNS_LOCAL;
1466 }
1467 
1468 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1469 			 struct nlattr *data[])
1470 {
1471 	struct ip_tunnel *nt;
1472 	struct net *net = dev_net(dev);
1473 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1474 	int mtu;
1475 	int err;
1476 
1477 	nt = netdev_priv(dev);
1478 	ipgre_netlink_parms(data, &nt->parms);
1479 
1480 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1481 		return -EEXIST;
1482 
1483 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1484 		random_ether_addr(dev->dev_addr);
1485 
1486 	mtu = ipgre_tunnel_bind_dev(dev);
1487 	if (!tb[IFLA_MTU])
1488 		dev->mtu = mtu;
1489 
1490 	err = register_netdevice(dev);
1491 	if (err)
1492 		goto out;
1493 
1494 	dev_hold(dev);
1495 	ipgre_tunnel_link(ign, nt);
1496 
1497 out:
1498 	return err;
1499 }
1500 
1501 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1502 			    struct nlattr *data[])
1503 {
1504 	struct ip_tunnel *t, *nt;
1505 	struct net *net = dev_net(dev);
1506 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1507 	struct ip_tunnel_parm p;
1508 	int mtu;
1509 
1510 	if (dev == ign->fb_tunnel_dev)
1511 		return -EINVAL;
1512 
1513 	nt = netdev_priv(dev);
1514 	ipgre_netlink_parms(data, &p);
1515 
1516 	t = ipgre_tunnel_locate(net, &p, 0);
1517 
1518 	if (t) {
1519 		if (t->dev != dev)
1520 			return -EEXIST;
1521 	} else {
1522 		t = nt;
1523 
1524 		if (dev->type != ARPHRD_ETHER) {
1525 			unsigned nflags = 0;
1526 
1527 			if (ipv4_is_multicast(p.iph.daddr))
1528 				nflags = IFF_BROADCAST;
1529 			else if (p.iph.daddr)
1530 				nflags = IFF_POINTOPOINT;
1531 
1532 			if ((dev->flags ^ nflags) &
1533 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1534 				return -EINVAL;
1535 		}
1536 
1537 		ipgre_tunnel_unlink(ign, t);
1538 		t->parms.iph.saddr = p.iph.saddr;
1539 		t->parms.iph.daddr = p.iph.daddr;
1540 		t->parms.i_key = p.i_key;
1541 		if (dev->type != ARPHRD_ETHER) {
1542 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1543 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1544 		}
1545 		ipgre_tunnel_link(ign, t);
1546 		netdev_state_change(dev);
1547 	}
1548 
1549 	t->parms.o_key = p.o_key;
1550 	t->parms.iph.ttl = p.iph.ttl;
1551 	t->parms.iph.tos = p.iph.tos;
1552 	t->parms.iph.frag_off = p.iph.frag_off;
1553 
1554 	if (t->parms.link != p.link) {
1555 		t->parms.link = p.link;
1556 		mtu = ipgre_tunnel_bind_dev(dev);
1557 		if (!tb[IFLA_MTU])
1558 			dev->mtu = mtu;
1559 		netdev_state_change(dev);
1560 	}
1561 
1562 	return 0;
1563 }
1564 
1565 static size_t ipgre_get_size(const struct net_device *dev)
1566 {
1567 	return
1568 		/* IFLA_GRE_LINK */
1569 		nla_total_size(4) +
1570 		/* IFLA_GRE_IFLAGS */
1571 		nla_total_size(2) +
1572 		/* IFLA_GRE_OFLAGS */
1573 		nla_total_size(2) +
1574 		/* IFLA_GRE_IKEY */
1575 		nla_total_size(4) +
1576 		/* IFLA_GRE_OKEY */
1577 		nla_total_size(4) +
1578 		/* IFLA_GRE_LOCAL */
1579 		nla_total_size(4) +
1580 		/* IFLA_GRE_REMOTE */
1581 		nla_total_size(4) +
1582 		/* IFLA_GRE_TTL */
1583 		nla_total_size(1) +
1584 		/* IFLA_GRE_TOS */
1585 		nla_total_size(1) +
1586 		/* IFLA_GRE_PMTUDISC */
1587 		nla_total_size(1) +
1588 		0;
1589 }
1590 
1591 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1592 {
1593 	struct ip_tunnel *t = netdev_priv(dev);
1594 	struct ip_tunnel_parm *p = &t->parms;
1595 
1596 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1597 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1598 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1599 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1600 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1601 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1602 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1603 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1604 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1605 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1606 
1607 	return 0;
1608 
1609 nla_put_failure:
1610 	return -EMSGSIZE;
1611 }
1612 
1613 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1614 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1615 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1616 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1617 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1618 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1619 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1620 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1621 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1622 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1623 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1624 };
1625 
1626 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1627 	.kind		= "gre",
1628 	.maxtype	= IFLA_GRE_MAX,
1629 	.policy		= ipgre_policy,
1630 	.priv_size	= sizeof(struct ip_tunnel),
1631 	.setup		= ipgre_tunnel_setup,
1632 	.validate	= ipgre_tunnel_validate,
1633 	.newlink	= ipgre_newlink,
1634 	.changelink	= ipgre_changelink,
1635 	.get_size	= ipgre_get_size,
1636 	.fill_info	= ipgre_fill_info,
1637 };
1638 
1639 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1640 	.kind		= "gretap",
1641 	.maxtype	= IFLA_GRE_MAX,
1642 	.policy		= ipgre_policy,
1643 	.priv_size	= sizeof(struct ip_tunnel),
1644 	.setup		= ipgre_tap_setup,
1645 	.validate	= ipgre_tap_validate,
1646 	.newlink	= ipgre_newlink,
1647 	.changelink	= ipgre_changelink,
1648 	.get_size	= ipgre_get_size,
1649 	.fill_info	= ipgre_fill_info,
1650 };
1651 
1652 /*
1653  *	And now the modules code and kernel interface.
1654  */
1655 
1656 static int __init ipgre_init(void)
1657 {
1658 	int err;
1659 
1660 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1661 
1662 	err = register_pernet_device(&ipgre_net_ops);
1663 	if (err < 0)
1664 		return err;
1665 
1666 	err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
1667 	if (err < 0) {
1668 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1669 		goto add_proto_failed;
1670 	}
1671 
1672 	err = rtnl_link_register(&ipgre_link_ops);
1673 	if (err < 0)
1674 		goto rtnl_link_failed;
1675 
1676 	err = rtnl_link_register(&ipgre_tap_ops);
1677 	if (err < 0)
1678 		goto tap_ops_failed;
1679 
1680 out:
1681 	return err;
1682 
1683 tap_ops_failed:
1684 	rtnl_link_unregister(&ipgre_link_ops);
1685 rtnl_link_failed:
1686 	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1687 add_proto_failed:
1688 	unregister_pernet_device(&ipgre_net_ops);
1689 	goto out;
1690 }
1691 
1692 static void __exit ipgre_fini(void)
1693 {
1694 	rtnl_link_unregister(&ipgre_tap_ops);
1695 	rtnl_link_unregister(&ipgre_link_ops);
1696 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1697 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1698 	unregister_pernet_device(&ipgre_net_ops);
1699 }
1700 
1701 module_init(ipgre_init);
1702 module_exit(ipgre_fini);
1703 MODULE_LICENSE("GPL");
1704 MODULE_ALIAS_RTNL_LINK("gre");
1705 MODULE_ALIAS_RTNL_LINK("gretap");
1706