xref: /linux/net/ipv4/ip_gre.c (revision b233b28eac0cc37d07c2d007ea08c86c778c5af4)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32 
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46 
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73 
74 
75 
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80 
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89 
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92 
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96 
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109 
110 
111 
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118 
119    Alexey Kuznetsov.
120  */
121 
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126 
127 /* Fallback tunnel: no source, no destination, no key, no options */
128 
129 #define HASH_SIZE  16
130 
131 static int ipgre_net_id;
132 struct ipgre_net {
133 	struct ip_tunnel *tunnels[4][HASH_SIZE];
134 
135 	struct net_device *fb_tunnel_dev;
136 };
137 
138 /* Tunnel hash table */
139 
140 /*
141    4 hash tables:
142 
143    3: (remote,local)
144    2: (remote,*)
145    1: (*,local)
146    0: (*,*)
147 
148    We require exact key match i.e. if a key is present in packet
149    it will match only tunnel with the same key; if it is not present,
150    it will match only keyless tunnel.
151 
152    All keysless packets, if not matched configured keyless tunnels
153    will match fallback tunnel.
154  */
155 
156 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
157 
158 #define tunnels_r_l	tunnels[3]
159 #define tunnels_r	tunnels[2]
160 #define tunnels_l	tunnels[1]
161 #define tunnels_wc	tunnels[0]
162 
163 static DEFINE_RWLOCK(ipgre_lock);
164 
165 /* Given src, dst and key, find appropriate for input tunnel. */
166 
167 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
168 					      __be32 remote, __be32 local,
169 					      __be32 key, __be16 gre_proto)
170 {
171 	unsigned h0 = HASH(remote);
172 	unsigned h1 = HASH(key);
173 	struct ip_tunnel *t;
174 	struct ip_tunnel *t2 = NULL;
175 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
176 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
177 		       ARPHRD_ETHER : ARPHRD_IPGRE;
178 
179 	for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
180 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
181 			if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
182 				if (t->dev->type == dev_type)
183 					return t;
184 				if (t->dev->type == ARPHRD_IPGRE && !t2)
185 					t2 = t;
186 			}
187 		}
188 	}
189 
190 	for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
191 		if (remote == t->parms.iph.daddr) {
192 			if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
193 				if (t->dev->type == dev_type)
194 					return t;
195 				if (t->dev->type == ARPHRD_IPGRE && !t2)
196 					t2 = t;
197 			}
198 		}
199 	}
200 
201 	for (t = ign->tunnels_l[h1]; t; t = t->next) {
202 		if (local == t->parms.iph.saddr ||
203 		     (local == t->parms.iph.daddr &&
204 		      ipv4_is_multicast(local))) {
205 			if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
206 				if (t->dev->type == dev_type)
207 					return t;
208 				if (t->dev->type == ARPHRD_IPGRE && !t2)
209 					t2 = t;
210 			}
211 		}
212 	}
213 
214 	for (t = ign->tunnels_wc[h1]; t; t = t->next) {
215 		if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
216 			if (t->dev->type == dev_type)
217 				return t;
218 			if (t->dev->type == ARPHRD_IPGRE && !t2)
219 				t2 = t;
220 		}
221 	}
222 
223 	if (t2)
224 		return t2;
225 
226 	if (ign->fb_tunnel_dev->flags&IFF_UP)
227 		return netdev_priv(ign->fb_tunnel_dev);
228 	return NULL;
229 }
230 
231 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
232 		struct ip_tunnel_parm *parms)
233 {
234 	__be32 remote = parms->iph.daddr;
235 	__be32 local = parms->iph.saddr;
236 	__be32 key = parms->i_key;
237 	unsigned h = HASH(key);
238 	int prio = 0;
239 
240 	if (local)
241 		prio |= 1;
242 	if (remote && !ipv4_is_multicast(remote)) {
243 		prio |= 2;
244 		h ^= HASH(remote);
245 	}
246 
247 	return &ign->tunnels[prio][h];
248 }
249 
250 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
251 		struct ip_tunnel *t)
252 {
253 	return __ipgre_bucket(ign, &t->parms);
254 }
255 
256 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
257 {
258 	struct ip_tunnel **tp = ipgre_bucket(ign, t);
259 
260 	t->next = *tp;
261 	write_lock_bh(&ipgre_lock);
262 	*tp = t;
263 	write_unlock_bh(&ipgre_lock);
264 }
265 
266 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
267 {
268 	struct ip_tunnel **tp;
269 
270 	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
271 		if (t == *tp) {
272 			write_lock_bh(&ipgre_lock);
273 			*tp = t->next;
274 			write_unlock_bh(&ipgre_lock);
275 			break;
276 		}
277 	}
278 }
279 
280 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
281 					   struct ip_tunnel_parm *parms,
282 					   int type)
283 {
284 	__be32 remote = parms->iph.daddr;
285 	__be32 local = parms->iph.saddr;
286 	__be32 key = parms->i_key;
287 	struct ip_tunnel *t, **tp;
288 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
289 
290 	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
291 		if (local == t->parms.iph.saddr &&
292 		    remote == t->parms.iph.daddr &&
293 		    key == t->parms.i_key &&
294 		    type == t->dev->type)
295 			break;
296 
297 	return t;
298 }
299 
300 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
301 		struct ip_tunnel_parm *parms, int create)
302 {
303 	struct ip_tunnel *t, *nt;
304 	struct net_device *dev;
305 	char name[IFNAMSIZ];
306 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
307 
308 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
309 	if (t || !create)
310 		return t;
311 
312 	if (parms->name[0])
313 		strlcpy(name, parms->name, IFNAMSIZ);
314 	else
315 		sprintf(name, "gre%%d");
316 
317 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
318 	if (!dev)
319 	  return NULL;
320 
321 	dev_net_set(dev, net);
322 
323 	if (strchr(name, '%')) {
324 		if (dev_alloc_name(dev, name) < 0)
325 			goto failed_free;
326 	}
327 
328 	nt = netdev_priv(dev);
329 	nt->parms = *parms;
330 	dev->rtnl_link_ops = &ipgre_link_ops;
331 
332 	dev->mtu = ipgre_tunnel_bind_dev(dev);
333 
334 	if (register_netdevice(dev) < 0)
335 		goto failed_free;
336 
337 	dev_hold(dev);
338 	ipgre_tunnel_link(ign, nt);
339 	return nt;
340 
341 failed_free:
342 	free_netdev(dev);
343 	return NULL;
344 }
345 
346 static void ipgre_tunnel_uninit(struct net_device *dev)
347 {
348 	struct net *net = dev_net(dev);
349 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
350 
351 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
352 	dev_put(dev);
353 }
354 
355 
356 static void ipgre_err(struct sk_buff *skb, u32 info)
357 {
358 
359 /* All the routers (except for Linux) return only
360    8 bytes of packet payload. It means, that precise relaying of
361    ICMP in the real Internet is absolutely infeasible.
362 
363    Moreover, Cisco "wise men" put GRE key to the third word
364    in GRE header. It makes impossible maintaining even soft state for keyed
365    GRE tunnels with enabled checksum. Tell them "thank you".
366 
367    Well, I wonder, rfc1812 was written by Cisco employee,
368    what the hell these idiots break standrads established
369    by themself???
370  */
371 
372 	struct iphdr *iph = (struct iphdr *)skb->data;
373 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
374 	int grehlen = (iph->ihl<<2) + 4;
375 	const int type = icmp_hdr(skb)->type;
376 	const int code = icmp_hdr(skb)->code;
377 	struct ip_tunnel *t;
378 	__be16 flags;
379 
380 	flags = p[0];
381 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
382 		if (flags&(GRE_VERSION|GRE_ROUTING))
383 			return;
384 		if (flags&GRE_KEY) {
385 			grehlen += 4;
386 			if (flags&GRE_CSUM)
387 				grehlen += 4;
388 		}
389 	}
390 
391 	/* If only 8 bytes returned, keyed message will be dropped here */
392 	if (skb_headlen(skb) < grehlen)
393 		return;
394 
395 	switch (type) {
396 	default:
397 	case ICMP_PARAMETERPROB:
398 		return;
399 
400 	case ICMP_DEST_UNREACH:
401 		switch (code) {
402 		case ICMP_SR_FAILED:
403 		case ICMP_PORT_UNREACH:
404 			/* Impossible event. */
405 			return;
406 		case ICMP_FRAG_NEEDED:
407 			/* Soft state for pmtu is maintained by IP core. */
408 			return;
409 		default:
410 			/* All others are translated to HOST_UNREACH.
411 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
412 			   I believe they are just ether pollution. --ANK
413 			 */
414 			break;
415 		}
416 		break;
417 	case ICMP_TIME_EXCEEDED:
418 		if (code != ICMP_EXC_TTL)
419 			return;
420 		break;
421 	}
422 
423 	read_lock(&ipgre_lock);
424 	t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
425 				flags & GRE_KEY ?
426 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
427 				p[1]);
428 	if (t == NULL || t->parms.iph.daddr == 0 ||
429 	    ipv4_is_multicast(t->parms.iph.daddr))
430 		goto out;
431 
432 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
433 		goto out;
434 
435 	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
436 		t->err_count++;
437 	else
438 		t->err_count = 1;
439 	t->err_time = jiffies;
440 out:
441 	read_unlock(&ipgre_lock);
442 	return;
443 }
444 
445 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
446 {
447 	if (INET_ECN_is_ce(iph->tos)) {
448 		if (skb->protocol == htons(ETH_P_IP)) {
449 			IP_ECN_set_ce(ip_hdr(skb));
450 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
451 			IP6_ECN_set_ce(ipv6_hdr(skb));
452 		}
453 	}
454 }
455 
456 static inline u8
457 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
458 {
459 	u8 inner = 0;
460 	if (skb->protocol == htons(ETH_P_IP))
461 		inner = old_iph->tos;
462 	else if (skb->protocol == htons(ETH_P_IPV6))
463 		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
464 	return INET_ECN_encapsulate(tos, inner);
465 }
466 
467 static int ipgre_rcv(struct sk_buff *skb)
468 {
469 	struct iphdr *iph;
470 	u8     *h;
471 	__be16    flags;
472 	__sum16   csum = 0;
473 	__be32 key = 0;
474 	u32    seqno = 0;
475 	struct ip_tunnel *tunnel;
476 	int    offset = 4;
477 	__be16 gre_proto;
478 	unsigned int len;
479 
480 	if (!pskb_may_pull(skb, 16))
481 		goto drop_nolock;
482 
483 	iph = ip_hdr(skb);
484 	h = skb->data;
485 	flags = *(__be16*)h;
486 
487 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
488 		/* - Version must be 0.
489 		   - We do not support routing headers.
490 		 */
491 		if (flags&(GRE_VERSION|GRE_ROUTING))
492 			goto drop_nolock;
493 
494 		if (flags&GRE_CSUM) {
495 			switch (skb->ip_summed) {
496 			case CHECKSUM_COMPLETE:
497 				csum = csum_fold(skb->csum);
498 				if (!csum)
499 					break;
500 				/* fall through */
501 			case CHECKSUM_NONE:
502 				skb->csum = 0;
503 				csum = __skb_checksum_complete(skb);
504 				skb->ip_summed = CHECKSUM_COMPLETE;
505 			}
506 			offset += 4;
507 		}
508 		if (flags&GRE_KEY) {
509 			key = *(__be32*)(h + offset);
510 			offset += 4;
511 		}
512 		if (flags&GRE_SEQ) {
513 			seqno = ntohl(*(__be32*)(h + offset));
514 			offset += 4;
515 		}
516 	}
517 
518 	gre_proto = *(__be16 *)(h + 2);
519 
520 	read_lock(&ipgre_lock);
521 	if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
522 					  iph->saddr, iph->daddr, key,
523 					  gre_proto))) {
524 		struct net_device_stats *stats = &tunnel->dev->stats;
525 
526 		secpath_reset(skb);
527 
528 		skb->protocol = gre_proto;
529 		/* WCCP version 1 and 2 protocol decoding.
530 		 * - Change protocol to IP
531 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
532 		 */
533 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
534 			skb->protocol = htons(ETH_P_IP);
535 			if ((*(h + offset) & 0xF0) != 0x40)
536 				offset += 4;
537 		}
538 
539 		skb->mac_header = skb->network_header;
540 		__pskb_pull(skb, offset);
541 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
542 		skb->pkt_type = PACKET_HOST;
543 #ifdef CONFIG_NET_IPGRE_BROADCAST
544 		if (ipv4_is_multicast(iph->daddr)) {
545 			/* Looped back packet, drop it! */
546 			if (skb->rtable->fl.iif == 0)
547 				goto drop;
548 			stats->multicast++;
549 			skb->pkt_type = PACKET_BROADCAST;
550 		}
551 #endif
552 
553 		if (((flags&GRE_CSUM) && csum) ||
554 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
555 			stats->rx_crc_errors++;
556 			stats->rx_errors++;
557 			goto drop;
558 		}
559 		if (tunnel->parms.i_flags&GRE_SEQ) {
560 			if (!(flags&GRE_SEQ) ||
561 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
562 				stats->rx_fifo_errors++;
563 				stats->rx_errors++;
564 				goto drop;
565 			}
566 			tunnel->i_seqno = seqno + 1;
567 		}
568 
569 		len = skb->len;
570 
571 		/* Warning: All skb pointers will be invalidated! */
572 		if (tunnel->dev->type == ARPHRD_ETHER) {
573 			if (!pskb_may_pull(skb, ETH_HLEN)) {
574 				stats->rx_length_errors++;
575 				stats->rx_errors++;
576 				goto drop;
577 			}
578 
579 			iph = ip_hdr(skb);
580 			skb->protocol = eth_type_trans(skb, tunnel->dev);
581 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
582 		}
583 
584 		stats->rx_packets++;
585 		stats->rx_bytes += len;
586 		skb->dev = tunnel->dev;
587 		dst_release(skb->dst);
588 		skb->dst = NULL;
589 		nf_reset(skb);
590 
591 		skb_reset_network_header(skb);
592 		ipgre_ecn_decapsulate(iph, skb);
593 
594 		netif_rx(skb);
595 		read_unlock(&ipgre_lock);
596 		return(0);
597 	}
598 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
599 
600 drop:
601 	read_unlock(&ipgre_lock);
602 drop_nolock:
603 	kfree_skb(skb);
604 	return(0);
605 }
606 
607 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
608 {
609 	struct ip_tunnel *tunnel = netdev_priv(dev);
610 	struct net_device_stats *stats = &tunnel->dev->stats;
611 	struct iphdr  *old_iph = ip_hdr(skb);
612 	struct iphdr  *tiph;
613 	u8     tos;
614 	__be16 df;
615 	struct rtable *rt;     			/* Route to the other host */
616 	struct net_device *tdev;			/* Device to other host */
617 	struct iphdr  *iph;			/* Our new IP header */
618 	unsigned int max_headroom;		/* The extra header space needed */
619 	int    gre_hlen;
620 	__be32 dst;
621 	int    mtu;
622 
623 	if (tunnel->recursion++) {
624 		stats->collisions++;
625 		goto tx_error;
626 	}
627 
628 	if (dev->type == ARPHRD_ETHER)
629 		IPCB(skb)->flags = 0;
630 
631 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
632 		gre_hlen = 0;
633 		tiph = (struct iphdr *)skb->data;
634 	} else {
635 		gre_hlen = tunnel->hlen;
636 		tiph = &tunnel->parms.iph;
637 	}
638 
639 	if ((dst = tiph->daddr) == 0) {
640 		/* NBMA tunnel */
641 
642 		if (skb->dst == NULL) {
643 			stats->tx_fifo_errors++;
644 			goto tx_error;
645 		}
646 
647 		if (skb->protocol == htons(ETH_P_IP)) {
648 			rt = skb->rtable;
649 			if ((dst = rt->rt_gateway) == 0)
650 				goto tx_error_icmp;
651 		}
652 #ifdef CONFIG_IPV6
653 		else if (skb->protocol == htons(ETH_P_IPV6)) {
654 			struct in6_addr *addr6;
655 			int addr_type;
656 			struct neighbour *neigh = skb->dst->neighbour;
657 
658 			if (neigh == NULL)
659 				goto tx_error;
660 
661 			addr6 = (struct in6_addr *)&neigh->primary_key;
662 			addr_type = ipv6_addr_type(addr6);
663 
664 			if (addr_type == IPV6_ADDR_ANY) {
665 				addr6 = &ipv6_hdr(skb)->daddr;
666 				addr_type = ipv6_addr_type(addr6);
667 			}
668 
669 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
670 				goto tx_error_icmp;
671 
672 			dst = addr6->s6_addr32[3];
673 		}
674 #endif
675 		else
676 			goto tx_error;
677 	}
678 
679 	tos = tiph->tos;
680 	if (tos&1) {
681 		if (skb->protocol == htons(ETH_P_IP))
682 			tos = old_iph->tos;
683 		tos &= ~1;
684 	}
685 
686 	{
687 		struct flowi fl = { .oif = tunnel->parms.link,
688 				    .nl_u = { .ip4_u =
689 					      { .daddr = dst,
690 						.saddr = tiph->saddr,
691 						.tos = RT_TOS(tos) } },
692 				    .proto = IPPROTO_GRE };
693 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
694 			stats->tx_carrier_errors++;
695 			goto tx_error;
696 		}
697 	}
698 	tdev = rt->u.dst.dev;
699 
700 	if (tdev == dev) {
701 		ip_rt_put(rt);
702 		stats->collisions++;
703 		goto tx_error;
704 	}
705 
706 	df = tiph->frag_off;
707 	if (df)
708 		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
709 	else
710 		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
711 
712 	if (skb->dst)
713 		skb->dst->ops->update_pmtu(skb->dst, mtu);
714 
715 	if (skb->protocol == htons(ETH_P_IP)) {
716 		df |= (old_iph->frag_off&htons(IP_DF));
717 
718 		if ((old_iph->frag_off&htons(IP_DF)) &&
719 		    mtu < ntohs(old_iph->tot_len)) {
720 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
721 			ip_rt_put(rt);
722 			goto tx_error;
723 		}
724 	}
725 #ifdef CONFIG_IPV6
726 	else if (skb->protocol == htons(ETH_P_IPV6)) {
727 		struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
728 
729 		if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
730 			if ((tunnel->parms.iph.daddr &&
731 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
732 			    rt6->rt6i_dst.plen == 128) {
733 				rt6->rt6i_flags |= RTF_MODIFIED;
734 				skb->dst->metrics[RTAX_MTU-1] = mtu;
735 			}
736 		}
737 
738 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
739 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
740 			ip_rt_put(rt);
741 			goto tx_error;
742 		}
743 	}
744 #endif
745 
746 	if (tunnel->err_count > 0) {
747 		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
748 			tunnel->err_count--;
749 
750 			dst_link_failure(skb);
751 		} else
752 			tunnel->err_count = 0;
753 	}
754 
755 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
756 
757 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
758 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
759 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
760 		if (!new_skb) {
761 			ip_rt_put(rt);
762 			stats->tx_dropped++;
763 			dev_kfree_skb(skb);
764 			tunnel->recursion--;
765 			return 0;
766 		}
767 		if (skb->sk)
768 			skb_set_owner_w(new_skb, skb->sk);
769 		dev_kfree_skb(skb);
770 		skb = new_skb;
771 		old_iph = ip_hdr(skb);
772 	}
773 
774 	skb_reset_transport_header(skb);
775 	skb_push(skb, gre_hlen);
776 	skb_reset_network_header(skb);
777 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
778 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
779 			      IPSKB_REROUTED);
780 	dst_release(skb->dst);
781 	skb->dst = &rt->u.dst;
782 
783 	/*
784 	 *	Push down and install the IPIP header.
785 	 */
786 
787 	iph 			=	ip_hdr(skb);
788 	iph->version		=	4;
789 	iph->ihl		=	sizeof(struct iphdr) >> 2;
790 	iph->frag_off		=	df;
791 	iph->protocol		=	IPPROTO_GRE;
792 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
793 	iph->daddr		=	rt->rt_dst;
794 	iph->saddr		=	rt->rt_src;
795 
796 	if ((iph->ttl = tiph->ttl) == 0) {
797 		if (skb->protocol == htons(ETH_P_IP))
798 			iph->ttl = old_iph->ttl;
799 #ifdef CONFIG_IPV6
800 		else if (skb->protocol == htons(ETH_P_IPV6))
801 			iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
802 #endif
803 		else
804 			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
805 	}
806 
807 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
808 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
809 				   htons(ETH_P_TEB) : skb->protocol;
810 
811 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
812 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
813 
814 		if (tunnel->parms.o_flags&GRE_SEQ) {
815 			++tunnel->o_seqno;
816 			*ptr = htonl(tunnel->o_seqno);
817 			ptr--;
818 		}
819 		if (tunnel->parms.o_flags&GRE_KEY) {
820 			*ptr = tunnel->parms.o_key;
821 			ptr--;
822 		}
823 		if (tunnel->parms.o_flags&GRE_CSUM) {
824 			*ptr = 0;
825 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
826 		}
827 	}
828 
829 	nf_reset(skb);
830 
831 	IPTUNNEL_XMIT();
832 	tunnel->recursion--;
833 	return 0;
834 
835 tx_error_icmp:
836 	dst_link_failure(skb);
837 
838 tx_error:
839 	stats->tx_errors++;
840 	dev_kfree_skb(skb);
841 	tunnel->recursion--;
842 	return 0;
843 }
844 
845 static int ipgre_tunnel_bind_dev(struct net_device *dev)
846 {
847 	struct net_device *tdev = NULL;
848 	struct ip_tunnel *tunnel;
849 	struct iphdr *iph;
850 	int hlen = LL_MAX_HEADER;
851 	int mtu = ETH_DATA_LEN;
852 	int addend = sizeof(struct iphdr) + 4;
853 
854 	tunnel = netdev_priv(dev);
855 	iph = &tunnel->parms.iph;
856 
857 	/* Guess output device to choose reasonable mtu and needed_headroom */
858 
859 	if (iph->daddr) {
860 		struct flowi fl = { .oif = tunnel->parms.link,
861 				    .nl_u = { .ip4_u =
862 					      { .daddr = iph->daddr,
863 						.saddr = iph->saddr,
864 						.tos = RT_TOS(iph->tos) } },
865 				    .proto = IPPROTO_GRE };
866 		struct rtable *rt;
867 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
868 			tdev = rt->u.dst.dev;
869 			ip_rt_put(rt);
870 		}
871 
872 		if (dev->type != ARPHRD_ETHER)
873 			dev->flags |= IFF_POINTOPOINT;
874 	}
875 
876 	if (!tdev && tunnel->parms.link)
877 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
878 
879 	if (tdev) {
880 		hlen = tdev->hard_header_len + tdev->needed_headroom;
881 		mtu = tdev->mtu;
882 	}
883 	dev->iflink = tunnel->parms.link;
884 
885 	/* Precalculate GRE options length */
886 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
887 		if (tunnel->parms.o_flags&GRE_CSUM)
888 			addend += 4;
889 		if (tunnel->parms.o_flags&GRE_KEY)
890 			addend += 4;
891 		if (tunnel->parms.o_flags&GRE_SEQ)
892 			addend += 4;
893 	}
894 	dev->needed_headroom = addend + hlen;
895 	mtu -= dev->hard_header_len - addend;
896 
897 	if (mtu < 68)
898 		mtu = 68;
899 
900 	tunnel->hlen = addend;
901 
902 	return mtu;
903 }
904 
905 static int
906 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
907 {
908 	int err = 0;
909 	struct ip_tunnel_parm p;
910 	struct ip_tunnel *t;
911 	struct net *net = dev_net(dev);
912 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
913 
914 	switch (cmd) {
915 	case SIOCGETTUNNEL:
916 		t = NULL;
917 		if (dev == ign->fb_tunnel_dev) {
918 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
919 				err = -EFAULT;
920 				break;
921 			}
922 			t = ipgre_tunnel_locate(net, &p, 0);
923 		}
924 		if (t == NULL)
925 			t = netdev_priv(dev);
926 		memcpy(&p, &t->parms, sizeof(p));
927 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
928 			err = -EFAULT;
929 		break;
930 
931 	case SIOCADDTUNNEL:
932 	case SIOCCHGTUNNEL:
933 		err = -EPERM;
934 		if (!capable(CAP_NET_ADMIN))
935 			goto done;
936 
937 		err = -EFAULT;
938 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
939 			goto done;
940 
941 		err = -EINVAL;
942 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
943 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
944 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
945 			goto done;
946 		if (p.iph.ttl)
947 			p.iph.frag_off |= htons(IP_DF);
948 
949 		if (!(p.i_flags&GRE_KEY))
950 			p.i_key = 0;
951 		if (!(p.o_flags&GRE_KEY))
952 			p.o_key = 0;
953 
954 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
955 
956 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
957 			if (t != NULL) {
958 				if (t->dev != dev) {
959 					err = -EEXIST;
960 					break;
961 				}
962 			} else {
963 				unsigned nflags = 0;
964 
965 				t = netdev_priv(dev);
966 
967 				if (ipv4_is_multicast(p.iph.daddr))
968 					nflags = IFF_BROADCAST;
969 				else if (p.iph.daddr)
970 					nflags = IFF_POINTOPOINT;
971 
972 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
973 					err = -EINVAL;
974 					break;
975 				}
976 				ipgre_tunnel_unlink(ign, t);
977 				t->parms.iph.saddr = p.iph.saddr;
978 				t->parms.iph.daddr = p.iph.daddr;
979 				t->parms.i_key = p.i_key;
980 				t->parms.o_key = p.o_key;
981 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
982 				memcpy(dev->broadcast, &p.iph.daddr, 4);
983 				ipgre_tunnel_link(ign, t);
984 				netdev_state_change(dev);
985 			}
986 		}
987 
988 		if (t) {
989 			err = 0;
990 			if (cmd == SIOCCHGTUNNEL) {
991 				t->parms.iph.ttl = p.iph.ttl;
992 				t->parms.iph.tos = p.iph.tos;
993 				t->parms.iph.frag_off = p.iph.frag_off;
994 				if (t->parms.link != p.link) {
995 					t->parms.link = p.link;
996 					dev->mtu = ipgre_tunnel_bind_dev(dev);
997 					netdev_state_change(dev);
998 				}
999 			}
1000 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1001 				err = -EFAULT;
1002 		} else
1003 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1004 		break;
1005 
1006 	case SIOCDELTUNNEL:
1007 		err = -EPERM;
1008 		if (!capable(CAP_NET_ADMIN))
1009 			goto done;
1010 
1011 		if (dev == ign->fb_tunnel_dev) {
1012 			err = -EFAULT;
1013 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1014 				goto done;
1015 			err = -ENOENT;
1016 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1017 				goto done;
1018 			err = -EPERM;
1019 			if (t == netdev_priv(ign->fb_tunnel_dev))
1020 				goto done;
1021 			dev = t->dev;
1022 		}
1023 		unregister_netdevice(dev);
1024 		err = 0;
1025 		break;
1026 
1027 	default:
1028 		err = -EINVAL;
1029 	}
1030 
1031 done:
1032 	return err;
1033 }
1034 
1035 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1036 {
1037 	struct ip_tunnel *tunnel = netdev_priv(dev);
1038 	if (new_mtu < 68 ||
1039 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1040 		return -EINVAL;
1041 	dev->mtu = new_mtu;
1042 	return 0;
1043 }
1044 
1045 /* Nice toy. Unfortunately, useless in real life :-)
1046    It allows to construct virtual multiprotocol broadcast "LAN"
1047    over the Internet, provided multicast routing is tuned.
1048 
1049 
1050    I have no idea was this bicycle invented before me,
1051    so that I had to set ARPHRD_IPGRE to a random value.
1052    I have an impression, that Cisco could make something similar,
1053    but this feature is apparently missing in IOS<=11.2(8).
1054 
1055    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1056    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1057 
1058    ping -t 255 224.66.66.66
1059 
1060    If nobody answers, mbone does not work.
1061 
1062    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1063    ip addr add 10.66.66.<somewhat>/24 dev Universe
1064    ifconfig Universe up
1065    ifconfig Universe add fe80::<Your_real_addr>/10
1066    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1067    ftp 10.66.66.66
1068    ...
1069    ftp fec0:6666:6666::193.233.7.65
1070    ...
1071 
1072  */
1073 
1074 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1075 			unsigned short type,
1076 			const void *daddr, const void *saddr, unsigned len)
1077 {
1078 	struct ip_tunnel *t = netdev_priv(dev);
1079 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1080 	__be16 *p = (__be16*)(iph+1);
1081 
1082 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1083 	p[0]		= t->parms.o_flags;
1084 	p[1]		= htons(type);
1085 
1086 	/*
1087 	 *	Set the source hardware address.
1088 	 */
1089 
1090 	if (saddr)
1091 		memcpy(&iph->saddr, saddr, 4);
1092 
1093 	if (daddr) {
1094 		memcpy(&iph->daddr, daddr, 4);
1095 		return t->hlen;
1096 	}
1097 	if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1098 		return t->hlen;
1099 
1100 	return -t->hlen;
1101 }
1102 
1103 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1104 {
1105 	struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1106 	memcpy(haddr, &iph->saddr, 4);
1107 	return 4;
1108 }
1109 
1110 static const struct header_ops ipgre_header_ops = {
1111 	.create	= ipgre_header,
1112 	.parse	= ipgre_header_parse,
1113 };
1114 
1115 #ifdef CONFIG_NET_IPGRE_BROADCAST
1116 static int ipgre_open(struct net_device *dev)
1117 {
1118 	struct ip_tunnel *t = netdev_priv(dev);
1119 
1120 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1121 		struct flowi fl = { .oif = t->parms.link,
1122 				    .nl_u = { .ip4_u =
1123 					      { .daddr = t->parms.iph.daddr,
1124 						.saddr = t->parms.iph.saddr,
1125 						.tos = RT_TOS(t->parms.iph.tos) } },
1126 				    .proto = IPPROTO_GRE };
1127 		struct rtable *rt;
1128 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
1129 			return -EADDRNOTAVAIL;
1130 		dev = rt->u.dst.dev;
1131 		ip_rt_put(rt);
1132 		if (__in_dev_get_rtnl(dev) == NULL)
1133 			return -EADDRNOTAVAIL;
1134 		t->mlink = dev->ifindex;
1135 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1136 	}
1137 	return 0;
1138 }
1139 
1140 static int ipgre_close(struct net_device *dev)
1141 {
1142 	struct ip_tunnel *t = netdev_priv(dev);
1143 
1144 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1145 		struct in_device *in_dev;
1146 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1147 		if (in_dev) {
1148 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1149 			in_dev_put(in_dev);
1150 		}
1151 	}
1152 	return 0;
1153 }
1154 
1155 #endif
1156 
1157 static const struct net_device_ops ipgre_netdev_ops = {
1158 	.ndo_init		= ipgre_tunnel_init,
1159 	.ndo_uninit		= ipgre_tunnel_uninit,
1160 #ifdef CONFIG_NET_IPGRE_BROADCAST
1161 	.ndo_open		= ipgre_open,
1162 	.ndo_stop		= ipgre_close,
1163 #endif
1164 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1165 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1166 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1167 };
1168 
1169 static void ipgre_tunnel_setup(struct net_device *dev)
1170 {
1171 	dev->netdev_ops		= &ipgre_netdev_ops;
1172 	dev->destructor 	= free_netdev;
1173 
1174 	dev->type		= ARPHRD_IPGRE;
1175 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1176 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1177 	dev->flags		= IFF_NOARP;
1178 	dev->iflink		= 0;
1179 	dev->addr_len		= 4;
1180 	dev->features		|= NETIF_F_NETNS_LOCAL;
1181 }
1182 
1183 static int ipgre_tunnel_init(struct net_device *dev)
1184 {
1185 	struct ip_tunnel *tunnel;
1186 	struct iphdr *iph;
1187 
1188 	tunnel = netdev_priv(dev);
1189 	iph = &tunnel->parms.iph;
1190 
1191 	tunnel->dev = dev;
1192 	strcpy(tunnel->parms.name, dev->name);
1193 
1194 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1195 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1196 
1197 	if (iph->daddr) {
1198 #ifdef CONFIG_NET_IPGRE_BROADCAST
1199 		if (ipv4_is_multicast(iph->daddr)) {
1200 			if (!iph->saddr)
1201 				return -EINVAL;
1202 			dev->flags = IFF_BROADCAST;
1203 			dev->header_ops = &ipgre_header_ops;
1204 		}
1205 #endif
1206 	} else
1207 		dev->header_ops = &ipgre_header_ops;
1208 
1209 	return 0;
1210 }
1211 
1212 static void ipgre_fb_tunnel_init(struct net_device *dev)
1213 {
1214 	struct ip_tunnel *tunnel = netdev_priv(dev);
1215 	struct iphdr *iph = &tunnel->parms.iph;
1216 	struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1217 
1218 	tunnel->dev = dev;
1219 	strcpy(tunnel->parms.name, dev->name);
1220 
1221 	iph->version		= 4;
1222 	iph->protocol		= IPPROTO_GRE;
1223 	iph->ihl		= 5;
1224 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1225 
1226 	dev_hold(dev);
1227 	ign->tunnels_wc[0]	= tunnel;
1228 }
1229 
1230 
1231 static struct net_protocol ipgre_protocol = {
1232 	.handler	=	ipgre_rcv,
1233 	.err_handler	=	ipgre_err,
1234 	.netns_ok	=	1,
1235 };
1236 
1237 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1238 {
1239 	int prio;
1240 
1241 	for (prio = 0; prio < 4; prio++) {
1242 		int h;
1243 		for (h = 0; h < HASH_SIZE; h++) {
1244 			struct ip_tunnel *t;
1245 			while ((t = ign->tunnels[prio][h]) != NULL)
1246 				unregister_netdevice(t->dev);
1247 		}
1248 	}
1249 }
1250 
1251 static int ipgre_init_net(struct net *net)
1252 {
1253 	int err;
1254 	struct ipgre_net *ign;
1255 
1256 	err = -ENOMEM;
1257 	ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1258 	if (ign == NULL)
1259 		goto err_alloc;
1260 
1261 	err = net_assign_generic(net, ipgre_net_id, ign);
1262 	if (err < 0)
1263 		goto err_assign;
1264 
1265 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1266 					   ipgre_tunnel_setup);
1267 	if (!ign->fb_tunnel_dev) {
1268 		err = -ENOMEM;
1269 		goto err_alloc_dev;
1270 	}
1271 	dev_net_set(ign->fb_tunnel_dev, net);
1272 
1273 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1274 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1275 
1276 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1277 		goto err_reg_dev;
1278 
1279 	return 0;
1280 
1281 err_reg_dev:
1282 	free_netdev(ign->fb_tunnel_dev);
1283 err_alloc_dev:
1284 	/* nothing */
1285 err_assign:
1286 	kfree(ign);
1287 err_alloc:
1288 	return err;
1289 }
1290 
1291 static void ipgre_exit_net(struct net *net)
1292 {
1293 	struct ipgre_net *ign;
1294 
1295 	ign = net_generic(net, ipgre_net_id);
1296 	rtnl_lock();
1297 	ipgre_destroy_tunnels(ign);
1298 	rtnl_unlock();
1299 	kfree(ign);
1300 }
1301 
1302 static struct pernet_operations ipgre_net_ops = {
1303 	.init = ipgre_init_net,
1304 	.exit = ipgre_exit_net,
1305 };
1306 
1307 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1308 {
1309 	__be16 flags;
1310 
1311 	if (!data)
1312 		return 0;
1313 
1314 	flags = 0;
1315 	if (data[IFLA_GRE_IFLAGS])
1316 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1317 	if (data[IFLA_GRE_OFLAGS])
1318 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1319 	if (flags & (GRE_VERSION|GRE_ROUTING))
1320 		return -EINVAL;
1321 
1322 	return 0;
1323 }
1324 
1325 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1326 {
1327 	__be32 daddr;
1328 
1329 	if (tb[IFLA_ADDRESS]) {
1330 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1331 			return -EINVAL;
1332 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1333 			return -EADDRNOTAVAIL;
1334 	}
1335 
1336 	if (!data)
1337 		goto out;
1338 
1339 	if (data[IFLA_GRE_REMOTE]) {
1340 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1341 		if (!daddr)
1342 			return -EINVAL;
1343 	}
1344 
1345 out:
1346 	return ipgre_tunnel_validate(tb, data);
1347 }
1348 
1349 static void ipgre_netlink_parms(struct nlattr *data[],
1350 				struct ip_tunnel_parm *parms)
1351 {
1352 	memset(parms, 0, sizeof(*parms));
1353 
1354 	parms->iph.protocol = IPPROTO_GRE;
1355 
1356 	if (!data)
1357 		return;
1358 
1359 	if (data[IFLA_GRE_LINK])
1360 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1361 
1362 	if (data[IFLA_GRE_IFLAGS])
1363 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1364 
1365 	if (data[IFLA_GRE_OFLAGS])
1366 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1367 
1368 	if (data[IFLA_GRE_IKEY])
1369 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1370 
1371 	if (data[IFLA_GRE_OKEY])
1372 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1373 
1374 	if (data[IFLA_GRE_LOCAL])
1375 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1376 
1377 	if (data[IFLA_GRE_REMOTE])
1378 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1379 
1380 	if (data[IFLA_GRE_TTL])
1381 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1382 
1383 	if (data[IFLA_GRE_TOS])
1384 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1385 
1386 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1387 		parms->iph.frag_off = htons(IP_DF);
1388 }
1389 
1390 static int ipgre_tap_init(struct net_device *dev)
1391 {
1392 	struct ip_tunnel *tunnel;
1393 
1394 	tunnel = netdev_priv(dev);
1395 
1396 	tunnel->dev = dev;
1397 	strcpy(tunnel->parms.name, dev->name);
1398 
1399 	ipgre_tunnel_bind_dev(dev);
1400 
1401 	return 0;
1402 }
1403 
1404 static const struct net_device_ops ipgre_tap_netdev_ops = {
1405 	.ndo_init		= ipgre_tap_init,
1406 	.ndo_uninit		= ipgre_tunnel_uninit,
1407 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1408 	.ndo_set_mac_address 	= eth_mac_addr,
1409 	.ndo_validate_addr	= eth_validate_addr,
1410 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1411 };
1412 
1413 static void ipgre_tap_setup(struct net_device *dev)
1414 {
1415 
1416 	ether_setup(dev);
1417 
1418 	dev->netdev_ops		= &ipgre_netdev_ops;
1419 	dev->destructor 	= free_netdev;
1420 
1421 	dev->iflink		= 0;
1422 	dev->features		|= NETIF_F_NETNS_LOCAL;
1423 }
1424 
1425 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1426 			 struct nlattr *data[])
1427 {
1428 	struct ip_tunnel *nt;
1429 	struct net *net = dev_net(dev);
1430 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1431 	int mtu;
1432 	int err;
1433 
1434 	nt = netdev_priv(dev);
1435 	ipgre_netlink_parms(data, &nt->parms);
1436 
1437 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1438 		return -EEXIST;
1439 
1440 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1441 		random_ether_addr(dev->dev_addr);
1442 
1443 	mtu = ipgre_tunnel_bind_dev(dev);
1444 	if (!tb[IFLA_MTU])
1445 		dev->mtu = mtu;
1446 
1447 	err = register_netdevice(dev);
1448 	if (err)
1449 		goto out;
1450 
1451 	dev_hold(dev);
1452 	ipgre_tunnel_link(ign, nt);
1453 
1454 out:
1455 	return err;
1456 }
1457 
1458 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1459 			    struct nlattr *data[])
1460 {
1461 	struct ip_tunnel *t, *nt;
1462 	struct net *net = dev_net(dev);
1463 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1464 	struct ip_tunnel_parm p;
1465 	int mtu;
1466 
1467 	if (dev == ign->fb_tunnel_dev)
1468 		return -EINVAL;
1469 
1470 	nt = netdev_priv(dev);
1471 	ipgre_netlink_parms(data, &p);
1472 
1473 	t = ipgre_tunnel_locate(net, &p, 0);
1474 
1475 	if (t) {
1476 		if (t->dev != dev)
1477 			return -EEXIST;
1478 	} else {
1479 		unsigned nflags = 0;
1480 
1481 		t = nt;
1482 
1483 		if (ipv4_is_multicast(p.iph.daddr))
1484 			nflags = IFF_BROADCAST;
1485 		else if (p.iph.daddr)
1486 			nflags = IFF_POINTOPOINT;
1487 
1488 		if ((dev->flags ^ nflags) &
1489 		    (IFF_POINTOPOINT | IFF_BROADCAST))
1490 			return -EINVAL;
1491 
1492 		ipgre_tunnel_unlink(ign, t);
1493 		t->parms.iph.saddr = p.iph.saddr;
1494 		t->parms.iph.daddr = p.iph.daddr;
1495 		t->parms.i_key = p.i_key;
1496 		memcpy(dev->dev_addr, &p.iph.saddr, 4);
1497 		memcpy(dev->broadcast, &p.iph.daddr, 4);
1498 		ipgre_tunnel_link(ign, t);
1499 		netdev_state_change(dev);
1500 	}
1501 
1502 	t->parms.o_key = p.o_key;
1503 	t->parms.iph.ttl = p.iph.ttl;
1504 	t->parms.iph.tos = p.iph.tos;
1505 	t->parms.iph.frag_off = p.iph.frag_off;
1506 
1507 	if (t->parms.link != p.link) {
1508 		t->parms.link = p.link;
1509 		mtu = ipgre_tunnel_bind_dev(dev);
1510 		if (!tb[IFLA_MTU])
1511 			dev->mtu = mtu;
1512 		netdev_state_change(dev);
1513 	}
1514 
1515 	return 0;
1516 }
1517 
1518 static size_t ipgre_get_size(const struct net_device *dev)
1519 {
1520 	return
1521 		/* IFLA_GRE_LINK */
1522 		nla_total_size(4) +
1523 		/* IFLA_GRE_IFLAGS */
1524 		nla_total_size(2) +
1525 		/* IFLA_GRE_OFLAGS */
1526 		nla_total_size(2) +
1527 		/* IFLA_GRE_IKEY */
1528 		nla_total_size(4) +
1529 		/* IFLA_GRE_OKEY */
1530 		nla_total_size(4) +
1531 		/* IFLA_GRE_LOCAL */
1532 		nla_total_size(4) +
1533 		/* IFLA_GRE_REMOTE */
1534 		nla_total_size(4) +
1535 		/* IFLA_GRE_TTL */
1536 		nla_total_size(1) +
1537 		/* IFLA_GRE_TOS */
1538 		nla_total_size(1) +
1539 		/* IFLA_GRE_PMTUDISC */
1540 		nla_total_size(1) +
1541 		0;
1542 }
1543 
1544 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1545 {
1546 	struct ip_tunnel *t = netdev_priv(dev);
1547 	struct ip_tunnel_parm *p = &t->parms;
1548 
1549 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1550 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1551 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1552 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1553 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1554 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1555 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1556 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1557 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1558 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1559 
1560 	return 0;
1561 
1562 nla_put_failure:
1563 	return -EMSGSIZE;
1564 }
1565 
1566 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1567 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1568 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1569 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1570 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1571 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1572 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1573 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1574 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1575 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1576 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1577 };
1578 
1579 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1580 	.kind		= "gre",
1581 	.maxtype	= IFLA_GRE_MAX,
1582 	.policy		= ipgre_policy,
1583 	.priv_size	= sizeof(struct ip_tunnel),
1584 	.setup		= ipgre_tunnel_setup,
1585 	.validate	= ipgre_tunnel_validate,
1586 	.newlink	= ipgre_newlink,
1587 	.changelink	= ipgre_changelink,
1588 	.get_size	= ipgre_get_size,
1589 	.fill_info	= ipgre_fill_info,
1590 };
1591 
1592 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1593 	.kind		= "gretap",
1594 	.maxtype	= IFLA_GRE_MAX,
1595 	.policy		= ipgre_policy,
1596 	.priv_size	= sizeof(struct ip_tunnel),
1597 	.setup		= ipgre_tap_setup,
1598 	.validate	= ipgre_tap_validate,
1599 	.newlink	= ipgre_newlink,
1600 	.changelink	= ipgre_changelink,
1601 	.get_size	= ipgre_get_size,
1602 	.fill_info	= ipgre_fill_info,
1603 };
1604 
1605 /*
1606  *	And now the modules code and kernel interface.
1607  */
1608 
1609 static int __init ipgre_init(void)
1610 {
1611 	int err;
1612 
1613 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1614 
1615 	if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1616 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1617 		return -EAGAIN;
1618 	}
1619 
1620 	err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1621 	if (err < 0)
1622 		goto gen_device_failed;
1623 
1624 	err = rtnl_link_register(&ipgre_link_ops);
1625 	if (err < 0)
1626 		goto rtnl_link_failed;
1627 
1628 	err = rtnl_link_register(&ipgre_tap_ops);
1629 	if (err < 0)
1630 		goto tap_ops_failed;
1631 
1632 out:
1633 	return err;
1634 
1635 tap_ops_failed:
1636 	rtnl_link_unregister(&ipgre_link_ops);
1637 rtnl_link_failed:
1638 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1639 gen_device_failed:
1640 	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1641 	goto out;
1642 }
1643 
1644 static void __exit ipgre_fini(void)
1645 {
1646 	rtnl_link_unregister(&ipgre_tap_ops);
1647 	rtnl_link_unregister(&ipgre_link_ops);
1648 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1649 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1650 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1651 }
1652 
1653 module_init(ipgre_init);
1654 module_exit(ipgre_fini);
1655 MODULE_LICENSE("GPL");
1656 MODULE_ALIAS_RTNL_LINK("gre");
1657 MODULE_ALIAS_RTNL_LINK("gretap");
1658