xref: /linux/net/ipv4/ip_gre.c (revision 367b8112fe2ea5c39a7bb4d263dcdd9b612fae18)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32 
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46 
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73 
74 
75 
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80 
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89 
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92 
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96 
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109 
110 
111 
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118 
119    Alexey Kuznetsov.
120  */
121 
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126 
127 /* Fallback tunnel: no source, no destination, no key, no options */
128 
129 static int ipgre_fb_tunnel_init(struct net_device *dev);
130 
131 #define HASH_SIZE  16
132 
133 static int ipgre_net_id;
134 struct ipgre_net {
135 	struct ip_tunnel *tunnels[4][HASH_SIZE];
136 
137 	struct net_device *fb_tunnel_dev;
138 };
139 
140 /* Tunnel hash table */
141 
142 /*
143    4 hash tables:
144 
145    3: (remote,local)
146    2: (remote,*)
147    1: (*,local)
148    0: (*,*)
149 
150    We require exact key match i.e. if a key is present in packet
151    it will match only tunnel with the same key; if it is not present,
152    it will match only keyless tunnel.
153 
154    All keysless packets, if not matched configured keyless tunnels
155    will match fallback tunnel.
156  */
157 
158 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
159 
160 #define tunnels_r_l	tunnels[3]
161 #define tunnels_r	tunnels[2]
162 #define tunnels_l	tunnels[1]
163 #define tunnels_wc	tunnels[0]
164 
165 static DEFINE_RWLOCK(ipgre_lock);
166 
167 /* Given src, dst and key, find appropriate for input tunnel. */
168 
169 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
170 					      __be32 remote, __be32 local,
171 					      __be32 key, __be16 gre_proto)
172 {
173 	unsigned h0 = HASH(remote);
174 	unsigned h1 = HASH(key);
175 	struct ip_tunnel *t;
176 	struct ip_tunnel *t2 = NULL;
177 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
178 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
179 		       ARPHRD_ETHER : ARPHRD_IPGRE;
180 
181 	for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
183 			if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
184 				if (t->dev->type == dev_type)
185 					return t;
186 				if (t->dev->type == ARPHRD_IPGRE && !t2)
187 					t2 = t;
188 			}
189 		}
190 	}
191 
192 	for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
193 		if (remote == t->parms.iph.daddr) {
194 			if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
195 				if (t->dev->type == dev_type)
196 					return t;
197 				if (t->dev->type == ARPHRD_IPGRE && !t2)
198 					t2 = t;
199 			}
200 		}
201 	}
202 
203 	for (t = ign->tunnels_l[h1]; t; t = t->next) {
204 		if (local == t->parms.iph.saddr ||
205 		     (local == t->parms.iph.daddr &&
206 		      ipv4_is_multicast(local))) {
207 			if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
208 				if (t->dev->type == dev_type)
209 					return t;
210 				if (t->dev->type == ARPHRD_IPGRE && !t2)
211 					t2 = t;
212 			}
213 		}
214 	}
215 
216 	for (t = ign->tunnels_wc[h1]; t; t = t->next) {
217 		if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
218 			if (t->dev->type == dev_type)
219 				return t;
220 			if (t->dev->type == ARPHRD_IPGRE && !t2)
221 				t2 = t;
222 		}
223 	}
224 
225 	if (t2)
226 		return t2;
227 
228 	if (ign->fb_tunnel_dev->flags&IFF_UP)
229 		return netdev_priv(ign->fb_tunnel_dev);
230 	return NULL;
231 }
232 
233 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
234 		struct ip_tunnel_parm *parms)
235 {
236 	__be32 remote = parms->iph.daddr;
237 	__be32 local = parms->iph.saddr;
238 	__be32 key = parms->i_key;
239 	unsigned h = HASH(key);
240 	int prio = 0;
241 
242 	if (local)
243 		prio |= 1;
244 	if (remote && !ipv4_is_multicast(remote)) {
245 		prio |= 2;
246 		h ^= HASH(remote);
247 	}
248 
249 	return &ign->tunnels[prio][h];
250 }
251 
252 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
253 		struct ip_tunnel *t)
254 {
255 	return __ipgre_bucket(ign, &t->parms);
256 }
257 
258 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
259 {
260 	struct ip_tunnel **tp = ipgre_bucket(ign, t);
261 
262 	t->next = *tp;
263 	write_lock_bh(&ipgre_lock);
264 	*tp = t;
265 	write_unlock_bh(&ipgre_lock);
266 }
267 
268 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
269 {
270 	struct ip_tunnel **tp;
271 
272 	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
273 		if (t == *tp) {
274 			write_lock_bh(&ipgre_lock);
275 			*tp = t->next;
276 			write_unlock_bh(&ipgre_lock);
277 			break;
278 		}
279 	}
280 }
281 
282 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
283 					   struct ip_tunnel_parm *parms,
284 					   int type)
285 {
286 	__be32 remote = parms->iph.daddr;
287 	__be32 local = parms->iph.saddr;
288 	__be32 key = parms->i_key;
289 	struct ip_tunnel *t, **tp;
290 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
291 
292 	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
293 		if (local == t->parms.iph.saddr &&
294 		    remote == t->parms.iph.daddr &&
295 		    key == t->parms.i_key &&
296 		    type == t->dev->type)
297 			break;
298 
299 	return t;
300 }
301 
302 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
303 		struct ip_tunnel_parm *parms, int create)
304 {
305 	struct ip_tunnel *t, *nt;
306 	struct net_device *dev;
307 	char name[IFNAMSIZ];
308 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
309 
310 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
311 	if (t || !create)
312 		return t;
313 
314 	if (parms->name[0])
315 		strlcpy(name, parms->name, IFNAMSIZ);
316 	else
317 		sprintf(name, "gre%%d");
318 
319 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
320 	if (!dev)
321 	  return NULL;
322 
323 	dev_net_set(dev, net);
324 
325 	if (strchr(name, '%')) {
326 		if (dev_alloc_name(dev, name) < 0)
327 			goto failed_free;
328 	}
329 
330 	nt = netdev_priv(dev);
331 	nt->parms = *parms;
332 	dev->rtnl_link_ops = &ipgre_link_ops;
333 
334 	dev->mtu = ipgre_tunnel_bind_dev(dev);
335 
336 	if (register_netdevice(dev) < 0)
337 		goto failed_free;
338 
339 	dev_hold(dev);
340 	ipgre_tunnel_link(ign, nt);
341 	return nt;
342 
343 failed_free:
344 	free_netdev(dev);
345 	return NULL;
346 }
347 
348 static void ipgre_tunnel_uninit(struct net_device *dev)
349 {
350 	struct net *net = dev_net(dev);
351 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
352 
353 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
354 	dev_put(dev);
355 }
356 
357 
358 static void ipgre_err(struct sk_buff *skb, u32 info)
359 {
360 
361 /* All the routers (except for Linux) return only
362    8 bytes of packet payload. It means, that precise relaying of
363    ICMP in the real Internet is absolutely infeasible.
364 
365    Moreover, Cisco "wise men" put GRE key to the third word
366    in GRE header. It makes impossible maintaining even soft state for keyed
367    GRE tunnels with enabled checksum. Tell them "thank you".
368 
369    Well, I wonder, rfc1812 was written by Cisco employee,
370    what the hell these idiots break standrads established
371    by themself???
372  */
373 
374 	struct iphdr *iph = (struct iphdr*)skb->data;
375 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
376 	int grehlen = (iph->ihl<<2) + 4;
377 	const int type = icmp_hdr(skb)->type;
378 	const int code = icmp_hdr(skb)->code;
379 	struct ip_tunnel *t;
380 	__be16 flags;
381 
382 	flags = p[0];
383 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
384 		if (flags&(GRE_VERSION|GRE_ROUTING))
385 			return;
386 		if (flags&GRE_KEY) {
387 			grehlen += 4;
388 			if (flags&GRE_CSUM)
389 				grehlen += 4;
390 		}
391 	}
392 
393 	/* If only 8 bytes returned, keyed message will be dropped here */
394 	if (skb_headlen(skb) < grehlen)
395 		return;
396 
397 	switch (type) {
398 	default:
399 	case ICMP_PARAMETERPROB:
400 		return;
401 
402 	case ICMP_DEST_UNREACH:
403 		switch (code) {
404 		case ICMP_SR_FAILED:
405 		case ICMP_PORT_UNREACH:
406 			/* Impossible event. */
407 			return;
408 		case ICMP_FRAG_NEEDED:
409 			/* Soft state for pmtu is maintained by IP core. */
410 			return;
411 		default:
412 			/* All others are translated to HOST_UNREACH.
413 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
414 			   I believe they are just ether pollution. --ANK
415 			 */
416 			break;
417 		}
418 		break;
419 	case ICMP_TIME_EXCEEDED:
420 		if (code != ICMP_EXC_TTL)
421 			return;
422 		break;
423 	}
424 
425 	read_lock(&ipgre_lock);
426 	t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
427 				flags & GRE_KEY ?
428 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
429 				p[1]);
430 	if (t == NULL || t->parms.iph.daddr == 0 ||
431 	    ipv4_is_multicast(t->parms.iph.daddr))
432 		goto out;
433 
434 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
435 		goto out;
436 
437 	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
438 		t->err_count++;
439 	else
440 		t->err_count = 1;
441 	t->err_time = jiffies;
442 out:
443 	read_unlock(&ipgre_lock);
444 	return;
445 }
446 
447 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
448 {
449 	if (INET_ECN_is_ce(iph->tos)) {
450 		if (skb->protocol == htons(ETH_P_IP)) {
451 			IP_ECN_set_ce(ip_hdr(skb));
452 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
453 			IP6_ECN_set_ce(ipv6_hdr(skb));
454 		}
455 	}
456 }
457 
458 static inline u8
459 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
460 {
461 	u8 inner = 0;
462 	if (skb->protocol == htons(ETH_P_IP))
463 		inner = old_iph->tos;
464 	else if (skb->protocol == htons(ETH_P_IPV6))
465 		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
466 	return INET_ECN_encapsulate(tos, inner);
467 }
468 
469 static int ipgre_rcv(struct sk_buff *skb)
470 {
471 	struct iphdr *iph;
472 	u8     *h;
473 	__be16    flags;
474 	__sum16   csum = 0;
475 	__be32 key = 0;
476 	u32    seqno = 0;
477 	struct ip_tunnel *tunnel;
478 	int    offset = 4;
479 	__be16 gre_proto;
480 	unsigned int len;
481 
482 	if (!pskb_may_pull(skb, 16))
483 		goto drop_nolock;
484 
485 	iph = ip_hdr(skb);
486 	h = skb->data;
487 	flags = *(__be16*)h;
488 
489 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
490 		/* - Version must be 0.
491 		   - We do not support routing headers.
492 		 */
493 		if (flags&(GRE_VERSION|GRE_ROUTING))
494 			goto drop_nolock;
495 
496 		if (flags&GRE_CSUM) {
497 			switch (skb->ip_summed) {
498 			case CHECKSUM_COMPLETE:
499 				csum = csum_fold(skb->csum);
500 				if (!csum)
501 					break;
502 				/* fall through */
503 			case CHECKSUM_NONE:
504 				skb->csum = 0;
505 				csum = __skb_checksum_complete(skb);
506 				skb->ip_summed = CHECKSUM_COMPLETE;
507 			}
508 			offset += 4;
509 		}
510 		if (flags&GRE_KEY) {
511 			key = *(__be32*)(h + offset);
512 			offset += 4;
513 		}
514 		if (flags&GRE_SEQ) {
515 			seqno = ntohl(*(__be32*)(h + offset));
516 			offset += 4;
517 		}
518 	}
519 
520 	gre_proto = *(__be16 *)(h + 2);
521 
522 	read_lock(&ipgre_lock);
523 	if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
524 					  iph->saddr, iph->daddr, key,
525 					  gre_proto))) {
526 		struct net_device_stats *stats = &tunnel->dev->stats;
527 
528 		secpath_reset(skb);
529 
530 		skb->protocol = gre_proto;
531 		/* WCCP version 1 and 2 protocol decoding.
532 		 * - Change protocol to IP
533 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
534 		 */
535 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
536 			skb->protocol = htons(ETH_P_IP);
537 			if ((*(h + offset) & 0xF0) != 0x40)
538 				offset += 4;
539 		}
540 
541 		skb->mac_header = skb->network_header;
542 		__pskb_pull(skb, offset);
543 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
544 		skb->pkt_type = PACKET_HOST;
545 #ifdef CONFIG_NET_IPGRE_BROADCAST
546 		if (ipv4_is_multicast(iph->daddr)) {
547 			/* Looped back packet, drop it! */
548 			if (skb->rtable->fl.iif == 0)
549 				goto drop;
550 			stats->multicast++;
551 			skb->pkt_type = PACKET_BROADCAST;
552 		}
553 #endif
554 
555 		if (((flags&GRE_CSUM) && csum) ||
556 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
557 			stats->rx_crc_errors++;
558 			stats->rx_errors++;
559 			goto drop;
560 		}
561 		if (tunnel->parms.i_flags&GRE_SEQ) {
562 			if (!(flags&GRE_SEQ) ||
563 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
564 				stats->rx_fifo_errors++;
565 				stats->rx_errors++;
566 				goto drop;
567 			}
568 			tunnel->i_seqno = seqno + 1;
569 		}
570 
571 		len = skb->len;
572 
573 		/* Warning: All skb pointers will be invalidated! */
574 		if (tunnel->dev->type == ARPHRD_ETHER) {
575 			if (!pskb_may_pull(skb, ETH_HLEN)) {
576 				stats->rx_length_errors++;
577 				stats->rx_errors++;
578 				goto drop;
579 			}
580 
581 			iph = ip_hdr(skb);
582 			skb->protocol = eth_type_trans(skb, tunnel->dev);
583 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
584 		}
585 
586 		stats->rx_packets++;
587 		stats->rx_bytes += len;
588 		skb->dev = tunnel->dev;
589 		dst_release(skb->dst);
590 		skb->dst = NULL;
591 		nf_reset(skb);
592 
593 		skb_reset_network_header(skb);
594 		ipgre_ecn_decapsulate(iph, skb);
595 
596 		netif_rx(skb);
597 		read_unlock(&ipgre_lock);
598 		return(0);
599 	}
600 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
601 
602 drop:
603 	read_unlock(&ipgre_lock);
604 drop_nolock:
605 	kfree_skb(skb);
606 	return(0);
607 }
608 
609 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
610 {
611 	struct ip_tunnel *tunnel = netdev_priv(dev);
612 	struct net_device_stats *stats = &tunnel->dev->stats;
613 	struct iphdr  *old_iph = ip_hdr(skb);
614 	struct iphdr  *tiph;
615 	u8     tos;
616 	__be16 df;
617 	struct rtable *rt;     			/* Route to the other host */
618 	struct net_device *tdev;			/* Device to other host */
619 	struct iphdr  *iph;			/* Our new IP header */
620 	unsigned int max_headroom;		/* The extra header space needed */
621 	int    gre_hlen;
622 	__be32 dst;
623 	int    mtu;
624 
625 	if (tunnel->recursion++) {
626 		stats->collisions++;
627 		goto tx_error;
628 	}
629 
630 	if (dev->type == ARPHRD_ETHER)
631 		IPCB(skb)->flags = 0;
632 
633 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
634 		gre_hlen = 0;
635 		tiph = (struct iphdr*)skb->data;
636 	} else {
637 		gre_hlen = tunnel->hlen;
638 		tiph = &tunnel->parms.iph;
639 	}
640 
641 	if ((dst = tiph->daddr) == 0) {
642 		/* NBMA tunnel */
643 
644 		if (skb->dst == NULL) {
645 			stats->tx_fifo_errors++;
646 			goto tx_error;
647 		}
648 
649 		if (skb->protocol == htons(ETH_P_IP)) {
650 			rt = skb->rtable;
651 			if ((dst = rt->rt_gateway) == 0)
652 				goto tx_error_icmp;
653 		}
654 #ifdef CONFIG_IPV6
655 		else if (skb->protocol == htons(ETH_P_IPV6)) {
656 			struct in6_addr *addr6;
657 			int addr_type;
658 			struct neighbour *neigh = skb->dst->neighbour;
659 
660 			if (neigh == NULL)
661 				goto tx_error;
662 
663 			addr6 = (struct in6_addr*)&neigh->primary_key;
664 			addr_type = ipv6_addr_type(addr6);
665 
666 			if (addr_type == IPV6_ADDR_ANY) {
667 				addr6 = &ipv6_hdr(skb)->daddr;
668 				addr_type = ipv6_addr_type(addr6);
669 			}
670 
671 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
672 				goto tx_error_icmp;
673 
674 			dst = addr6->s6_addr32[3];
675 		}
676 #endif
677 		else
678 			goto tx_error;
679 	}
680 
681 	tos = tiph->tos;
682 	if (tos&1) {
683 		if (skb->protocol == htons(ETH_P_IP))
684 			tos = old_iph->tos;
685 		tos &= ~1;
686 	}
687 
688 	{
689 		struct flowi fl = { .oif = tunnel->parms.link,
690 				    .nl_u = { .ip4_u =
691 					      { .daddr = dst,
692 						.saddr = tiph->saddr,
693 						.tos = RT_TOS(tos) } },
694 				    .proto = IPPROTO_GRE };
695 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
696 			stats->tx_carrier_errors++;
697 			goto tx_error;
698 		}
699 	}
700 	tdev = rt->u.dst.dev;
701 
702 	if (tdev == dev) {
703 		ip_rt_put(rt);
704 		stats->collisions++;
705 		goto tx_error;
706 	}
707 
708 	df = tiph->frag_off;
709 	if (df)
710 		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
711 	else
712 		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
713 
714 	if (skb->dst)
715 		skb->dst->ops->update_pmtu(skb->dst, mtu);
716 
717 	if (skb->protocol == htons(ETH_P_IP)) {
718 		df |= (old_iph->frag_off&htons(IP_DF));
719 
720 		if ((old_iph->frag_off&htons(IP_DF)) &&
721 		    mtu < ntohs(old_iph->tot_len)) {
722 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
723 			ip_rt_put(rt);
724 			goto tx_error;
725 		}
726 	}
727 #ifdef CONFIG_IPV6
728 	else if (skb->protocol == htons(ETH_P_IPV6)) {
729 		struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
730 
731 		if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
732 			if ((tunnel->parms.iph.daddr &&
733 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
734 			    rt6->rt6i_dst.plen == 128) {
735 				rt6->rt6i_flags |= RTF_MODIFIED;
736 				skb->dst->metrics[RTAX_MTU-1] = mtu;
737 			}
738 		}
739 
740 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
741 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
742 			ip_rt_put(rt);
743 			goto tx_error;
744 		}
745 	}
746 #endif
747 
748 	if (tunnel->err_count > 0) {
749 		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
750 			tunnel->err_count--;
751 
752 			dst_link_failure(skb);
753 		} else
754 			tunnel->err_count = 0;
755 	}
756 
757 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
758 
759 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
760 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
761 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
762 		if (!new_skb) {
763 			ip_rt_put(rt);
764 			stats->tx_dropped++;
765 			dev_kfree_skb(skb);
766 			tunnel->recursion--;
767 			return 0;
768 		}
769 		if (skb->sk)
770 			skb_set_owner_w(new_skb, skb->sk);
771 		dev_kfree_skb(skb);
772 		skb = new_skb;
773 		old_iph = ip_hdr(skb);
774 	}
775 
776 	skb_reset_transport_header(skb);
777 	skb_push(skb, gre_hlen);
778 	skb_reset_network_header(skb);
779 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
780 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
781 			      IPSKB_REROUTED);
782 	dst_release(skb->dst);
783 	skb->dst = &rt->u.dst;
784 
785 	/*
786 	 *	Push down and install the IPIP header.
787 	 */
788 
789 	iph 			=	ip_hdr(skb);
790 	iph->version		=	4;
791 	iph->ihl		=	sizeof(struct iphdr) >> 2;
792 	iph->frag_off		=	df;
793 	iph->protocol		=	IPPROTO_GRE;
794 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
795 	iph->daddr		=	rt->rt_dst;
796 	iph->saddr		=	rt->rt_src;
797 
798 	if ((iph->ttl = tiph->ttl) == 0) {
799 		if (skb->protocol == htons(ETH_P_IP))
800 			iph->ttl = old_iph->ttl;
801 #ifdef CONFIG_IPV6
802 		else if (skb->protocol == htons(ETH_P_IPV6))
803 			iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
804 #endif
805 		else
806 			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
807 	}
808 
809 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
810 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
811 				   htons(ETH_P_TEB) : skb->protocol;
812 
813 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
814 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
815 
816 		if (tunnel->parms.o_flags&GRE_SEQ) {
817 			++tunnel->o_seqno;
818 			*ptr = htonl(tunnel->o_seqno);
819 			ptr--;
820 		}
821 		if (tunnel->parms.o_flags&GRE_KEY) {
822 			*ptr = tunnel->parms.o_key;
823 			ptr--;
824 		}
825 		if (tunnel->parms.o_flags&GRE_CSUM) {
826 			*ptr = 0;
827 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
828 		}
829 	}
830 
831 	nf_reset(skb);
832 
833 	IPTUNNEL_XMIT();
834 	tunnel->recursion--;
835 	return 0;
836 
837 tx_error_icmp:
838 	dst_link_failure(skb);
839 
840 tx_error:
841 	stats->tx_errors++;
842 	dev_kfree_skb(skb);
843 	tunnel->recursion--;
844 	return 0;
845 }
846 
847 static int ipgre_tunnel_bind_dev(struct net_device *dev)
848 {
849 	struct net_device *tdev = NULL;
850 	struct ip_tunnel *tunnel;
851 	struct iphdr *iph;
852 	int hlen = LL_MAX_HEADER;
853 	int mtu = ETH_DATA_LEN;
854 	int addend = sizeof(struct iphdr) + 4;
855 
856 	tunnel = netdev_priv(dev);
857 	iph = &tunnel->parms.iph;
858 
859 	/* Guess output device to choose reasonable mtu and needed_headroom */
860 
861 	if (iph->daddr) {
862 		struct flowi fl = { .oif = tunnel->parms.link,
863 				    .nl_u = { .ip4_u =
864 					      { .daddr = iph->daddr,
865 						.saddr = iph->saddr,
866 						.tos = RT_TOS(iph->tos) } },
867 				    .proto = IPPROTO_GRE };
868 		struct rtable *rt;
869 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
870 			tdev = rt->u.dst.dev;
871 			ip_rt_put(rt);
872 		}
873 
874 		if (dev->type != ARPHRD_ETHER)
875 			dev->flags |= IFF_POINTOPOINT;
876 	}
877 
878 	if (!tdev && tunnel->parms.link)
879 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
880 
881 	if (tdev) {
882 		hlen = tdev->hard_header_len + tdev->needed_headroom;
883 		mtu = tdev->mtu;
884 	}
885 	dev->iflink = tunnel->parms.link;
886 
887 	/* Precalculate GRE options length */
888 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
889 		if (tunnel->parms.o_flags&GRE_CSUM)
890 			addend += 4;
891 		if (tunnel->parms.o_flags&GRE_KEY)
892 			addend += 4;
893 		if (tunnel->parms.o_flags&GRE_SEQ)
894 			addend += 4;
895 	}
896 	dev->needed_headroom = addend + hlen;
897 	mtu -= dev->hard_header_len - addend;
898 
899 	if (mtu < 68)
900 		mtu = 68;
901 
902 	tunnel->hlen = addend;
903 
904 	return mtu;
905 }
906 
907 static int
908 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
909 {
910 	int err = 0;
911 	struct ip_tunnel_parm p;
912 	struct ip_tunnel *t;
913 	struct net *net = dev_net(dev);
914 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
915 
916 	switch (cmd) {
917 	case SIOCGETTUNNEL:
918 		t = NULL;
919 		if (dev == ign->fb_tunnel_dev) {
920 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
921 				err = -EFAULT;
922 				break;
923 			}
924 			t = ipgre_tunnel_locate(net, &p, 0);
925 		}
926 		if (t == NULL)
927 			t = netdev_priv(dev);
928 		memcpy(&p, &t->parms, sizeof(p));
929 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
930 			err = -EFAULT;
931 		break;
932 
933 	case SIOCADDTUNNEL:
934 	case SIOCCHGTUNNEL:
935 		err = -EPERM;
936 		if (!capable(CAP_NET_ADMIN))
937 			goto done;
938 
939 		err = -EFAULT;
940 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
941 			goto done;
942 
943 		err = -EINVAL;
944 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
945 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
946 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
947 			goto done;
948 		if (p.iph.ttl)
949 			p.iph.frag_off |= htons(IP_DF);
950 
951 		if (!(p.i_flags&GRE_KEY))
952 			p.i_key = 0;
953 		if (!(p.o_flags&GRE_KEY))
954 			p.o_key = 0;
955 
956 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
957 
958 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
959 			if (t != NULL) {
960 				if (t->dev != dev) {
961 					err = -EEXIST;
962 					break;
963 				}
964 			} else {
965 				unsigned nflags=0;
966 
967 				t = netdev_priv(dev);
968 
969 				if (ipv4_is_multicast(p.iph.daddr))
970 					nflags = IFF_BROADCAST;
971 				else if (p.iph.daddr)
972 					nflags = IFF_POINTOPOINT;
973 
974 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
975 					err = -EINVAL;
976 					break;
977 				}
978 				ipgre_tunnel_unlink(ign, t);
979 				t->parms.iph.saddr = p.iph.saddr;
980 				t->parms.iph.daddr = p.iph.daddr;
981 				t->parms.i_key = p.i_key;
982 				t->parms.o_key = p.o_key;
983 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
984 				memcpy(dev->broadcast, &p.iph.daddr, 4);
985 				ipgre_tunnel_link(ign, t);
986 				netdev_state_change(dev);
987 			}
988 		}
989 
990 		if (t) {
991 			err = 0;
992 			if (cmd == SIOCCHGTUNNEL) {
993 				t->parms.iph.ttl = p.iph.ttl;
994 				t->parms.iph.tos = p.iph.tos;
995 				t->parms.iph.frag_off = p.iph.frag_off;
996 				if (t->parms.link != p.link) {
997 					t->parms.link = p.link;
998 					dev->mtu = ipgre_tunnel_bind_dev(dev);
999 					netdev_state_change(dev);
1000 				}
1001 			}
1002 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1003 				err = -EFAULT;
1004 		} else
1005 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1006 		break;
1007 
1008 	case SIOCDELTUNNEL:
1009 		err = -EPERM;
1010 		if (!capable(CAP_NET_ADMIN))
1011 			goto done;
1012 
1013 		if (dev == ign->fb_tunnel_dev) {
1014 			err = -EFAULT;
1015 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1016 				goto done;
1017 			err = -ENOENT;
1018 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1019 				goto done;
1020 			err = -EPERM;
1021 			if (t == netdev_priv(ign->fb_tunnel_dev))
1022 				goto done;
1023 			dev = t->dev;
1024 		}
1025 		unregister_netdevice(dev);
1026 		err = 0;
1027 		break;
1028 
1029 	default:
1030 		err = -EINVAL;
1031 	}
1032 
1033 done:
1034 	return err;
1035 }
1036 
1037 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1038 {
1039 	struct ip_tunnel *tunnel = netdev_priv(dev);
1040 	if (new_mtu < 68 ||
1041 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1042 		return -EINVAL;
1043 	dev->mtu = new_mtu;
1044 	return 0;
1045 }
1046 
1047 /* Nice toy. Unfortunately, useless in real life :-)
1048    It allows to construct virtual multiprotocol broadcast "LAN"
1049    over the Internet, provided multicast routing is tuned.
1050 
1051 
1052    I have no idea was this bicycle invented before me,
1053    so that I had to set ARPHRD_IPGRE to a random value.
1054    I have an impression, that Cisco could make something similar,
1055    but this feature is apparently missing in IOS<=11.2(8).
1056 
1057    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1058    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1059 
1060    ping -t 255 224.66.66.66
1061 
1062    If nobody answers, mbone does not work.
1063 
1064    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1065    ip addr add 10.66.66.<somewhat>/24 dev Universe
1066    ifconfig Universe up
1067    ifconfig Universe add fe80::<Your_real_addr>/10
1068    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1069    ftp 10.66.66.66
1070    ...
1071    ftp fec0:6666:6666::193.233.7.65
1072    ...
1073 
1074  */
1075 
1076 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1077 			unsigned short type,
1078 			const void *daddr, const void *saddr, unsigned len)
1079 {
1080 	struct ip_tunnel *t = netdev_priv(dev);
1081 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1082 	__be16 *p = (__be16*)(iph+1);
1083 
1084 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1085 	p[0]		= t->parms.o_flags;
1086 	p[1]		= htons(type);
1087 
1088 	/*
1089 	 *	Set the source hardware address.
1090 	 */
1091 
1092 	if (saddr)
1093 		memcpy(&iph->saddr, saddr, 4);
1094 
1095 	if (daddr) {
1096 		memcpy(&iph->daddr, daddr, 4);
1097 		return t->hlen;
1098 	}
1099 	if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1100 		return t->hlen;
1101 
1102 	return -t->hlen;
1103 }
1104 
1105 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1106 {
1107 	struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1108 	memcpy(haddr, &iph->saddr, 4);
1109 	return 4;
1110 }
1111 
1112 static const struct header_ops ipgre_header_ops = {
1113 	.create	= ipgre_header,
1114 	.parse	= ipgre_header_parse,
1115 };
1116 
1117 #ifdef CONFIG_NET_IPGRE_BROADCAST
1118 static int ipgre_open(struct net_device *dev)
1119 {
1120 	struct ip_tunnel *t = netdev_priv(dev);
1121 
1122 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1123 		struct flowi fl = { .oif = t->parms.link,
1124 				    .nl_u = { .ip4_u =
1125 					      { .daddr = t->parms.iph.daddr,
1126 						.saddr = t->parms.iph.saddr,
1127 						.tos = RT_TOS(t->parms.iph.tos) } },
1128 				    .proto = IPPROTO_GRE };
1129 		struct rtable *rt;
1130 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
1131 			return -EADDRNOTAVAIL;
1132 		dev = rt->u.dst.dev;
1133 		ip_rt_put(rt);
1134 		if (__in_dev_get_rtnl(dev) == NULL)
1135 			return -EADDRNOTAVAIL;
1136 		t->mlink = dev->ifindex;
1137 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1138 	}
1139 	return 0;
1140 }
1141 
1142 static int ipgre_close(struct net_device *dev)
1143 {
1144 	struct ip_tunnel *t = netdev_priv(dev);
1145 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1146 		struct in_device *in_dev;
1147 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1148 		if (in_dev) {
1149 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1150 			in_dev_put(in_dev);
1151 		}
1152 	}
1153 	return 0;
1154 }
1155 
1156 #endif
1157 
1158 static void ipgre_tunnel_setup(struct net_device *dev)
1159 {
1160 	dev->init		= ipgre_tunnel_init;
1161 	dev->uninit		= ipgre_tunnel_uninit;
1162 	dev->destructor 	= free_netdev;
1163 	dev->hard_start_xmit	= ipgre_tunnel_xmit;
1164 	dev->do_ioctl		= ipgre_tunnel_ioctl;
1165 	dev->change_mtu		= ipgre_tunnel_change_mtu;
1166 
1167 	dev->type		= ARPHRD_IPGRE;
1168 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1169 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1170 	dev->flags		= IFF_NOARP;
1171 	dev->iflink		= 0;
1172 	dev->addr_len		= 4;
1173 	dev->features		|= NETIF_F_NETNS_LOCAL;
1174 }
1175 
1176 static int ipgre_tunnel_init(struct net_device *dev)
1177 {
1178 	struct ip_tunnel *tunnel;
1179 	struct iphdr *iph;
1180 
1181 	tunnel = netdev_priv(dev);
1182 	iph = &tunnel->parms.iph;
1183 
1184 	tunnel->dev = dev;
1185 	strcpy(tunnel->parms.name, dev->name);
1186 
1187 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1188 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1189 
1190 	if (iph->daddr) {
1191 #ifdef CONFIG_NET_IPGRE_BROADCAST
1192 		if (ipv4_is_multicast(iph->daddr)) {
1193 			if (!iph->saddr)
1194 				return -EINVAL;
1195 			dev->flags = IFF_BROADCAST;
1196 			dev->header_ops = &ipgre_header_ops;
1197 			dev->open = ipgre_open;
1198 			dev->stop = ipgre_close;
1199 		}
1200 #endif
1201 	} else
1202 		dev->header_ops = &ipgre_header_ops;
1203 
1204 	return 0;
1205 }
1206 
1207 static int ipgre_fb_tunnel_init(struct net_device *dev)
1208 {
1209 	struct ip_tunnel *tunnel = netdev_priv(dev);
1210 	struct iphdr *iph = &tunnel->parms.iph;
1211 	struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1212 
1213 	tunnel->dev = dev;
1214 	strcpy(tunnel->parms.name, dev->name);
1215 
1216 	iph->version		= 4;
1217 	iph->protocol		= IPPROTO_GRE;
1218 	iph->ihl		= 5;
1219 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1220 
1221 	dev_hold(dev);
1222 	ign->tunnels_wc[0]	= tunnel;
1223 	return 0;
1224 }
1225 
1226 
1227 static struct net_protocol ipgre_protocol = {
1228 	.handler	=	ipgre_rcv,
1229 	.err_handler	=	ipgre_err,
1230 	.netns_ok	=	1,
1231 };
1232 
1233 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1234 {
1235 	int prio;
1236 
1237 	for (prio = 0; prio < 4; prio++) {
1238 		int h;
1239 		for (h = 0; h < HASH_SIZE; h++) {
1240 			struct ip_tunnel *t;
1241 			while ((t = ign->tunnels[prio][h]) != NULL)
1242 				unregister_netdevice(t->dev);
1243 		}
1244 	}
1245 }
1246 
1247 static int ipgre_init_net(struct net *net)
1248 {
1249 	int err;
1250 	struct ipgre_net *ign;
1251 
1252 	err = -ENOMEM;
1253 	ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1254 	if (ign == NULL)
1255 		goto err_alloc;
1256 
1257 	err = net_assign_generic(net, ipgre_net_id, ign);
1258 	if (err < 0)
1259 		goto err_assign;
1260 
1261 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1262 					   ipgre_tunnel_setup);
1263 	if (!ign->fb_tunnel_dev) {
1264 		err = -ENOMEM;
1265 		goto err_alloc_dev;
1266 	}
1267 
1268 	ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1269 	dev_net_set(ign->fb_tunnel_dev, net);
1270 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1271 
1272 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1273 		goto err_reg_dev;
1274 
1275 	return 0;
1276 
1277 err_reg_dev:
1278 	free_netdev(ign->fb_tunnel_dev);
1279 err_alloc_dev:
1280 	/* nothing */
1281 err_assign:
1282 	kfree(ign);
1283 err_alloc:
1284 	return err;
1285 }
1286 
1287 static void ipgre_exit_net(struct net *net)
1288 {
1289 	struct ipgre_net *ign;
1290 
1291 	ign = net_generic(net, ipgre_net_id);
1292 	rtnl_lock();
1293 	ipgre_destroy_tunnels(ign);
1294 	rtnl_unlock();
1295 	kfree(ign);
1296 }
1297 
1298 static struct pernet_operations ipgre_net_ops = {
1299 	.init = ipgre_init_net,
1300 	.exit = ipgre_exit_net,
1301 };
1302 
1303 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1304 {
1305 	__be16 flags;
1306 
1307 	if (!data)
1308 		return 0;
1309 
1310 	flags = 0;
1311 	if (data[IFLA_GRE_IFLAGS])
1312 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1313 	if (data[IFLA_GRE_OFLAGS])
1314 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1315 	if (flags & (GRE_VERSION|GRE_ROUTING))
1316 		return -EINVAL;
1317 
1318 	return 0;
1319 }
1320 
1321 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1322 {
1323 	__be32 daddr;
1324 
1325 	if (tb[IFLA_ADDRESS]) {
1326 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1327 			return -EINVAL;
1328 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1329 			return -EADDRNOTAVAIL;
1330 	}
1331 
1332 	if (!data)
1333 		goto out;
1334 
1335 	if (data[IFLA_GRE_REMOTE]) {
1336 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1337 		if (!daddr)
1338 			return -EINVAL;
1339 	}
1340 
1341 out:
1342 	return ipgre_tunnel_validate(tb, data);
1343 }
1344 
1345 static void ipgre_netlink_parms(struct nlattr *data[],
1346 				struct ip_tunnel_parm *parms)
1347 {
1348 	memset(parms, 0, sizeof(*parms));
1349 
1350 	parms->iph.protocol = IPPROTO_GRE;
1351 
1352 	if (!data)
1353 		return;
1354 
1355 	if (data[IFLA_GRE_LINK])
1356 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1357 
1358 	if (data[IFLA_GRE_IFLAGS])
1359 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1360 
1361 	if (data[IFLA_GRE_OFLAGS])
1362 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1363 
1364 	if (data[IFLA_GRE_IKEY])
1365 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1366 
1367 	if (data[IFLA_GRE_OKEY])
1368 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1369 
1370 	if (data[IFLA_GRE_LOCAL])
1371 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1372 
1373 	if (data[IFLA_GRE_REMOTE])
1374 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1375 
1376 	if (data[IFLA_GRE_TTL])
1377 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1378 
1379 	if (data[IFLA_GRE_TOS])
1380 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1381 
1382 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1383 		parms->iph.frag_off = htons(IP_DF);
1384 }
1385 
1386 static int ipgre_tap_init(struct net_device *dev)
1387 {
1388 	struct ip_tunnel *tunnel;
1389 
1390 	tunnel = netdev_priv(dev);
1391 
1392 	tunnel->dev = dev;
1393 	strcpy(tunnel->parms.name, dev->name);
1394 
1395 	ipgre_tunnel_bind_dev(dev);
1396 
1397 	return 0;
1398 }
1399 
1400 static void ipgre_tap_setup(struct net_device *dev)
1401 {
1402 
1403 	ether_setup(dev);
1404 
1405 	dev->init		= ipgre_tap_init;
1406 	dev->uninit		= ipgre_tunnel_uninit;
1407 	dev->destructor 	= free_netdev;
1408 	dev->hard_start_xmit	= ipgre_tunnel_xmit;
1409 	dev->change_mtu		= ipgre_tunnel_change_mtu;
1410 
1411 	dev->iflink		= 0;
1412 	dev->features		|= NETIF_F_NETNS_LOCAL;
1413 }
1414 
1415 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1416 			 struct nlattr *data[])
1417 {
1418 	struct ip_tunnel *nt;
1419 	struct net *net = dev_net(dev);
1420 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1421 	int mtu;
1422 	int err;
1423 
1424 	nt = netdev_priv(dev);
1425 	ipgre_netlink_parms(data, &nt->parms);
1426 
1427 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1428 		return -EEXIST;
1429 
1430 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1431 		random_ether_addr(dev->dev_addr);
1432 
1433 	mtu = ipgre_tunnel_bind_dev(dev);
1434 	if (!tb[IFLA_MTU])
1435 		dev->mtu = mtu;
1436 
1437 	err = register_netdevice(dev);
1438 	if (err)
1439 		goto out;
1440 
1441 	dev_hold(dev);
1442 	ipgre_tunnel_link(ign, nt);
1443 
1444 out:
1445 	return err;
1446 }
1447 
1448 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1449 			    struct nlattr *data[])
1450 {
1451 	struct ip_tunnel *t, *nt;
1452 	struct net *net = dev_net(dev);
1453 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1454 	struct ip_tunnel_parm p;
1455 	int mtu;
1456 
1457 	if (dev == ign->fb_tunnel_dev)
1458 		return -EINVAL;
1459 
1460 	nt = netdev_priv(dev);
1461 	ipgre_netlink_parms(data, &p);
1462 
1463 	t = ipgre_tunnel_locate(net, &p, 0);
1464 
1465 	if (t) {
1466 		if (t->dev != dev)
1467 			return -EEXIST;
1468 	} else {
1469 		unsigned nflags = 0;
1470 
1471 		t = nt;
1472 
1473 		if (ipv4_is_multicast(p.iph.daddr))
1474 			nflags = IFF_BROADCAST;
1475 		else if (p.iph.daddr)
1476 			nflags = IFF_POINTOPOINT;
1477 
1478 		if ((dev->flags ^ nflags) &
1479 		    (IFF_POINTOPOINT | IFF_BROADCAST))
1480 			return -EINVAL;
1481 
1482 		ipgre_tunnel_unlink(ign, t);
1483 		t->parms.iph.saddr = p.iph.saddr;
1484 		t->parms.iph.daddr = p.iph.daddr;
1485 		t->parms.i_key = p.i_key;
1486 		memcpy(dev->dev_addr, &p.iph.saddr, 4);
1487 		memcpy(dev->broadcast, &p.iph.daddr, 4);
1488 		ipgre_tunnel_link(ign, t);
1489 		netdev_state_change(dev);
1490 	}
1491 
1492 	t->parms.o_key = p.o_key;
1493 	t->parms.iph.ttl = p.iph.ttl;
1494 	t->parms.iph.tos = p.iph.tos;
1495 	t->parms.iph.frag_off = p.iph.frag_off;
1496 
1497 	if (t->parms.link != p.link) {
1498 		t->parms.link = p.link;
1499 		mtu = ipgre_tunnel_bind_dev(dev);
1500 		if (!tb[IFLA_MTU])
1501 			dev->mtu = mtu;
1502 		netdev_state_change(dev);
1503 	}
1504 
1505 	return 0;
1506 }
1507 
1508 static size_t ipgre_get_size(const struct net_device *dev)
1509 {
1510 	return
1511 		/* IFLA_GRE_LINK */
1512 		nla_total_size(4) +
1513 		/* IFLA_GRE_IFLAGS */
1514 		nla_total_size(2) +
1515 		/* IFLA_GRE_OFLAGS */
1516 		nla_total_size(2) +
1517 		/* IFLA_GRE_IKEY */
1518 		nla_total_size(4) +
1519 		/* IFLA_GRE_OKEY */
1520 		nla_total_size(4) +
1521 		/* IFLA_GRE_LOCAL */
1522 		nla_total_size(4) +
1523 		/* IFLA_GRE_REMOTE */
1524 		nla_total_size(4) +
1525 		/* IFLA_GRE_TTL */
1526 		nla_total_size(1) +
1527 		/* IFLA_GRE_TOS */
1528 		nla_total_size(1) +
1529 		/* IFLA_GRE_PMTUDISC */
1530 		nla_total_size(1) +
1531 		0;
1532 }
1533 
1534 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1535 {
1536 	struct ip_tunnel *t = netdev_priv(dev);
1537 	struct ip_tunnel_parm *p = &t->parms;
1538 
1539 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1540 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1541 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1542 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1543 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1544 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1545 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1546 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1547 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1548 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1549 
1550 	return 0;
1551 
1552 nla_put_failure:
1553 	return -EMSGSIZE;
1554 }
1555 
1556 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1557 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1558 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1559 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1560 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1561 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1562 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1563 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1564 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1565 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1566 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1567 };
1568 
1569 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1570 	.kind		= "gre",
1571 	.maxtype	= IFLA_GRE_MAX,
1572 	.policy		= ipgre_policy,
1573 	.priv_size	= sizeof(struct ip_tunnel),
1574 	.setup		= ipgre_tunnel_setup,
1575 	.validate	= ipgre_tunnel_validate,
1576 	.newlink	= ipgre_newlink,
1577 	.changelink	= ipgre_changelink,
1578 	.get_size	= ipgre_get_size,
1579 	.fill_info	= ipgre_fill_info,
1580 };
1581 
1582 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1583 	.kind		= "gretap",
1584 	.maxtype	= IFLA_GRE_MAX,
1585 	.policy		= ipgre_policy,
1586 	.priv_size	= sizeof(struct ip_tunnel),
1587 	.setup		= ipgre_tap_setup,
1588 	.validate	= ipgre_tap_validate,
1589 	.newlink	= ipgre_newlink,
1590 	.changelink	= ipgre_changelink,
1591 	.get_size	= ipgre_get_size,
1592 	.fill_info	= ipgre_fill_info,
1593 };
1594 
1595 /*
1596  *	And now the modules code and kernel interface.
1597  */
1598 
1599 static int __init ipgre_init(void)
1600 {
1601 	int err;
1602 
1603 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1604 
1605 	if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1606 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1607 		return -EAGAIN;
1608 	}
1609 
1610 	err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1611 	if (err < 0)
1612 		goto gen_device_failed;
1613 
1614 	err = rtnl_link_register(&ipgre_link_ops);
1615 	if (err < 0)
1616 		goto rtnl_link_failed;
1617 
1618 	err = rtnl_link_register(&ipgre_tap_ops);
1619 	if (err < 0)
1620 		goto tap_ops_failed;
1621 
1622 out:
1623 	return err;
1624 
1625 tap_ops_failed:
1626 	rtnl_link_unregister(&ipgre_link_ops);
1627 rtnl_link_failed:
1628 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1629 gen_device_failed:
1630 	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1631 	goto out;
1632 }
1633 
1634 static void __exit ipgre_fini(void)
1635 {
1636 	rtnl_link_unregister(&ipgre_tap_ops);
1637 	rtnl_link_unregister(&ipgre_link_ops);
1638 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1639 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1640 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1641 }
1642 
1643 module_init(ipgre_init);
1644 module_exit(ipgre_fini);
1645 MODULE_LICENSE("GPL");
1646 MODULE_ALIAS_RTNL_LINK("gre");
1647 MODULE_ALIAS_RTNL_LINK("gretap");
1648