xref: /linux/net/ipv4/ip_gre.c (revision 913df4453f85f1fe79b35ecf3c9a0c0b707d22a2)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32 
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46 
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: HARD_TX_LOCK lock breaks dead loops.
70 
71 
72 
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77 
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86 
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89 
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, tt is not solution at all.
93 
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    fastly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106 
107 
108 
109    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
110    practically identical code. It would be good to glue them
111    together, but it is not very evident, how to make them modular.
112    sit is integral part of IPv6, ipip and gre are naturally modular.
113    We could extract common parts (hash table, ioctl etc)
114    to a separate module (ip_tunnel.c).
115 
116    Alexey Kuznetsov.
117  */
118 
119 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122 static int ipgre_tunnel_bind_dev(struct net_device *dev);
123 
124 /* Fallback tunnel: no source, no destination, no key, no options */
125 
126 #define HASH_SIZE  16
127 
128 static int ipgre_net_id;
129 struct ipgre_net {
130 	struct ip_tunnel *tunnels[4][HASH_SIZE];
131 
132 	struct net_device *fb_tunnel_dev;
133 };
134 
135 /* Tunnel hash table */
136 
137 /*
138    4 hash tables:
139 
140    3: (remote,local)
141    2: (remote,*)
142    1: (*,local)
143    0: (*,*)
144 
145    We require exact key match i.e. if a key is present in packet
146    it will match only tunnel with the same key; if it is not present,
147    it will match only keyless tunnel.
148 
149    All keysless packets, if not matched configured keyless tunnels
150    will match fallback tunnel.
151  */
152 
153 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
154 
155 #define tunnels_r_l	tunnels[3]
156 #define tunnels_r	tunnels[2]
157 #define tunnels_l	tunnels[1]
158 #define tunnels_wc	tunnels[0]
159 
160 static DEFINE_RWLOCK(ipgre_lock);
161 
162 /* Given src, dst and key, find appropriate for input tunnel. */
163 
164 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
165 					      __be32 remote, __be32 local,
166 					      __be32 key, __be16 gre_proto)
167 {
168 	struct net *net = dev_net(dev);
169 	int link = dev->ifindex;
170 	unsigned h0 = HASH(remote);
171 	unsigned h1 = HASH(key);
172 	struct ip_tunnel *t, *cand = NULL;
173 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
174 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
175 		       ARPHRD_ETHER : ARPHRD_IPGRE;
176 	int score, cand_score = 4;
177 
178 	for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
179 		if (local != t->parms.iph.saddr ||
180 		    remote != t->parms.iph.daddr ||
181 		    key != t->parms.i_key ||
182 		    !(t->dev->flags & IFF_UP))
183 			continue;
184 
185 		if (t->dev->type != ARPHRD_IPGRE &&
186 		    t->dev->type != dev_type)
187 			continue;
188 
189 		score = 0;
190 		if (t->parms.link != link)
191 			score |= 1;
192 		if (t->dev->type != dev_type)
193 			score |= 2;
194 		if (score == 0)
195 			return t;
196 
197 		if (score < cand_score) {
198 			cand = t;
199 			cand_score = score;
200 		}
201 	}
202 
203 	for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
204 		if (remote != t->parms.iph.daddr ||
205 		    key != t->parms.i_key ||
206 		    !(t->dev->flags & IFF_UP))
207 			continue;
208 
209 		if (t->dev->type != ARPHRD_IPGRE &&
210 		    t->dev->type != dev_type)
211 			continue;
212 
213 		score = 0;
214 		if (t->parms.link != link)
215 			score |= 1;
216 		if (t->dev->type != dev_type)
217 			score |= 2;
218 		if (score == 0)
219 			return t;
220 
221 		if (score < cand_score) {
222 			cand = t;
223 			cand_score = score;
224 		}
225 	}
226 
227 	for (t = ign->tunnels_l[h1]; t; t = t->next) {
228 		if ((local != t->parms.iph.saddr &&
229 		     (local != t->parms.iph.daddr ||
230 		      !ipv4_is_multicast(local))) ||
231 		    key != t->parms.i_key ||
232 		    !(t->dev->flags & IFF_UP))
233 			continue;
234 
235 		if (t->dev->type != ARPHRD_IPGRE &&
236 		    t->dev->type != dev_type)
237 			continue;
238 
239 		score = 0;
240 		if (t->parms.link != link)
241 			score |= 1;
242 		if (t->dev->type != dev_type)
243 			score |= 2;
244 		if (score == 0)
245 			return t;
246 
247 		if (score < cand_score) {
248 			cand = t;
249 			cand_score = score;
250 		}
251 	}
252 
253 	for (t = ign->tunnels_wc[h1]; t; t = t->next) {
254 		if (t->parms.i_key != key ||
255 		    !(t->dev->flags & IFF_UP))
256 			continue;
257 
258 		if (t->dev->type != ARPHRD_IPGRE &&
259 		    t->dev->type != dev_type)
260 			continue;
261 
262 		score = 0;
263 		if (t->parms.link != link)
264 			score |= 1;
265 		if (t->dev->type != dev_type)
266 			score |= 2;
267 		if (score == 0)
268 			return t;
269 
270 		if (score < cand_score) {
271 			cand = t;
272 			cand_score = score;
273 		}
274 	}
275 
276 	if (cand != NULL)
277 		return cand;
278 
279 	if (ign->fb_tunnel_dev->flags & IFF_UP)
280 		return netdev_priv(ign->fb_tunnel_dev);
281 
282 	return NULL;
283 }
284 
285 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
286 		struct ip_tunnel_parm *parms)
287 {
288 	__be32 remote = parms->iph.daddr;
289 	__be32 local = parms->iph.saddr;
290 	__be32 key = parms->i_key;
291 	unsigned h = HASH(key);
292 	int prio = 0;
293 
294 	if (local)
295 		prio |= 1;
296 	if (remote && !ipv4_is_multicast(remote)) {
297 		prio |= 2;
298 		h ^= HASH(remote);
299 	}
300 
301 	return &ign->tunnels[prio][h];
302 }
303 
304 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
305 		struct ip_tunnel *t)
306 {
307 	return __ipgre_bucket(ign, &t->parms);
308 }
309 
310 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
311 {
312 	struct ip_tunnel **tp = ipgre_bucket(ign, t);
313 
314 	t->next = *tp;
315 	write_lock_bh(&ipgre_lock);
316 	*tp = t;
317 	write_unlock_bh(&ipgre_lock);
318 }
319 
320 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
321 {
322 	struct ip_tunnel **tp;
323 
324 	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
325 		if (t == *tp) {
326 			write_lock_bh(&ipgre_lock);
327 			*tp = t->next;
328 			write_unlock_bh(&ipgre_lock);
329 			break;
330 		}
331 	}
332 }
333 
334 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
335 					   struct ip_tunnel_parm *parms,
336 					   int type)
337 {
338 	__be32 remote = parms->iph.daddr;
339 	__be32 local = parms->iph.saddr;
340 	__be32 key = parms->i_key;
341 	int link = parms->link;
342 	struct ip_tunnel *t, **tp;
343 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
344 
345 	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
346 		if (local == t->parms.iph.saddr &&
347 		    remote == t->parms.iph.daddr &&
348 		    key == t->parms.i_key &&
349 		    link == t->parms.link &&
350 		    type == t->dev->type)
351 			break;
352 
353 	return t;
354 }
355 
356 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
357 		struct ip_tunnel_parm *parms, int create)
358 {
359 	struct ip_tunnel *t, *nt;
360 	struct net_device *dev;
361 	char name[IFNAMSIZ];
362 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
363 
364 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
365 	if (t || !create)
366 		return t;
367 
368 	if (parms->name[0])
369 		strlcpy(name, parms->name, IFNAMSIZ);
370 	else
371 		sprintf(name, "gre%%d");
372 
373 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
374 	if (!dev)
375 	  return NULL;
376 
377 	dev_net_set(dev, net);
378 
379 	if (strchr(name, '%')) {
380 		if (dev_alloc_name(dev, name) < 0)
381 			goto failed_free;
382 	}
383 
384 	nt = netdev_priv(dev);
385 	nt->parms = *parms;
386 	dev->rtnl_link_ops = &ipgre_link_ops;
387 
388 	dev->mtu = ipgre_tunnel_bind_dev(dev);
389 
390 	if (register_netdevice(dev) < 0)
391 		goto failed_free;
392 
393 	dev_hold(dev);
394 	ipgre_tunnel_link(ign, nt);
395 	return nt;
396 
397 failed_free:
398 	free_netdev(dev);
399 	return NULL;
400 }
401 
402 static void ipgre_tunnel_uninit(struct net_device *dev)
403 {
404 	struct net *net = dev_net(dev);
405 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
406 
407 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
408 	dev_put(dev);
409 }
410 
411 
412 static void ipgre_err(struct sk_buff *skb, u32 info)
413 {
414 
415 /* All the routers (except for Linux) return only
416    8 bytes of packet payload. It means, that precise relaying of
417    ICMP in the real Internet is absolutely infeasible.
418 
419    Moreover, Cisco "wise men" put GRE key to the third word
420    in GRE header. It makes impossible maintaining even soft state for keyed
421    GRE tunnels with enabled checksum. Tell them "thank you".
422 
423    Well, I wonder, rfc1812 was written by Cisco employee,
424    what the hell these idiots break standrads established
425    by themself???
426  */
427 
428 	struct iphdr *iph = (struct iphdr *)skb->data;
429 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
430 	int grehlen = (iph->ihl<<2) + 4;
431 	const int type = icmp_hdr(skb)->type;
432 	const int code = icmp_hdr(skb)->code;
433 	struct ip_tunnel *t;
434 	__be16 flags;
435 
436 	flags = p[0];
437 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
438 		if (flags&(GRE_VERSION|GRE_ROUTING))
439 			return;
440 		if (flags&GRE_KEY) {
441 			grehlen += 4;
442 			if (flags&GRE_CSUM)
443 				grehlen += 4;
444 		}
445 	}
446 
447 	/* If only 8 bytes returned, keyed message will be dropped here */
448 	if (skb_headlen(skb) < grehlen)
449 		return;
450 
451 	switch (type) {
452 	default:
453 	case ICMP_PARAMETERPROB:
454 		return;
455 
456 	case ICMP_DEST_UNREACH:
457 		switch (code) {
458 		case ICMP_SR_FAILED:
459 		case ICMP_PORT_UNREACH:
460 			/* Impossible event. */
461 			return;
462 		case ICMP_FRAG_NEEDED:
463 			/* Soft state for pmtu is maintained by IP core. */
464 			return;
465 		default:
466 			/* All others are translated to HOST_UNREACH.
467 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
468 			   I believe they are just ether pollution. --ANK
469 			 */
470 			break;
471 		}
472 		break;
473 	case ICMP_TIME_EXCEEDED:
474 		if (code != ICMP_EXC_TTL)
475 			return;
476 		break;
477 	}
478 
479 	read_lock(&ipgre_lock);
480 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
481 				flags & GRE_KEY ?
482 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
483 				p[1]);
484 	if (t == NULL || t->parms.iph.daddr == 0 ||
485 	    ipv4_is_multicast(t->parms.iph.daddr))
486 		goto out;
487 
488 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
489 		goto out;
490 
491 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
492 		t->err_count++;
493 	else
494 		t->err_count = 1;
495 	t->err_time = jiffies;
496 out:
497 	read_unlock(&ipgre_lock);
498 	return;
499 }
500 
501 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
502 {
503 	if (INET_ECN_is_ce(iph->tos)) {
504 		if (skb->protocol == htons(ETH_P_IP)) {
505 			IP_ECN_set_ce(ip_hdr(skb));
506 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
507 			IP6_ECN_set_ce(ipv6_hdr(skb));
508 		}
509 	}
510 }
511 
512 static inline u8
513 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
514 {
515 	u8 inner = 0;
516 	if (skb->protocol == htons(ETH_P_IP))
517 		inner = old_iph->tos;
518 	else if (skb->protocol == htons(ETH_P_IPV6))
519 		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
520 	return INET_ECN_encapsulate(tos, inner);
521 }
522 
523 static int ipgre_rcv(struct sk_buff *skb)
524 {
525 	struct iphdr *iph;
526 	u8     *h;
527 	__be16    flags;
528 	__sum16   csum = 0;
529 	__be32 key = 0;
530 	u32    seqno = 0;
531 	struct ip_tunnel *tunnel;
532 	int    offset = 4;
533 	__be16 gre_proto;
534 	unsigned int len;
535 
536 	if (!pskb_may_pull(skb, 16))
537 		goto drop_nolock;
538 
539 	iph = ip_hdr(skb);
540 	h = skb->data;
541 	flags = *(__be16*)h;
542 
543 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
544 		/* - Version must be 0.
545 		   - We do not support routing headers.
546 		 */
547 		if (flags&(GRE_VERSION|GRE_ROUTING))
548 			goto drop_nolock;
549 
550 		if (flags&GRE_CSUM) {
551 			switch (skb->ip_summed) {
552 			case CHECKSUM_COMPLETE:
553 				csum = csum_fold(skb->csum);
554 				if (!csum)
555 					break;
556 				/* fall through */
557 			case CHECKSUM_NONE:
558 				skb->csum = 0;
559 				csum = __skb_checksum_complete(skb);
560 				skb->ip_summed = CHECKSUM_COMPLETE;
561 			}
562 			offset += 4;
563 		}
564 		if (flags&GRE_KEY) {
565 			key = *(__be32*)(h + offset);
566 			offset += 4;
567 		}
568 		if (flags&GRE_SEQ) {
569 			seqno = ntohl(*(__be32*)(h + offset));
570 			offset += 4;
571 		}
572 	}
573 
574 	gre_proto = *(__be16 *)(h + 2);
575 
576 	read_lock(&ipgre_lock);
577 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
578 					  iph->saddr, iph->daddr, key,
579 					  gre_proto))) {
580 		struct net_device_stats *stats = &tunnel->dev->stats;
581 
582 		secpath_reset(skb);
583 
584 		skb->protocol = gre_proto;
585 		/* WCCP version 1 and 2 protocol decoding.
586 		 * - Change protocol to IP
587 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
588 		 */
589 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
590 			skb->protocol = htons(ETH_P_IP);
591 			if ((*(h + offset) & 0xF0) != 0x40)
592 				offset += 4;
593 		}
594 
595 		skb->mac_header = skb->network_header;
596 		__pskb_pull(skb, offset);
597 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
598 		skb->pkt_type = PACKET_HOST;
599 #ifdef CONFIG_NET_IPGRE_BROADCAST
600 		if (ipv4_is_multicast(iph->daddr)) {
601 			/* Looped back packet, drop it! */
602 			if (skb_rtable(skb)->fl.iif == 0)
603 				goto drop;
604 			stats->multicast++;
605 			skb->pkt_type = PACKET_BROADCAST;
606 		}
607 #endif
608 
609 		if (((flags&GRE_CSUM) && csum) ||
610 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
611 			stats->rx_crc_errors++;
612 			stats->rx_errors++;
613 			goto drop;
614 		}
615 		if (tunnel->parms.i_flags&GRE_SEQ) {
616 			if (!(flags&GRE_SEQ) ||
617 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
618 				stats->rx_fifo_errors++;
619 				stats->rx_errors++;
620 				goto drop;
621 			}
622 			tunnel->i_seqno = seqno + 1;
623 		}
624 
625 		len = skb->len;
626 
627 		/* Warning: All skb pointers will be invalidated! */
628 		if (tunnel->dev->type == ARPHRD_ETHER) {
629 			if (!pskb_may_pull(skb, ETH_HLEN)) {
630 				stats->rx_length_errors++;
631 				stats->rx_errors++;
632 				goto drop;
633 			}
634 
635 			iph = ip_hdr(skb);
636 			skb->protocol = eth_type_trans(skb, tunnel->dev);
637 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
638 		}
639 
640 		stats->rx_packets++;
641 		stats->rx_bytes += len;
642 		skb->dev = tunnel->dev;
643 		skb_dst_drop(skb);
644 		nf_reset(skb);
645 
646 		skb_reset_network_header(skb);
647 		ipgre_ecn_decapsulate(iph, skb);
648 
649 		netif_rx(skb);
650 		read_unlock(&ipgre_lock);
651 		return(0);
652 	}
653 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
654 
655 drop:
656 	read_unlock(&ipgre_lock);
657 drop_nolock:
658 	kfree_skb(skb);
659 	return(0);
660 }
661 
662 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
663 {
664 	struct ip_tunnel *tunnel = netdev_priv(dev);
665 	struct net_device_stats *stats = &tunnel->dev->stats;
666 	struct iphdr  *old_iph = ip_hdr(skb);
667 	struct iphdr  *tiph;
668 	u8     tos;
669 	__be16 df;
670 	struct rtable *rt;     			/* Route to the other host */
671 	struct net_device *tdev;			/* Device to other host */
672 	struct iphdr  *iph;			/* Our new IP header */
673 	unsigned int max_headroom;		/* The extra header space needed */
674 	int    gre_hlen;
675 	__be32 dst;
676 	int    mtu;
677 
678 	if (dev->type == ARPHRD_ETHER)
679 		IPCB(skb)->flags = 0;
680 
681 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
682 		gre_hlen = 0;
683 		tiph = (struct iphdr *)skb->data;
684 	} else {
685 		gre_hlen = tunnel->hlen;
686 		tiph = &tunnel->parms.iph;
687 	}
688 
689 	if ((dst = tiph->daddr) == 0) {
690 		/* NBMA tunnel */
691 
692 		if (skb_dst(skb) == NULL) {
693 			stats->tx_fifo_errors++;
694 			goto tx_error;
695 		}
696 
697 		if (skb->protocol == htons(ETH_P_IP)) {
698 			rt = skb_rtable(skb);
699 			if ((dst = rt->rt_gateway) == 0)
700 				goto tx_error_icmp;
701 		}
702 #ifdef CONFIG_IPV6
703 		else if (skb->protocol == htons(ETH_P_IPV6)) {
704 			struct in6_addr *addr6;
705 			int addr_type;
706 			struct neighbour *neigh = skb_dst(skb)->neighbour;
707 
708 			if (neigh == NULL)
709 				goto tx_error;
710 
711 			addr6 = (struct in6_addr *)&neigh->primary_key;
712 			addr_type = ipv6_addr_type(addr6);
713 
714 			if (addr_type == IPV6_ADDR_ANY) {
715 				addr6 = &ipv6_hdr(skb)->daddr;
716 				addr_type = ipv6_addr_type(addr6);
717 			}
718 
719 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
720 				goto tx_error_icmp;
721 
722 			dst = addr6->s6_addr32[3];
723 		}
724 #endif
725 		else
726 			goto tx_error;
727 	}
728 
729 	tos = tiph->tos;
730 	if (tos == 1) {
731 		tos = 0;
732 		if (skb->protocol == htons(ETH_P_IP))
733 			tos = old_iph->tos;
734 	}
735 
736 	{
737 		struct flowi fl = { .oif = tunnel->parms.link,
738 				    .nl_u = { .ip4_u =
739 					      { .daddr = dst,
740 						.saddr = tiph->saddr,
741 						.tos = RT_TOS(tos) } },
742 				    .proto = IPPROTO_GRE };
743 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
744 			stats->tx_carrier_errors++;
745 			goto tx_error;
746 		}
747 	}
748 	tdev = rt->u.dst.dev;
749 
750 	if (tdev == dev) {
751 		ip_rt_put(rt);
752 		stats->collisions++;
753 		goto tx_error;
754 	}
755 
756 	df = tiph->frag_off;
757 	if (df)
758 		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
759 	else
760 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
761 
762 	if (skb_dst(skb))
763 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
764 
765 	if (skb->protocol == htons(ETH_P_IP)) {
766 		df |= (old_iph->frag_off&htons(IP_DF));
767 
768 		if ((old_iph->frag_off&htons(IP_DF)) &&
769 		    mtu < ntohs(old_iph->tot_len)) {
770 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
771 			ip_rt_put(rt);
772 			goto tx_error;
773 		}
774 	}
775 #ifdef CONFIG_IPV6
776 	else if (skb->protocol == htons(ETH_P_IPV6)) {
777 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
778 
779 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
780 			if ((tunnel->parms.iph.daddr &&
781 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
782 			    rt6->rt6i_dst.plen == 128) {
783 				rt6->rt6i_flags |= RTF_MODIFIED;
784 				skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
785 			}
786 		}
787 
788 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
789 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
790 			ip_rt_put(rt);
791 			goto tx_error;
792 		}
793 	}
794 #endif
795 
796 	if (tunnel->err_count > 0) {
797 		if (time_before(jiffies,
798 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
799 			tunnel->err_count--;
800 
801 			dst_link_failure(skb);
802 		} else
803 			tunnel->err_count = 0;
804 	}
805 
806 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
807 
808 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
809 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
810 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
811 		if (!new_skb) {
812 			ip_rt_put(rt);
813 			stats->tx_dropped++;
814 			dev_kfree_skb(skb);
815 			return NETDEV_TX_OK;
816 		}
817 		if (skb->sk)
818 			skb_set_owner_w(new_skb, skb->sk);
819 		dev_kfree_skb(skb);
820 		skb = new_skb;
821 		old_iph = ip_hdr(skb);
822 	}
823 
824 	skb_reset_transport_header(skb);
825 	skb_push(skb, gre_hlen);
826 	skb_reset_network_header(skb);
827 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
828 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
829 			      IPSKB_REROUTED);
830 	skb_dst_drop(skb);
831 	skb_dst_set(skb, &rt->u.dst);
832 
833 	/*
834 	 *	Push down and install the IPIP header.
835 	 */
836 
837 	iph 			=	ip_hdr(skb);
838 	iph->version		=	4;
839 	iph->ihl		=	sizeof(struct iphdr) >> 2;
840 	iph->frag_off		=	df;
841 	iph->protocol		=	IPPROTO_GRE;
842 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
843 	iph->daddr		=	rt->rt_dst;
844 	iph->saddr		=	rt->rt_src;
845 
846 	if ((iph->ttl = tiph->ttl) == 0) {
847 		if (skb->protocol == htons(ETH_P_IP))
848 			iph->ttl = old_iph->ttl;
849 #ifdef CONFIG_IPV6
850 		else if (skb->protocol == htons(ETH_P_IPV6))
851 			iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
852 #endif
853 		else
854 			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
855 	}
856 
857 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
858 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
859 				   htons(ETH_P_TEB) : skb->protocol;
860 
861 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
862 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
863 
864 		if (tunnel->parms.o_flags&GRE_SEQ) {
865 			++tunnel->o_seqno;
866 			*ptr = htonl(tunnel->o_seqno);
867 			ptr--;
868 		}
869 		if (tunnel->parms.o_flags&GRE_KEY) {
870 			*ptr = tunnel->parms.o_key;
871 			ptr--;
872 		}
873 		if (tunnel->parms.o_flags&GRE_CSUM) {
874 			*ptr = 0;
875 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
876 		}
877 	}
878 
879 	nf_reset(skb);
880 
881 	IPTUNNEL_XMIT();
882 	return NETDEV_TX_OK;
883 
884 tx_error_icmp:
885 	dst_link_failure(skb);
886 
887 tx_error:
888 	stats->tx_errors++;
889 	dev_kfree_skb(skb);
890 	return NETDEV_TX_OK;
891 }
892 
893 static int ipgre_tunnel_bind_dev(struct net_device *dev)
894 {
895 	struct net_device *tdev = NULL;
896 	struct ip_tunnel *tunnel;
897 	struct iphdr *iph;
898 	int hlen = LL_MAX_HEADER;
899 	int mtu = ETH_DATA_LEN;
900 	int addend = sizeof(struct iphdr) + 4;
901 
902 	tunnel = netdev_priv(dev);
903 	iph = &tunnel->parms.iph;
904 
905 	/* Guess output device to choose reasonable mtu and needed_headroom */
906 
907 	if (iph->daddr) {
908 		struct flowi fl = { .oif = tunnel->parms.link,
909 				    .nl_u = { .ip4_u =
910 					      { .daddr = iph->daddr,
911 						.saddr = iph->saddr,
912 						.tos = RT_TOS(iph->tos) } },
913 				    .proto = IPPROTO_GRE };
914 		struct rtable *rt;
915 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
916 			tdev = rt->u.dst.dev;
917 			ip_rt_put(rt);
918 		}
919 
920 		if (dev->type != ARPHRD_ETHER)
921 			dev->flags |= IFF_POINTOPOINT;
922 	}
923 
924 	if (!tdev && tunnel->parms.link)
925 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
926 
927 	if (tdev) {
928 		hlen = tdev->hard_header_len + tdev->needed_headroom;
929 		mtu = tdev->mtu;
930 	}
931 	dev->iflink = tunnel->parms.link;
932 
933 	/* Precalculate GRE options length */
934 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
935 		if (tunnel->parms.o_flags&GRE_CSUM)
936 			addend += 4;
937 		if (tunnel->parms.o_flags&GRE_KEY)
938 			addend += 4;
939 		if (tunnel->parms.o_flags&GRE_SEQ)
940 			addend += 4;
941 	}
942 	dev->needed_headroom = addend + hlen;
943 	mtu -= dev->hard_header_len + addend;
944 
945 	if (mtu < 68)
946 		mtu = 68;
947 
948 	tunnel->hlen = addend;
949 
950 	return mtu;
951 }
952 
953 static int
954 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
955 {
956 	int err = 0;
957 	struct ip_tunnel_parm p;
958 	struct ip_tunnel *t;
959 	struct net *net = dev_net(dev);
960 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
961 
962 	switch (cmd) {
963 	case SIOCGETTUNNEL:
964 		t = NULL;
965 		if (dev == ign->fb_tunnel_dev) {
966 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
967 				err = -EFAULT;
968 				break;
969 			}
970 			t = ipgre_tunnel_locate(net, &p, 0);
971 		}
972 		if (t == NULL)
973 			t = netdev_priv(dev);
974 		memcpy(&p, &t->parms, sizeof(p));
975 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
976 			err = -EFAULT;
977 		break;
978 
979 	case SIOCADDTUNNEL:
980 	case SIOCCHGTUNNEL:
981 		err = -EPERM;
982 		if (!capable(CAP_NET_ADMIN))
983 			goto done;
984 
985 		err = -EFAULT;
986 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
987 			goto done;
988 
989 		err = -EINVAL;
990 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
991 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
992 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
993 			goto done;
994 		if (p.iph.ttl)
995 			p.iph.frag_off |= htons(IP_DF);
996 
997 		if (!(p.i_flags&GRE_KEY))
998 			p.i_key = 0;
999 		if (!(p.o_flags&GRE_KEY))
1000 			p.o_key = 0;
1001 
1002 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1003 
1004 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1005 			if (t != NULL) {
1006 				if (t->dev != dev) {
1007 					err = -EEXIST;
1008 					break;
1009 				}
1010 			} else {
1011 				unsigned nflags = 0;
1012 
1013 				t = netdev_priv(dev);
1014 
1015 				if (ipv4_is_multicast(p.iph.daddr))
1016 					nflags = IFF_BROADCAST;
1017 				else if (p.iph.daddr)
1018 					nflags = IFF_POINTOPOINT;
1019 
1020 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1021 					err = -EINVAL;
1022 					break;
1023 				}
1024 				ipgre_tunnel_unlink(ign, t);
1025 				t->parms.iph.saddr = p.iph.saddr;
1026 				t->parms.iph.daddr = p.iph.daddr;
1027 				t->parms.i_key = p.i_key;
1028 				t->parms.o_key = p.o_key;
1029 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1030 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1031 				ipgre_tunnel_link(ign, t);
1032 				netdev_state_change(dev);
1033 			}
1034 		}
1035 
1036 		if (t) {
1037 			err = 0;
1038 			if (cmd == SIOCCHGTUNNEL) {
1039 				t->parms.iph.ttl = p.iph.ttl;
1040 				t->parms.iph.tos = p.iph.tos;
1041 				t->parms.iph.frag_off = p.iph.frag_off;
1042 				if (t->parms.link != p.link) {
1043 					t->parms.link = p.link;
1044 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1045 					netdev_state_change(dev);
1046 				}
1047 			}
1048 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1049 				err = -EFAULT;
1050 		} else
1051 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1052 		break;
1053 
1054 	case SIOCDELTUNNEL:
1055 		err = -EPERM;
1056 		if (!capable(CAP_NET_ADMIN))
1057 			goto done;
1058 
1059 		if (dev == ign->fb_tunnel_dev) {
1060 			err = -EFAULT;
1061 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1062 				goto done;
1063 			err = -ENOENT;
1064 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1065 				goto done;
1066 			err = -EPERM;
1067 			if (t == netdev_priv(ign->fb_tunnel_dev))
1068 				goto done;
1069 			dev = t->dev;
1070 		}
1071 		unregister_netdevice(dev);
1072 		err = 0;
1073 		break;
1074 
1075 	default:
1076 		err = -EINVAL;
1077 	}
1078 
1079 done:
1080 	return err;
1081 }
1082 
1083 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1084 {
1085 	struct ip_tunnel *tunnel = netdev_priv(dev);
1086 	if (new_mtu < 68 ||
1087 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1088 		return -EINVAL;
1089 	dev->mtu = new_mtu;
1090 	return 0;
1091 }
1092 
1093 /* Nice toy. Unfortunately, useless in real life :-)
1094    It allows to construct virtual multiprotocol broadcast "LAN"
1095    over the Internet, provided multicast routing is tuned.
1096 
1097 
1098    I have no idea was this bicycle invented before me,
1099    so that I had to set ARPHRD_IPGRE to a random value.
1100    I have an impression, that Cisco could make something similar,
1101    but this feature is apparently missing in IOS<=11.2(8).
1102 
1103    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1104    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1105 
1106    ping -t 255 224.66.66.66
1107 
1108    If nobody answers, mbone does not work.
1109 
1110    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1111    ip addr add 10.66.66.<somewhat>/24 dev Universe
1112    ifconfig Universe up
1113    ifconfig Universe add fe80::<Your_real_addr>/10
1114    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1115    ftp 10.66.66.66
1116    ...
1117    ftp fec0:6666:6666::193.233.7.65
1118    ...
1119 
1120  */
1121 
1122 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1123 			unsigned short type,
1124 			const void *daddr, const void *saddr, unsigned len)
1125 {
1126 	struct ip_tunnel *t = netdev_priv(dev);
1127 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1128 	__be16 *p = (__be16*)(iph+1);
1129 
1130 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1131 	p[0]		= t->parms.o_flags;
1132 	p[1]		= htons(type);
1133 
1134 	/*
1135 	 *	Set the source hardware address.
1136 	 */
1137 
1138 	if (saddr)
1139 		memcpy(&iph->saddr, saddr, 4);
1140 
1141 	if (daddr) {
1142 		memcpy(&iph->daddr, daddr, 4);
1143 		return t->hlen;
1144 	}
1145 	if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1146 		return t->hlen;
1147 
1148 	return -t->hlen;
1149 }
1150 
1151 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1152 {
1153 	struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1154 	memcpy(haddr, &iph->saddr, 4);
1155 	return 4;
1156 }
1157 
1158 static const struct header_ops ipgre_header_ops = {
1159 	.create	= ipgre_header,
1160 	.parse	= ipgre_header_parse,
1161 };
1162 
1163 #ifdef CONFIG_NET_IPGRE_BROADCAST
1164 static int ipgre_open(struct net_device *dev)
1165 {
1166 	struct ip_tunnel *t = netdev_priv(dev);
1167 
1168 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1169 		struct flowi fl = { .oif = t->parms.link,
1170 				    .nl_u = { .ip4_u =
1171 					      { .daddr = t->parms.iph.daddr,
1172 						.saddr = t->parms.iph.saddr,
1173 						.tos = RT_TOS(t->parms.iph.tos) } },
1174 				    .proto = IPPROTO_GRE };
1175 		struct rtable *rt;
1176 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
1177 			return -EADDRNOTAVAIL;
1178 		dev = rt->u.dst.dev;
1179 		ip_rt_put(rt);
1180 		if (__in_dev_get_rtnl(dev) == NULL)
1181 			return -EADDRNOTAVAIL;
1182 		t->mlink = dev->ifindex;
1183 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1184 	}
1185 	return 0;
1186 }
1187 
1188 static int ipgre_close(struct net_device *dev)
1189 {
1190 	struct ip_tunnel *t = netdev_priv(dev);
1191 
1192 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1193 		struct in_device *in_dev;
1194 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1195 		if (in_dev) {
1196 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1197 			in_dev_put(in_dev);
1198 		}
1199 	}
1200 	return 0;
1201 }
1202 
1203 #endif
1204 
1205 static const struct net_device_ops ipgre_netdev_ops = {
1206 	.ndo_init		= ipgre_tunnel_init,
1207 	.ndo_uninit		= ipgre_tunnel_uninit,
1208 #ifdef CONFIG_NET_IPGRE_BROADCAST
1209 	.ndo_open		= ipgre_open,
1210 	.ndo_stop		= ipgre_close,
1211 #endif
1212 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1213 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1214 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1215 };
1216 
1217 static void ipgre_tunnel_setup(struct net_device *dev)
1218 {
1219 	dev->netdev_ops		= &ipgre_netdev_ops;
1220 	dev->destructor 	= free_netdev;
1221 
1222 	dev->type		= ARPHRD_IPGRE;
1223 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1224 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1225 	dev->flags		= IFF_NOARP;
1226 	dev->iflink		= 0;
1227 	dev->addr_len		= 4;
1228 	dev->features		|= NETIF_F_NETNS_LOCAL;
1229 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1230 }
1231 
1232 static int ipgre_tunnel_init(struct net_device *dev)
1233 {
1234 	struct ip_tunnel *tunnel;
1235 	struct iphdr *iph;
1236 
1237 	tunnel = netdev_priv(dev);
1238 	iph = &tunnel->parms.iph;
1239 
1240 	tunnel->dev = dev;
1241 	strcpy(tunnel->parms.name, dev->name);
1242 
1243 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1244 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1245 
1246 	if (iph->daddr) {
1247 #ifdef CONFIG_NET_IPGRE_BROADCAST
1248 		if (ipv4_is_multicast(iph->daddr)) {
1249 			if (!iph->saddr)
1250 				return -EINVAL;
1251 			dev->flags = IFF_BROADCAST;
1252 			dev->header_ops = &ipgre_header_ops;
1253 		}
1254 #endif
1255 	} else
1256 		dev->header_ops = &ipgre_header_ops;
1257 
1258 	return 0;
1259 }
1260 
1261 static void ipgre_fb_tunnel_init(struct net_device *dev)
1262 {
1263 	struct ip_tunnel *tunnel = netdev_priv(dev);
1264 	struct iphdr *iph = &tunnel->parms.iph;
1265 	struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1266 
1267 	tunnel->dev = dev;
1268 	strcpy(tunnel->parms.name, dev->name);
1269 
1270 	iph->version		= 4;
1271 	iph->protocol		= IPPROTO_GRE;
1272 	iph->ihl		= 5;
1273 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1274 
1275 	dev_hold(dev);
1276 	ign->tunnels_wc[0]	= tunnel;
1277 }
1278 
1279 
1280 static const struct net_protocol ipgre_protocol = {
1281 	.handler	=	ipgre_rcv,
1282 	.err_handler	=	ipgre_err,
1283 	.netns_ok	=	1,
1284 };
1285 
1286 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1287 {
1288 	int prio;
1289 
1290 	for (prio = 0; prio < 4; prio++) {
1291 		int h;
1292 		for (h = 0; h < HASH_SIZE; h++) {
1293 			struct ip_tunnel *t;
1294 			while ((t = ign->tunnels[prio][h]) != NULL)
1295 				unregister_netdevice(t->dev);
1296 		}
1297 	}
1298 }
1299 
1300 static int ipgre_init_net(struct net *net)
1301 {
1302 	int err;
1303 	struct ipgre_net *ign;
1304 
1305 	err = -ENOMEM;
1306 	ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1307 	if (ign == NULL)
1308 		goto err_alloc;
1309 
1310 	err = net_assign_generic(net, ipgre_net_id, ign);
1311 	if (err < 0)
1312 		goto err_assign;
1313 
1314 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1315 					   ipgre_tunnel_setup);
1316 	if (!ign->fb_tunnel_dev) {
1317 		err = -ENOMEM;
1318 		goto err_alloc_dev;
1319 	}
1320 	dev_net_set(ign->fb_tunnel_dev, net);
1321 
1322 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1323 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1324 
1325 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1326 		goto err_reg_dev;
1327 
1328 	return 0;
1329 
1330 err_reg_dev:
1331 	free_netdev(ign->fb_tunnel_dev);
1332 err_alloc_dev:
1333 	/* nothing */
1334 err_assign:
1335 	kfree(ign);
1336 err_alloc:
1337 	return err;
1338 }
1339 
1340 static void ipgre_exit_net(struct net *net)
1341 {
1342 	struct ipgre_net *ign;
1343 
1344 	ign = net_generic(net, ipgre_net_id);
1345 	rtnl_lock();
1346 	ipgre_destroy_tunnels(ign);
1347 	rtnl_unlock();
1348 	kfree(ign);
1349 }
1350 
1351 static struct pernet_operations ipgre_net_ops = {
1352 	.init = ipgre_init_net,
1353 	.exit = ipgre_exit_net,
1354 };
1355 
1356 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1357 {
1358 	__be16 flags;
1359 
1360 	if (!data)
1361 		return 0;
1362 
1363 	flags = 0;
1364 	if (data[IFLA_GRE_IFLAGS])
1365 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1366 	if (data[IFLA_GRE_OFLAGS])
1367 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1368 	if (flags & (GRE_VERSION|GRE_ROUTING))
1369 		return -EINVAL;
1370 
1371 	return 0;
1372 }
1373 
1374 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1375 {
1376 	__be32 daddr;
1377 
1378 	if (tb[IFLA_ADDRESS]) {
1379 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1380 			return -EINVAL;
1381 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1382 			return -EADDRNOTAVAIL;
1383 	}
1384 
1385 	if (!data)
1386 		goto out;
1387 
1388 	if (data[IFLA_GRE_REMOTE]) {
1389 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1390 		if (!daddr)
1391 			return -EINVAL;
1392 	}
1393 
1394 out:
1395 	return ipgre_tunnel_validate(tb, data);
1396 }
1397 
1398 static void ipgre_netlink_parms(struct nlattr *data[],
1399 				struct ip_tunnel_parm *parms)
1400 {
1401 	memset(parms, 0, sizeof(*parms));
1402 
1403 	parms->iph.protocol = IPPROTO_GRE;
1404 
1405 	if (!data)
1406 		return;
1407 
1408 	if (data[IFLA_GRE_LINK])
1409 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1410 
1411 	if (data[IFLA_GRE_IFLAGS])
1412 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1413 
1414 	if (data[IFLA_GRE_OFLAGS])
1415 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1416 
1417 	if (data[IFLA_GRE_IKEY])
1418 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1419 
1420 	if (data[IFLA_GRE_OKEY])
1421 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1422 
1423 	if (data[IFLA_GRE_LOCAL])
1424 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1425 
1426 	if (data[IFLA_GRE_REMOTE])
1427 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1428 
1429 	if (data[IFLA_GRE_TTL])
1430 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1431 
1432 	if (data[IFLA_GRE_TOS])
1433 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1434 
1435 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1436 		parms->iph.frag_off = htons(IP_DF);
1437 }
1438 
1439 static int ipgre_tap_init(struct net_device *dev)
1440 {
1441 	struct ip_tunnel *tunnel;
1442 
1443 	tunnel = netdev_priv(dev);
1444 
1445 	tunnel->dev = dev;
1446 	strcpy(tunnel->parms.name, dev->name);
1447 
1448 	ipgre_tunnel_bind_dev(dev);
1449 
1450 	return 0;
1451 }
1452 
1453 static const struct net_device_ops ipgre_tap_netdev_ops = {
1454 	.ndo_init		= ipgre_tap_init,
1455 	.ndo_uninit		= ipgre_tunnel_uninit,
1456 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1457 	.ndo_set_mac_address 	= eth_mac_addr,
1458 	.ndo_validate_addr	= eth_validate_addr,
1459 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1460 };
1461 
1462 static void ipgre_tap_setup(struct net_device *dev)
1463 {
1464 
1465 	ether_setup(dev);
1466 
1467 	dev->netdev_ops		= &ipgre_netdev_ops;
1468 	dev->destructor 	= free_netdev;
1469 
1470 	dev->iflink		= 0;
1471 	dev->features		|= NETIF_F_NETNS_LOCAL;
1472 }
1473 
1474 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1475 			 struct nlattr *data[])
1476 {
1477 	struct ip_tunnel *nt;
1478 	struct net *net = dev_net(dev);
1479 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1480 	int mtu;
1481 	int err;
1482 
1483 	nt = netdev_priv(dev);
1484 	ipgre_netlink_parms(data, &nt->parms);
1485 
1486 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1487 		return -EEXIST;
1488 
1489 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1490 		random_ether_addr(dev->dev_addr);
1491 
1492 	mtu = ipgre_tunnel_bind_dev(dev);
1493 	if (!tb[IFLA_MTU])
1494 		dev->mtu = mtu;
1495 
1496 	err = register_netdevice(dev);
1497 	if (err)
1498 		goto out;
1499 
1500 	dev_hold(dev);
1501 	ipgre_tunnel_link(ign, nt);
1502 
1503 out:
1504 	return err;
1505 }
1506 
1507 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1508 			    struct nlattr *data[])
1509 {
1510 	struct ip_tunnel *t, *nt;
1511 	struct net *net = dev_net(dev);
1512 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1513 	struct ip_tunnel_parm p;
1514 	int mtu;
1515 
1516 	if (dev == ign->fb_tunnel_dev)
1517 		return -EINVAL;
1518 
1519 	nt = netdev_priv(dev);
1520 	ipgre_netlink_parms(data, &p);
1521 
1522 	t = ipgre_tunnel_locate(net, &p, 0);
1523 
1524 	if (t) {
1525 		if (t->dev != dev)
1526 			return -EEXIST;
1527 	} else {
1528 		unsigned nflags = 0;
1529 
1530 		t = nt;
1531 
1532 		if (ipv4_is_multicast(p.iph.daddr))
1533 			nflags = IFF_BROADCAST;
1534 		else if (p.iph.daddr)
1535 			nflags = IFF_POINTOPOINT;
1536 
1537 		if ((dev->flags ^ nflags) &
1538 		    (IFF_POINTOPOINT | IFF_BROADCAST))
1539 			return -EINVAL;
1540 
1541 		ipgre_tunnel_unlink(ign, t);
1542 		t->parms.iph.saddr = p.iph.saddr;
1543 		t->parms.iph.daddr = p.iph.daddr;
1544 		t->parms.i_key = p.i_key;
1545 		memcpy(dev->dev_addr, &p.iph.saddr, 4);
1546 		memcpy(dev->broadcast, &p.iph.daddr, 4);
1547 		ipgre_tunnel_link(ign, t);
1548 		netdev_state_change(dev);
1549 	}
1550 
1551 	t->parms.o_key = p.o_key;
1552 	t->parms.iph.ttl = p.iph.ttl;
1553 	t->parms.iph.tos = p.iph.tos;
1554 	t->parms.iph.frag_off = p.iph.frag_off;
1555 
1556 	if (t->parms.link != p.link) {
1557 		t->parms.link = p.link;
1558 		mtu = ipgre_tunnel_bind_dev(dev);
1559 		if (!tb[IFLA_MTU])
1560 			dev->mtu = mtu;
1561 		netdev_state_change(dev);
1562 	}
1563 
1564 	return 0;
1565 }
1566 
1567 static size_t ipgre_get_size(const struct net_device *dev)
1568 {
1569 	return
1570 		/* IFLA_GRE_LINK */
1571 		nla_total_size(4) +
1572 		/* IFLA_GRE_IFLAGS */
1573 		nla_total_size(2) +
1574 		/* IFLA_GRE_OFLAGS */
1575 		nla_total_size(2) +
1576 		/* IFLA_GRE_IKEY */
1577 		nla_total_size(4) +
1578 		/* IFLA_GRE_OKEY */
1579 		nla_total_size(4) +
1580 		/* IFLA_GRE_LOCAL */
1581 		nla_total_size(4) +
1582 		/* IFLA_GRE_REMOTE */
1583 		nla_total_size(4) +
1584 		/* IFLA_GRE_TTL */
1585 		nla_total_size(1) +
1586 		/* IFLA_GRE_TOS */
1587 		nla_total_size(1) +
1588 		/* IFLA_GRE_PMTUDISC */
1589 		nla_total_size(1) +
1590 		0;
1591 }
1592 
1593 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1594 {
1595 	struct ip_tunnel *t = netdev_priv(dev);
1596 	struct ip_tunnel_parm *p = &t->parms;
1597 
1598 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1599 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1600 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1601 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1602 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1603 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1604 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1605 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1606 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1607 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1608 
1609 	return 0;
1610 
1611 nla_put_failure:
1612 	return -EMSGSIZE;
1613 }
1614 
1615 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1616 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1617 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1618 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1619 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1620 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1621 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1622 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1623 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1624 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1625 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1626 };
1627 
1628 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1629 	.kind		= "gre",
1630 	.maxtype	= IFLA_GRE_MAX,
1631 	.policy		= ipgre_policy,
1632 	.priv_size	= sizeof(struct ip_tunnel),
1633 	.setup		= ipgre_tunnel_setup,
1634 	.validate	= ipgre_tunnel_validate,
1635 	.newlink	= ipgre_newlink,
1636 	.changelink	= ipgre_changelink,
1637 	.get_size	= ipgre_get_size,
1638 	.fill_info	= ipgre_fill_info,
1639 };
1640 
1641 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1642 	.kind		= "gretap",
1643 	.maxtype	= IFLA_GRE_MAX,
1644 	.policy		= ipgre_policy,
1645 	.priv_size	= sizeof(struct ip_tunnel),
1646 	.setup		= ipgre_tap_setup,
1647 	.validate	= ipgre_tap_validate,
1648 	.newlink	= ipgre_newlink,
1649 	.changelink	= ipgre_changelink,
1650 	.get_size	= ipgre_get_size,
1651 	.fill_info	= ipgre_fill_info,
1652 };
1653 
1654 /*
1655  *	And now the modules code and kernel interface.
1656  */
1657 
1658 static int __init ipgre_init(void)
1659 {
1660 	int err;
1661 
1662 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1663 
1664 	if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1665 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1666 		return -EAGAIN;
1667 	}
1668 
1669 	err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1670 	if (err < 0)
1671 		goto gen_device_failed;
1672 
1673 	err = rtnl_link_register(&ipgre_link_ops);
1674 	if (err < 0)
1675 		goto rtnl_link_failed;
1676 
1677 	err = rtnl_link_register(&ipgre_tap_ops);
1678 	if (err < 0)
1679 		goto tap_ops_failed;
1680 
1681 out:
1682 	return err;
1683 
1684 tap_ops_failed:
1685 	rtnl_link_unregister(&ipgre_link_ops);
1686 rtnl_link_failed:
1687 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1688 gen_device_failed:
1689 	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1690 	goto out;
1691 }
1692 
1693 static void __exit ipgre_fini(void)
1694 {
1695 	rtnl_link_unregister(&ipgre_tap_ops);
1696 	rtnl_link_unregister(&ipgre_link_ops);
1697 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1698 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1699 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1700 }
1701 
1702 module_init(ipgre_init);
1703 module_exit(ipgre_fini);
1704 MODULE_LICENSE("GPL");
1705 MODULE_ALIAS_RTNL_LINK("gre");
1706 MODULE_ALIAS_RTNL_LINK("gretap");
1707