xref: /linux/net/ipv4/ip_gre.c (revision 26b0d14106954ae46d2f4f7eec3481828a210f7d)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56 
57 /*
58    Problems & solutions
59    --------------------
60 
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65 
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72 
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76 
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81 
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90 
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93 
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97 
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110 
111 
112 
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119 
120    Alexey Kuznetsov.
121  */
122 
123 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124 static int ipgre_tunnel_init(struct net_device *dev);
125 static void ipgre_tunnel_setup(struct net_device *dev);
126 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127 
128 /* Fallback tunnel: no source, no destination, no key, no options */
129 
130 #define HASH_SIZE  16
131 
132 static int ipgre_net_id __read_mostly;
133 struct ipgre_net {
134 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
135 
136 	struct net_device *fb_tunnel_dev;
137 };
138 
139 /* Tunnel hash table */
140 
141 /*
142    4 hash tables:
143 
144    3: (remote,local)
145    2: (remote,*)
146    1: (*,local)
147    0: (*,*)
148 
149    We require exact key match i.e. if a key is present in packet
150    it will match only tunnel with the same key; if it is not present,
151    it will match only keyless tunnel.
152 
153    All keysless packets, if not matched configured keyless tunnels
154    will match fallback tunnel.
155  */
156 
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158 
159 #define tunnels_r_l	tunnels[3]
160 #define tunnels_r	tunnels[2]
161 #define tunnels_l	tunnels[1]
162 #define tunnels_wc	tunnels[0]
163 /*
164  * Locking : hash tables are protected by RCU and RTNL
165  */
166 
167 #define for_each_ip_tunnel_rcu(start) \
168 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
169 
170 /* often modified stats are per cpu, other are shared (netdev->stats) */
171 struct pcpu_tstats {
172 	u64	rx_packets;
173 	u64	rx_bytes;
174 	u64	tx_packets;
175 	u64	tx_bytes;
176 	struct u64_stats_sync	syncp;
177 };
178 
179 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
180 						   struct rtnl_link_stats64 *tot)
181 {
182 	int i;
183 
184 	for_each_possible_cpu(i) {
185 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
186 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
187 		unsigned int start;
188 
189 		do {
190 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
191 			rx_packets = tstats->rx_packets;
192 			tx_packets = tstats->tx_packets;
193 			rx_bytes = tstats->rx_bytes;
194 			tx_bytes = tstats->tx_bytes;
195 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
196 
197 		tot->rx_packets += rx_packets;
198 		tot->tx_packets += tx_packets;
199 		tot->rx_bytes   += rx_bytes;
200 		tot->tx_bytes   += tx_bytes;
201 	}
202 
203 	tot->multicast = dev->stats.multicast;
204 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
205 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
206 	tot->rx_length_errors = dev->stats.rx_length_errors;
207 	tot->rx_errors = dev->stats.rx_errors;
208 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210 	tot->tx_dropped = dev->stats.tx_dropped;
211 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
212 	tot->tx_errors = dev->stats.tx_errors;
213 
214 	return tot;
215 }
216 
217 /* Given src, dst and key, find appropriate for input tunnel. */
218 
219 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
220 					     __be32 remote, __be32 local,
221 					     __be32 key, __be16 gre_proto)
222 {
223 	struct net *net = dev_net(dev);
224 	int link = dev->ifindex;
225 	unsigned int h0 = HASH(remote);
226 	unsigned int h1 = HASH(key);
227 	struct ip_tunnel *t, *cand = NULL;
228 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
229 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
230 		       ARPHRD_ETHER : ARPHRD_IPGRE;
231 	int score, cand_score = 4;
232 
233 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
234 		if (local != t->parms.iph.saddr ||
235 		    remote != t->parms.iph.daddr ||
236 		    key != t->parms.i_key ||
237 		    !(t->dev->flags & IFF_UP))
238 			continue;
239 
240 		if (t->dev->type != ARPHRD_IPGRE &&
241 		    t->dev->type != dev_type)
242 			continue;
243 
244 		score = 0;
245 		if (t->parms.link != link)
246 			score |= 1;
247 		if (t->dev->type != dev_type)
248 			score |= 2;
249 		if (score == 0)
250 			return t;
251 
252 		if (score < cand_score) {
253 			cand = t;
254 			cand_score = score;
255 		}
256 	}
257 
258 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
259 		if (remote != t->parms.iph.daddr ||
260 		    key != t->parms.i_key ||
261 		    !(t->dev->flags & IFF_UP))
262 			continue;
263 
264 		if (t->dev->type != ARPHRD_IPGRE &&
265 		    t->dev->type != dev_type)
266 			continue;
267 
268 		score = 0;
269 		if (t->parms.link != link)
270 			score |= 1;
271 		if (t->dev->type != dev_type)
272 			score |= 2;
273 		if (score == 0)
274 			return t;
275 
276 		if (score < cand_score) {
277 			cand = t;
278 			cand_score = score;
279 		}
280 	}
281 
282 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
283 		if ((local != t->parms.iph.saddr &&
284 		     (local != t->parms.iph.daddr ||
285 		      !ipv4_is_multicast(local))) ||
286 		    key != t->parms.i_key ||
287 		    !(t->dev->flags & IFF_UP))
288 			continue;
289 
290 		if (t->dev->type != ARPHRD_IPGRE &&
291 		    t->dev->type != dev_type)
292 			continue;
293 
294 		score = 0;
295 		if (t->parms.link != link)
296 			score |= 1;
297 		if (t->dev->type != dev_type)
298 			score |= 2;
299 		if (score == 0)
300 			return t;
301 
302 		if (score < cand_score) {
303 			cand = t;
304 			cand_score = score;
305 		}
306 	}
307 
308 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
309 		if (t->parms.i_key != key ||
310 		    !(t->dev->flags & IFF_UP))
311 			continue;
312 
313 		if (t->dev->type != ARPHRD_IPGRE &&
314 		    t->dev->type != dev_type)
315 			continue;
316 
317 		score = 0;
318 		if (t->parms.link != link)
319 			score |= 1;
320 		if (t->dev->type != dev_type)
321 			score |= 2;
322 		if (score == 0)
323 			return t;
324 
325 		if (score < cand_score) {
326 			cand = t;
327 			cand_score = score;
328 		}
329 	}
330 
331 	if (cand != NULL)
332 		return cand;
333 
334 	dev = ign->fb_tunnel_dev;
335 	if (dev->flags & IFF_UP)
336 		return netdev_priv(dev);
337 
338 	return NULL;
339 }
340 
341 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
342 		struct ip_tunnel_parm *parms)
343 {
344 	__be32 remote = parms->iph.daddr;
345 	__be32 local = parms->iph.saddr;
346 	__be32 key = parms->i_key;
347 	unsigned int h = HASH(key);
348 	int prio = 0;
349 
350 	if (local)
351 		prio |= 1;
352 	if (remote && !ipv4_is_multicast(remote)) {
353 		prio |= 2;
354 		h ^= HASH(remote);
355 	}
356 
357 	return &ign->tunnels[prio][h];
358 }
359 
360 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
361 		struct ip_tunnel *t)
362 {
363 	return __ipgre_bucket(ign, &t->parms);
364 }
365 
366 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
367 {
368 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
369 
370 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
371 	rcu_assign_pointer(*tp, t);
372 }
373 
374 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
375 {
376 	struct ip_tunnel __rcu **tp;
377 	struct ip_tunnel *iter;
378 
379 	for (tp = ipgre_bucket(ign, t);
380 	     (iter = rtnl_dereference(*tp)) != NULL;
381 	     tp = &iter->next) {
382 		if (t == iter) {
383 			rcu_assign_pointer(*tp, t->next);
384 			break;
385 		}
386 	}
387 }
388 
389 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
390 					   struct ip_tunnel_parm *parms,
391 					   int type)
392 {
393 	__be32 remote = parms->iph.daddr;
394 	__be32 local = parms->iph.saddr;
395 	__be32 key = parms->i_key;
396 	int link = parms->link;
397 	struct ip_tunnel *t;
398 	struct ip_tunnel __rcu **tp;
399 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400 
401 	for (tp = __ipgre_bucket(ign, parms);
402 	     (t = rtnl_dereference(*tp)) != NULL;
403 	     tp = &t->next)
404 		if (local == t->parms.iph.saddr &&
405 		    remote == t->parms.iph.daddr &&
406 		    key == t->parms.i_key &&
407 		    link == t->parms.link &&
408 		    type == t->dev->type)
409 			break;
410 
411 	return t;
412 }
413 
414 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
415 		struct ip_tunnel_parm *parms, int create)
416 {
417 	struct ip_tunnel *t, *nt;
418 	struct net_device *dev;
419 	char name[IFNAMSIZ];
420 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
421 
422 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
423 	if (t || !create)
424 		return t;
425 
426 	if (parms->name[0])
427 		strlcpy(name, parms->name, IFNAMSIZ);
428 	else
429 		strcpy(name, "gre%d");
430 
431 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
432 	if (!dev)
433 		return NULL;
434 
435 	dev_net_set(dev, net);
436 
437 	nt = netdev_priv(dev);
438 	nt->parms = *parms;
439 	dev->rtnl_link_ops = &ipgre_link_ops;
440 
441 	dev->mtu = ipgre_tunnel_bind_dev(dev);
442 
443 	if (register_netdevice(dev) < 0)
444 		goto failed_free;
445 
446 	/* Can use a lockless transmit, unless we generate output sequences */
447 	if (!(nt->parms.o_flags & GRE_SEQ))
448 		dev->features |= NETIF_F_LLTX;
449 
450 	dev_hold(dev);
451 	ipgre_tunnel_link(ign, nt);
452 	return nt;
453 
454 failed_free:
455 	free_netdev(dev);
456 	return NULL;
457 }
458 
459 static void ipgre_tunnel_uninit(struct net_device *dev)
460 {
461 	struct net *net = dev_net(dev);
462 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
463 
464 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
465 	dev_put(dev);
466 }
467 
468 
469 static void ipgre_err(struct sk_buff *skb, u32 info)
470 {
471 
472 /* All the routers (except for Linux) return only
473    8 bytes of packet payload. It means, that precise relaying of
474    ICMP in the real Internet is absolutely infeasible.
475 
476    Moreover, Cisco "wise men" put GRE key to the third word
477    in GRE header. It makes impossible maintaining even soft state for keyed
478    GRE tunnels with enabled checksum. Tell them "thank you".
479 
480    Well, I wonder, rfc1812 was written by Cisco employee,
481    what the hell these idiots break standards established
482    by themselves???
483  */
484 
485 	const struct iphdr *iph = (const struct iphdr *)skb->data;
486 	__be16	     *p = (__be16 *)(skb->data+(iph->ihl<<2));
487 	int grehlen = (iph->ihl<<2) + 4;
488 	const int type = icmp_hdr(skb)->type;
489 	const int code = icmp_hdr(skb)->code;
490 	struct ip_tunnel *t;
491 	__be16 flags;
492 
493 	flags = p[0];
494 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
495 		if (flags&(GRE_VERSION|GRE_ROUTING))
496 			return;
497 		if (flags&GRE_KEY) {
498 			grehlen += 4;
499 			if (flags&GRE_CSUM)
500 				grehlen += 4;
501 		}
502 	}
503 
504 	/* If only 8 bytes returned, keyed message will be dropped here */
505 	if (skb_headlen(skb) < grehlen)
506 		return;
507 
508 	switch (type) {
509 	default:
510 	case ICMP_PARAMETERPROB:
511 		return;
512 
513 	case ICMP_DEST_UNREACH:
514 		switch (code) {
515 		case ICMP_SR_FAILED:
516 		case ICMP_PORT_UNREACH:
517 			/* Impossible event. */
518 			return;
519 		case ICMP_FRAG_NEEDED:
520 			/* Soft state for pmtu is maintained by IP core. */
521 			return;
522 		default:
523 			/* All others are translated to HOST_UNREACH.
524 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
525 			   I believe they are just ether pollution. --ANK
526 			 */
527 			break;
528 		}
529 		break;
530 	case ICMP_TIME_EXCEEDED:
531 		if (code != ICMP_EXC_TTL)
532 			return;
533 		break;
534 	}
535 
536 	rcu_read_lock();
537 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
538 				flags & GRE_KEY ?
539 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
540 				p[1]);
541 	if (t == NULL || t->parms.iph.daddr == 0 ||
542 	    ipv4_is_multicast(t->parms.iph.daddr))
543 		goto out;
544 
545 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
546 		goto out;
547 
548 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
549 		t->err_count++;
550 	else
551 		t->err_count = 1;
552 	t->err_time = jiffies;
553 out:
554 	rcu_read_unlock();
555 }
556 
557 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
558 {
559 	if (INET_ECN_is_ce(iph->tos)) {
560 		if (skb->protocol == htons(ETH_P_IP)) {
561 			IP_ECN_set_ce(ip_hdr(skb));
562 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
563 			IP6_ECN_set_ce(ipv6_hdr(skb));
564 		}
565 	}
566 }
567 
568 static inline u8
569 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
570 {
571 	u8 inner = 0;
572 	if (skb->protocol == htons(ETH_P_IP))
573 		inner = old_iph->tos;
574 	else if (skb->protocol == htons(ETH_P_IPV6))
575 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
576 	return INET_ECN_encapsulate(tos, inner);
577 }
578 
579 static int ipgre_rcv(struct sk_buff *skb)
580 {
581 	const struct iphdr *iph;
582 	u8     *h;
583 	__be16    flags;
584 	__sum16   csum = 0;
585 	__be32 key = 0;
586 	u32    seqno = 0;
587 	struct ip_tunnel *tunnel;
588 	int    offset = 4;
589 	__be16 gre_proto;
590 
591 	if (!pskb_may_pull(skb, 16))
592 		goto drop_nolock;
593 
594 	iph = ip_hdr(skb);
595 	h = skb->data;
596 	flags = *(__be16 *)h;
597 
598 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
599 		/* - Version must be 0.
600 		   - We do not support routing headers.
601 		 */
602 		if (flags&(GRE_VERSION|GRE_ROUTING))
603 			goto drop_nolock;
604 
605 		if (flags&GRE_CSUM) {
606 			switch (skb->ip_summed) {
607 			case CHECKSUM_COMPLETE:
608 				csum = csum_fold(skb->csum);
609 				if (!csum)
610 					break;
611 				/* fall through */
612 			case CHECKSUM_NONE:
613 				skb->csum = 0;
614 				csum = __skb_checksum_complete(skb);
615 				skb->ip_summed = CHECKSUM_COMPLETE;
616 			}
617 			offset += 4;
618 		}
619 		if (flags&GRE_KEY) {
620 			key = *(__be32 *)(h + offset);
621 			offset += 4;
622 		}
623 		if (flags&GRE_SEQ) {
624 			seqno = ntohl(*(__be32 *)(h + offset));
625 			offset += 4;
626 		}
627 	}
628 
629 	gre_proto = *(__be16 *)(h + 2);
630 
631 	rcu_read_lock();
632 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
633 					  iph->saddr, iph->daddr, key,
634 					  gre_proto))) {
635 		struct pcpu_tstats *tstats;
636 
637 		secpath_reset(skb);
638 
639 		skb->protocol = gre_proto;
640 		/* WCCP version 1 and 2 protocol decoding.
641 		 * - Change protocol to IP
642 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
643 		 */
644 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
645 			skb->protocol = htons(ETH_P_IP);
646 			if ((*(h + offset) & 0xF0) != 0x40)
647 				offset += 4;
648 		}
649 
650 		skb->mac_header = skb->network_header;
651 		__pskb_pull(skb, offset);
652 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
653 		skb->pkt_type = PACKET_HOST;
654 #ifdef CONFIG_NET_IPGRE_BROADCAST
655 		if (ipv4_is_multicast(iph->daddr)) {
656 			/* Looped back packet, drop it! */
657 			if (rt_is_output_route(skb_rtable(skb)))
658 				goto drop;
659 			tunnel->dev->stats.multicast++;
660 			skb->pkt_type = PACKET_BROADCAST;
661 		}
662 #endif
663 
664 		if (((flags&GRE_CSUM) && csum) ||
665 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
666 			tunnel->dev->stats.rx_crc_errors++;
667 			tunnel->dev->stats.rx_errors++;
668 			goto drop;
669 		}
670 		if (tunnel->parms.i_flags&GRE_SEQ) {
671 			if (!(flags&GRE_SEQ) ||
672 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
673 				tunnel->dev->stats.rx_fifo_errors++;
674 				tunnel->dev->stats.rx_errors++;
675 				goto drop;
676 			}
677 			tunnel->i_seqno = seqno + 1;
678 		}
679 
680 		/* Warning: All skb pointers will be invalidated! */
681 		if (tunnel->dev->type == ARPHRD_ETHER) {
682 			if (!pskb_may_pull(skb, ETH_HLEN)) {
683 				tunnel->dev->stats.rx_length_errors++;
684 				tunnel->dev->stats.rx_errors++;
685 				goto drop;
686 			}
687 
688 			iph = ip_hdr(skb);
689 			skb->protocol = eth_type_trans(skb, tunnel->dev);
690 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
691 		}
692 
693 		tstats = this_cpu_ptr(tunnel->dev->tstats);
694 		u64_stats_update_begin(&tstats->syncp);
695 		tstats->rx_packets++;
696 		tstats->rx_bytes += skb->len;
697 		u64_stats_update_end(&tstats->syncp);
698 
699 		__skb_tunnel_rx(skb, tunnel->dev);
700 
701 		skb_reset_network_header(skb);
702 		ipgre_ecn_decapsulate(iph, skb);
703 
704 		netif_rx(skb);
705 
706 		rcu_read_unlock();
707 		return 0;
708 	}
709 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
710 
711 drop:
712 	rcu_read_unlock();
713 drop_nolock:
714 	kfree_skb(skb);
715 	return 0;
716 }
717 
718 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
719 {
720 	struct ip_tunnel *tunnel = netdev_priv(dev);
721 	struct pcpu_tstats *tstats;
722 	const struct iphdr  *old_iph = ip_hdr(skb);
723 	const struct iphdr  *tiph;
724 	struct flowi4 fl4;
725 	u8     tos;
726 	__be16 df;
727 	struct rtable *rt;     			/* Route to the other host */
728 	struct net_device *tdev;		/* Device to other host */
729 	struct iphdr  *iph;			/* Our new IP header */
730 	unsigned int max_headroom;		/* The extra header space needed */
731 	int    gre_hlen;
732 	__be32 dst;
733 	int    mtu;
734 
735 	if (dev->type == ARPHRD_ETHER)
736 		IPCB(skb)->flags = 0;
737 
738 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
739 		gre_hlen = 0;
740 		tiph = (const struct iphdr *)skb->data;
741 	} else {
742 		gre_hlen = tunnel->hlen;
743 		tiph = &tunnel->parms.iph;
744 	}
745 
746 	if ((dst = tiph->daddr) == 0) {
747 		/* NBMA tunnel */
748 
749 		if (skb_dst(skb) == NULL) {
750 			dev->stats.tx_fifo_errors++;
751 			goto tx_error;
752 		}
753 
754 		if (skb->protocol == htons(ETH_P_IP)) {
755 			rt = skb_rtable(skb);
756 			dst = rt->rt_gateway;
757 		}
758 #if IS_ENABLED(CONFIG_IPV6)
759 		else if (skb->protocol == htons(ETH_P_IPV6)) {
760 			const struct in6_addr *addr6;
761 			struct neighbour *neigh;
762 			bool do_tx_error_icmp;
763 			int addr_type;
764 
765 			neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
766 			if (neigh == NULL)
767 				goto tx_error;
768 
769 			addr6 = (const struct in6_addr *)&neigh->primary_key;
770 			addr_type = ipv6_addr_type(addr6);
771 
772 			if (addr_type == IPV6_ADDR_ANY) {
773 				addr6 = &ipv6_hdr(skb)->daddr;
774 				addr_type = ipv6_addr_type(addr6);
775 			}
776 
777 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
778 				do_tx_error_icmp = true;
779 			else {
780 				do_tx_error_icmp = false;
781 				dst = addr6->s6_addr32[3];
782 			}
783 			neigh_release(neigh);
784 			if (do_tx_error_icmp)
785 				goto tx_error_icmp;
786 		}
787 #endif
788 		else
789 			goto tx_error;
790 	}
791 
792 	tos = tiph->tos;
793 	if (tos == 1) {
794 		tos = 0;
795 		if (skb->protocol == htons(ETH_P_IP))
796 			tos = old_iph->tos;
797 		else if (skb->protocol == htons(ETH_P_IPV6))
798 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
799 	}
800 
801 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
802 				 tunnel->parms.o_key, RT_TOS(tos),
803 				 tunnel->parms.link);
804 	if (IS_ERR(rt)) {
805 		dev->stats.tx_carrier_errors++;
806 		goto tx_error;
807 	}
808 	tdev = rt->dst.dev;
809 
810 	if (tdev == dev) {
811 		ip_rt_put(rt);
812 		dev->stats.collisions++;
813 		goto tx_error;
814 	}
815 
816 	df = tiph->frag_off;
817 	if (df)
818 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
819 	else
820 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
821 
822 	if (skb_dst(skb))
823 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
824 
825 	if (skb->protocol == htons(ETH_P_IP)) {
826 		df |= (old_iph->frag_off&htons(IP_DF));
827 
828 		if ((old_iph->frag_off&htons(IP_DF)) &&
829 		    mtu < ntohs(old_iph->tot_len)) {
830 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
831 			ip_rt_put(rt);
832 			goto tx_error;
833 		}
834 	}
835 #if IS_ENABLED(CONFIG_IPV6)
836 	else if (skb->protocol == htons(ETH_P_IPV6)) {
837 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
838 
839 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
840 			if ((tunnel->parms.iph.daddr &&
841 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
842 			    rt6->rt6i_dst.plen == 128) {
843 				rt6->rt6i_flags |= RTF_MODIFIED;
844 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
845 			}
846 		}
847 
848 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
849 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
850 			ip_rt_put(rt);
851 			goto tx_error;
852 		}
853 	}
854 #endif
855 
856 	if (tunnel->err_count > 0) {
857 		if (time_before(jiffies,
858 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
859 			tunnel->err_count--;
860 
861 			dst_link_failure(skb);
862 		} else
863 			tunnel->err_count = 0;
864 	}
865 
866 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
867 
868 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
869 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
870 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
871 		if (max_headroom > dev->needed_headroom)
872 			dev->needed_headroom = max_headroom;
873 		if (!new_skb) {
874 			ip_rt_put(rt);
875 			dev->stats.tx_dropped++;
876 			dev_kfree_skb(skb);
877 			return NETDEV_TX_OK;
878 		}
879 		if (skb->sk)
880 			skb_set_owner_w(new_skb, skb->sk);
881 		dev_kfree_skb(skb);
882 		skb = new_skb;
883 		old_iph = ip_hdr(skb);
884 	}
885 
886 	skb_reset_transport_header(skb);
887 	skb_push(skb, gre_hlen);
888 	skb_reset_network_header(skb);
889 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
890 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
891 			      IPSKB_REROUTED);
892 	skb_dst_drop(skb);
893 	skb_dst_set(skb, &rt->dst);
894 
895 	/*
896 	 *	Push down and install the IPIP header.
897 	 */
898 
899 	iph 			=	ip_hdr(skb);
900 	iph->version		=	4;
901 	iph->ihl		=	sizeof(struct iphdr) >> 2;
902 	iph->frag_off		=	df;
903 	iph->protocol		=	IPPROTO_GRE;
904 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
905 	iph->daddr		=	fl4.daddr;
906 	iph->saddr		=	fl4.saddr;
907 
908 	if ((iph->ttl = tiph->ttl) == 0) {
909 		if (skb->protocol == htons(ETH_P_IP))
910 			iph->ttl = old_iph->ttl;
911 #if IS_ENABLED(CONFIG_IPV6)
912 		else if (skb->protocol == htons(ETH_P_IPV6))
913 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
914 #endif
915 		else
916 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
917 	}
918 
919 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
920 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
921 				   htons(ETH_P_TEB) : skb->protocol;
922 
923 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
924 		__be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
925 
926 		if (tunnel->parms.o_flags&GRE_SEQ) {
927 			++tunnel->o_seqno;
928 			*ptr = htonl(tunnel->o_seqno);
929 			ptr--;
930 		}
931 		if (tunnel->parms.o_flags&GRE_KEY) {
932 			*ptr = tunnel->parms.o_key;
933 			ptr--;
934 		}
935 		if (tunnel->parms.o_flags&GRE_CSUM) {
936 			*ptr = 0;
937 			*(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
938 		}
939 	}
940 
941 	nf_reset(skb);
942 	tstats = this_cpu_ptr(dev->tstats);
943 	__IPTUNNEL_XMIT(tstats, &dev->stats);
944 	return NETDEV_TX_OK;
945 
946 #if IS_ENABLED(CONFIG_IPV6)
947 tx_error_icmp:
948 	dst_link_failure(skb);
949 #endif
950 tx_error:
951 	dev->stats.tx_errors++;
952 	dev_kfree_skb(skb);
953 	return NETDEV_TX_OK;
954 }
955 
956 static int ipgre_tunnel_bind_dev(struct net_device *dev)
957 {
958 	struct net_device *tdev = NULL;
959 	struct ip_tunnel *tunnel;
960 	const struct iphdr *iph;
961 	int hlen = LL_MAX_HEADER;
962 	int mtu = ETH_DATA_LEN;
963 	int addend = sizeof(struct iphdr) + 4;
964 
965 	tunnel = netdev_priv(dev);
966 	iph = &tunnel->parms.iph;
967 
968 	/* Guess output device to choose reasonable mtu and needed_headroom */
969 
970 	if (iph->daddr) {
971 		struct flowi4 fl4;
972 		struct rtable *rt;
973 
974 		rt = ip_route_output_gre(dev_net(dev), &fl4,
975 					 iph->daddr, iph->saddr,
976 					 tunnel->parms.o_key,
977 					 RT_TOS(iph->tos),
978 					 tunnel->parms.link);
979 		if (!IS_ERR(rt)) {
980 			tdev = rt->dst.dev;
981 			ip_rt_put(rt);
982 		}
983 
984 		if (dev->type != ARPHRD_ETHER)
985 			dev->flags |= IFF_POINTOPOINT;
986 	}
987 
988 	if (!tdev && tunnel->parms.link)
989 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
990 
991 	if (tdev) {
992 		hlen = tdev->hard_header_len + tdev->needed_headroom;
993 		mtu = tdev->mtu;
994 	}
995 	dev->iflink = tunnel->parms.link;
996 
997 	/* Precalculate GRE options length */
998 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
999 		if (tunnel->parms.o_flags&GRE_CSUM)
1000 			addend += 4;
1001 		if (tunnel->parms.o_flags&GRE_KEY)
1002 			addend += 4;
1003 		if (tunnel->parms.o_flags&GRE_SEQ)
1004 			addend += 4;
1005 	}
1006 	dev->needed_headroom = addend + hlen;
1007 	mtu -= dev->hard_header_len + addend;
1008 
1009 	if (mtu < 68)
1010 		mtu = 68;
1011 
1012 	tunnel->hlen = addend;
1013 
1014 	return mtu;
1015 }
1016 
1017 static int
1018 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1019 {
1020 	int err = 0;
1021 	struct ip_tunnel_parm p;
1022 	struct ip_tunnel *t;
1023 	struct net *net = dev_net(dev);
1024 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1025 
1026 	switch (cmd) {
1027 	case SIOCGETTUNNEL:
1028 		t = NULL;
1029 		if (dev == ign->fb_tunnel_dev) {
1030 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1031 				err = -EFAULT;
1032 				break;
1033 			}
1034 			t = ipgre_tunnel_locate(net, &p, 0);
1035 		}
1036 		if (t == NULL)
1037 			t = netdev_priv(dev);
1038 		memcpy(&p, &t->parms, sizeof(p));
1039 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1040 			err = -EFAULT;
1041 		break;
1042 
1043 	case SIOCADDTUNNEL:
1044 	case SIOCCHGTUNNEL:
1045 		err = -EPERM;
1046 		if (!capable(CAP_NET_ADMIN))
1047 			goto done;
1048 
1049 		err = -EFAULT;
1050 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1051 			goto done;
1052 
1053 		err = -EINVAL;
1054 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1055 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1056 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1057 			goto done;
1058 		if (p.iph.ttl)
1059 			p.iph.frag_off |= htons(IP_DF);
1060 
1061 		if (!(p.i_flags&GRE_KEY))
1062 			p.i_key = 0;
1063 		if (!(p.o_flags&GRE_KEY))
1064 			p.o_key = 0;
1065 
1066 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1067 
1068 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1069 			if (t != NULL) {
1070 				if (t->dev != dev) {
1071 					err = -EEXIST;
1072 					break;
1073 				}
1074 			} else {
1075 				unsigned int nflags = 0;
1076 
1077 				t = netdev_priv(dev);
1078 
1079 				if (ipv4_is_multicast(p.iph.daddr))
1080 					nflags = IFF_BROADCAST;
1081 				else if (p.iph.daddr)
1082 					nflags = IFF_POINTOPOINT;
1083 
1084 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1085 					err = -EINVAL;
1086 					break;
1087 				}
1088 				ipgre_tunnel_unlink(ign, t);
1089 				synchronize_net();
1090 				t->parms.iph.saddr = p.iph.saddr;
1091 				t->parms.iph.daddr = p.iph.daddr;
1092 				t->parms.i_key = p.i_key;
1093 				t->parms.o_key = p.o_key;
1094 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1095 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1096 				ipgre_tunnel_link(ign, t);
1097 				netdev_state_change(dev);
1098 			}
1099 		}
1100 
1101 		if (t) {
1102 			err = 0;
1103 			if (cmd == SIOCCHGTUNNEL) {
1104 				t->parms.iph.ttl = p.iph.ttl;
1105 				t->parms.iph.tos = p.iph.tos;
1106 				t->parms.iph.frag_off = p.iph.frag_off;
1107 				if (t->parms.link != p.link) {
1108 					t->parms.link = p.link;
1109 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1110 					netdev_state_change(dev);
1111 				}
1112 			}
1113 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1114 				err = -EFAULT;
1115 		} else
1116 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1117 		break;
1118 
1119 	case SIOCDELTUNNEL:
1120 		err = -EPERM;
1121 		if (!capable(CAP_NET_ADMIN))
1122 			goto done;
1123 
1124 		if (dev == ign->fb_tunnel_dev) {
1125 			err = -EFAULT;
1126 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1127 				goto done;
1128 			err = -ENOENT;
1129 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1130 				goto done;
1131 			err = -EPERM;
1132 			if (t == netdev_priv(ign->fb_tunnel_dev))
1133 				goto done;
1134 			dev = t->dev;
1135 		}
1136 		unregister_netdevice(dev);
1137 		err = 0;
1138 		break;
1139 
1140 	default:
1141 		err = -EINVAL;
1142 	}
1143 
1144 done:
1145 	return err;
1146 }
1147 
1148 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1149 {
1150 	struct ip_tunnel *tunnel = netdev_priv(dev);
1151 	if (new_mtu < 68 ||
1152 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1153 		return -EINVAL;
1154 	dev->mtu = new_mtu;
1155 	return 0;
1156 }
1157 
1158 /* Nice toy. Unfortunately, useless in real life :-)
1159    It allows to construct virtual multiprotocol broadcast "LAN"
1160    over the Internet, provided multicast routing is tuned.
1161 
1162 
1163    I have no idea was this bicycle invented before me,
1164    so that I had to set ARPHRD_IPGRE to a random value.
1165    I have an impression, that Cisco could make something similar,
1166    but this feature is apparently missing in IOS<=11.2(8).
1167 
1168    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1169    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1170 
1171    ping -t 255 224.66.66.66
1172 
1173    If nobody answers, mbone does not work.
1174 
1175    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1176    ip addr add 10.66.66.<somewhat>/24 dev Universe
1177    ifconfig Universe up
1178    ifconfig Universe add fe80::<Your_real_addr>/10
1179    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1180    ftp 10.66.66.66
1181    ...
1182    ftp fec0:6666:6666::193.233.7.65
1183    ...
1184 
1185  */
1186 
1187 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1188 			unsigned short type,
1189 			const void *daddr, const void *saddr, unsigned int len)
1190 {
1191 	struct ip_tunnel *t = netdev_priv(dev);
1192 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1193 	__be16 *p = (__be16 *)(iph+1);
1194 
1195 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1196 	p[0]		= t->parms.o_flags;
1197 	p[1]		= htons(type);
1198 
1199 	/*
1200 	 *	Set the source hardware address.
1201 	 */
1202 
1203 	if (saddr)
1204 		memcpy(&iph->saddr, saddr, 4);
1205 	if (daddr)
1206 		memcpy(&iph->daddr, daddr, 4);
1207 	if (iph->daddr)
1208 		return t->hlen;
1209 
1210 	return -t->hlen;
1211 }
1212 
1213 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1214 {
1215 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1216 	memcpy(haddr, &iph->saddr, 4);
1217 	return 4;
1218 }
1219 
1220 static const struct header_ops ipgre_header_ops = {
1221 	.create	= ipgre_header,
1222 	.parse	= ipgre_header_parse,
1223 };
1224 
1225 #ifdef CONFIG_NET_IPGRE_BROADCAST
1226 static int ipgre_open(struct net_device *dev)
1227 {
1228 	struct ip_tunnel *t = netdev_priv(dev);
1229 
1230 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1231 		struct flowi4 fl4;
1232 		struct rtable *rt;
1233 
1234 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1235 					 t->parms.iph.daddr,
1236 					 t->parms.iph.saddr,
1237 					 t->parms.o_key,
1238 					 RT_TOS(t->parms.iph.tos),
1239 					 t->parms.link);
1240 		if (IS_ERR(rt))
1241 			return -EADDRNOTAVAIL;
1242 		dev = rt->dst.dev;
1243 		ip_rt_put(rt);
1244 		if (__in_dev_get_rtnl(dev) == NULL)
1245 			return -EADDRNOTAVAIL;
1246 		t->mlink = dev->ifindex;
1247 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1248 	}
1249 	return 0;
1250 }
1251 
1252 static int ipgre_close(struct net_device *dev)
1253 {
1254 	struct ip_tunnel *t = netdev_priv(dev);
1255 
1256 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1257 		struct in_device *in_dev;
1258 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1259 		if (in_dev)
1260 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1261 	}
1262 	return 0;
1263 }
1264 
1265 #endif
1266 
1267 static const struct net_device_ops ipgre_netdev_ops = {
1268 	.ndo_init		= ipgre_tunnel_init,
1269 	.ndo_uninit		= ipgre_tunnel_uninit,
1270 #ifdef CONFIG_NET_IPGRE_BROADCAST
1271 	.ndo_open		= ipgre_open,
1272 	.ndo_stop		= ipgre_close,
1273 #endif
1274 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1275 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1276 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1277 	.ndo_get_stats64	= ipgre_get_stats64,
1278 };
1279 
1280 static void ipgre_dev_free(struct net_device *dev)
1281 {
1282 	free_percpu(dev->tstats);
1283 	free_netdev(dev);
1284 }
1285 
1286 static void ipgre_tunnel_setup(struct net_device *dev)
1287 {
1288 	dev->netdev_ops		= &ipgre_netdev_ops;
1289 	dev->destructor 	= ipgre_dev_free;
1290 
1291 	dev->type		= ARPHRD_IPGRE;
1292 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1293 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1294 	dev->flags		= IFF_NOARP;
1295 	dev->iflink		= 0;
1296 	dev->addr_len		= 4;
1297 	dev->features		|= NETIF_F_NETNS_LOCAL;
1298 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1299 }
1300 
1301 static int ipgre_tunnel_init(struct net_device *dev)
1302 {
1303 	struct ip_tunnel *tunnel;
1304 	struct iphdr *iph;
1305 
1306 	tunnel = netdev_priv(dev);
1307 	iph = &tunnel->parms.iph;
1308 
1309 	tunnel->dev = dev;
1310 	strcpy(tunnel->parms.name, dev->name);
1311 
1312 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1313 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1314 
1315 	if (iph->daddr) {
1316 #ifdef CONFIG_NET_IPGRE_BROADCAST
1317 		if (ipv4_is_multicast(iph->daddr)) {
1318 			if (!iph->saddr)
1319 				return -EINVAL;
1320 			dev->flags = IFF_BROADCAST;
1321 			dev->header_ops = &ipgre_header_ops;
1322 		}
1323 #endif
1324 	} else
1325 		dev->header_ops = &ipgre_header_ops;
1326 
1327 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1328 	if (!dev->tstats)
1329 		return -ENOMEM;
1330 
1331 	return 0;
1332 }
1333 
1334 static void ipgre_fb_tunnel_init(struct net_device *dev)
1335 {
1336 	struct ip_tunnel *tunnel = netdev_priv(dev);
1337 	struct iphdr *iph = &tunnel->parms.iph;
1338 
1339 	tunnel->dev = dev;
1340 	strcpy(tunnel->parms.name, dev->name);
1341 
1342 	iph->version		= 4;
1343 	iph->protocol		= IPPROTO_GRE;
1344 	iph->ihl		= 5;
1345 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1346 
1347 	dev_hold(dev);
1348 }
1349 
1350 
1351 static const struct gre_protocol ipgre_protocol = {
1352 	.handler     = ipgre_rcv,
1353 	.err_handler = ipgre_err,
1354 };
1355 
1356 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1357 {
1358 	int prio;
1359 
1360 	for (prio = 0; prio < 4; prio++) {
1361 		int h;
1362 		for (h = 0; h < HASH_SIZE; h++) {
1363 			struct ip_tunnel *t;
1364 
1365 			t = rtnl_dereference(ign->tunnels[prio][h]);
1366 
1367 			while (t != NULL) {
1368 				unregister_netdevice_queue(t->dev, head);
1369 				t = rtnl_dereference(t->next);
1370 			}
1371 		}
1372 	}
1373 }
1374 
1375 static int __net_init ipgre_init_net(struct net *net)
1376 {
1377 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1378 	int err;
1379 
1380 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1381 					   ipgre_tunnel_setup);
1382 	if (!ign->fb_tunnel_dev) {
1383 		err = -ENOMEM;
1384 		goto err_alloc_dev;
1385 	}
1386 	dev_net_set(ign->fb_tunnel_dev, net);
1387 
1388 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1389 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1390 
1391 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1392 		goto err_reg_dev;
1393 
1394 	rcu_assign_pointer(ign->tunnels_wc[0],
1395 			   netdev_priv(ign->fb_tunnel_dev));
1396 	return 0;
1397 
1398 err_reg_dev:
1399 	ipgre_dev_free(ign->fb_tunnel_dev);
1400 err_alloc_dev:
1401 	return err;
1402 }
1403 
1404 static void __net_exit ipgre_exit_net(struct net *net)
1405 {
1406 	struct ipgre_net *ign;
1407 	LIST_HEAD(list);
1408 
1409 	ign = net_generic(net, ipgre_net_id);
1410 	rtnl_lock();
1411 	ipgre_destroy_tunnels(ign, &list);
1412 	unregister_netdevice_many(&list);
1413 	rtnl_unlock();
1414 }
1415 
1416 static struct pernet_operations ipgre_net_ops = {
1417 	.init = ipgre_init_net,
1418 	.exit = ipgre_exit_net,
1419 	.id   = &ipgre_net_id,
1420 	.size = sizeof(struct ipgre_net),
1421 };
1422 
1423 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1424 {
1425 	__be16 flags;
1426 
1427 	if (!data)
1428 		return 0;
1429 
1430 	flags = 0;
1431 	if (data[IFLA_GRE_IFLAGS])
1432 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1433 	if (data[IFLA_GRE_OFLAGS])
1434 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1435 	if (flags & (GRE_VERSION|GRE_ROUTING))
1436 		return -EINVAL;
1437 
1438 	return 0;
1439 }
1440 
1441 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1442 {
1443 	__be32 daddr;
1444 
1445 	if (tb[IFLA_ADDRESS]) {
1446 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1447 			return -EINVAL;
1448 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1449 			return -EADDRNOTAVAIL;
1450 	}
1451 
1452 	if (!data)
1453 		goto out;
1454 
1455 	if (data[IFLA_GRE_REMOTE]) {
1456 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1457 		if (!daddr)
1458 			return -EINVAL;
1459 	}
1460 
1461 out:
1462 	return ipgre_tunnel_validate(tb, data);
1463 }
1464 
1465 static void ipgre_netlink_parms(struct nlattr *data[],
1466 				struct ip_tunnel_parm *parms)
1467 {
1468 	memset(parms, 0, sizeof(*parms));
1469 
1470 	parms->iph.protocol = IPPROTO_GRE;
1471 
1472 	if (!data)
1473 		return;
1474 
1475 	if (data[IFLA_GRE_LINK])
1476 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1477 
1478 	if (data[IFLA_GRE_IFLAGS])
1479 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1480 
1481 	if (data[IFLA_GRE_OFLAGS])
1482 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1483 
1484 	if (data[IFLA_GRE_IKEY])
1485 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1486 
1487 	if (data[IFLA_GRE_OKEY])
1488 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1489 
1490 	if (data[IFLA_GRE_LOCAL])
1491 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1492 
1493 	if (data[IFLA_GRE_REMOTE])
1494 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1495 
1496 	if (data[IFLA_GRE_TTL])
1497 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1498 
1499 	if (data[IFLA_GRE_TOS])
1500 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1501 
1502 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1503 		parms->iph.frag_off = htons(IP_DF);
1504 }
1505 
1506 static int ipgre_tap_init(struct net_device *dev)
1507 {
1508 	struct ip_tunnel *tunnel;
1509 
1510 	tunnel = netdev_priv(dev);
1511 
1512 	tunnel->dev = dev;
1513 	strcpy(tunnel->parms.name, dev->name);
1514 
1515 	ipgre_tunnel_bind_dev(dev);
1516 
1517 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1518 	if (!dev->tstats)
1519 		return -ENOMEM;
1520 
1521 	return 0;
1522 }
1523 
1524 static const struct net_device_ops ipgre_tap_netdev_ops = {
1525 	.ndo_init		= ipgre_tap_init,
1526 	.ndo_uninit		= ipgre_tunnel_uninit,
1527 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1528 	.ndo_set_mac_address 	= eth_mac_addr,
1529 	.ndo_validate_addr	= eth_validate_addr,
1530 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1531 	.ndo_get_stats64	= ipgre_get_stats64,
1532 };
1533 
1534 static void ipgre_tap_setup(struct net_device *dev)
1535 {
1536 
1537 	ether_setup(dev);
1538 
1539 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1540 	dev->destructor 	= ipgre_dev_free;
1541 
1542 	dev->iflink		= 0;
1543 	dev->features		|= NETIF_F_NETNS_LOCAL;
1544 }
1545 
1546 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1547 			 struct nlattr *data[])
1548 {
1549 	struct ip_tunnel *nt;
1550 	struct net *net = dev_net(dev);
1551 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1552 	int mtu;
1553 	int err;
1554 
1555 	nt = netdev_priv(dev);
1556 	ipgre_netlink_parms(data, &nt->parms);
1557 
1558 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1559 		return -EEXIST;
1560 
1561 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1562 		eth_hw_addr_random(dev);
1563 
1564 	mtu = ipgre_tunnel_bind_dev(dev);
1565 	if (!tb[IFLA_MTU])
1566 		dev->mtu = mtu;
1567 
1568 	/* Can use a lockless transmit, unless we generate output sequences */
1569 	if (!(nt->parms.o_flags & GRE_SEQ))
1570 		dev->features |= NETIF_F_LLTX;
1571 
1572 	err = register_netdevice(dev);
1573 	if (err)
1574 		goto out;
1575 
1576 	dev_hold(dev);
1577 	ipgre_tunnel_link(ign, nt);
1578 
1579 out:
1580 	return err;
1581 }
1582 
1583 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1584 			    struct nlattr *data[])
1585 {
1586 	struct ip_tunnel *t, *nt;
1587 	struct net *net = dev_net(dev);
1588 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1589 	struct ip_tunnel_parm p;
1590 	int mtu;
1591 
1592 	if (dev == ign->fb_tunnel_dev)
1593 		return -EINVAL;
1594 
1595 	nt = netdev_priv(dev);
1596 	ipgre_netlink_parms(data, &p);
1597 
1598 	t = ipgre_tunnel_locate(net, &p, 0);
1599 
1600 	if (t) {
1601 		if (t->dev != dev)
1602 			return -EEXIST;
1603 	} else {
1604 		t = nt;
1605 
1606 		if (dev->type != ARPHRD_ETHER) {
1607 			unsigned int nflags = 0;
1608 
1609 			if (ipv4_is_multicast(p.iph.daddr))
1610 				nflags = IFF_BROADCAST;
1611 			else if (p.iph.daddr)
1612 				nflags = IFF_POINTOPOINT;
1613 
1614 			if ((dev->flags ^ nflags) &
1615 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1616 				return -EINVAL;
1617 		}
1618 
1619 		ipgre_tunnel_unlink(ign, t);
1620 		t->parms.iph.saddr = p.iph.saddr;
1621 		t->parms.iph.daddr = p.iph.daddr;
1622 		t->parms.i_key = p.i_key;
1623 		if (dev->type != ARPHRD_ETHER) {
1624 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1625 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1626 		}
1627 		ipgre_tunnel_link(ign, t);
1628 		netdev_state_change(dev);
1629 	}
1630 
1631 	t->parms.o_key = p.o_key;
1632 	t->parms.iph.ttl = p.iph.ttl;
1633 	t->parms.iph.tos = p.iph.tos;
1634 	t->parms.iph.frag_off = p.iph.frag_off;
1635 
1636 	if (t->parms.link != p.link) {
1637 		t->parms.link = p.link;
1638 		mtu = ipgre_tunnel_bind_dev(dev);
1639 		if (!tb[IFLA_MTU])
1640 			dev->mtu = mtu;
1641 		netdev_state_change(dev);
1642 	}
1643 
1644 	return 0;
1645 }
1646 
1647 static size_t ipgre_get_size(const struct net_device *dev)
1648 {
1649 	return
1650 		/* IFLA_GRE_LINK */
1651 		nla_total_size(4) +
1652 		/* IFLA_GRE_IFLAGS */
1653 		nla_total_size(2) +
1654 		/* IFLA_GRE_OFLAGS */
1655 		nla_total_size(2) +
1656 		/* IFLA_GRE_IKEY */
1657 		nla_total_size(4) +
1658 		/* IFLA_GRE_OKEY */
1659 		nla_total_size(4) +
1660 		/* IFLA_GRE_LOCAL */
1661 		nla_total_size(4) +
1662 		/* IFLA_GRE_REMOTE */
1663 		nla_total_size(4) +
1664 		/* IFLA_GRE_TTL */
1665 		nla_total_size(1) +
1666 		/* IFLA_GRE_TOS */
1667 		nla_total_size(1) +
1668 		/* IFLA_GRE_PMTUDISC */
1669 		nla_total_size(1) +
1670 		0;
1671 }
1672 
1673 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1674 {
1675 	struct ip_tunnel *t = netdev_priv(dev);
1676 	struct ip_tunnel_parm *p = &t->parms;
1677 
1678 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1679 	    nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1680 	    nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1681 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1682 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1683 	    nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1684 	    nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1685 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1686 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1687 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1688 		       !!(p->iph.frag_off & htons(IP_DF))))
1689 		goto nla_put_failure;
1690 	return 0;
1691 
1692 nla_put_failure:
1693 	return -EMSGSIZE;
1694 }
1695 
1696 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1697 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1698 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1699 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1700 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1701 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1702 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1703 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1704 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1705 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1706 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1707 };
1708 
1709 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1710 	.kind		= "gre",
1711 	.maxtype	= IFLA_GRE_MAX,
1712 	.policy		= ipgre_policy,
1713 	.priv_size	= sizeof(struct ip_tunnel),
1714 	.setup		= ipgre_tunnel_setup,
1715 	.validate	= ipgre_tunnel_validate,
1716 	.newlink	= ipgre_newlink,
1717 	.changelink	= ipgre_changelink,
1718 	.get_size	= ipgre_get_size,
1719 	.fill_info	= ipgre_fill_info,
1720 };
1721 
1722 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1723 	.kind		= "gretap",
1724 	.maxtype	= IFLA_GRE_MAX,
1725 	.policy		= ipgre_policy,
1726 	.priv_size	= sizeof(struct ip_tunnel),
1727 	.setup		= ipgre_tap_setup,
1728 	.validate	= ipgre_tap_validate,
1729 	.newlink	= ipgre_newlink,
1730 	.changelink	= ipgre_changelink,
1731 	.get_size	= ipgre_get_size,
1732 	.fill_info	= ipgre_fill_info,
1733 };
1734 
1735 /*
1736  *	And now the modules code and kernel interface.
1737  */
1738 
1739 static int __init ipgre_init(void)
1740 {
1741 	int err;
1742 
1743 	pr_info("GRE over IPv4 tunneling driver\n");
1744 
1745 	err = register_pernet_device(&ipgre_net_ops);
1746 	if (err < 0)
1747 		return err;
1748 
1749 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1750 	if (err < 0) {
1751 		pr_info("%s: can't add protocol\n", __func__);
1752 		goto add_proto_failed;
1753 	}
1754 
1755 	err = rtnl_link_register(&ipgre_link_ops);
1756 	if (err < 0)
1757 		goto rtnl_link_failed;
1758 
1759 	err = rtnl_link_register(&ipgre_tap_ops);
1760 	if (err < 0)
1761 		goto tap_ops_failed;
1762 
1763 out:
1764 	return err;
1765 
1766 tap_ops_failed:
1767 	rtnl_link_unregister(&ipgre_link_ops);
1768 rtnl_link_failed:
1769 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1770 add_proto_failed:
1771 	unregister_pernet_device(&ipgre_net_ops);
1772 	goto out;
1773 }
1774 
1775 static void __exit ipgre_fini(void)
1776 {
1777 	rtnl_link_unregister(&ipgre_tap_ops);
1778 	rtnl_link_unregister(&ipgre_link_ops);
1779 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1780 		pr_info("%s: can't remove protocol\n", __func__);
1781 	unregister_pernet_device(&ipgre_net_ops);
1782 }
1783 
1784 module_init(ipgre_init);
1785 module_exit(ipgre_fini);
1786 MODULE_LICENSE("GPL");
1787 MODULE_ALIAS_RTNL_LINK("gre");
1788 MODULE_ALIAS_RTNL_LINK("gretap");
1789 MODULE_ALIAS_NETDEV("gre0");
1790