xref: /linux/net/ipv4/ip_gre.c (revision b889fcf63cb62e7fdb7816565e28f44dbe4a76a5)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56 
57 /*
58    Problems & solutions
59    --------------------
60 
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65 
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72 
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76 
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81 
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90 
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93 
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97 
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110 
111 
112 
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119 
120    Alexey Kuznetsov.
121  */
122 
123 static bool log_ecn_error = true;
124 module_param(log_ecn_error, bool, 0644);
125 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126 
127 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
128 static int ipgre_tunnel_init(struct net_device *dev);
129 static void ipgre_tunnel_setup(struct net_device *dev);
130 static int ipgre_tunnel_bind_dev(struct net_device *dev);
131 
132 /* Fallback tunnel: no source, no destination, no key, no options */
133 
134 #define HASH_SIZE  16
135 
136 static int ipgre_net_id __read_mostly;
137 struct ipgre_net {
138 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
139 
140 	struct net_device *fb_tunnel_dev;
141 };
142 
143 /* Tunnel hash table */
144 
145 /*
146    4 hash tables:
147 
148    3: (remote,local)
149    2: (remote,*)
150    1: (*,local)
151    0: (*,*)
152 
153    We require exact key match i.e. if a key is present in packet
154    it will match only tunnel with the same key; if it is not present,
155    it will match only keyless tunnel.
156 
157    All keysless packets, if not matched configured keyless tunnels
158    will match fallback tunnel.
159  */
160 
161 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
162 
163 #define tunnels_r_l	tunnels[3]
164 #define tunnels_r	tunnels[2]
165 #define tunnels_l	tunnels[1]
166 #define tunnels_wc	tunnels[0]
167 
168 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
169 						   struct rtnl_link_stats64 *tot)
170 {
171 	int i;
172 
173 	for_each_possible_cpu(i) {
174 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
175 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
176 		unsigned int start;
177 
178 		do {
179 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
180 			rx_packets = tstats->rx_packets;
181 			tx_packets = tstats->tx_packets;
182 			rx_bytes = tstats->rx_bytes;
183 			tx_bytes = tstats->tx_bytes;
184 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
185 
186 		tot->rx_packets += rx_packets;
187 		tot->tx_packets += tx_packets;
188 		tot->rx_bytes   += rx_bytes;
189 		tot->tx_bytes   += tx_bytes;
190 	}
191 
192 	tot->multicast = dev->stats.multicast;
193 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
194 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
195 	tot->rx_length_errors = dev->stats.rx_length_errors;
196 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
197 	tot->rx_errors = dev->stats.rx_errors;
198 
199 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
200 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
201 	tot->tx_dropped = dev->stats.tx_dropped;
202 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
203 	tot->tx_errors = dev->stats.tx_errors;
204 
205 	return tot;
206 }
207 
208 /* Does key in tunnel parameters match packet */
209 static bool ipgre_key_match(const struct ip_tunnel_parm *p,
210 			    __be16 flags, __be32 key)
211 {
212 	if (p->i_flags & GRE_KEY) {
213 		if (flags & GRE_KEY)
214 			return key == p->i_key;
215 		else
216 			return false;	/* key expected, none present */
217 	} else
218 		return !(flags & GRE_KEY);
219 }
220 
221 /* Given src, dst and key, find appropriate for input tunnel. */
222 
223 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
224 					     __be32 remote, __be32 local,
225 					     __be16 flags, __be32 key,
226 					     __be16 gre_proto)
227 {
228 	struct net *net = dev_net(dev);
229 	int link = dev->ifindex;
230 	unsigned int h0 = HASH(remote);
231 	unsigned int h1 = HASH(key);
232 	struct ip_tunnel *t, *cand = NULL;
233 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
234 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
235 		       ARPHRD_ETHER : ARPHRD_IPGRE;
236 	int score, cand_score = 4;
237 
238 	for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
239 		if (local != t->parms.iph.saddr ||
240 		    remote != t->parms.iph.daddr ||
241 		    !(t->dev->flags & IFF_UP))
242 			continue;
243 
244 		if (!ipgre_key_match(&t->parms, flags, key))
245 			continue;
246 
247 		if (t->dev->type != ARPHRD_IPGRE &&
248 		    t->dev->type != dev_type)
249 			continue;
250 
251 		score = 0;
252 		if (t->parms.link != link)
253 			score |= 1;
254 		if (t->dev->type != dev_type)
255 			score |= 2;
256 		if (score == 0)
257 			return t;
258 
259 		if (score < cand_score) {
260 			cand = t;
261 			cand_score = score;
262 		}
263 	}
264 
265 	for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
266 		if (remote != t->parms.iph.daddr ||
267 		    !(t->dev->flags & IFF_UP))
268 			continue;
269 
270 		if (!ipgre_key_match(&t->parms, flags, key))
271 			continue;
272 
273 		if (t->dev->type != ARPHRD_IPGRE &&
274 		    t->dev->type != dev_type)
275 			continue;
276 
277 		score = 0;
278 		if (t->parms.link != link)
279 			score |= 1;
280 		if (t->dev->type != dev_type)
281 			score |= 2;
282 		if (score == 0)
283 			return t;
284 
285 		if (score < cand_score) {
286 			cand = t;
287 			cand_score = score;
288 		}
289 	}
290 
291 	for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
292 		if ((local != t->parms.iph.saddr &&
293 		     (local != t->parms.iph.daddr ||
294 		      !ipv4_is_multicast(local))) ||
295 		    !(t->dev->flags & IFF_UP))
296 			continue;
297 
298 		if (!ipgre_key_match(&t->parms, flags, key))
299 			continue;
300 
301 		if (t->dev->type != ARPHRD_IPGRE &&
302 		    t->dev->type != dev_type)
303 			continue;
304 
305 		score = 0;
306 		if (t->parms.link != link)
307 			score |= 1;
308 		if (t->dev->type != dev_type)
309 			score |= 2;
310 		if (score == 0)
311 			return t;
312 
313 		if (score < cand_score) {
314 			cand = t;
315 			cand_score = score;
316 		}
317 	}
318 
319 	for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
320 		if (t->parms.i_key != key ||
321 		    !(t->dev->flags & IFF_UP))
322 			continue;
323 
324 		if (t->dev->type != ARPHRD_IPGRE &&
325 		    t->dev->type != dev_type)
326 			continue;
327 
328 		score = 0;
329 		if (t->parms.link != link)
330 			score |= 1;
331 		if (t->dev->type != dev_type)
332 			score |= 2;
333 		if (score == 0)
334 			return t;
335 
336 		if (score < cand_score) {
337 			cand = t;
338 			cand_score = score;
339 		}
340 	}
341 
342 	if (cand != NULL)
343 		return cand;
344 
345 	dev = ign->fb_tunnel_dev;
346 	if (dev->flags & IFF_UP)
347 		return netdev_priv(dev);
348 
349 	return NULL;
350 }
351 
352 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
353 		struct ip_tunnel_parm *parms)
354 {
355 	__be32 remote = parms->iph.daddr;
356 	__be32 local = parms->iph.saddr;
357 	__be32 key = parms->i_key;
358 	unsigned int h = HASH(key);
359 	int prio = 0;
360 
361 	if (local)
362 		prio |= 1;
363 	if (remote && !ipv4_is_multicast(remote)) {
364 		prio |= 2;
365 		h ^= HASH(remote);
366 	}
367 
368 	return &ign->tunnels[prio][h];
369 }
370 
371 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
372 		struct ip_tunnel *t)
373 {
374 	return __ipgre_bucket(ign, &t->parms);
375 }
376 
377 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
378 {
379 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
380 
381 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
382 	rcu_assign_pointer(*tp, t);
383 }
384 
385 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
386 {
387 	struct ip_tunnel __rcu **tp;
388 	struct ip_tunnel *iter;
389 
390 	for (tp = ipgre_bucket(ign, t);
391 	     (iter = rtnl_dereference(*tp)) != NULL;
392 	     tp = &iter->next) {
393 		if (t == iter) {
394 			rcu_assign_pointer(*tp, t->next);
395 			break;
396 		}
397 	}
398 }
399 
400 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
401 					   struct ip_tunnel_parm *parms,
402 					   int type)
403 {
404 	__be32 remote = parms->iph.daddr;
405 	__be32 local = parms->iph.saddr;
406 	__be32 key = parms->i_key;
407 	int link = parms->link;
408 	struct ip_tunnel *t;
409 	struct ip_tunnel __rcu **tp;
410 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
411 
412 	for (tp = __ipgre_bucket(ign, parms);
413 	     (t = rtnl_dereference(*tp)) != NULL;
414 	     tp = &t->next)
415 		if (local == t->parms.iph.saddr &&
416 		    remote == t->parms.iph.daddr &&
417 		    key == t->parms.i_key &&
418 		    link == t->parms.link &&
419 		    type == t->dev->type)
420 			break;
421 
422 	return t;
423 }
424 
425 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
426 		struct ip_tunnel_parm *parms, int create)
427 {
428 	struct ip_tunnel *t, *nt;
429 	struct net_device *dev;
430 	char name[IFNAMSIZ];
431 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
432 
433 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
434 	if (t || !create)
435 		return t;
436 
437 	if (parms->name[0])
438 		strlcpy(name, parms->name, IFNAMSIZ);
439 	else
440 		strcpy(name, "gre%d");
441 
442 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
443 	if (!dev)
444 		return NULL;
445 
446 	dev_net_set(dev, net);
447 
448 	nt = netdev_priv(dev);
449 	nt->parms = *parms;
450 	dev->rtnl_link_ops = &ipgre_link_ops;
451 
452 	dev->mtu = ipgre_tunnel_bind_dev(dev);
453 
454 	if (register_netdevice(dev) < 0)
455 		goto failed_free;
456 
457 	/* Can use a lockless transmit, unless we generate output sequences */
458 	if (!(nt->parms.o_flags & GRE_SEQ))
459 		dev->features |= NETIF_F_LLTX;
460 
461 	dev_hold(dev);
462 	ipgre_tunnel_link(ign, nt);
463 	return nt;
464 
465 failed_free:
466 	free_netdev(dev);
467 	return NULL;
468 }
469 
470 static void ipgre_tunnel_uninit(struct net_device *dev)
471 {
472 	struct net *net = dev_net(dev);
473 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
474 
475 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
476 	dev_put(dev);
477 }
478 
479 
480 static void ipgre_err(struct sk_buff *skb, u32 info)
481 {
482 
483 /* All the routers (except for Linux) return only
484    8 bytes of packet payload. It means, that precise relaying of
485    ICMP in the real Internet is absolutely infeasible.
486 
487    Moreover, Cisco "wise men" put GRE key to the third word
488    in GRE header. It makes impossible maintaining even soft state for keyed
489    GRE tunnels with enabled checksum. Tell them "thank you".
490 
491    Well, I wonder, rfc1812 was written by Cisco employee,
492    what the hell these idiots break standards established
493    by themselves???
494  */
495 
496 	const struct iphdr *iph = (const struct iphdr *)skb->data;
497 	__be16	     *p = (__be16 *)(skb->data+(iph->ihl<<2));
498 	int grehlen = (iph->ihl<<2) + 4;
499 	const int type = icmp_hdr(skb)->type;
500 	const int code = icmp_hdr(skb)->code;
501 	struct ip_tunnel *t;
502 	__be16 flags;
503 	__be32 key = 0;
504 
505 	flags = p[0];
506 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
507 		if (flags&(GRE_VERSION|GRE_ROUTING))
508 			return;
509 		if (flags&GRE_KEY) {
510 			grehlen += 4;
511 			if (flags&GRE_CSUM)
512 				grehlen += 4;
513 		}
514 	}
515 
516 	/* If only 8 bytes returned, keyed message will be dropped here */
517 	if (skb_headlen(skb) < grehlen)
518 		return;
519 
520 	if (flags & GRE_KEY)
521 		key = *(((__be32 *)p) + (grehlen / 4) - 1);
522 
523 	switch (type) {
524 	default:
525 	case ICMP_PARAMETERPROB:
526 		return;
527 
528 	case ICMP_DEST_UNREACH:
529 		switch (code) {
530 		case ICMP_SR_FAILED:
531 		case ICMP_PORT_UNREACH:
532 			/* Impossible event. */
533 			return;
534 		default:
535 			/* All others are translated to HOST_UNREACH.
536 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
537 			   I believe they are just ether pollution. --ANK
538 			 */
539 			break;
540 		}
541 		break;
542 	case ICMP_TIME_EXCEEDED:
543 		if (code != ICMP_EXC_TTL)
544 			return;
545 		break;
546 
547 	case ICMP_REDIRECT:
548 		break;
549 	}
550 
551 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
552 				flags, key, p[1]);
553 
554 	if (t == NULL)
555 		return;
556 
557 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
558 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
559 				 t->parms.link, 0, IPPROTO_GRE, 0);
560 		return;
561 	}
562 	if (type == ICMP_REDIRECT) {
563 		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
564 			      IPPROTO_GRE, 0);
565 		return;
566 	}
567 	if (t->parms.iph.daddr == 0 ||
568 	    ipv4_is_multicast(t->parms.iph.daddr))
569 		return;
570 
571 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
572 		return;
573 
574 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
575 		t->err_count++;
576 	else
577 		t->err_count = 1;
578 	t->err_time = jiffies;
579 }
580 
581 static inline u8
582 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
583 {
584 	u8 inner = 0;
585 	if (skb->protocol == htons(ETH_P_IP))
586 		inner = old_iph->tos;
587 	else if (skb->protocol == htons(ETH_P_IPV6))
588 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
589 	return INET_ECN_encapsulate(tos, inner);
590 }
591 
592 static int ipgre_rcv(struct sk_buff *skb)
593 {
594 	const struct iphdr *iph;
595 	u8     *h;
596 	__be16    flags;
597 	__sum16   csum = 0;
598 	__be32 key = 0;
599 	u32    seqno = 0;
600 	struct ip_tunnel *tunnel;
601 	int    offset = 4;
602 	__be16 gre_proto;
603 	int    err;
604 
605 	if (!pskb_may_pull(skb, 16))
606 		goto drop;
607 
608 	iph = ip_hdr(skb);
609 	h = skb->data;
610 	flags = *(__be16 *)h;
611 
612 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
613 		/* - Version must be 0.
614 		   - We do not support routing headers.
615 		 */
616 		if (flags&(GRE_VERSION|GRE_ROUTING))
617 			goto drop;
618 
619 		if (flags&GRE_CSUM) {
620 			switch (skb->ip_summed) {
621 			case CHECKSUM_COMPLETE:
622 				csum = csum_fold(skb->csum);
623 				if (!csum)
624 					break;
625 				/* fall through */
626 			case CHECKSUM_NONE:
627 				skb->csum = 0;
628 				csum = __skb_checksum_complete(skb);
629 				skb->ip_summed = CHECKSUM_COMPLETE;
630 			}
631 			offset += 4;
632 		}
633 		if (flags&GRE_KEY) {
634 			key = *(__be32 *)(h + offset);
635 			offset += 4;
636 		}
637 		if (flags&GRE_SEQ) {
638 			seqno = ntohl(*(__be32 *)(h + offset));
639 			offset += 4;
640 		}
641 	}
642 
643 	gre_proto = *(__be16 *)(h + 2);
644 
645 	tunnel = ipgre_tunnel_lookup(skb->dev,
646 				     iph->saddr, iph->daddr, flags, key,
647 				     gre_proto);
648 	if (tunnel) {
649 		struct pcpu_tstats *tstats;
650 
651 		secpath_reset(skb);
652 
653 		skb->protocol = gre_proto;
654 		/* WCCP version 1 and 2 protocol decoding.
655 		 * - Change protocol to IP
656 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
657 		 */
658 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
659 			skb->protocol = htons(ETH_P_IP);
660 			if ((*(h + offset) & 0xF0) != 0x40)
661 				offset += 4;
662 		}
663 
664 		skb->mac_header = skb->network_header;
665 		__pskb_pull(skb, offset);
666 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
667 		skb->pkt_type = PACKET_HOST;
668 #ifdef CONFIG_NET_IPGRE_BROADCAST
669 		if (ipv4_is_multicast(iph->daddr)) {
670 			/* Looped back packet, drop it! */
671 			if (rt_is_output_route(skb_rtable(skb)))
672 				goto drop;
673 			tunnel->dev->stats.multicast++;
674 			skb->pkt_type = PACKET_BROADCAST;
675 		}
676 #endif
677 
678 		if (((flags&GRE_CSUM) && csum) ||
679 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
680 			tunnel->dev->stats.rx_crc_errors++;
681 			tunnel->dev->stats.rx_errors++;
682 			goto drop;
683 		}
684 		if (tunnel->parms.i_flags&GRE_SEQ) {
685 			if (!(flags&GRE_SEQ) ||
686 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
687 				tunnel->dev->stats.rx_fifo_errors++;
688 				tunnel->dev->stats.rx_errors++;
689 				goto drop;
690 			}
691 			tunnel->i_seqno = seqno + 1;
692 		}
693 
694 		/* Warning: All skb pointers will be invalidated! */
695 		if (tunnel->dev->type == ARPHRD_ETHER) {
696 			if (!pskb_may_pull(skb, ETH_HLEN)) {
697 				tunnel->dev->stats.rx_length_errors++;
698 				tunnel->dev->stats.rx_errors++;
699 				goto drop;
700 			}
701 
702 			iph = ip_hdr(skb);
703 			skb->protocol = eth_type_trans(skb, tunnel->dev);
704 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
705 		}
706 
707 		__skb_tunnel_rx(skb, tunnel->dev);
708 
709 		skb_reset_network_header(skb);
710 		err = IP_ECN_decapsulate(iph, skb);
711 		if (unlikely(err)) {
712 			if (log_ecn_error)
713 				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
714 						     &iph->saddr, iph->tos);
715 			if (err > 1) {
716 				++tunnel->dev->stats.rx_frame_errors;
717 				++tunnel->dev->stats.rx_errors;
718 				goto drop;
719 			}
720 		}
721 
722 		tstats = this_cpu_ptr(tunnel->dev->tstats);
723 		u64_stats_update_begin(&tstats->syncp);
724 		tstats->rx_packets++;
725 		tstats->rx_bytes += skb->len;
726 		u64_stats_update_end(&tstats->syncp);
727 
728 		gro_cells_receive(&tunnel->gro_cells, skb);
729 		return 0;
730 	}
731 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
732 
733 drop:
734 	kfree_skb(skb);
735 	return 0;
736 }
737 
738 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
739 {
740 	struct ip_tunnel *tunnel = netdev_priv(dev);
741 	const struct iphdr  *old_iph = ip_hdr(skb);
742 	const struct iphdr  *tiph;
743 	struct flowi4 fl4;
744 	u8     tos;
745 	__be16 df;
746 	struct rtable *rt;     			/* Route to the other host */
747 	struct net_device *tdev;		/* Device to other host */
748 	struct iphdr  *iph;			/* Our new IP header */
749 	unsigned int max_headroom;		/* The extra header space needed */
750 	int    gre_hlen;
751 	__be32 dst;
752 	int    mtu;
753 
754 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
755 	    skb_checksum_help(skb))
756 		goto tx_error;
757 
758 	if (dev->type == ARPHRD_ETHER)
759 		IPCB(skb)->flags = 0;
760 
761 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
762 		gre_hlen = 0;
763 		tiph = (const struct iphdr *)skb->data;
764 	} else {
765 		gre_hlen = tunnel->hlen;
766 		tiph = &tunnel->parms.iph;
767 	}
768 
769 	if ((dst = tiph->daddr) == 0) {
770 		/* NBMA tunnel */
771 
772 		if (skb_dst(skb) == NULL) {
773 			dev->stats.tx_fifo_errors++;
774 			goto tx_error;
775 		}
776 
777 		if (skb->protocol == htons(ETH_P_IP)) {
778 			rt = skb_rtable(skb);
779 			dst = rt_nexthop(rt, old_iph->daddr);
780 		}
781 #if IS_ENABLED(CONFIG_IPV6)
782 		else if (skb->protocol == htons(ETH_P_IPV6)) {
783 			const struct in6_addr *addr6;
784 			struct neighbour *neigh;
785 			bool do_tx_error_icmp;
786 			int addr_type;
787 
788 			neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
789 			if (neigh == NULL)
790 				goto tx_error;
791 
792 			addr6 = (const struct in6_addr *)&neigh->primary_key;
793 			addr_type = ipv6_addr_type(addr6);
794 
795 			if (addr_type == IPV6_ADDR_ANY) {
796 				addr6 = &ipv6_hdr(skb)->daddr;
797 				addr_type = ipv6_addr_type(addr6);
798 			}
799 
800 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
801 				do_tx_error_icmp = true;
802 			else {
803 				do_tx_error_icmp = false;
804 				dst = addr6->s6_addr32[3];
805 			}
806 			neigh_release(neigh);
807 			if (do_tx_error_icmp)
808 				goto tx_error_icmp;
809 		}
810 #endif
811 		else
812 			goto tx_error;
813 	}
814 
815 	tos = tiph->tos;
816 	if (tos == 1) {
817 		tos = 0;
818 		if (skb->protocol == htons(ETH_P_IP))
819 			tos = old_iph->tos;
820 		else if (skb->protocol == htons(ETH_P_IPV6))
821 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
822 	}
823 
824 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
825 				 tunnel->parms.o_key, RT_TOS(tos),
826 				 tunnel->parms.link);
827 	if (IS_ERR(rt)) {
828 		dev->stats.tx_carrier_errors++;
829 		goto tx_error;
830 	}
831 	tdev = rt->dst.dev;
832 
833 	if (tdev == dev) {
834 		ip_rt_put(rt);
835 		dev->stats.collisions++;
836 		goto tx_error;
837 	}
838 
839 	df = tiph->frag_off;
840 	if (df)
841 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
842 	else
843 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
844 
845 	if (skb_dst(skb))
846 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
847 
848 	if (skb->protocol == htons(ETH_P_IP)) {
849 		df |= (old_iph->frag_off&htons(IP_DF));
850 
851 		if ((old_iph->frag_off&htons(IP_DF)) &&
852 		    mtu < ntohs(old_iph->tot_len)) {
853 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
854 			ip_rt_put(rt);
855 			goto tx_error;
856 		}
857 	}
858 #if IS_ENABLED(CONFIG_IPV6)
859 	else if (skb->protocol == htons(ETH_P_IPV6)) {
860 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
861 
862 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
863 			if ((tunnel->parms.iph.daddr &&
864 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
865 			    rt6->rt6i_dst.plen == 128) {
866 				rt6->rt6i_flags |= RTF_MODIFIED;
867 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
868 			}
869 		}
870 
871 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
872 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
873 			ip_rt_put(rt);
874 			goto tx_error;
875 		}
876 	}
877 #endif
878 
879 	if (tunnel->err_count > 0) {
880 		if (time_before(jiffies,
881 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
882 			tunnel->err_count--;
883 
884 			dst_link_failure(skb);
885 		} else
886 			tunnel->err_count = 0;
887 	}
888 
889 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
890 
891 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
892 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
893 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
894 		if (max_headroom > dev->needed_headroom)
895 			dev->needed_headroom = max_headroom;
896 		if (!new_skb) {
897 			ip_rt_put(rt);
898 			dev->stats.tx_dropped++;
899 			dev_kfree_skb(skb);
900 			return NETDEV_TX_OK;
901 		}
902 		if (skb->sk)
903 			skb_set_owner_w(new_skb, skb->sk);
904 		dev_kfree_skb(skb);
905 		skb = new_skb;
906 		old_iph = ip_hdr(skb);
907 	}
908 
909 	skb_reset_transport_header(skb);
910 	skb_push(skb, gre_hlen);
911 	skb_reset_network_header(skb);
912 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
913 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
914 			      IPSKB_REROUTED);
915 	skb_dst_drop(skb);
916 	skb_dst_set(skb, &rt->dst);
917 
918 	/*
919 	 *	Push down and install the IPIP header.
920 	 */
921 
922 	iph 			=	ip_hdr(skb);
923 	iph->version		=	4;
924 	iph->ihl		=	sizeof(struct iphdr) >> 2;
925 	iph->frag_off		=	df;
926 	iph->protocol		=	IPPROTO_GRE;
927 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
928 	iph->daddr		=	fl4.daddr;
929 	iph->saddr		=	fl4.saddr;
930 
931 	if ((iph->ttl = tiph->ttl) == 0) {
932 		if (skb->protocol == htons(ETH_P_IP))
933 			iph->ttl = old_iph->ttl;
934 #if IS_ENABLED(CONFIG_IPV6)
935 		else if (skb->protocol == htons(ETH_P_IPV6))
936 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
937 #endif
938 		else
939 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
940 	}
941 
942 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
943 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
944 				   htons(ETH_P_TEB) : skb->protocol;
945 
946 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
947 		__be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
948 
949 		if (tunnel->parms.o_flags&GRE_SEQ) {
950 			++tunnel->o_seqno;
951 			*ptr = htonl(tunnel->o_seqno);
952 			ptr--;
953 		}
954 		if (tunnel->parms.o_flags&GRE_KEY) {
955 			*ptr = tunnel->parms.o_key;
956 			ptr--;
957 		}
958 		if (tunnel->parms.o_flags&GRE_CSUM) {
959 			*ptr = 0;
960 			*(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
961 		}
962 	}
963 
964 	iptunnel_xmit(skb, dev);
965 	return NETDEV_TX_OK;
966 
967 #if IS_ENABLED(CONFIG_IPV6)
968 tx_error_icmp:
969 	dst_link_failure(skb);
970 #endif
971 tx_error:
972 	dev->stats.tx_errors++;
973 	dev_kfree_skb(skb);
974 	return NETDEV_TX_OK;
975 }
976 
977 static int ipgre_tunnel_bind_dev(struct net_device *dev)
978 {
979 	struct net_device *tdev = NULL;
980 	struct ip_tunnel *tunnel;
981 	const struct iphdr *iph;
982 	int hlen = LL_MAX_HEADER;
983 	int mtu = ETH_DATA_LEN;
984 	int addend = sizeof(struct iphdr) + 4;
985 
986 	tunnel = netdev_priv(dev);
987 	iph = &tunnel->parms.iph;
988 
989 	/* Guess output device to choose reasonable mtu and needed_headroom */
990 
991 	if (iph->daddr) {
992 		struct flowi4 fl4;
993 		struct rtable *rt;
994 
995 		rt = ip_route_output_gre(dev_net(dev), &fl4,
996 					 iph->daddr, iph->saddr,
997 					 tunnel->parms.o_key,
998 					 RT_TOS(iph->tos),
999 					 tunnel->parms.link);
1000 		if (!IS_ERR(rt)) {
1001 			tdev = rt->dst.dev;
1002 			ip_rt_put(rt);
1003 		}
1004 
1005 		if (dev->type != ARPHRD_ETHER)
1006 			dev->flags |= IFF_POINTOPOINT;
1007 	}
1008 
1009 	if (!tdev && tunnel->parms.link)
1010 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1011 
1012 	if (tdev) {
1013 		hlen = tdev->hard_header_len + tdev->needed_headroom;
1014 		mtu = tdev->mtu;
1015 	}
1016 	dev->iflink = tunnel->parms.link;
1017 
1018 	/* Precalculate GRE options length */
1019 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1020 		if (tunnel->parms.o_flags&GRE_CSUM)
1021 			addend += 4;
1022 		if (tunnel->parms.o_flags&GRE_KEY)
1023 			addend += 4;
1024 		if (tunnel->parms.o_flags&GRE_SEQ)
1025 			addend += 4;
1026 	}
1027 	dev->needed_headroom = addend + hlen;
1028 	mtu -= dev->hard_header_len + addend;
1029 
1030 	if (mtu < 68)
1031 		mtu = 68;
1032 
1033 	tunnel->hlen = addend;
1034 
1035 	return mtu;
1036 }
1037 
1038 static int
1039 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1040 {
1041 	int err = 0;
1042 	struct ip_tunnel_parm p;
1043 	struct ip_tunnel *t;
1044 	struct net *net = dev_net(dev);
1045 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1046 
1047 	switch (cmd) {
1048 	case SIOCGETTUNNEL:
1049 		t = NULL;
1050 		if (dev == ign->fb_tunnel_dev) {
1051 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1052 				err = -EFAULT;
1053 				break;
1054 			}
1055 			t = ipgre_tunnel_locate(net, &p, 0);
1056 		}
1057 		if (t == NULL)
1058 			t = netdev_priv(dev);
1059 		memcpy(&p, &t->parms, sizeof(p));
1060 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1061 			err = -EFAULT;
1062 		break;
1063 
1064 	case SIOCADDTUNNEL:
1065 	case SIOCCHGTUNNEL:
1066 		err = -EPERM;
1067 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1068 			goto done;
1069 
1070 		err = -EFAULT;
1071 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1072 			goto done;
1073 
1074 		err = -EINVAL;
1075 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1076 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1077 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1078 			goto done;
1079 		if (p.iph.ttl)
1080 			p.iph.frag_off |= htons(IP_DF);
1081 
1082 		if (!(p.i_flags&GRE_KEY))
1083 			p.i_key = 0;
1084 		if (!(p.o_flags&GRE_KEY))
1085 			p.o_key = 0;
1086 
1087 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1088 
1089 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1090 			if (t != NULL) {
1091 				if (t->dev != dev) {
1092 					err = -EEXIST;
1093 					break;
1094 				}
1095 			} else {
1096 				unsigned int nflags = 0;
1097 
1098 				t = netdev_priv(dev);
1099 
1100 				if (ipv4_is_multicast(p.iph.daddr))
1101 					nflags = IFF_BROADCAST;
1102 				else if (p.iph.daddr)
1103 					nflags = IFF_POINTOPOINT;
1104 
1105 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1106 					err = -EINVAL;
1107 					break;
1108 				}
1109 				ipgre_tunnel_unlink(ign, t);
1110 				synchronize_net();
1111 				t->parms.iph.saddr = p.iph.saddr;
1112 				t->parms.iph.daddr = p.iph.daddr;
1113 				t->parms.i_key = p.i_key;
1114 				t->parms.o_key = p.o_key;
1115 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1116 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1117 				ipgre_tunnel_link(ign, t);
1118 				netdev_state_change(dev);
1119 			}
1120 		}
1121 
1122 		if (t) {
1123 			err = 0;
1124 			if (cmd == SIOCCHGTUNNEL) {
1125 				t->parms.iph.ttl = p.iph.ttl;
1126 				t->parms.iph.tos = p.iph.tos;
1127 				t->parms.iph.frag_off = p.iph.frag_off;
1128 				if (t->parms.link != p.link) {
1129 					t->parms.link = p.link;
1130 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1131 					netdev_state_change(dev);
1132 				}
1133 			}
1134 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1135 				err = -EFAULT;
1136 		} else
1137 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1138 		break;
1139 
1140 	case SIOCDELTUNNEL:
1141 		err = -EPERM;
1142 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1143 			goto done;
1144 
1145 		if (dev == ign->fb_tunnel_dev) {
1146 			err = -EFAULT;
1147 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1148 				goto done;
1149 			err = -ENOENT;
1150 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1151 				goto done;
1152 			err = -EPERM;
1153 			if (t == netdev_priv(ign->fb_tunnel_dev))
1154 				goto done;
1155 			dev = t->dev;
1156 		}
1157 		unregister_netdevice(dev);
1158 		err = 0;
1159 		break;
1160 
1161 	default:
1162 		err = -EINVAL;
1163 	}
1164 
1165 done:
1166 	return err;
1167 }
1168 
1169 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1170 {
1171 	struct ip_tunnel *tunnel = netdev_priv(dev);
1172 	if (new_mtu < 68 ||
1173 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1174 		return -EINVAL;
1175 	dev->mtu = new_mtu;
1176 	return 0;
1177 }
1178 
1179 /* Nice toy. Unfortunately, useless in real life :-)
1180    It allows to construct virtual multiprotocol broadcast "LAN"
1181    over the Internet, provided multicast routing is tuned.
1182 
1183 
1184    I have no idea was this bicycle invented before me,
1185    so that I had to set ARPHRD_IPGRE to a random value.
1186    I have an impression, that Cisco could make something similar,
1187    but this feature is apparently missing in IOS<=11.2(8).
1188 
1189    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1190    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1191 
1192    ping -t 255 224.66.66.66
1193 
1194    If nobody answers, mbone does not work.
1195 
1196    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1197    ip addr add 10.66.66.<somewhat>/24 dev Universe
1198    ifconfig Universe up
1199    ifconfig Universe add fe80::<Your_real_addr>/10
1200    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1201    ftp 10.66.66.66
1202    ...
1203    ftp fec0:6666:6666::193.233.7.65
1204    ...
1205 
1206  */
1207 
1208 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1209 			unsigned short type,
1210 			const void *daddr, const void *saddr, unsigned int len)
1211 {
1212 	struct ip_tunnel *t = netdev_priv(dev);
1213 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1214 	__be16 *p = (__be16 *)(iph+1);
1215 
1216 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1217 	p[0]		= t->parms.o_flags;
1218 	p[1]		= htons(type);
1219 
1220 	/*
1221 	 *	Set the source hardware address.
1222 	 */
1223 
1224 	if (saddr)
1225 		memcpy(&iph->saddr, saddr, 4);
1226 	if (daddr)
1227 		memcpy(&iph->daddr, daddr, 4);
1228 	if (iph->daddr)
1229 		return t->hlen;
1230 
1231 	return -t->hlen;
1232 }
1233 
1234 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1235 {
1236 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1237 	memcpy(haddr, &iph->saddr, 4);
1238 	return 4;
1239 }
1240 
1241 static const struct header_ops ipgre_header_ops = {
1242 	.create	= ipgre_header,
1243 	.parse	= ipgre_header_parse,
1244 };
1245 
1246 #ifdef CONFIG_NET_IPGRE_BROADCAST
1247 static int ipgre_open(struct net_device *dev)
1248 {
1249 	struct ip_tunnel *t = netdev_priv(dev);
1250 
1251 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1252 		struct flowi4 fl4;
1253 		struct rtable *rt;
1254 
1255 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1256 					 t->parms.iph.daddr,
1257 					 t->parms.iph.saddr,
1258 					 t->parms.o_key,
1259 					 RT_TOS(t->parms.iph.tos),
1260 					 t->parms.link);
1261 		if (IS_ERR(rt))
1262 			return -EADDRNOTAVAIL;
1263 		dev = rt->dst.dev;
1264 		ip_rt_put(rt);
1265 		if (__in_dev_get_rtnl(dev) == NULL)
1266 			return -EADDRNOTAVAIL;
1267 		t->mlink = dev->ifindex;
1268 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1269 	}
1270 	return 0;
1271 }
1272 
1273 static int ipgre_close(struct net_device *dev)
1274 {
1275 	struct ip_tunnel *t = netdev_priv(dev);
1276 
1277 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1278 		struct in_device *in_dev;
1279 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1280 		if (in_dev)
1281 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1282 	}
1283 	return 0;
1284 }
1285 
1286 #endif
1287 
1288 static const struct net_device_ops ipgre_netdev_ops = {
1289 	.ndo_init		= ipgre_tunnel_init,
1290 	.ndo_uninit		= ipgre_tunnel_uninit,
1291 #ifdef CONFIG_NET_IPGRE_BROADCAST
1292 	.ndo_open		= ipgre_open,
1293 	.ndo_stop		= ipgre_close,
1294 #endif
1295 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1296 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1297 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1298 	.ndo_get_stats64	= ipgre_get_stats64,
1299 };
1300 
1301 static void ipgre_dev_free(struct net_device *dev)
1302 {
1303 	struct ip_tunnel *tunnel = netdev_priv(dev);
1304 
1305 	gro_cells_destroy(&tunnel->gro_cells);
1306 	free_percpu(dev->tstats);
1307 	free_netdev(dev);
1308 }
1309 
1310 #define GRE_FEATURES (NETIF_F_SG |		\
1311 		      NETIF_F_FRAGLIST |	\
1312 		      NETIF_F_HIGHDMA |		\
1313 		      NETIF_F_HW_CSUM)
1314 
1315 static void ipgre_tunnel_setup(struct net_device *dev)
1316 {
1317 	dev->netdev_ops		= &ipgre_netdev_ops;
1318 	dev->destructor 	= ipgre_dev_free;
1319 
1320 	dev->type		= ARPHRD_IPGRE;
1321 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1322 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1323 	dev->flags		= IFF_NOARP;
1324 	dev->iflink		= 0;
1325 	dev->addr_len		= 4;
1326 	dev->features		|= NETIF_F_NETNS_LOCAL;
1327 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1328 
1329 	dev->features		|= GRE_FEATURES;
1330 	dev->hw_features	|= GRE_FEATURES;
1331 }
1332 
1333 static int ipgre_tunnel_init(struct net_device *dev)
1334 {
1335 	struct ip_tunnel *tunnel;
1336 	struct iphdr *iph;
1337 	int err;
1338 
1339 	tunnel = netdev_priv(dev);
1340 	iph = &tunnel->parms.iph;
1341 
1342 	tunnel->dev = dev;
1343 	strcpy(tunnel->parms.name, dev->name);
1344 
1345 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1346 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1347 
1348 	if (iph->daddr) {
1349 #ifdef CONFIG_NET_IPGRE_BROADCAST
1350 		if (ipv4_is_multicast(iph->daddr)) {
1351 			if (!iph->saddr)
1352 				return -EINVAL;
1353 			dev->flags = IFF_BROADCAST;
1354 			dev->header_ops = &ipgre_header_ops;
1355 		}
1356 #endif
1357 	} else
1358 		dev->header_ops = &ipgre_header_ops;
1359 
1360 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1361 	if (!dev->tstats)
1362 		return -ENOMEM;
1363 
1364 	err = gro_cells_init(&tunnel->gro_cells, dev);
1365 	if (err) {
1366 		free_percpu(dev->tstats);
1367 		return err;
1368 	}
1369 
1370 	return 0;
1371 }
1372 
1373 static void ipgre_fb_tunnel_init(struct net_device *dev)
1374 {
1375 	struct ip_tunnel *tunnel = netdev_priv(dev);
1376 	struct iphdr *iph = &tunnel->parms.iph;
1377 
1378 	tunnel->dev = dev;
1379 	strcpy(tunnel->parms.name, dev->name);
1380 
1381 	iph->version		= 4;
1382 	iph->protocol		= IPPROTO_GRE;
1383 	iph->ihl		= 5;
1384 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1385 
1386 	dev_hold(dev);
1387 }
1388 
1389 
1390 static const struct gre_protocol ipgre_protocol = {
1391 	.handler     = ipgre_rcv,
1392 	.err_handler = ipgre_err,
1393 };
1394 
1395 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1396 {
1397 	int prio;
1398 
1399 	for (prio = 0; prio < 4; prio++) {
1400 		int h;
1401 		for (h = 0; h < HASH_SIZE; h++) {
1402 			struct ip_tunnel *t;
1403 
1404 			t = rtnl_dereference(ign->tunnels[prio][h]);
1405 
1406 			while (t != NULL) {
1407 				unregister_netdevice_queue(t->dev, head);
1408 				t = rtnl_dereference(t->next);
1409 			}
1410 		}
1411 	}
1412 }
1413 
1414 static int __net_init ipgre_init_net(struct net *net)
1415 {
1416 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1417 	int err;
1418 
1419 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1420 					   ipgre_tunnel_setup);
1421 	if (!ign->fb_tunnel_dev) {
1422 		err = -ENOMEM;
1423 		goto err_alloc_dev;
1424 	}
1425 	dev_net_set(ign->fb_tunnel_dev, net);
1426 
1427 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1428 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1429 
1430 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1431 		goto err_reg_dev;
1432 
1433 	rcu_assign_pointer(ign->tunnels_wc[0],
1434 			   netdev_priv(ign->fb_tunnel_dev));
1435 	return 0;
1436 
1437 err_reg_dev:
1438 	ipgre_dev_free(ign->fb_tunnel_dev);
1439 err_alloc_dev:
1440 	return err;
1441 }
1442 
1443 static void __net_exit ipgre_exit_net(struct net *net)
1444 {
1445 	struct ipgre_net *ign;
1446 	LIST_HEAD(list);
1447 
1448 	ign = net_generic(net, ipgre_net_id);
1449 	rtnl_lock();
1450 	ipgre_destroy_tunnels(ign, &list);
1451 	unregister_netdevice_many(&list);
1452 	rtnl_unlock();
1453 }
1454 
1455 static struct pernet_operations ipgre_net_ops = {
1456 	.init = ipgre_init_net,
1457 	.exit = ipgre_exit_net,
1458 	.id   = &ipgre_net_id,
1459 	.size = sizeof(struct ipgre_net),
1460 };
1461 
1462 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1463 {
1464 	__be16 flags;
1465 
1466 	if (!data)
1467 		return 0;
1468 
1469 	flags = 0;
1470 	if (data[IFLA_GRE_IFLAGS])
1471 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1472 	if (data[IFLA_GRE_OFLAGS])
1473 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1474 	if (flags & (GRE_VERSION|GRE_ROUTING))
1475 		return -EINVAL;
1476 
1477 	return 0;
1478 }
1479 
1480 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1481 {
1482 	__be32 daddr;
1483 
1484 	if (tb[IFLA_ADDRESS]) {
1485 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1486 			return -EINVAL;
1487 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1488 			return -EADDRNOTAVAIL;
1489 	}
1490 
1491 	if (!data)
1492 		goto out;
1493 
1494 	if (data[IFLA_GRE_REMOTE]) {
1495 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1496 		if (!daddr)
1497 			return -EINVAL;
1498 	}
1499 
1500 out:
1501 	return ipgre_tunnel_validate(tb, data);
1502 }
1503 
1504 static void ipgre_netlink_parms(struct nlattr *data[],
1505 				struct ip_tunnel_parm *parms)
1506 {
1507 	memset(parms, 0, sizeof(*parms));
1508 
1509 	parms->iph.protocol = IPPROTO_GRE;
1510 
1511 	if (!data)
1512 		return;
1513 
1514 	if (data[IFLA_GRE_LINK])
1515 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1516 
1517 	if (data[IFLA_GRE_IFLAGS])
1518 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1519 
1520 	if (data[IFLA_GRE_OFLAGS])
1521 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1522 
1523 	if (data[IFLA_GRE_IKEY])
1524 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1525 
1526 	if (data[IFLA_GRE_OKEY])
1527 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1528 
1529 	if (data[IFLA_GRE_LOCAL])
1530 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1531 
1532 	if (data[IFLA_GRE_REMOTE])
1533 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1534 
1535 	if (data[IFLA_GRE_TTL])
1536 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1537 
1538 	if (data[IFLA_GRE_TOS])
1539 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1540 
1541 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1542 		parms->iph.frag_off = htons(IP_DF);
1543 }
1544 
1545 static int ipgre_tap_init(struct net_device *dev)
1546 {
1547 	struct ip_tunnel *tunnel;
1548 
1549 	tunnel = netdev_priv(dev);
1550 
1551 	tunnel->dev = dev;
1552 	strcpy(tunnel->parms.name, dev->name);
1553 
1554 	ipgre_tunnel_bind_dev(dev);
1555 
1556 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1557 	if (!dev->tstats)
1558 		return -ENOMEM;
1559 
1560 	return 0;
1561 }
1562 
1563 static const struct net_device_ops ipgre_tap_netdev_ops = {
1564 	.ndo_init		= ipgre_tap_init,
1565 	.ndo_uninit		= ipgre_tunnel_uninit,
1566 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1567 	.ndo_set_mac_address 	= eth_mac_addr,
1568 	.ndo_validate_addr	= eth_validate_addr,
1569 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1570 	.ndo_get_stats64	= ipgre_get_stats64,
1571 };
1572 
1573 static void ipgre_tap_setup(struct net_device *dev)
1574 {
1575 
1576 	ether_setup(dev);
1577 
1578 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1579 	dev->destructor 	= ipgre_dev_free;
1580 
1581 	dev->iflink		= 0;
1582 	dev->features		|= NETIF_F_NETNS_LOCAL;
1583 }
1584 
1585 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1586 			 struct nlattr *data[])
1587 {
1588 	struct ip_tunnel *nt;
1589 	struct net *net = dev_net(dev);
1590 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1591 	int mtu;
1592 	int err;
1593 
1594 	nt = netdev_priv(dev);
1595 	ipgre_netlink_parms(data, &nt->parms);
1596 
1597 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1598 		return -EEXIST;
1599 
1600 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1601 		eth_hw_addr_random(dev);
1602 
1603 	mtu = ipgre_tunnel_bind_dev(dev);
1604 	if (!tb[IFLA_MTU])
1605 		dev->mtu = mtu;
1606 
1607 	/* Can use a lockless transmit, unless we generate output sequences */
1608 	if (!(nt->parms.o_flags & GRE_SEQ))
1609 		dev->features |= NETIF_F_LLTX;
1610 
1611 	err = register_netdevice(dev);
1612 	if (err)
1613 		goto out;
1614 
1615 	dev_hold(dev);
1616 	ipgre_tunnel_link(ign, nt);
1617 
1618 out:
1619 	return err;
1620 }
1621 
1622 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1623 			    struct nlattr *data[])
1624 {
1625 	struct ip_tunnel *t, *nt;
1626 	struct net *net = dev_net(dev);
1627 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1628 	struct ip_tunnel_parm p;
1629 	int mtu;
1630 
1631 	if (dev == ign->fb_tunnel_dev)
1632 		return -EINVAL;
1633 
1634 	nt = netdev_priv(dev);
1635 	ipgre_netlink_parms(data, &p);
1636 
1637 	t = ipgre_tunnel_locate(net, &p, 0);
1638 
1639 	if (t) {
1640 		if (t->dev != dev)
1641 			return -EEXIST;
1642 	} else {
1643 		t = nt;
1644 
1645 		if (dev->type != ARPHRD_ETHER) {
1646 			unsigned int nflags = 0;
1647 
1648 			if (ipv4_is_multicast(p.iph.daddr))
1649 				nflags = IFF_BROADCAST;
1650 			else if (p.iph.daddr)
1651 				nflags = IFF_POINTOPOINT;
1652 
1653 			if ((dev->flags ^ nflags) &
1654 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1655 				return -EINVAL;
1656 		}
1657 
1658 		ipgre_tunnel_unlink(ign, t);
1659 		t->parms.iph.saddr = p.iph.saddr;
1660 		t->parms.iph.daddr = p.iph.daddr;
1661 		t->parms.i_key = p.i_key;
1662 		if (dev->type != ARPHRD_ETHER) {
1663 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1664 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1665 		}
1666 		ipgre_tunnel_link(ign, t);
1667 		netdev_state_change(dev);
1668 	}
1669 
1670 	t->parms.o_key = p.o_key;
1671 	t->parms.iph.ttl = p.iph.ttl;
1672 	t->parms.iph.tos = p.iph.tos;
1673 	t->parms.iph.frag_off = p.iph.frag_off;
1674 
1675 	if (t->parms.link != p.link) {
1676 		t->parms.link = p.link;
1677 		mtu = ipgre_tunnel_bind_dev(dev);
1678 		if (!tb[IFLA_MTU])
1679 			dev->mtu = mtu;
1680 		netdev_state_change(dev);
1681 	}
1682 
1683 	return 0;
1684 }
1685 
1686 static size_t ipgre_get_size(const struct net_device *dev)
1687 {
1688 	return
1689 		/* IFLA_GRE_LINK */
1690 		nla_total_size(4) +
1691 		/* IFLA_GRE_IFLAGS */
1692 		nla_total_size(2) +
1693 		/* IFLA_GRE_OFLAGS */
1694 		nla_total_size(2) +
1695 		/* IFLA_GRE_IKEY */
1696 		nla_total_size(4) +
1697 		/* IFLA_GRE_OKEY */
1698 		nla_total_size(4) +
1699 		/* IFLA_GRE_LOCAL */
1700 		nla_total_size(4) +
1701 		/* IFLA_GRE_REMOTE */
1702 		nla_total_size(4) +
1703 		/* IFLA_GRE_TTL */
1704 		nla_total_size(1) +
1705 		/* IFLA_GRE_TOS */
1706 		nla_total_size(1) +
1707 		/* IFLA_GRE_PMTUDISC */
1708 		nla_total_size(1) +
1709 		0;
1710 }
1711 
1712 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1713 {
1714 	struct ip_tunnel *t = netdev_priv(dev);
1715 	struct ip_tunnel_parm *p = &t->parms;
1716 
1717 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1718 	    nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1719 	    nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1720 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1721 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1722 	    nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1723 	    nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1724 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1725 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1726 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1727 		       !!(p->iph.frag_off & htons(IP_DF))))
1728 		goto nla_put_failure;
1729 	return 0;
1730 
1731 nla_put_failure:
1732 	return -EMSGSIZE;
1733 }
1734 
1735 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1736 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1737 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1738 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1739 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1740 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1741 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1742 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1743 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1744 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1745 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1746 };
1747 
1748 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1749 	.kind		= "gre",
1750 	.maxtype	= IFLA_GRE_MAX,
1751 	.policy		= ipgre_policy,
1752 	.priv_size	= sizeof(struct ip_tunnel),
1753 	.setup		= ipgre_tunnel_setup,
1754 	.validate	= ipgre_tunnel_validate,
1755 	.newlink	= ipgre_newlink,
1756 	.changelink	= ipgre_changelink,
1757 	.get_size	= ipgre_get_size,
1758 	.fill_info	= ipgre_fill_info,
1759 };
1760 
1761 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1762 	.kind		= "gretap",
1763 	.maxtype	= IFLA_GRE_MAX,
1764 	.policy		= ipgre_policy,
1765 	.priv_size	= sizeof(struct ip_tunnel),
1766 	.setup		= ipgre_tap_setup,
1767 	.validate	= ipgre_tap_validate,
1768 	.newlink	= ipgre_newlink,
1769 	.changelink	= ipgre_changelink,
1770 	.get_size	= ipgre_get_size,
1771 	.fill_info	= ipgre_fill_info,
1772 };
1773 
1774 /*
1775  *	And now the modules code and kernel interface.
1776  */
1777 
1778 static int __init ipgre_init(void)
1779 {
1780 	int err;
1781 
1782 	pr_info("GRE over IPv4 tunneling driver\n");
1783 
1784 	err = register_pernet_device(&ipgre_net_ops);
1785 	if (err < 0)
1786 		return err;
1787 
1788 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1789 	if (err < 0) {
1790 		pr_info("%s: can't add protocol\n", __func__);
1791 		goto add_proto_failed;
1792 	}
1793 
1794 	err = rtnl_link_register(&ipgre_link_ops);
1795 	if (err < 0)
1796 		goto rtnl_link_failed;
1797 
1798 	err = rtnl_link_register(&ipgre_tap_ops);
1799 	if (err < 0)
1800 		goto tap_ops_failed;
1801 
1802 out:
1803 	return err;
1804 
1805 tap_ops_failed:
1806 	rtnl_link_unregister(&ipgre_link_ops);
1807 rtnl_link_failed:
1808 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1809 add_proto_failed:
1810 	unregister_pernet_device(&ipgre_net_ops);
1811 	goto out;
1812 }
1813 
1814 static void __exit ipgre_fini(void)
1815 {
1816 	rtnl_link_unregister(&ipgre_tap_ops);
1817 	rtnl_link_unregister(&ipgre_link_ops);
1818 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1819 		pr_info("%s: can't remove protocol\n", __func__);
1820 	unregister_pernet_device(&ipgre_net_ops);
1821 }
1822 
1823 module_init(ipgre_init);
1824 module_exit(ipgre_fini);
1825 MODULE_LICENSE("GPL");
1826 MODULE_ALIAS_RTNL_LINK("gre");
1827 MODULE_ALIAS_RTNL_LINK("gretap");
1828 MODULE_ALIAS_NETDEV("gre0");
1829