xref: /linux/net/ipv4/ip_gre.c (revision 41de8d4cff21a2e81e3d9ff66f5f7c903f9c3ab1)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33 
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48 
49 #if IS_ENABLED(CONFIG_IPV6)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54 
55 /*
56    Problems & solutions
57    --------------------
58 
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63 
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70 
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74 
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79 
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88 
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91 
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95 
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108 
109 
110 
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117 
118    Alexey Kuznetsov.
119  */
120 
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125 
126 /* Fallback tunnel: no source, no destination, no key, no options */
127 
128 #define HASH_SIZE  16
129 
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133 
134 	struct net_device *fb_tunnel_dev;
135 };
136 
137 /* Tunnel hash table */
138 
139 /*
140    4 hash tables:
141 
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146 
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150 
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154 
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156 
157 #define tunnels_r_l	tunnels[3]
158 #define tunnels_r	tunnels[2]
159 #define tunnels_l	tunnels[1]
160 #define tunnels_wc	tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164 
165 #define for_each_ip_tunnel_rcu(start) \
166 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170 	unsigned long	rx_packets;
171 	unsigned long	rx_bytes;
172 	unsigned long	tx_packets;
173 	unsigned long	tx_bytes;
174 } __attribute__((aligned(4*sizeof(unsigned long))));
175 
176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178 	struct pcpu_tstats sum = { 0 };
179 	int i;
180 
181 	for_each_possible_cpu(i) {
182 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183 
184 		sum.rx_packets += tstats->rx_packets;
185 		sum.rx_bytes   += tstats->rx_bytes;
186 		sum.tx_packets += tstats->tx_packets;
187 		sum.tx_bytes   += tstats->tx_bytes;
188 	}
189 	dev->stats.rx_packets = sum.rx_packets;
190 	dev->stats.rx_bytes   = sum.rx_bytes;
191 	dev->stats.tx_packets = sum.tx_packets;
192 	dev->stats.tx_bytes   = sum.tx_bytes;
193 	return &dev->stats;
194 }
195 
196 /* Given src, dst and key, find appropriate for input tunnel. */
197 
198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199 					      __be32 remote, __be32 local,
200 					      __be32 key, __be16 gre_proto)
201 {
202 	struct net *net = dev_net(dev);
203 	int link = dev->ifindex;
204 	unsigned int h0 = HASH(remote);
205 	unsigned int h1 = HASH(key);
206 	struct ip_tunnel *t, *cand = NULL;
207 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209 		       ARPHRD_ETHER : ARPHRD_IPGRE;
210 	int score, cand_score = 4;
211 
212 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213 		if (local != t->parms.iph.saddr ||
214 		    remote != t->parms.iph.daddr ||
215 		    key != t->parms.i_key ||
216 		    !(t->dev->flags & IFF_UP))
217 			continue;
218 
219 		if (t->dev->type != ARPHRD_IPGRE &&
220 		    t->dev->type != dev_type)
221 			continue;
222 
223 		score = 0;
224 		if (t->parms.link != link)
225 			score |= 1;
226 		if (t->dev->type != dev_type)
227 			score |= 2;
228 		if (score == 0)
229 			return t;
230 
231 		if (score < cand_score) {
232 			cand = t;
233 			cand_score = score;
234 		}
235 	}
236 
237 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238 		if (remote != t->parms.iph.daddr ||
239 		    key != t->parms.i_key ||
240 		    !(t->dev->flags & IFF_UP))
241 			continue;
242 
243 		if (t->dev->type != ARPHRD_IPGRE &&
244 		    t->dev->type != dev_type)
245 			continue;
246 
247 		score = 0;
248 		if (t->parms.link != link)
249 			score |= 1;
250 		if (t->dev->type != dev_type)
251 			score |= 2;
252 		if (score == 0)
253 			return t;
254 
255 		if (score < cand_score) {
256 			cand = t;
257 			cand_score = score;
258 		}
259 	}
260 
261 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262 		if ((local != t->parms.iph.saddr &&
263 		     (local != t->parms.iph.daddr ||
264 		      !ipv4_is_multicast(local))) ||
265 		    key != t->parms.i_key ||
266 		    !(t->dev->flags & IFF_UP))
267 			continue;
268 
269 		if (t->dev->type != ARPHRD_IPGRE &&
270 		    t->dev->type != dev_type)
271 			continue;
272 
273 		score = 0;
274 		if (t->parms.link != link)
275 			score |= 1;
276 		if (t->dev->type != dev_type)
277 			score |= 2;
278 		if (score == 0)
279 			return t;
280 
281 		if (score < cand_score) {
282 			cand = t;
283 			cand_score = score;
284 		}
285 	}
286 
287 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288 		if (t->parms.i_key != key ||
289 		    !(t->dev->flags & IFF_UP))
290 			continue;
291 
292 		if (t->dev->type != ARPHRD_IPGRE &&
293 		    t->dev->type != dev_type)
294 			continue;
295 
296 		score = 0;
297 		if (t->parms.link != link)
298 			score |= 1;
299 		if (t->dev->type != dev_type)
300 			score |= 2;
301 		if (score == 0)
302 			return t;
303 
304 		if (score < cand_score) {
305 			cand = t;
306 			cand_score = score;
307 		}
308 	}
309 
310 	if (cand != NULL)
311 		return cand;
312 
313 	dev = ign->fb_tunnel_dev;
314 	if (dev->flags & IFF_UP)
315 		return netdev_priv(dev);
316 
317 	return NULL;
318 }
319 
320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321 		struct ip_tunnel_parm *parms)
322 {
323 	__be32 remote = parms->iph.daddr;
324 	__be32 local = parms->iph.saddr;
325 	__be32 key = parms->i_key;
326 	unsigned int h = HASH(key);
327 	int prio = 0;
328 
329 	if (local)
330 		prio |= 1;
331 	if (remote && !ipv4_is_multicast(remote)) {
332 		prio |= 2;
333 		h ^= HASH(remote);
334 	}
335 
336 	return &ign->tunnels[prio][h];
337 }
338 
339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340 		struct ip_tunnel *t)
341 {
342 	return __ipgre_bucket(ign, &t->parms);
343 }
344 
345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348 
349 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350 	rcu_assign_pointer(*tp, t);
351 }
352 
353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355 	struct ip_tunnel __rcu **tp;
356 	struct ip_tunnel *iter;
357 
358 	for (tp = ipgre_bucket(ign, t);
359 	     (iter = rtnl_dereference(*tp)) != NULL;
360 	     tp = &iter->next) {
361 		if (t == iter) {
362 			rcu_assign_pointer(*tp, t->next);
363 			break;
364 		}
365 	}
366 }
367 
368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369 					   struct ip_tunnel_parm *parms,
370 					   int type)
371 {
372 	__be32 remote = parms->iph.daddr;
373 	__be32 local = parms->iph.saddr;
374 	__be32 key = parms->i_key;
375 	int link = parms->link;
376 	struct ip_tunnel *t;
377 	struct ip_tunnel __rcu **tp;
378 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379 
380 	for (tp = __ipgre_bucket(ign, parms);
381 	     (t = rtnl_dereference(*tp)) != NULL;
382 	     tp = &t->next)
383 		if (local == t->parms.iph.saddr &&
384 		    remote == t->parms.iph.daddr &&
385 		    key == t->parms.i_key &&
386 		    link == t->parms.link &&
387 		    type == t->dev->type)
388 			break;
389 
390 	return t;
391 }
392 
393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394 		struct ip_tunnel_parm *parms, int create)
395 {
396 	struct ip_tunnel *t, *nt;
397 	struct net_device *dev;
398 	char name[IFNAMSIZ];
399 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400 
401 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402 	if (t || !create)
403 		return t;
404 
405 	if (parms->name[0])
406 		strlcpy(name, parms->name, IFNAMSIZ);
407 	else
408 		strcpy(name, "gre%d");
409 
410 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411 	if (!dev)
412 		return NULL;
413 
414 	dev_net_set(dev, net);
415 
416 	nt = netdev_priv(dev);
417 	nt->parms = *parms;
418 	dev->rtnl_link_ops = &ipgre_link_ops;
419 
420 	dev->mtu = ipgre_tunnel_bind_dev(dev);
421 
422 	if (register_netdevice(dev) < 0)
423 		goto failed_free;
424 
425 	dev_hold(dev);
426 	ipgre_tunnel_link(ign, nt);
427 	return nt;
428 
429 failed_free:
430 	free_netdev(dev);
431 	return NULL;
432 }
433 
434 static void ipgre_tunnel_uninit(struct net_device *dev)
435 {
436 	struct net *net = dev_net(dev);
437 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
438 
439 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
440 	dev_put(dev);
441 }
442 
443 
444 static void ipgre_err(struct sk_buff *skb, u32 info)
445 {
446 
447 /* All the routers (except for Linux) return only
448    8 bytes of packet payload. It means, that precise relaying of
449    ICMP in the real Internet is absolutely infeasible.
450 
451    Moreover, Cisco "wise men" put GRE key to the third word
452    in GRE header. It makes impossible maintaining even soft state for keyed
453    GRE tunnels with enabled checksum. Tell them "thank you".
454 
455    Well, I wonder, rfc1812 was written by Cisco employee,
456    what the hell these idiots break standrads established
457    by themself???
458  */
459 
460 	const struct iphdr *iph = (const struct iphdr *)skb->data;
461 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
462 	int grehlen = (iph->ihl<<2) + 4;
463 	const int type = icmp_hdr(skb)->type;
464 	const int code = icmp_hdr(skb)->code;
465 	struct ip_tunnel *t;
466 	__be16 flags;
467 
468 	flags = p[0];
469 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
470 		if (flags&(GRE_VERSION|GRE_ROUTING))
471 			return;
472 		if (flags&GRE_KEY) {
473 			grehlen += 4;
474 			if (flags&GRE_CSUM)
475 				grehlen += 4;
476 		}
477 	}
478 
479 	/* If only 8 bytes returned, keyed message will be dropped here */
480 	if (skb_headlen(skb) < grehlen)
481 		return;
482 
483 	switch (type) {
484 	default:
485 	case ICMP_PARAMETERPROB:
486 		return;
487 
488 	case ICMP_DEST_UNREACH:
489 		switch (code) {
490 		case ICMP_SR_FAILED:
491 		case ICMP_PORT_UNREACH:
492 			/* Impossible event. */
493 			return;
494 		case ICMP_FRAG_NEEDED:
495 			/* Soft state for pmtu is maintained by IP core. */
496 			return;
497 		default:
498 			/* All others are translated to HOST_UNREACH.
499 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
500 			   I believe they are just ether pollution. --ANK
501 			 */
502 			break;
503 		}
504 		break;
505 	case ICMP_TIME_EXCEEDED:
506 		if (code != ICMP_EXC_TTL)
507 			return;
508 		break;
509 	}
510 
511 	rcu_read_lock();
512 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
513 				flags & GRE_KEY ?
514 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
515 				p[1]);
516 	if (t == NULL || t->parms.iph.daddr == 0 ||
517 	    ipv4_is_multicast(t->parms.iph.daddr))
518 		goto out;
519 
520 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
521 		goto out;
522 
523 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
524 		t->err_count++;
525 	else
526 		t->err_count = 1;
527 	t->err_time = jiffies;
528 out:
529 	rcu_read_unlock();
530 }
531 
532 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
533 {
534 	if (INET_ECN_is_ce(iph->tos)) {
535 		if (skb->protocol == htons(ETH_P_IP)) {
536 			IP_ECN_set_ce(ip_hdr(skb));
537 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
538 			IP6_ECN_set_ce(ipv6_hdr(skb));
539 		}
540 	}
541 }
542 
543 static inline u8
544 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
545 {
546 	u8 inner = 0;
547 	if (skb->protocol == htons(ETH_P_IP))
548 		inner = old_iph->tos;
549 	else if (skb->protocol == htons(ETH_P_IPV6))
550 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
551 	return INET_ECN_encapsulate(tos, inner);
552 }
553 
554 static int ipgre_rcv(struct sk_buff *skb)
555 {
556 	const struct iphdr *iph;
557 	u8     *h;
558 	__be16    flags;
559 	__sum16   csum = 0;
560 	__be32 key = 0;
561 	u32    seqno = 0;
562 	struct ip_tunnel *tunnel;
563 	int    offset = 4;
564 	__be16 gre_proto;
565 
566 	if (!pskb_may_pull(skb, 16))
567 		goto drop_nolock;
568 
569 	iph = ip_hdr(skb);
570 	h = skb->data;
571 	flags = *(__be16*)h;
572 
573 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
574 		/* - Version must be 0.
575 		   - We do not support routing headers.
576 		 */
577 		if (flags&(GRE_VERSION|GRE_ROUTING))
578 			goto drop_nolock;
579 
580 		if (flags&GRE_CSUM) {
581 			switch (skb->ip_summed) {
582 			case CHECKSUM_COMPLETE:
583 				csum = csum_fold(skb->csum);
584 				if (!csum)
585 					break;
586 				/* fall through */
587 			case CHECKSUM_NONE:
588 				skb->csum = 0;
589 				csum = __skb_checksum_complete(skb);
590 				skb->ip_summed = CHECKSUM_COMPLETE;
591 			}
592 			offset += 4;
593 		}
594 		if (flags&GRE_KEY) {
595 			key = *(__be32*)(h + offset);
596 			offset += 4;
597 		}
598 		if (flags&GRE_SEQ) {
599 			seqno = ntohl(*(__be32*)(h + offset));
600 			offset += 4;
601 		}
602 	}
603 
604 	gre_proto = *(__be16 *)(h + 2);
605 
606 	rcu_read_lock();
607 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
608 					  iph->saddr, iph->daddr, key,
609 					  gre_proto))) {
610 		struct pcpu_tstats *tstats;
611 
612 		secpath_reset(skb);
613 
614 		skb->protocol = gre_proto;
615 		/* WCCP version 1 and 2 protocol decoding.
616 		 * - Change protocol to IP
617 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
618 		 */
619 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
620 			skb->protocol = htons(ETH_P_IP);
621 			if ((*(h + offset) & 0xF0) != 0x40)
622 				offset += 4;
623 		}
624 
625 		skb->mac_header = skb->network_header;
626 		__pskb_pull(skb, offset);
627 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
628 		skb->pkt_type = PACKET_HOST;
629 #ifdef CONFIG_NET_IPGRE_BROADCAST
630 		if (ipv4_is_multicast(iph->daddr)) {
631 			/* Looped back packet, drop it! */
632 			if (rt_is_output_route(skb_rtable(skb)))
633 				goto drop;
634 			tunnel->dev->stats.multicast++;
635 			skb->pkt_type = PACKET_BROADCAST;
636 		}
637 #endif
638 
639 		if (((flags&GRE_CSUM) && csum) ||
640 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
641 			tunnel->dev->stats.rx_crc_errors++;
642 			tunnel->dev->stats.rx_errors++;
643 			goto drop;
644 		}
645 		if (tunnel->parms.i_flags&GRE_SEQ) {
646 			if (!(flags&GRE_SEQ) ||
647 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
648 				tunnel->dev->stats.rx_fifo_errors++;
649 				tunnel->dev->stats.rx_errors++;
650 				goto drop;
651 			}
652 			tunnel->i_seqno = seqno + 1;
653 		}
654 
655 		/* Warning: All skb pointers will be invalidated! */
656 		if (tunnel->dev->type == ARPHRD_ETHER) {
657 			if (!pskb_may_pull(skb, ETH_HLEN)) {
658 				tunnel->dev->stats.rx_length_errors++;
659 				tunnel->dev->stats.rx_errors++;
660 				goto drop;
661 			}
662 
663 			iph = ip_hdr(skb);
664 			skb->protocol = eth_type_trans(skb, tunnel->dev);
665 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
666 		}
667 
668 		tstats = this_cpu_ptr(tunnel->dev->tstats);
669 		tstats->rx_packets++;
670 		tstats->rx_bytes += skb->len;
671 
672 		__skb_tunnel_rx(skb, tunnel->dev);
673 
674 		skb_reset_network_header(skb);
675 		ipgre_ecn_decapsulate(iph, skb);
676 
677 		netif_rx(skb);
678 
679 		rcu_read_unlock();
680 		return 0;
681 	}
682 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
683 
684 drop:
685 	rcu_read_unlock();
686 drop_nolock:
687 	kfree_skb(skb);
688 	return 0;
689 }
690 
691 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
692 {
693 	struct ip_tunnel *tunnel = netdev_priv(dev);
694 	struct pcpu_tstats *tstats;
695 	const struct iphdr  *old_iph = ip_hdr(skb);
696 	const struct iphdr  *tiph;
697 	struct flowi4 fl4;
698 	u8     tos;
699 	__be16 df;
700 	struct rtable *rt;     			/* Route to the other host */
701 	struct net_device *tdev;		/* Device to other host */
702 	struct iphdr  *iph;			/* Our new IP header */
703 	unsigned int max_headroom;		/* The extra header space needed */
704 	int    gre_hlen;
705 	__be32 dst;
706 	int    mtu;
707 
708 	if (dev->type == ARPHRD_ETHER)
709 		IPCB(skb)->flags = 0;
710 
711 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
712 		gre_hlen = 0;
713 		tiph = (const struct iphdr *)skb->data;
714 	} else {
715 		gre_hlen = tunnel->hlen;
716 		tiph = &tunnel->parms.iph;
717 	}
718 
719 	if ((dst = tiph->daddr) == 0) {
720 		/* NBMA tunnel */
721 
722 		if (skb_dst(skb) == NULL) {
723 			dev->stats.tx_fifo_errors++;
724 			goto tx_error;
725 		}
726 
727 		if (skb->protocol == htons(ETH_P_IP)) {
728 			rt = skb_rtable(skb);
729 			dst = rt->rt_gateway;
730 		}
731 #if IS_ENABLED(CONFIG_IPV6)
732 		else if (skb->protocol == htons(ETH_P_IPV6)) {
733 			const struct in6_addr *addr6;
734 			struct neighbour *neigh;
735 			bool do_tx_error_icmp;
736 			int addr_type;
737 
738 			neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
739 			if (neigh == NULL)
740 				goto tx_error;
741 
742 			addr6 = (const struct in6_addr *)&neigh->primary_key;
743 			addr_type = ipv6_addr_type(addr6);
744 
745 			if (addr_type == IPV6_ADDR_ANY) {
746 				addr6 = &ipv6_hdr(skb)->daddr;
747 				addr_type = ipv6_addr_type(addr6);
748 			}
749 
750 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
751 				do_tx_error_icmp = true;
752 			else {
753 				do_tx_error_icmp = false;
754 				dst = addr6->s6_addr32[3];
755 			}
756 			neigh_release(neigh);
757 			if (do_tx_error_icmp)
758 				goto tx_error_icmp;
759 		}
760 #endif
761 		else
762 			goto tx_error;
763 	}
764 
765 	tos = tiph->tos;
766 	if (tos == 1) {
767 		tos = 0;
768 		if (skb->protocol == htons(ETH_P_IP))
769 			tos = old_iph->tos;
770 		else if (skb->protocol == htons(ETH_P_IPV6))
771 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
772 	}
773 
774 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
775 				 tunnel->parms.o_key, RT_TOS(tos),
776 				 tunnel->parms.link);
777 	if (IS_ERR(rt)) {
778 		dev->stats.tx_carrier_errors++;
779 		goto tx_error;
780 	}
781 	tdev = rt->dst.dev;
782 
783 	if (tdev == dev) {
784 		ip_rt_put(rt);
785 		dev->stats.collisions++;
786 		goto tx_error;
787 	}
788 
789 	df = tiph->frag_off;
790 	if (df)
791 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
792 	else
793 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
794 
795 	if (skb_dst(skb))
796 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
797 
798 	if (skb->protocol == htons(ETH_P_IP)) {
799 		df |= (old_iph->frag_off&htons(IP_DF));
800 
801 		if ((old_iph->frag_off&htons(IP_DF)) &&
802 		    mtu < ntohs(old_iph->tot_len)) {
803 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
804 			ip_rt_put(rt);
805 			goto tx_error;
806 		}
807 	}
808 #if IS_ENABLED(CONFIG_IPV6)
809 	else if (skb->protocol == htons(ETH_P_IPV6)) {
810 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
811 
812 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
813 			if ((tunnel->parms.iph.daddr &&
814 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
815 			    rt6->rt6i_dst.plen == 128) {
816 				rt6->rt6i_flags |= RTF_MODIFIED;
817 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
818 			}
819 		}
820 
821 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
822 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
823 			ip_rt_put(rt);
824 			goto tx_error;
825 		}
826 	}
827 #endif
828 
829 	if (tunnel->err_count > 0) {
830 		if (time_before(jiffies,
831 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
832 			tunnel->err_count--;
833 
834 			dst_link_failure(skb);
835 		} else
836 			tunnel->err_count = 0;
837 	}
838 
839 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
840 
841 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
842 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
843 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
844 		if (max_headroom > dev->needed_headroom)
845 			dev->needed_headroom = max_headroom;
846 		if (!new_skb) {
847 			ip_rt_put(rt);
848 			dev->stats.tx_dropped++;
849 			dev_kfree_skb(skb);
850 			return NETDEV_TX_OK;
851 		}
852 		if (skb->sk)
853 			skb_set_owner_w(new_skb, skb->sk);
854 		dev_kfree_skb(skb);
855 		skb = new_skb;
856 		old_iph = ip_hdr(skb);
857 	}
858 
859 	skb_reset_transport_header(skb);
860 	skb_push(skb, gre_hlen);
861 	skb_reset_network_header(skb);
862 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
863 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
864 			      IPSKB_REROUTED);
865 	skb_dst_drop(skb);
866 	skb_dst_set(skb, &rt->dst);
867 
868 	/*
869 	 *	Push down and install the IPIP header.
870 	 */
871 
872 	iph 			=	ip_hdr(skb);
873 	iph->version		=	4;
874 	iph->ihl		=	sizeof(struct iphdr) >> 2;
875 	iph->frag_off		=	df;
876 	iph->protocol		=	IPPROTO_GRE;
877 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
878 	iph->daddr		=	fl4.daddr;
879 	iph->saddr		=	fl4.saddr;
880 
881 	if ((iph->ttl = tiph->ttl) == 0) {
882 		if (skb->protocol == htons(ETH_P_IP))
883 			iph->ttl = old_iph->ttl;
884 #if IS_ENABLED(CONFIG_IPV6)
885 		else if (skb->protocol == htons(ETH_P_IPV6))
886 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
887 #endif
888 		else
889 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
890 	}
891 
892 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
893 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
894 				   htons(ETH_P_TEB) : skb->protocol;
895 
896 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
897 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
898 
899 		if (tunnel->parms.o_flags&GRE_SEQ) {
900 			++tunnel->o_seqno;
901 			*ptr = htonl(tunnel->o_seqno);
902 			ptr--;
903 		}
904 		if (tunnel->parms.o_flags&GRE_KEY) {
905 			*ptr = tunnel->parms.o_key;
906 			ptr--;
907 		}
908 		if (tunnel->parms.o_flags&GRE_CSUM) {
909 			*ptr = 0;
910 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
911 		}
912 	}
913 
914 	nf_reset(skb);
915 	tstats = this_cpu_ptr(dev->tstats);
916 	__IPTUNNEL_XMIT(tstats, &dev->stats);
917 	return NETDEV_TX_OK;
918 
919 #if IS_ENABLED(CONFIG_IPV6)
920 tx_error_icmp:
921 	dst_link_failure(skb);
922 #endif
923 tx_error:
924 	dev->stats.tx_errors++;
925 	dev_kfree_skb(skb);
926 	return NETDEV_TX_OK;
927 }
928 
929 static int ipgre_tunnel_bind_dev(struct net_device *dev)
930 {
931 	struct net_device *tdev = NULL;
932 	struct ip_tunnel *tunnel;
933 	const struct iphdr *iph;
934 	int hlen = LL_MAX_HEADER;
935 	int mtu = ETH_DATA_LEN;
936 	int addend = sizeof(struct iphdr) + 4;
937 
938 	tunnel = netdev_priv(dev);
939 	iph = &tunnel->parms.iph;
940 
941 	/* Guess output device to choose reasonable mtu and needed_headroom */
942 
943 	if (iph->daddr) {
944 		struct flowi4 fl4;
945 		struct rtable *rt;
946 
947 		rt = ip_route_output_gre(dev_net(dev), &fl4,
948 					 iph->daddr, iph->saddr,
949 					 tunnel->parms.o_key,
950 					 RT_TOS(iph->tos),
951 					 tunnel->parms.link);
952 		if (!IS_ERR(rt)) {
953 			tdev = rt->dst.dev;
954 			ip_rt_put(rt);
955 		}
956 
957 		if (dev->type != ARPHRD_ETHER)
958 			dev->flags |= IFF_POINTOPOINT;
959 	}
960 
961 	if (!tdev && tunnel->parms.link)
962 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
963 
964 	if (tdev) {
965 		hlen = tdev->hard_header_len + tdev->needed_headroom;
966 		mtu = tdev->mtu;
967 	}
968 	dev->iflink = tunnel->parms.link;
969 
970 	/* Precalculate GRE options length */
971 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
972 		if (tunnel->parms.o_flags&GRE_CSUM)
973 			addend += 4;
974 		if (tunnel->parms.o_flags&GRE_KEY)
975 			addend += 4;
976 		if (tunnel->parms.o_flags&GRE_SEQ)
977 			addend += 4;
978 	}
979 	dev->needed_headroom = addend + hlen;
980 	mtu -= dev->hard_header_len + addend;
981 
982 	if (mtu < 68)
983 		mtu = 68;
984 
985 	tunnel->hlen = addend;
986 
987 	return mtu;
988 }
989 
990 static int
991 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
992 {
993 	int err = 0;
994 	struct ip_tunnel_parm p;
995 	struct ip_tunnel *t;
996 	struct net *net = dev_net(dev);
997 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
998 
999 	switch (cmd) {
1000 	case SIOCGETTUNNEL:
1001 		t = NULL;
1002 		if (dev == ign->fb_tunnel_dev) {
1003 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1004 				err = -EFAULT;
1005 				break;
1006 			}
1007 			t = ipgre_tunnel_locate(net, &p, 0);
1008 		}
1009 		if (t == NULL)
1010 			t = netdev_priv(dev);
1011 		memcpy(&p, &t->parms, sizeof(p));
1012 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1013 			err = -EFAULT;
1014 		break;
1015 
1016 	case SIOCADDTUNNEL:
1017 	case SIOCCHGTUNNEL:
1018 		err = -EPERM;
1019 		if (!capable(CAP_NET_ADMIN))
1020 			goto done;
1021 
1022 		err = -EFAULT;
1023 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1024 			goto done;
1025 
1026 		err = -EINVAL;
1027 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1028 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1029 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1030 			goto done;
1031 		if (p.iph.ttl)
1032 			p.iph.frag_off |= htons(IP_DF);
1033 
1034 		if (!(p.i_flags&GRE_KEY))
1035 			p.i_key = 0;
1036 		if (!(p.o_flags&GRE_KEY))
1037 			p.o_key = 0;
1038 
1039 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1040 
1041 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1042 			if (t != NULL) {
1043 				if (t->dev != dev) {
1044 					err = -EEXIST;
1045 					break;
1046 				}
1047 			} else {
1048 				unsigned int nflags = 0;
1049 
1050 				t = netdev_priv(dev);
1051 
1052 				if (ipv4_is_multicast(p.iph.daddr))
1053 					nflags = IFF_BROADCAST;
1054 				else if (p.iph.daddr)
1055 					nflags = IFF_POINTOPOINT;
1056 
1057 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1058 					err = -EINVAL;
1059 					break;
1060 				}
1061 				ipgre_tunnel_unlink(ign, t);
1062 				synchronize_net();
1063 				t->parms.iph.saddr = p.iph.saddr;
1064 				t->parms.iph.daddr = p.iph.daddr;
1065 				t->parms.i_key = p.i_key;
1066 				t->parms.o_key = p.o_key;
1067 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1068 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1069 				ipgre_tunnel_link(ign, t);
1070 				netdev_state_change(dev);
1071 			}
1072 		}
1073 
1074 		if (t) {
1075 			err = 0;
1076 			if (cmd == SIOCCHGTUNNEL) {
1077 				t->parms.iph.ttl = p.iph.ttl;
1078 				t->parms.iph.tos = p.iph.tos;
1079 				t->parms.iph.frag_off = p.iph.frag_off;
1080 				if (t->parms.link != p.link) {
1081 					t->parms.link = p.link;
1082 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1083 					netdev_state_change(dev);
1084 				}
1085 			}
1086 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1087 				err = -EFAULT;
1088 		} else
1089 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1090 		break;
1091 
1092 	case SIOCDELTUNNEL:
1093 		err = -EPERM;
1094 		if (!capable(CAP_NET_ADMIN))
1095 			goto done;
1096 
1097 		if (dev == ign->fb_tunnel_dev) {
1098 			err = -EFAULT;
1099 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1100 				goto done;
1101 			err = -ENOENT;
1102 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1103 				goto done;
1104 			err = -EPERM;
1105 			if (t == netdev_priv(ign->fb_tunnel_dev))
1106 				goto done;
1107 			dev = t->dev;
1108 		}
1109 		unregister_netdevice(dev);
1110 		err = 0;
1111 		break;
1112 
1113 	default:
1114 		err = -EINVAL;
1115 	}
1116 
1117 done:
1118 	return err;
1119 }
1120 
1121 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1122 {
1123 	struct ip_tunnel *tunnel = netdev_priv(dev);
1124 	if (new_mtu < 68 ||
1125 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1126 		return -EINVAL;
1127 	dev->mtu = new_mtu;
1128 	return 0;
1129 }
1130 
1131 /* Nice toy. Unfortunately, useless in real life :-)
1132    It allows to construct virtual multiprotocol broadcast "LAN"
1133    over the Internet, provided multicast routing is tuned.
1134 
1135 
1136    I have no idea was this bicycle invented before me,
1137    so that I had to set ARPHRD_IPGRE to a random value.
1138    I have an impression, that Cisco could make something similar,
1139    but this feature is apparently missing in IOS<=11.2(8).
1140 
1141    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1142    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1143 
1144    ping -t 255 224.66.66.66
1145 
1146    If nobody answers, mbone does not work.
1147 
1148    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1149    ip addr add 10.66.66.<somewhat>/24 dev Universe
1150    ifconfig Universe up
1151    ifconfig Universe add fe80::<Your_real_addr>/10
1152    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1153    ftp 10.66.66.66
1154    ...
1155    ftp fec0:6666:6666::193.233.7.65
1156    ...
1157 
1158  */
1159 
1160 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1161 			unsigned short type,
1162 			const void *daddr, const void *saddr, unsigned int len)
1163 {
1164 	struct ip_tunnel *t = netdev_priv(dev);
1165 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1166 	__be16 *p = (__be16*)(iph+1);
1167 
1168 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1169 	p[0]		= t->parms.o_flags;
1170 	p[1]		= htons(type);
1171 
1172 	/*
1173 	 *	Set the source hardware address.
1174 	 */
1175 
1176 	if (saddr)
1177 		memcpy(&iph->saddr, saddr, 4);
1178 	if (daddr)
1179 		memcpy(&iph->daddr, daddr, 4);
1180 	if (iph->daddr)
1181 		return t->hlen;
1182 
1183 	return -t->hlen;
1184 }
1185 
1186 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1187 {
1188 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1189 	memcpy(haddr, &iph->saddr, 4);
1190 	return 4;
1191 }
1192 
1193 static const struct header_ops ipgre_header_ops = {
1194 	.create	= ipgre_header,
1195 	.parse	= ipgre_header_parse,
1196 };
1197 
1198 #ifdef CONFIG_NET_IPGRE_BROADCAST
1199 static int ipgre_open(struct net_device *dev)
1200 {
1201 	struct ip_tunnel *t = netdev_priv(dev);
1202 
1203 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1204 		struct flowi4 fl4;
1205 		struct rtable *rt;
1206 
1207 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1208 					 t->parms.iph.daddr,
1209 					 t->parms.iph.saddr,
1210 					 t->parms.o_key,
1211 					 RT_TOS(t->parms.iph.tos),
1212 					 t->parms.link);
1213 		if (IS_ERR(rt))
1214 			return -EADDRNOTAVAIL;
1215 		dev = rt->dst.dev;
1216 		ip_rt_put(rt);
1217 		if (__in_dev_get_rtnl(dev) == NULL)
1218 			return -EADDRNOTAVAIL;
1219 		t->mlink = dev->ifindex;
1220 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1221 	}
1222 	return 0;
1223 }
1224 
1225 static int ipgre_close(struct net_device *dev)
1226 {
1227 	struct ip_tunnel *t = netdev_priv(dev);
1228 
1229 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1230 		struct in_device *in_dev;
1231 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1232 		if (in_dev)
1233 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1234 	}
1235 	return 0;
1236 }
1237 
1238 #endif
1239 
1240 static const struct net_device_ops ipgre_netdev_ops = {
1241 	.ndo_init		= ipgre_tunnel_init,
1242 	.ndo_uninit		= ipgre_tunnel_uninit,
1243 #ifdef CONFIG_NET_IPGRE_BROADCAST
1244 	.ndo_open		= ipgre_open,
1245 	.ndo_stop		= ipgre_close,
1246 #endif
1247 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1248 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1249 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1250 	.ndo_get_stats		= ipgre_get_stats,
1251 };
1252 
1253 static void ipgre_dev_free(struct net_device *dev)
1254 {
1255 	free_percpu(dev->tstats);
1256 	free_netdev(dev);
1257 }
1258 
1259 static void ipgre_tunnel_setup(struct net_device *dev)
1260 {
1261 	dev->netdev_ops		= &ipgre_netdev_ops;
1262 	dev->destructor 	= ipgre_dev_free;
1263 
1264 	dev->type		= ARPHRD_IPGRE;
1265 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1266 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1267 	dev->flags		= IFF_NOARP;
1268 	dev->iflink		= 0;
1269 	dev->addr_len		= 4;
1270 	dev->features		|= NETIF_F_NETNS_LOCAL;
1271 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1272 }
1273 
1274 static int ipgre_tunnel_init(struct net_device *dev)
1275 {
1276 	struct ip_tunnel *tunnel;
1277 	struct iphdr *iph;
1278 
1279 	tunnel = netdev_priv(dev);
1280 	iph = &tunnel->parms.iph;
1281 
1282 	tunnel->dev = dev;
1283 	strcpy(tunnel->parms.name, dev->name);
1284 
1285 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1286 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1287 
1288 	if (iph->daddr) {
1289 #ifdef CONFIG_NET_IPGRE_BROADCAST
1290 		if (ipv4_is_multicast(iph->daddr)) {
1291 			if (!iph->saddr)
1292 				return -EINVAL;
1293 			dev->flags = IFF_BROADCAST;
1294 			dev->header_ops = &ipgre_header_ops;
1295 		}
1296 #endif
1297 	} else
1298 		dev->header_ops = &ipgre_header_ops;
1299 
1300 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1301 	if (!dev->tstats)
1302 		return -ENOMEM;
1303 
1304 	return 0;
1305 }
1306 
1307 static void ipgre_fb_tunnel_init(struct net_device *dev)
1308 {
1309 	struct ip_tunnel *tunnel = netdev_priv(dev);
1310 	struct iphdr *iph = &tunnel->parms.iph;
1311 
1312 	tunnel->dev = dev;
1313 	strcpy(tunnel->parms.name, dev->name);
1314 
1315 	iph->version		= 4;
1316 	iph->protocol		= IPPROTO_GRE;
1317 	iph->ihl		= 5;
1318 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1319 
1320 	dev_hold(dev);
1321 }
1322 
1323 
1324 static const struct gre_protocol ipgre_protocol = {
1325 	.handler     = ipgre_rcv,
1326 	.err_handler = ipgre_err,
1327 };
1328 
1329 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1330 {
1331 	int prio;
1332 
1333 	for (prio = 0; prio < 4; prio++) {
1334 		int h;
1335 		for (h = 0; h < HASH_SIZE; h++) {
1336 			struct ip_tunnel *t;
1337 
1338 			t = rtnl_dereference(ign->tunnels[prio][h]);
1339 
1340 			while (t != NULL) {
1341 				unregister_netdevice_queue(t->dev, head);
1342 				t = rtnl_dereference(t->next);
1343 			}
1344 		}
1345 	}
1346 }
1347 
1348 static int __net_init ipgre_init_net(struct net *net)
1349 {
1350 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1351 	int err;
1352 
1353 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1354 					   ipgre_tunnel_setup);
1355 	if (!ign->fb_tunnel_dev) {
1356 		err = -ENOMEM;
1357 		goto err_alloc_dev;
1358 	}
1359 	dev_net_set(ign->fb_tunnel_dev, net);
1360 
1361 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1362 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1363 
1364 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1365 		goto err_reg_dev;
1366 
1367 	rcu_assign_pointer(ign->tunnels_wc[0],
1368 			   netdev_priv(ign->fb_tunnel_dev));
1369 	return 0;
1370 
1371 err_reg_dev:
1372 	ipgre_dev_free(ign->fb_tunnel_dev);
1373 err_alloc_dev:
1374 	return err;
1375 }
1376 
1377 static void __net_exit ipgre_exit_net(struct net *net)
1378 {
1379 	struct ipgre_net *ign;
1380 	LIST_HEAD(list);
1381 
1382 	ign = net_generic(net, ipgre_net_id);
1383 	rtnl_lock();
1384 	ipgre_destroy_tunnels(ign, &list);
1385 	unregister_netdevice_many(&list);
1386 	rtnl_unlock();
1387 }
1388 
1389 static struct pernet_operations ipgre_net_ops = {
1390 	.init = ipgre_init_net,
1391 	.exit = ipgre_exit_net,
1392 	.id   = &ipgre_net_id,
1393 	.size = sizeof(struct ipgre_net),
1394 };
1395 
1396 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1397 {
1398 	__be16 flags;
1399 
1400 	if (!data)
1401 		return 0;
1402 
1403 	flags = 0;
1404 	if (data[IFLA_GRE_IFLAGS])
1405 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1406 	if (data[IFLA_GRE_OFLAGS])
1407 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1408 	if (flags & (GRE_VERSION|GRE_ROUTING))
1409 		return -EINVAL;
1410 
1411 	return 0;
1412 }
1413 
1414 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1415 {
1416 	__be32 daddr;
1417 
1418 	if (tb[IFLA_ADDRESS]) {
1419 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1420 			return -EINVAL;
1421 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1422 			return -EADDRNOTAVAIL;
1423 	}
1424 
1425 	if (!data)
1426 		goto out;
1427 
1428 	if (data[IFLA_GRE_REMOTE]) {
1429 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1430 		if (!daddr)
1431 			return -EINVAL;
1432 	}
1433 
1434 out:
1435 	return ipgre_tunnel_validate(tb, data);
1436 }
1437 
1438 static void ipgre_netlink_parms(struct nlattr *data[],
1439 				struct ip_tunnel_parm *parms)
1440 {
1441 	memset(parms, 0, sizeof(*parms));
1442 
1443 	parms->iph.protocol = IPPROTO_GRE;
1444 
1445 	if (!data)
1446 		return;
1447 
1448 	if (data[IFLA_GRE_LINK])
1449 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1450 
1451 	if (data[IFLA_GRE_IFLAGS])
1452 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1453 
1454 	if (data[IFLA_GRE_OFLAGS])
1455 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1456 
1457 	if (data[IFLA_GRE_IKEY])
1458 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1459 
1460 	if (data[IFLA_GRE_OKEY])
1461 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1462 
1463 	if (data[IFLA_GRE_LOCAL])
1464 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1465 
1466 	if (data[IFLA_GRE_REMOTE])
1467 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1468 
1469 	if (data[IFLA_GRE_TTL])
1470 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1471 
1472 	if (data[IFLA_GRE_TOS])
1473 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1474 
1475 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1476 		parms->iph.frag_off = htons(IP_DF);
1477 }
1478 
1479 static int ipgre_tap_init(struct net_device *dev)
1480 {
1481 	struct ip_tunnel *tunnel;
1482 
1483 	tunnel = netdev_priv(dev);
1484 
1485 	tunnel->dev = dev;
1486 	strcpy(tunnel->parms.name, dev->name);
1487 
1488 	ipgre_tunnel_bind_dev(dev);
1489 
1490 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1491 	if (!dev->tstats)
1492 		return -ENOMEM;
1493 
1494 	return 0;
1495 }
1496 
1497 static const struct net_device_ops ipgre_tap_netdev_ops = {
1498 	.ndo_init		= ipgre_tap_init,
1499 	.ndo_uninit		= ipgre_tunnel_uninit,
1500 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1501 	.ndo_set_mac_address 	= eth_mac_addr,
1502 	.ndo_validate_addr	= eth_validate_addr,
1503 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1504 	.ndo_get_stats		= ipgre_get_stats,
1505 };
1506 
1507 static void ipgre_tap_setup(struct net_device *dev)
1508 {
1509 
1510 	ether_setup(dev);
1511 
1512 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1513 	dev->destructor 	= ipgre_dev_free;
1514 
1515 	dev->iflink		= 0;
1516 	dev->features		|= NETIF_F_NETNS_LOCAL;
1517 }
1518 
1519 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1520 			 struct nlattr *data[])
1521 {
1522 	struct ip_tunnel *nt;
1523 	struct net *net = dev_net(dev);
1524 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1525 	int mtu;
1526 	int err;
1527 
1528 	nt = netdev_priv(dev);
1529 	ipgre_netlink_parms(data, &nt->parms);
1530 
1531 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1532 		return -EEXIST;
1533 
1534 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1535 		random_ether_addr(dev->dev_addr);
1536 
1537 	mtu = ipgre_tunnel_bind_dev(dev);
1538 	if (!tb[IFLA_MTU])
1539 		dev->mtu = mtu;
1540 
1541 	/* Can use a lockless transmit, unless we generate output sequences */
1542 	if (!(nt->parms.o_flags & GRE_SEQ))
1543 		dev->features |= NETIF_F_LLTX;
1544 
1545 	err = register_netdevice(dev);
1546 	if (err)
1547 		goto out;
1548 
1549 	dev_hold(dev);
1550 	ipgre_tunnel_link(ign, nt);
1551 
1552 out:
1553 	return err;
1554 }
1555 
1556 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1557 			    struct nlattr *data[])
1558 {
1559 	struct ip_tunnel *t, *nt;
1560 	struct net *net = dev_net(dev);
1561 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1562 	struct ip_tunnel_parm p;
1563 	int mtu;
1564 
1565 	if (dev == ign->fb_tunnel_dev)
1566 		return -EINVAL;
1567 
1568 	nt = netdev_priv(dev);
1569 	ipgre_netlink_parms(data, &p);
1570 
1571 	t = ipgre_tunnel_locate(net, &p, 0);
1572 
1573 	if (t) {
1574 		if (t->dev != dev)
1575 			return -EEXIST;
1576 	} else {
1577 		t = nt;
1578 
1579 		if (dev->type != ARPHRD_ETHER) {
1580 			unsigned int nflags = 0;
1581 
1582 			if (ipv4_is_multicast(p.iph.daddr))
1583 				nflags = IFF_BROADCAST;
1584 			else if (p.iph.daddr)
1585 				nflags = IFF_POINTOPOINT;
1586 
1587 			if ((dev->flags ^ nflags) &
1588 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1589 				return -EINVAL;
1590 		}
1591 
1592 		ipgre_tunnel_unlink(ign, t);
1593 		t->parms.iph.saddr = p.iph.saddr;
1594 		t->parms.iph.daddr = p.iph.daddr;
1595 		t->parms.i_key = p.i_key;
1596 		if (dev->type != ARPHRD_ETHER) {
1597 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1598 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1599 		}
1600 		ipgre_tunnel_link(ign, t);
1601 		netdev_state_change(dev);
1602 	}
1603 
1604 	t->parms.o_key = p.o_key;
1605 	t->parms.iph.ttl = p.iph.ttl;
1606 	t->parms.iph.tos = p.iph.tos;
1607 	t->parms.iph.frag_off = p.iph.frag_off;
1608 
1609 	if (t->parms.link != p.link) {
1610 		t->parms.link = p.link;
1611 		mtu = ipgre_tunnel_bind_dev(dev);
1612 		if (!tb[IFLA_MTU])
1613 			dev->mtu = mtu;
1614 		netdev_state_change(dev);
1615 	}
1616 
1617 	return 0;
1618 }
1619 
1620 static size_t ipgre_get_size(const struct net_device *dev)
1621 {
1622 	return
1623 		/* IFLA_GRE_LINK */
1624 		nla_total_size(4) +
1625 		/* IFLA_GRE_IFLAGS */
1626 		nla_total_size(2) +
1627 		/* IFLA_GRE_OFLAGS */
1628 		nla_total_size(2) +
1629 		/* IFLA_GRE_IKEY */
1630 		nla_total_size(4) +
1631 		/* IFLA_GRE_OKEY */
1632 		nla_total_size(4) +
1633 		/* IFLA_GRE_LOCAL */
1634 		nla_total_size(4) +
1635 		/* IFLA_GRE_REMOTE */
1636 		nla_total_size(4) +
1637 		/* IFLA_GRE_TTL */
1638 		nla_total_size(1) +
1639 		/* IFLA_GRE_TOS */
1640 		nla_total_size(1) +
1641 		/* IFLA_GRE_PMTUDISC */
1642 		nla_total_size(1) +
1643 		0;
1644 }
1645 
1646 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1647 {
1648 	struct ip_tunnel *t = netdev_priv(dev);
1649 	struct ip_tunnel_parm *p = &t->parms;
1650 
1651 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1652 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1653 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1654 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1655 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1656 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1657 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1658 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1659 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1660 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1661 
1662 	return 0;
1663 
1664 nla_put_failure:
1665 	return -EMSGSIZE;
1666 }
1667 
1668 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1669 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1670 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1671 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1672 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1673 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1674 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1675 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1676 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1677 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1678 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1679 };
1680 
1681 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1682 	.kind		= "gre",
1683 	.maxtype	= IFLA_GRE_MAX,
1684 	.policy		= ipgre_policy,
1685 	.priv_size	= sizeof(struct ip_tunnel),
1686 	.setup		= ipgre_tunnel_setup,
1687 	.validate	= ipgre_tunnel_validate,
1688 	.newlink	= ipgre_newlink,
1689 	.changelink	= ipgre_changelink,
1690 	.get_size	= ipgre_get_size,
1691 	.fill_info	= ipgre_fill_info,
1692 };
1693 
1694 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1695 	.kind		= "gretap",
1696 	.maxtype	= IFLA_GRE_MAX,
1697 	.policy		= ipgre_policy,
1698 	.priv_size	= sizeof(struct ip_tunnel),
1699 	.setup		= ipgre_tap_setup,
1700 	.validate	= ipgre_tap_validate,
1701 	.newlink	= ipgre_newlink,
1702 	.changelink	= ipgre_changelink,
1703 	.get_size	= ipgre_get_size,
1704 	.fill_info	= ipgre_fill_info,
1705 };
1706 
1707 /*
1708  *	And now the modules code and kernel interface.
1709  */
1710 
1711 static int __init ipgre_init(void)
1712 {
1713 	int err;
1714 
1715 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1716 
1717 	err = register_pernet_device(&ipgre_net_ops);
1718 	if (err < 0)
1719 		return err;
1720 
1721 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1722 	if (err < 0) {
1723 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1724 		goto add_proto_failed;
1725 	}
1726 
1727 	err = rtnl_link_register(&ipgre_link_ops);
1728 	if (err < 0)
1729 		goto rtnl_link_failed;
1730 
1731 	err = rtnl_link_register(&ipgre_tap_ops);
1732 	if (err < 0)
1733 		goto tap_ops_failed;
1734 
1735 out:
1736 	return err;
1737 
1738 tap_ops_failed:
1739 	rtnl_link_unregister(&ipgre_link_ops);
1740 rtnl_link_failed:
1741 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1742 add_proto_failed:
1743 	unregister_pernet_device(&ipgre_net_ops);
1744 	goto out;
1745 }
1746 
1747 static void __exit ipgre_fini(void)
1748 {
1749 	rtnl_link_unregister(&ipgre_tap_ops);
1750 	rtnl_link_unregister(&ipgre_link_ops);
1751 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1752 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1753 	unregister_pernet_device(&ipgre_net_ops);
1754 }
1755 
1756 module_init(ipgre_init);
1757 module_exit(ipgre_fini);
1758 MODULE_LICENSE("GPL");
1759 MODULE_ALIAS_RTNL_LINK("gre");
1760 MODULE_ALIAS_RTNL_LINK("gretap");
1761 MODULE_ALIAS_NETDEV("gre0");
1762