xref: /linux/net/ipv4/ip_gre.c (revision 4dc7ccf7e9d9bca1989b840be9e8e84911387cf2)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33 
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 
48 #ifdef CONFIG_IPV6
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #endif
53 
54 /*
55    Problems & solutions
56    --------------------
57 
58    1. The most important issue is detecting local dead loops.
59    They would cause complete host lockup in transmit, which
60    would be "resolved" by stack overflow or, if queueing is enabled,
61    with infinite looping in net_bh.
62 
63    We cannot track such dead loops during route installation,
64    it is infeasible task. The most general solutions would be
65    to keep skb->encapsulation counter (sort of local ttl),
66    and silently drop packet when it expires. It is the best
67    solution, but it supposes maintaing new variable in ALL
68    skb, even if no tunneling is used.
69 
70    Current solution: HARD_TX_LOCK lock breaks dead loops.
71 
72 
73 
74    2. Networking dead loops would not kill routers, but would really
75    kill network. IP hop limit plays role of "t->recursion" in this case,
76    if we copy it from packet being encapsulated to upper header.
77    It is very good solution, but it introduces two problems:
78 
79    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80      do not work over tunnels.
81    - traceroute does not work. I planned to relay ICMP from tunnel,
82      so that this problem would be solved and traceroute output
83      would even more informative. This idea appeared to be wrong:
84      only Linux complies to rfc1812 now (yes, guys, Linux is the only
85      true router now :-)), all routers (at least, in neighbourhood of mine)
86      return only 8 bytes of payload. It is the end.
87 
88    Hence, if we want that OSPF worked or traceroute said something reasonable,
89    we should search for another solution.
90 
91    One of them is to parse packet trying to detect inner encapsulation
92    made by our node. It is difficult or even impossible, especially,
93    taking into account fragmentation. TO be short, tt is not solution at all.
94 
95    Current solution: The solution was UNEXPECTEDLY SIMPLE.
96    We force DF flag on tunnels with preconfigured hop limit,
97    that is ALL. :-) Well, it does not remove the problem completely,
98    but exponential growth of network traffic is changed to linear
99    (branches, that exceed pmtu are pruned) and tunnel mtu
100    fastly degrades to value <68, where looping stops.
101    Yes, it is not good if there exists a router in the loop,
102    which does not force DF, even when encapsulating packets have DF set.
103    But it is not our problem! Nobody could accuse us, we made
104    all that we could make. Even if it is your gated who injected
105    fatal route to network, even if it were you who configured
106    fatal static route: you are innocent. :-)
107 
108 
109 
110    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111    practically identical code. It would be good to glue them
112    together, but it is not very evident, how to make them modular.
113    sit is integral part of IPv6, ipip and gre are naturally modular.
114    We could extract common parts (hash table, ioctl etc)
115    to a separate module (ip_tunnel.c).
116 
117    Alexey Kuznetsov.
118  */
119 
120 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
121 static int ipgre_tunnel_init(struct net_device *dev);
122 static void ipgre_tunnel_setup(struct net_device *dev);
123 static int ipgre_tunnel_bind_dev(struct net_device *dev);
124 
125 /* Fallback tunnel: no source, no destination, no key, no options */
126 
127 #define HASH_SIZE  16
128 
129 static int ipgre_net_id __read_mostly;
130 struct ipgre_net {
131 	struct ip_tunnel *tunnels[4][HASH_SIZE];
132 
133 	struct net_device *fb_tunnel_dev;
134 };
135 
136 /* Tunnel hash table */
137 
138 /*
139    4 hash tables:
140 
141    3: (remote,local)
142    2: (remote,*)
143    1: (*,local)
144    0: (*,*)
145 
146    We require exact key match i.e. if a key is present in packet
147    it will match only tunnel with the same key; if it is not present,
148    it will match only keyless tunnel.
149 
150    All keysless packets, if not matched configured keyless tunnels
151    will match fallback tunnel.
152  */
153 
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155 
156 #define tunnels_r_l	tunnels[3]
157 #define tunnels_r	tunnels[2]
158 #define tunnels_l	tunnels[1]
159 #define tunnels_wc	tunnels[0]
160 /*
161  * Locking : hash tables are protected by RCU and a spinlock
162  */
163 static DEFINE_SPINLOCK(ipgre_lock);
164 
165 #define for_each_ip_tunnel_rcu(start) \
166 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 
168 /* Given src, dst and key, find appropriate for input tunnel. */
169 
170 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
171 					      __be32 remote, __be32 local,
172 					      __be32 key, __be16 gre_proto)
173 {
174 	struct net *net = dev_net(dev);
175 	int link = dev->ifindex;
176 	unsigned h0 = HASH(remote);
177 	unsigned h1 = HASH(key);
178 	struct ip_tunnel *t, *cand = NULL;
179 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
181 		       ARPHRD_ETHER : ARPHRD_IPGRE;
182 	int score, cand_score = 4;
183 
184 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
185 		if (local != t->parms.iph.saddr ||
186 		    remote != t->parms.iph.daddr ||
187 		    key != t->parms.i_key ||
188 		    !(t->dev->flags & IFF_UP))
189 			continue;
190 
191 		if (t->dev->type != ARPHRD_IPGRE &&
192 		    t->dev->type != dev_type)
193 			continue;
194 
195 		score = 0;
196 		if (t->parms.link != link)
197 			score |= 1;
198 		if (t->dev->type != dev_type)
199 			score |= 2;
200 		if (score == 0)
201 			return t;
202 
203 		if (score < cand_score) {
204 			cand = t;
205 			cand_score = score;
206 		}
207 	}
208 
209 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
210 		if (remote != t->parms.iph.daddr ||
211 		    key != t->parms.i_key ||
212 		    !(t->dev->flags & IFF_UP))
213 			continue;
214 
215 		if (t->dev->type != ARPHRD_IPGRE &&
216 		    t->dev->type != dev_type)
217 			continue;
218 
219 		score = 0;
220 		if (t->parms.link != link)
221 			score |= 1;
222 		if (t->dev->type != dev_type)
223 			score |= 2;
224 		if (score == 0)
225 			return t;
226 
227 		if (score < cand_score) {
228 			cand = t;
229 			cand_score = score;
230 		}
231 	}
232 
233 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
234 		if ((local != t->parms.iph.saddr &&
235 		     (local != t->parms.iph.daddr ||
236 		      !ipv4_is_multicast(local))) ||
237 		    key != t->parms.i_key ||
238 		    !(t->dev->flags & IFF_UP))
239 			continue;
240 
241 		if (t->dev->type != ARPHRD_IPGRE &&
242 		    t->dev->type != dev_type)
243 			continue;
244 
245 		score = 0;
246 		if (t->parms.link != link)
247 			score |= 1;
248 		if (t->dev->type != dev_type)
249 			score |= 2;
250 		if (score == 0)
251 			return t;
252 
253 		if (score < cand_score) {
254 			cand = t;
255 			cand_score = score;
256 		}
257 	}
258 
259 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
260 		if (t->parms.i_key != key ||
261 		    !(t->dev->flags & IFF_UP))
262 			continue;
263 
264 		if (t->dev->type != ARPHRD_IPGRE &&
265 		    t->dev->type != dev_type)
266 			continue;
267 
268 		score = 0;
269 		if (t->parms.link != link)
270 			score |= 1;
271 		if (t->dev->type != dev_type)
272 			score |= 2;
273 		if (score == 0)
274 			return t;
275 
276 		if (score < cand_score) {
277 			cand = t;
278 			cand_score = score;
279 		}
280 	}
281 
282 	if (cand != NULL)
283 		return cand;
284 
285 	dev = ign->fb_tunnel_dev;
286 	if (dev->flags & IFF_UP)
287 		return netdev_priv(dev);
288 
289 	return NULL;
290 }
291 
292 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
293 		struct ip_tunnel_parm *parms)
294 {
295 	__be32 remote = parms->iph.daddr;
296 	__be32 local = parms->iph.saddr;
297 	__be32 key = parms->i_key;
298 	unsigned h = HASH(key);
299 	int prio = 0;
300 
301 	if (local)
302 		prio |= 1;
303 	if (remote && !ipv4_is_multicast(remote)) {
304 		prio |= 2;
305 		h ^= HASH(remote);
306 	}
307 
308 	return &ign->tunnels[prio][h];
309 }
310 
311 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
312 		struct ip_tunnel *t)
313 {
314 	return __ipgre_bucket(ign, &t->parms);
315 }
316 
317 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318 {
319 	struct ip_tunnel **tp = ipgre_bucket(ign, t);
320 
321 	spin_lock_bh(&ipgre_lock);
322 	t->next = *tp;
323 	rcu_assign_pointer(*tp, t);
324 	spin_unlock_bh(&ipgre_lock);
325 }
326 
327 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
328 {
329 	struct ip_tunnel **tp;
330 
331 	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
332 		if (t == *tp) {
333 			spin_lock_bh(&ipgre_lock);
334 			*tp = t->next;
335 			spin_unlock_bh(&ipgre_lock);
336 			break;
337 		}
338 	}
339 }
340 
341 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
342 					   struct ip_tunnel_parm *parms,
343 					   int type)
344 {
345 	__be32 remote = parms->iph.daddr;
346 	__be32 local = parms->iph.saddr;
347 	__be32 key = parms->i_key;
348 	int link = parms->link;
349 	struct ip_tunnel *t, **tp;
350 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351 
352 	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
353 		if (local == t->parms.iph.saddr &&
354 		    remote == t->parms.iph.daddr &&
355 		    key == t->parms.i_key &&
356 		    link == t->parms.link &&
357 		    type == t->dev->type)
358 			break;
359 
360 	return t;
361 }
362 
363 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
364 		struct ip_tunnel_parm *parms, int create)
365 {
366 	struct ip_tunnel *t, *nt;
367 	struct net_device *dev;
368 	char name[IFNAMSIZ];
369 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
370 
371 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
372 	if (t || !create)
373 		return t;
374 
375 	if (parms->name[0])
376 		strlcpy(name, parms->name, IFNAMSIZ);
377 	else
378 		sprintf(name, "gre%%d");
379 
380 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
381 	if (!dev)
382 	  return NULL;
383 
384 	dev_net_set(dev, net);
385 
386 	if (strchr(name, '%')) {
387 		if (dev_alloc_name(dev, name) < 0)
388 			goto failed_free;
389 	}
390 
391 	nt = netdev_priv(dev);
392 	nt->parms = *parms;
393 	dev->rtnl_link_ops = &ipgre_link_ops;
394 
395 	dev->mtu = ipgre_tunnel_bind_dev(dev);
396 
397 	if (register_netdevice(dev) < 0)
398 		goto failed_free;
399 
400 	dev_hold(dev);
401 	ipgre_tunnel_link(ign, nt);
402 	return nt;
403 
404 failed_free:
405 	free_netdev(dev);
406 	return NULL;
407 }
408 
409 static void ipgre_tunnel_uninit(struct net_device *dev)
410 {
411 	struct net *net = dev_net(dev);
412 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
413 
414 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
415 	dev_put(dev);
416 }
417 
418 
419 static void ipgre_err(struct sk_buff *skb, u32 info)
420 {
421 
422 /* All the routers (except for Linux) return only
423    8 bytes of packet payload. It means, that precise relaying of
424    ICMP in the real Internet is absolutely infeasible.
425 
426    Moreover, Cisco "wise men" put GRE key to the third word
427    in GRE header. It makes impossible maintaining even soft state for keyed
428    GRE tunnels with enabled checksum. Tell them "thank you".
429 
430    Well, I wonder, rfc1812 was written by Cisco employee,
431    what the hell these idiots break standrads established
432    by themself???
433  */
434 
435 	struct iphdr *iph = (struct iphdr *)skb->data;
436 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
437 	int grehlen = (iph->ihl<<2) + 4;
438 	const int type = icmp_hdr(skb)->type;
439 	const int code = icmp_hdr(skb)->code;
440 	struct ip_tunnel *t;
441 	__be16 flags;
442 
443 	flags = p[0];
444 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
445 		if (flags&(GRE_VERSION|GRE_ROUTING))
446 			return;
447 		if (flags&GRE_KEY) {
448 			grehlen += 4;
449 			if (flags&GRE_CSUM)
450 				grehlen += 4;
451 		}
452 	}
453 
454 	/* If only 8 bytes returned, keyed message will be dropped here */
455 	if (skb_headlen(skb) < grehlen)
456 		return;
457 
458 	switch (type) {
459 	default:
460 	case ICMP_PARAMETERPROB:
461 		return;
462 
463 	case ICMP_DEST_UNREACH:
464 		switch (code) {
465 		case ICMP_SR_FAILED:
466 		case ICMP_PORT_UNREACH:
467 			/* Impossible event. */
468 			return;
469 		case ICMP_FRAG_NEEDED:
470 			/* Soft state for pmtu is maintained by IP core. */
471 			return;
472 		default:
473 			/* All others are translated to HOST_UNREACH.
474 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
475 			   I believe they are just ether pollution. --ANK
476 			 */
477 			break;
478 		}
479 		break;
480 	case ICMP_TIME_EXCEEDED:
481 		if (code != ICMP_EXC_TTL)
482 			return;
483 		break;
484 	}
485 
486 	rcu_read_lock();
487 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
488 				flags & GRE_KEY ?
489 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
490 				p[1]);
491 	if (t == NULL || t->parms.iph.daddr == 0 ||
492 	    ipv4_is_multicast(t->parms.iph.daddr))
493 		goto out;
494 
495 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
496 		goto out;
497 
498 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
499 		t->err_count++;
500 	else
501 		t->err_count = 1;
502 	t->err_time = jiffies;
503 out:
504 	rcu_read_unlock();
505 	return;
506 }
507 
508 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
509 {
510 	if (INET_ECN_is_ce(iph->tos)) {
511 		if (skb->protocol == htons(ETH_P_IP)) {
512 			IP_ECN_set_ce(ip_hdr(skb));
513 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
514 			IP6_ECN_set_ce(ipv6_hdr(skb));
515 		}
516 	}
517 }
518 
519 static inline u8
520 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
521 {
522 	u8 inner = 0;
523 	if (skb->protocol == htons(ETH_P_IP))
524 		inner = old_iph->tos;
525 	else if (skb->protocol == htons(ETH_P_IPV6))
526 		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
527 	return INET_ECN_encapsulate(tos, inner);
528 }
529 
530 static int ipgre_rcv(struct sk_buff *skb)
531 {
532 	struct iphdr *iph;
533 	u8     *h;
534 	__be16    flags;
535 	__sum16   csum = 0;
536 	__be32 key = 0;
537 	u32    seqno = 0;
538 	struct ip_tunnel *tunnel;
539 	int    offset = 4;
540 	__be16 gre_proto;
541 	unsigned int len;
542 
543 	if (!pskb_may_pull(skb, 16))
544 		goto drop_nolock;
545 
546 	iph = ip_hdr(skb);
547 	h = skb->data;
548 	flags = *(__be16*)h;
549 
550 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
551 		/* - Version must be 0.
552 		   - We do not support routing headers.
553 		 */
554 		if (flags&(GRE_VERSION|GRE_ROUTING))
555 			goto drop_nolock;
556 
557 		if (flags&GRE_CSUM) {
558 			switch (skb->ip_summed) {
559 			case CHECKSUM_COMPLETE:
560 				csum = csum_fold(skb->csum);
561 				if (!csum)
562 					break;
563 				/* fall through */
564 			case CHECKSUM_NONE:
565 				skb->csum = 0;
566 				csum = __skb_checksum_complete(skb);
567 				skb->ip_summed = CHECKSUM_COMPLETE;
568 			}
569 			offset += 4;
570 		}
571 		if (flags&GRE_KEY) {
572 			key = *(__be32*)(h + offset);
573 			offset += 4;
574 		}
575 		if (flags&GRE_SEQ) {
576 			seqno = ntohl(*(__be32*)(h + offset));
577 			offset += 4;
578 		}
579 	}
580 
581 	gre_proto = *(__be16 *)(h + 2);
582 
583 	rcu_read_lock();
584 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
585 					  iph->saddr, iph->daddr, key,
586 					  gre_proto))) {
587 		struct net_device_stats *stats = &tunnel->dev->stats;
588 
589 		secpath_reset(skb);
590 
591 		skb->protocol = gre_proto;
592 		/* WCCP version 1 and 2 protocol decoding.
593 		 * - Change protocol to IP
594 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
595 		 */
596 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
597 			skb->protocol = htons(ETH_P_IP);
598 			if ((*(h + offset) & 0xF0) != 0x40)
599 				offset += 4;
600 		}
601 
602 		skb->mac_header = skb->network_header;
603 		__pskb_pull(skb, offset);
604 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
605 		skb->pkt_type = PACKET_HOST;
606 #ifdef CONFIG_NET_IPGRE_BROADCAST
607 		if (ipv4_is_multicast(iph->daddr)) {
608 			/* Looped back packet, drop it! */
609 			if (skb_rtable(skb)->fl.iif == 0)
610 				goto drop;
611 			stats->multicast++;
612 			skb->pkt_type = PACKET_BROADCAST;
613 		}
614 #endif
615 
616 		if (((flags&GRE_CSUM) && csum) ||
617 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
618 			stats->rx_crc_errors++;
619 			stats->rx_errors++;
620 			goto drop;
621 		}
622 		if (tunnel->parms.i_flags&GRE_SEQ) {
623 			if (!(flags&GRE_SEQ) ||
624 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
625 				stats->rx_fifo_errors++;
626 				stats->rx_errors++;
627 				goto drop;
628 			}
629 			tunnel->i_seqno = seqno + 1;
630 		}
631 
632 		len = skb->len;
633 
634 		/* Warning: All skb pointers will be invalidated! */
635 		if (tunnel->dev->type == ARPHRD_ETHER) {
636 			if (!pskb_may_pull(skb, ETH_HLEN)) {
637 				stats->rx_length_errors++;
638 				stats->rx_errors++;
639 				goto drop;
640 			}
641 
642 			iph = ip_hdr(skb);
643 			skb->protocol = eth_type_trans(skb, tunnel->dev);
644 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
645 		}
646 
647 		stats->rx_packets++;
648 		stats->rx_bytes += len;
649 		skb->dev = tunnel->dev;
650 		skb_dst_drop(skb);
651 		nf_reset(skb);
652 
653 		skb_reset_network_header(skb);
654 		ipgre_ecn_decapsulate(iph, skb);
655 
656 		netif_rx(skb);
657 		rcu_read_unlock();
658 		return(0);
659 	}
660 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
661 
662 drop:
663 	rcu_read_unlock();
664 drop_nolock:
665 	kfree_skb(skb);
666 	return(0);
667 }
668 
669 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
670 {
671 	struct ip_tunnel *tunnel = netdev_priv(dev);
672 	struct net_device_stats *stats = &dev->stats;
673 	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
674 	struct iphdr  *old_iph = ip_hdr(skb);
675 	struct iphdr  *tiph;
676 	u8     tos;
677 	__be16 df;
678 	struct rtable *rt;     			/* Route to the other host */
679 	struct net_device *tdev;			/* Device to other host */
680 	struct iphdr  *iph;			/* Our new IP header */
681 	unsigned int max_headroom;		/* The extra header space needed */
682 	int    gre_hlen;
683 	__be32 dst;
684 	int    mtu;
685 
686 	if (dev->type == ARPHRD_ETHER)
687 		IPCB(skb)->flags = 0;
688 
689 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
690 		gre_hlen = 0;
691 		tiph = (struct iphdr *)skb->data;
692 	} else {
693 		gre_hlen = tunnel->hlen;
694 		tiph = &tunnel->parms.iph;
695 	}
696 
697 	if ((dst = tiph->daddr) == 0) {
698 		/* NBMA tunnel */
699 
700 		if (skb_dst(skb) == NULL) {
701 			stats->tx_fifo_errors++;
702 			goto tx_error;
703 		}
704 
705 		if (skb->protocol == htons(ETH_P_IP)) {
706 			rt = skb_rtable(skb);
707 			if ((dst = rt->rt_gateway) == 0)
708 				goto tx_error_icmp;
709 		}
710 #ifdef CONFIG_IPV6
711 		else if (skb->protocol == htons(ETH_P_IPV6)) {
712 			struct in6_addr *addr6;
713 			int addr_type;
714 			struct neighbour *neigh = skb_dst(skb)->neighbour;
715 
716 			if (neigh == NULL)
717 				goto tx_error;
718 
719 			addr6 = (struct in6_addr *)&neigh->primary_key;
720 			addr_type = ipv6_addr_type(addr6);
721 
722 			if (addr_type == IPV6_ADDR_ANY) {
723 				addr6 = &ipv6_hdr(skb)->daddr;
724 				addr_type = ipv6_addr_type(addr6);
725 			}
726 
727 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
728 				goto tx_error_icmp;
729 
730 			dst = addr6->s6_addr32[3];
731 		}
732 #endif
733 		else
734 			goto tx_error;
735 	}
736 
737 	tos = tiph->tos;
738 	if (tos == 1) {
739 		tos = 0;
740 		if (skb->protocol == htons(ETH_P_IP))
741 			tos = old_iph->tos;
742 	}
743 
744 	{
745 		struct flowi fl = { .oif = tunnel->parms.link,
746 				    .nl_u = { .ip4_u =
747 					      { .daddr = dst,
748 						.saddr = tiph->saddr,
749 						.tos = RT_TOS(tos) } },
750 				    .proto = IPPROTO_GRE };
751 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
752 			stats->tx_carrier_errors++;
753 			goto tx_error;
754 		}
755 	}
756 	tdev = rt->u.dst.dev;
757 
758 	if (tdev == dev) {
759 		ip_rt_put(rt);
760 		stats->collisions++;
761 		goto tx_error;
762 	}
763 
764 	df = tiph->frag_off;
765 	if (df)
766 		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
767 	else
768 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
769 
770 	if (skb_dst(skb))
771 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
772 
773 	if (skb->protocol == htons(ETH_P_IP)) {
774 		df |= (old_iph->frag_off&htons(IP_DF));
775 
776 		if ((old_iph->frag_off&htons(IP_DF)) &&
777 		    mtu < ntohs(old_iph->tot_len)) {
778 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
779 			ip_rt_put(rt);
780 			goto tx_error;
781 		}
782 	}
783 #ifdef CONFIG_IPV6
784 	else if (skb->protocol == htons(ETH_P_IPV6)) {
785 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
786 
787 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
788 			if ((tunnel->parms.iph.daddr &&
789 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
790 			    rt6->rt6i_dst.plen == 128) {
791 				rt6->rt6i_flags |= RTF_MODIFIED;
792 				skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
793 			}
794 		}
795 
796 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
797 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
798 			ip_rt_put(rt);
799 			goto tx_error;
800 		}
801 	}
802 #endif
803 
804 	if (tunnel->err_count > 0) {
805 		if (time_before(jiffies,
806 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
807 			tunnel->err_count--;
808 
809 			dst_link_failure(skb);
810 		} else
811 			tunnel->err_count = 0;
812 	}
813 
814 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
815 
816 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
817 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
818 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
819 		if (max_headroom > dev->needed_headroom)
820 			dev->needed_headroom = max_headroom;
821 		if (!new_skb) {
822 			ip_rt_put(rt);
823 			txq->tx_dropped++;
824 			dev_kfree_skb(skb);
825 			return NETDEV_TX_OK;
826 		}
827 		if (skb->sk)
828 			skb_set_owner_w(new_skb, skb->sk);
829 		dev_kfree_skb(skb);
830 		skb = new_skb;
831 		old_iph = ip_hdr(skb);
832 	}
833 
834 	skb_reset_transport_header(skb);
835 	skb_push(skb, gre_hlen);
836 	skb_reset_network_header(skb);
837 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
838 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
839 			      IPSKB_REROUTED);
840 	skb_dst_drop(skb);
841 	skb_dst_set(skb, &rt->u.dst);
842 
843 	/*
844 	 *	Push down and install the IPIP header.
845 	 */
846 
847 	iph 			=	ip_hdr(skb);
848 	iph->version		=	4;
849 	iph->ihl		=	sizeof(struct iphdr) >> 2;
850 	iph->frag_off		=	df;
851 	iph->protocol		=	IPPROTO_GRE;
852 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
853 	iph->daddr		=	rt->rt_dst;
854 	iph->saddr		=	rt->rt_src;
855 
856 	if ((iph->ttl = tiph->ttl) == 0) {
857 		if (skb->protocol == htons(ETH_P_IP))
858 			iph->ttl = old_iph->ttl;
859 #ifdef CONFIG_IPV6
860 		else if (skb->protocol == htons(ETH_P_IPV6))
861 			iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
862 #endif
863 		else
864 			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
865 	}
866 
867 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
868 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
869 				   htons(ETH_P_TEB) : skb->protocol;
870 
871 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
872 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
873 
874 		if (tunnel->parms.o_flags&GRE_SEQ) {
875 			++tunnel->o_seqno;
876 			*ptr = htonl(tunnel->o_seqno);
877 			ptr--;
878 		}
879 		if (tunnel->parms.o_flags&GRE_KEY) {
880 			*ptr = tunnel->parms.o_key;
881 			ptr--;
882 		}
883 		if (tunnel->parms.o_flags&GRE_CSUM) {
884 			*ptr = 0;
885 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
886 		}
887 	}
888 
889 	nf_reset(skb);
890 
891 	IPTUNNEL_XMIT();
892 	return NETDEV_TX_OK;
893 
894 tx_error_icmp:
895 	dst_link_failure(skb);
896 
897 tx_error:
898 	stats->tx_errors++;
899 	dev_kfree_skb(skb);
900 	return NETDEV_TX_OK;
901 }
902 
903 static int ipgre_tunnel_bind_dev(struct net_device *dev)
904 {
905 	struct net_device *tdev = NULL;
906 	struct ip_tunnel *tunnel;
907 	struct iphdr *iph;
908 	int hlen = LL_MAX_HEADER;
909 	int mtu = ETH_DATA_LEN;
910 	int addend = sizeof(struct iphdr) + 4;
911 
912 	tunnel = netdev_priv(dev);
913 	iph = &tunnel->parms.iph;
914 
915 	/* Guess output device to choose reasonable mtu and needed_headroom */
916 
917 	if (iph->daddr) {
918 		struct flowi fl = { .oif = tunnel->parms.link,
919 				    .nl_u = { .ip4_u =
920 					      { .daddr = iph->daddr,
921 						.saddr = iph->saddr,
922 						.tos = RT_TOS(iph->tos) } },
923 				    .proto = IPPROTO_GRE };
924 		struct rtable *rt;
925 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
926 			tdev = rt->u.dst.dev;
927 			ip_rt_put(rt);
928 		}
929 
930 		if (dev->type != ARPHRD_ETHER)
931 			dev->flags |= IFF_POINTOPOINT;
932 	}
933 
934 	if (!tdev && tunnel->parms.link)
935 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
936 
937 	if (tdev) {
938 		hlen = tdev->hard_header_len + tdev->needed_headroom;
939 		mtu = tdev->mtu;
940 	}
941 	dev->iflink = tunnel->parms.link;
942 
943 	/* Precalculate GRE options length */
944 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
945 		if (tunnel->parms.o_flags&GRE_CSUM)
946 			addend += 4;
947 		if (tunnel->parms.o_flags&GRE_KEY)
948 			addend += 4;
949 		if (tunnel->parms.o_flags&GRE_SEQ)
950 			addend += 4;
951 	}
952 	dev->needed_headroom = addend + hlen;
953 	mtu -= dev->hard_header_len + addend;
954 
955 	if (mtu < 68)
956 		mtu = 68;
957 
958 	tunnel->hlen = addend;
959 
960 	return mtu;
961 }
962 
963 static int
964 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
965 {
966 	int err = 0;
967 	struct ip_tunnel_parm p;
968 	struct ip_tunnel *t;
969 	struct net *net = dev_net(dev);
970 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
971 
972 	switch (cmd) {
973 	case SIOCGETTUNNEL:
974 		t = NULL;
975 		if (dev == ign->fb_tunnel_dev) {
976 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
977 				err = -EFAULT;
978 				break;
979 			}
980 			t = ipgre_tunnel_locate(net, &p, 0);
981 		}
982 		if (t == NULL)
983 			t = netdev_priv(dev);
984 		memcpy(&p, &t->parms, sizeof(p));
985 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
986 			err = -EFAULT;
987 		break;
988 
989 	case SIOCADDTUNNEL:
990 	case SIOCCHGTUNNEL:
991 		err = -EPERM;
992 		if (!capable(CAP_NET_ADMIN))
993 			goto done;
994 
995 		err = -EFAULT;
996 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
997 			goto done;
998 
999 		err = -EINVAL;
1000 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1001 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1002 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1003 			goto done;
1004 		if (p.iph.ttl)
1005 			p.iph.frag_off |= htons(IP_DF);
1006 
1007 		if (!(p.i_flags&GRE_KEY))
1008 			p.i_key = 0;
1009 		if (!(p.o_flags&GRE_KEY))
1010 			p.o_key = 0;
1011 
1012 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1013 
1014 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1015 			if (t != NULL) {
1016 				if (t->dev != dev) {
1017 					err = -EEXIST;
1018 					break;
1019 				}
1020 			} else {
1021 				unsigned nflags = 0;
1022 
1023 				t = netdev_priv(dev);
1024 
1025 				if (ipv4_is_multicast(p.iph.daddr))
1026 					nflags = IFF_BROADCAST;
1027 				else if (p.iph.daddr)
1028 					nflags = IFF_POINTOPOINT;
1029 
1030 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1031 					err = -EINVAL;
1032 					break;
1033 				}
1034 				ipgre_tunnel_unlink(ign, t);
1035 				t->parms.iph.saddr = p.iph.saddr;
1036 				t->parms.iph.daddr = p.iph.daddr;
1037 				t->parms.i_key = p.i_key;
1038 				t->parms.o_key = p.o_key;
1039 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1040 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1041 				ipgre_tunnel_link(ign, t);
1042 				netdev_state_change(dev);
1043 			}
1044 		}
1045 
1046 		if (t) {
1047 			err = 0;
1048 			if (cmd == SIOCCHGTUNNEL) {
1049 				t->parms.iph.ttl = p.iph.ttl;
1050 				t->parms.iph.tos = p.iph.tos;
1051 				t->parms.iph.frag_off = p.iph.frag_off;
1052 				if (t->parms.link != p.link) {
1053 					t->parms.link = p.link;
1054 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1055 					netdev_state_change(dev);
1056 				}
1057 			}
1058 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1059 				err = -EFAULT;
1060 		} else
1061 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1062 		break;
1063 
1064 	case SIOCDELTUNNEL:
1065 		err = -EPERM;
1066 		if (!capable(CAP_NET_ADMIN))
1067 			goto done;
1068 
1069 		if (dev == ign->fb_tunnel_dev) {
1070 			err = -EFAULT;
1071 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1072 				goto done;
1073 			err = -ENOENT;
1074 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1075 				goto done;
1076 			err = -EPERM;
1077 			if (t == netdev_priv(ign->fb_tunnel_dev))
1078 				goto done;
1079 			dev = t->dev;
1080 		}
1081 		unregister_netdevice(dev);
1082 		err = 0;
1083 		break;
1084 
1085 	default:
1086 		err = -EINVAL;
1087 	}
1088 
1089 done:
1090 	return err;
1091 }
1092 
1093 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1094 {
1095 	struct ip_tunnel *tunnel = netdev_priv(dev);
1096 	if (new_mtu < 68 ||
1097 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1098 		return -EINVAL;
1099 	dev->mtu = new_mtu;
1100 	return 0;
1101 }
1102 
1103 /* Nice toy. Unfortunately, useless in real life :-)
1104    It allows to construct virtual multiprotocol broadcast "LAN"
1105    over the Internet, provided multicast routing is tuned.
1106 
1107 
1108    I have no idea was this bicycle invented before me,
1109    so that I had to set ARPHRD_IPGRE to a random value.
1110    I have an impression, that Cisco could make something similar,
1111    but this feature is apparently missing in IOS<=11.2(8).
1112 
1113    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1114    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1115 
1116    ping -t 255 224.66.66.66
1117 
1118    If nobody answers, mbone does not work.
1119 
1120    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1121    ip addr add 10.66.66.<somewhat>/24 dev Universe
1122    ifconfig Universe up
1123    ifconfig Universe add fe80::<Your_real_addr>/10
1124    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1125    ftp 10.66.66.66
1126    ...
1127    ftp fec0:6666:6666::193.233.7.65
1128    ...
1129 
1130  */
1131 
1132 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1133 			unsigned short type,
1134 			const void *daddr, const void *saddr, unsigned len)
1135 {
1136 	struct ip_tunnel *t = netdev_priv(dev);
1137 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1138 	__be16 *p = (__be16*)(iph+1);
1139 
1140 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1141 	p[0]		= t->parms.o_flags;
1142 	p[1]		= htons(type);
1143 
1144 	/*
1145 	 *	Set the source hardware address.
1146 	 */
1147 
1148 	if (saddr)
1149 		memcpy(&iph->saddr, saddr, 4);
1150 	if (daddr)
1151 		memcpy(&iph->daddr, daddr, 4);
1152 	if (iph->daddr)
1153 		return t->hlen;
1154 
1155 	return -t->hlen;
1156 }
1157 
1158 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1159 {
1160 	struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1161 	memcpy(haddr, &iph->saddr, 4);
1162 	return 4;
1163 }
1164 
1165 static const struct header_ops ipgre_header_ops = {
1166 	.create	= ipgre_header,
1167 	.parse	= ipgre_header_parse,
1168 };
1169 
1170 #ifdef CONFIG_NET_IPGRE_BROADCAST
1171 static int ipgre_open(struct net_device *dev)
1172 {
1173 	struct ip_tunnel *t = netdev_priv(dev);
1174 
1175 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1176 		struct flowi fl = { .oif = t->parms.link,
1177 				    .nl_u = { .ip4_u =
1178 					      { .daddr = t->parms.iph.daddr,
1179 						.saddr = t->parms.iph.saddr,
1180 						.tos = RT_TOS(t->parms.iph.tos) } },
1181 				    .proto = IPPROTO_GRE };
1182 		struct rtable *rt;
1183 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
1184 			return -EADDRNOTAVAIL;
1185 		dev = rt->u.dst.dev;
1186 		ip_rt_put(rt);
1187 		if (__in_dev_get_rtnl(dev) == NULL)
1188 			return -EADDRNOTAVAIL;
1189 		t->mlink = dev->ifindex;
1190 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1191 	}
1192 	return 0;
1193 }
1194 
1195 static int ipgre_close(struct net_device *dev)
1196 {
1197 	struct ip_tunnel *t = netdev_priv(dev);
1198 
1199 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1200 		struct in_device *in_dev;
1201 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1202 		if (in_dev) {
1203 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1204 			in_dev_put(in_dev);
1205 		}
1206 	}
1207 	return 0;
1208 }
1209 
1210 #endif
1211 
1212 static const struct net_device_ops ipgre_netdev_ops = {
1213 	.ndo_init		= ipgre_tunnel_init,
1214 	.ndo_uninit		= ipgre_tunnel_uninit,
1215 #ifdef CONFIG_NET_IPGRE_BROADCAST
1216 	.ndo_open		= ipgre_open,
1217 	.ndo_stop		= ipgre_close,
1218 #endif
1219 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1220 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1221 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1222 };
1223 
1224 static void ipgre_tunnel_setup(struct net_device *dev)
1225 {
1226 	dev->netdev_ops		= &ipgre_netdev_ops;
1227 	dev->destructor 	= free_netdev;
1228 
1229 	dev->type		= ARPHRD_IPGRE;
1230 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1231 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1232 	dev->flags		= IFF_NOARP;
1233 	dev->iflink		= 0;
1234 	dev->addr_len		= 4;
1235 	dev->features		|= NETIF_F_NETNS_LOCAL;
1236 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1237 }
1238 
1239 static int ipgre_tunnel_init(struct net_device *dev)
1240 {
1241 	struct ip_tunnel *tunnel;
1242 	struct iphdr *iph;
1243 
1244 	tunnel = netdev_priv(dev);
1245 	iph = &tunnel->parms.iph;
1246 
1247 	tunnel->dev = dev;
1248 	strcpy(tunnel->parms.name, dev->name);
1249 
1250 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1251 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1252 
1253 	if (iph->daddr) {
1254 #ifdef CONFIG_NET_IPGRE_BROADCAST
1255 		if (ipv4_is_multicast(iph->daddr)) {
1256 			if (!iph->saddr)
1257 				return -EINVAL;
1258 			dev->flags = IFF_BROADCAST;
1259 			dev->header_ops = &ipgre_header_ops;
1260 		}
1261 #endif
1262 	} else
1263 		dev->header_ops = &ipgre_header_ops;
1264 
1265 	return 0;
1266 }
1267 
1268 static void ipgre_fb_tunnel_init(struct net_device *dev)
1269 {
1270 	struct ip_tunnel *tunnel = netdev_priv(dev);
1271 	struct iphdr *iph = &tunnel->parms.iph;
1272 	struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1273 
1274 	tunnel->dev = dev;
1275 	strcpy(tunnel->parms.name, dev->name);
1276 
1277 	iph->version		= 4;
1278 	iph->protocol		= IPPROTO_GRE;
1279 	iph->ihl		= 5;
1280 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1281 
1282 	dev_hold(dev);
1283 	ign->tunnels_wc[0]	= tunnel;
1284 }
1285 
1286 
1287 static const struct net_protocol ipgre_protocol = {
1288 	.handler	=	ipgre_rcv,
1289 	.err_handler	=	ipgre_err,
1290 	.netns_ok	=	1,
1291 };
1292 
1293 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1294 {
1295 	int prio;
1296 
1297 	for (prio = 0; prio < 4; prio++) {
1298 		int h;
1299 		for (h = 0; h < HASH_SIZE; h++) {
1300 			struct ip_tunnel *t = ign->tunnels[prio][h];
1301 
1302 			while (t != NULL) {
1303 				unregister_netdevice_queue(t->dev, head);
1304 				t = t->next;
1305 			}
1306 		}
1307 	}
1308 }
1309 
1310 static int __net_init ipgre_init_net(struct net *net)
1311 {
1312 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1313 	int err;
1314 
1315 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1316 					   ipgre_tunnel_setup);
1317 	if (!ign->fb_tunnel_dev) {
1318 		err = -ENOMEM;
1319 		goto err_alloc_dev;
1320 	}
1321 	dev_net_set(ign->fb_tunnel_dev, net);
1322 
1323 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1324 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1325 
1326 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1327 		goto err_reg_dev;
1328 
1329 	return 0;
1330 
1331 err_reg_dev:
1332 	free_netdev(ign->fb_tunnel_dev);
1333 err_alloc_dev:
1334 	return err;
1335 }
1336 
1337 static void __net_exit ipgre_exit_net(struct net *net)
1338 {
1339 	struct ipgre_net *ign;
1340 	LIST_HEAD(list);
1341 
1342 	ign = net_generic(net, ipgre_net_id);
1343 	rtnl_lock();
1344 	ipgre_destroy_tunnels(ign, &list);
1345 	unregister_netdevice_many(&list);
1346 	rtnl_unlock();
1347 }
1348 
1349 static struct pernet_operations ipgre_net_ops = {
1350 	.init = ipgre_init_net,
1351 	.exit = ipgre_exit_net,
1352 	.id   = &ipgre_net_id,
1353 	.size = sizeof(struct ipgre_net),
1354 };
1355 
1356 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1357 {
1358 	__be16 flags;
1359 
1360 	if (!data)
1361 		return 0;
1362 
1363 	flags = 0;
1364 	if (data[IFLA_GRE_IFLAGS])
1365 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1366 	if (data[IFLA_GRE_OFLAGS])
1367 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1368 	if (flags & (GRE_VERSION|GRE_ROUTING))
1369 		return -EINVAL;
1370 
1371 	return 0;
1372 }
1373 
1374 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1375 {
1376 	__be32 daddr;
1377 
1378 	if (tb[IFLA_ADDRESS]) {
1379 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1380 			return -EINVAL;
1381 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1382 			return -EADDRNOTAVAIL;
1383 	}
1384 
1385 	if (!data)
1386 		goto out;
1387 
1388 	if (data[IFLA_GRE_REMOTE]) {
1389 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1390 		if (!daddr)
1391 			return -EINVAL;
1392 	}
1393 
1394 out:
1395 	return ipgre_tunnel_validate(tb, data);
1396 }
1397 
1398 static void ipgre_netlink_parms(struct nlattr *data[],
1399 				struct ip_tunnel_parm *parms)
1400 {
1401 	memset(parms, 0, sizeof(*parms));
1402 
1403 	parms->iph.protocol = IPPROTO_GRE;
1404 
1405 	if (!data)
1406 		return;
1407 
1408 	if (data[IFLA_GRE_LINK])
1409 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1410 
1411 	if (data[IFLA_GRE_IFLAGS])
1412 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1413 
1414 	if (data[IFLA_GRE_OFLAGS])
1415 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1416 
1417 	if (data[IFLA_GRE_IKEY])
1418 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1419 
1420 	if (data[IFLA_GRE_OKEY])
1421 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1422 
1423 	if (data[IFLA_GRE_LOCAL])
1424 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1425 
1426 	if (data[IFLA_GRE_REMOTE])
1427 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1428 
1429 	if (data[IFLA_GRE_TTL])
1430 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1431 
1432 	if (data[IFLA_GRE_TOS])
1433 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1434 
1435 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1436 		parms->iph.frag_off = htons(IP_DF);
1437 }
1438 
1439 static int ipgre_tap_init(struct net_device *dev)
1440 {
1441 	struct ip_tunnel *tunnel;
1442 
1443 	tunnel = netdev_priv(dev);
1444 
1445 	tunnel->dev = dev;
1446 	strcpy(tunnel->parms.name, dev->name);
1447 
1448 	ipgre_tunnel_bind_dev(dev);
1449 
1450 	return 0;
1451 }
1452 
1453 static const struct net_device_ops ipgre_tap_netdev_ops = {
1454 	.ndo_init		= ipgre_tap_init,
1455 	.ndo_uninit		= ipgre_tunnel_uninit,
1456 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1457 	.ndo_set_mac_address 	= eth_mac_addr,
1458 	.ndo_validate_addr	= eth_validate_addr,
1459 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1460 };
1461 
1462 static void ipgre_tap_setup(struct net_device *dev)
1463 {
1464 
1465 	ether_setup(dev);
1466 
1467 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1468 	dev->destructor 	= free_netdev;
1469 
1470 	dev->iflink		= 0;
1471 	dev->features		|= NETIF_F_NETNS_LOCAL;
1472 }
1473 
1474 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1475 			 struct nlattr *data[])
1476 {
1477 	struct ip_tunnel *nt;
1478 	struct net *net = dev_net(dev);
1479 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1480 	int mtu;
1481 	int err;
1482 
1483 	nt = netdev_priv(dev);
1484 	ipgre_netlink_parms(data, &nt->parms);
1485 
1486 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1487 		return -EEXIST;
1488 
1489 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1490 		random_ether_addr(dev->dev_addr);
1491 
1492 	mtu = ipgre_tunnel_bind_dev(dev);
1493 	if (!tb[IFLA_MTU])
1494 		dev->mtu = mtu;
1495 
1496 	err = register_netdevice(dev);
1497 	if (err)
1498 		goto out;
1499 
1500 	dev_hold(dev);
1501 	ipgre_tunnel_link(ign, nt);
1502 
1503 out:
1504 	return err;
1505 }
1506 
1507 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1508 			    struct nlattr *data[])
1509 {
1510 	struct ip_tunnel *t, *nt;
1511 	struct net *net = dev_net(dev);
1512 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1513 	struct ip_tunnel_parm p;
1514 	int mtu;
1515 
1516 	if (dev == ign->fb_tunnel_dev)
1517 		return -EINVAL;
1518 
1519 	nt = netdev_priv(dev);
1520 	ipgre_netlink_parms(data, &p);
1521 
1522 	t = ipgre_tunnel_locate(net, &p, 0);
1523 
1524 	if (t) {
1525 		if (t->dev != dev)
1526 			return -EEXIST;
1527 	} else {
1528 		t = nt;
1529 
1530 		if (dev->type != ARPHRD_ETHER) {
1531 			unsigned nflags = 0;
1532 
1533 			if (ipv4_is_multicast(p.iph.daddr))
1534 				nflags = IFF_BROADCAST;
1535 			else if (p.iph.daddr)
1536 				nflags = IFF_POINTOPOINT;
1537 
1538 			if ((dev->flags ^ nflags) &
1539 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1540 				return -EINVAL;
1541 		}
1542 
1543 		ipgre_tunnel_unlink(ign, t);
1544 		t->parms.iph.saddr = p.iph.saddr;
1545 		t->parms.iph.daddr = p.iph.daddr;
1546 		t->parms.i_key = p.i_key;
1547 		if (dev->type != ARPHRD_ETHER) {
1548 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1549 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1550 		}
1551 		ipgre_tunnel_link(ign, t);
1552 		netdev_state_change(dev);
1553 	}
1554 
1555 	t->parms.o_key = p.o_key;
1556 	t->parms.iph.ttl = p.iph.ttl;
1557 	t->parms.iph.tos = p.iph.tos;
1558 	t->parms.iph.frag_off = p.iph.frag_off;
1559 
1560 	if (t->parms.link != p.link) {
1561 		t->parms.link = p.link;
1562 		mtu = ipgre_tunnel_bind_dev(dev);
1563 		if (!tb[IFLA_MTU])
1564 			dev->mtu = mtu;
1565 		netdev_state_change(dev);
1566 	}
1567 
1568 	return 0;
1569 }
1570 
1571 static size_t ipgre_get_size(const struct net_device *dev)
1572 {
1573 	return
1574 		/* IFLA_GRE_LINK */
1575 		nla_total_size(4) +
1576 		/* IFLA_GRE_IFLAGS */
1577 		nla_total_size(2) +
1578 		/* IFLA_GRE_OFLAGS */
1579 		nla_total_size(2) +
1580 		/* IFLA_GRE_IKEY */
1581 		nla_total_size(4) +
1582 		/* IFLA_GRE_OKEY */
1583 		nla_total_size(4) +
1584 		/* IFLA_GRE_LOCAL */
1585 		nla_total_size(4) +
1586 		/* IFLA_GRE_REMOTE */
1587 		nla_total_size(4) +
1588 		/* IFLA_GRE_TTL */
1589 		nla_total_size(1) +
1590 		/* IFLA_GRE_TOS */
1591 		nla_total_size(1) +
1592 		/* IFLA_GRE_PMTUDISC */
1593 		nla_total_size(1) +
1594 		0;
1595 }
1596 
1597 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1598 {
1599 	struct ip_tunnel *t = netdev_priv(dev);
1600 	struct ip_tunnel_parm *p = &t->parms;
1601 
1602 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1603 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1604 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1605 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1606 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1607 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1608 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1609 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1610 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1611 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1612 
1613 	return 0;
1614 
1615 nla_put_failure:
1616 	return -EMSGSIZE;
1617 }
1618 
1619 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1620 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1621 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1622 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1623 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1624 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1625 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1626 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1627 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1628 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1629 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1630 };
1631 
1632 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1633 	.kind		= "gre",
1634 	.maxtype	= IFLA_GRE_MAX,
1635 	.policy		= ipgre_policy,
1636 	.priv_size	= sizeof(struct ip_tunnel),
1637 	.setup		= ipgre_tunnel_setup,
1638 	.validate	= ipgre_tunnel_validate,
1639 	.newlink	= ipgre_newlink,
1640 	.changelink	= ipgre_changelink,
1641 	.get_size	= ipgre_get_size,
1642 	.fill_info	= ipgre_fill_info,
1643 };
1644 
1645 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1646 	.kind		= "gretap",
1647 	.maxtype	= IFLA_GRE_MAX,
1648 	.policy		= ipgre_policy,
1649 	.priv_size	= sizeof(struct ip_tunnel),
1650 	.setup		= ipgre_tap_setup,
1651 	.validate	= ipgre_tap_validate,
1652 	.newlink	= ipgre_newlink,
1653 	.changelink	= ipgre_changelink,
1654 	.get_size	= ipgre_get_size,
1655 	.fill_info	= ipgre_fill_info,
1656 };
1657 
1658 /*
1659  *	And now the modules code and kernel interface.
1660  */
1661 
1662 static int __init ipgre_init(void)
1663 {
1664 	int err;
1665 
1666 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1667 
1668 	err = register_pernet_device(&ipgre_net_ops);
1669 	if (err < 0)
1670 		return err;
1671 
1672 	err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
1673 	if (err < 0) {
1674 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1675 		goto add_proto_failed;
1676 	}
1677 
1678 	err = rtnl_link_register(&ipgre_link_ops);
1679 	if (err < 0)
1680 		goto rtnl_link_failed;
1681 
1682 	err = rtnl_link_register(&ipgre_tap_ops);
1683 	if (err < 0)
1684 		goto tap_ops_failed;
1685 
1686 out:
1687 	return err;
1688 
1689 tap_ops_failed:
1690 	rtnl_link_unregister(&ipgre_link_ops);
1691 rtnl_link_failed:
1692 	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1693 add_proto_failed:
1694 	unregister_pernet_device(&ipgre_net_ops);
1695 	goto out;
1696 }
1697 
1698 static void __exit ipgre_fini(void)
1699 {
1700 	rtnl_link_unregister(&ipgre_tap_ops);
1701 	rtnl_link_unregister(&ipgre_link_ops);
1702 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1703 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1704 	unregister_pernet_device(&ipgre_net_ops);
1705 }
1706 
1707 module_init(ipgre_init);
1708 module_exit(ipgre_fini);
1709 MODULE_LICENSE("GPL");
1710 MODULE_ALIAS_RTNL_LINK("gre");
1711 MODULE_ALIAS_RTNL_LINK("gretap");
1712