xref: /linux/net/ipv4/ip_gre.c (revision e5c5d22e8dcf7c2d430336cbf8e180bd38e8daf1)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56 
57 /*
58    Problems & solutions
59    --------------------
60 
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65 
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72 
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76 
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81 
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90 
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93 
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97 
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110 
111    Alexey Kuznetsov.
112  */
113 
114 static bool log_ecn_error = true;
115 module_param(log_ecn_error, bool, 0644);
116 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
117 
118 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
119 static int ipgre_tunnel_init(struct net_device *dev);
120 
121 static int ipgre_net_id __read_mostly;
122 static int gre_tap_net_id __read_mostly;
123 
124 static __sum16 check_checksum(struct sk_buff *skb)
125 {
126 	__sum16 csum = 0;
127 
128 	switch (skb->ip_summed) {
129 	case CHECKSUM_COMPLETE:
130 		csum = csum_fold(skb->csum);
131 
132 		if (!csum)
133 			break;
134 		/* Fall through. */
135 
136 	case CHECKSUM_NONE:
137 		skb->csum = 0;
138 		csum = __skb_checksum_complete(skb);
139 		skb->ip_summed = CHECKSUM_COMPLETE;
140 		break;
141 	}
142 
143 	return csum;
144 }
145 
146 static int ip_gre_calc_hlen(__be16 o_flags)
147 {
148 	int addend = 4;
149 
150 	if (o_flags&TUNNEL_CSUM)
151 		addend += 4;
152 	if (o_flags&TUNNEL_KEY)
153 		addend += 4;
154 	if (o_flags&TUNNEL_SEQ)
155 		addend += 4;
156 	return addend;
157 }
158 
159 static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
160 			    bool *csum_err, int *hdr_len)
161 {
162 	struct iphdr *iph = ip_hdr(skb);
163 	struct gre_base_hdr *greh;
164 	__be32 *options;
165 
166 	if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
167 		return -EINVAL;
168 
169 	greh = (struct gre_base_hdr *)((u8 *)iph + (iph->ihl << 2));
170 	if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
171 		return -EINVAL;
172 
173 	tpi->flags = gre_flags_to_tnl_flags(greh->flags);
174 	*hdr_len = ip_gre_calc_hlen(tpi->flags);
175 
176 	if (!pskb_may_pull(skb, *hdr_len))
177 		return -EINVAL;
178 
179 	tpi->proto = greh->protocol;
180 
181 	options = (__be32 *)(greh + 1);
182 	if (greh->flags & GRE_CSUM) {
183 		if (check_checksum(skb)) {
184 			*csum_err = true;
185 			return -EINVAL;
186 		}
187 		options++;
188 	}
189 
190 	if (greh->flags & GRE_KEY) {
191 		tpi->key = *options;
192 		options++;
193 	} else
194 		tpi->key = 0;
195 
196 	if (unlikely(greh->flags & GRE_SEQ)) {
197 		tpi->seq = *options;
198 		options++;
199 	} else
200 		tpi->seq = 0;
201 
202 	/* WCCP version 1 and 2 protocol decoding.
203 	 * - Change protocol to IP
204 	 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
205 	 */
206 	if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
207 		tpi->proto = htons(ETH_P_IP);
208 		if ((*(u8 *)options & 0xF0) != 0x40) {
209 			*hdr_len += 4;
210 			if (!pskb_may_pull(skb, *hdr_len))
211 				return -EINVAL;
212 		}
213 	}
214 
215 	return 0;
216 }
217 
218 static void ipgre_err(struct sk_buff *skb, u32 info)
219 {
220 
221 	/* All the routers (except for Linux) return only
222 	   8 bytes of packet payload. It means, that precise relaying of
223 	   ICMP in the real Internet is absolutely infeasible.
224 
225 	   Moreover, Cisco "wise men" put GRE key to the third word
226 	   in GRE header. It makes impossible maintaining even soft
227 	   state for keyed GRE tunnels with enabled checksum. Tell
228 	   them "thank you".
229 
230 	   Well, I wonder, rfc1812 was written by Cisco employee,
231 	   what the hell these idiots break standards established
232 	   by themselves???
233 	   */
234 	struct net *net = dev_net(skb->dev);
235 	struct ip_tunnel_net *itn;
236 	const struct iphdr *iph = (const struct iphdr *)skb->data;
237 	const int type = icmp_hdr(skb)->type;
238 	const int code = icmp_hdr(skb)->code;
239 	struct ip_tunnel *t;
240 	struct tnl_ptk_info tpi;
241 	int hdr_len;
242 	bool csum_err = false;
243 
244 	if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len)) {
245 		if (!csum_err)          /* ignore csum errors. */
246 			return;
247 	}
248 
249 	switch (type) {
250 	default:
251 	case ICMP_PARAMETERPROB:
252 		return;
253 
254 	case ICMP_DEST_UNREACH:
255 		switch (code) {
256 		case ICMP_SR_FAILED:
257 		case ICMP_PORT_UNREACH:
258 			/* Impossible event. */
259 			return;
260 		default:
261 			/* All others are translated to HOST_UNREACH.
262 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
263 			   I believe they are just ether pollution. --ANK
264 			 */
265 			break;
266 		}
267 		break;
268 	case ICMP_TIME_EXCEEDED:
269 		if (code != ICMP_EXC_TTL)
270 			return;
271 		break;
272 
273 	case ICMP_REDIRECT:
274 		break;
275 	}
276 
277 	if (tpi.proto == htons(ETH_P_TEB))
278 		itn = net_generic(net, gre_tap_net_id);
279 	else
280 		itn = net_generic(net, ipgre_net_id);
281 
282 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
283 			     iph->daddr, iph->saddr, tpi.key);
284 
285 	if (t == NULL)
286 		return;
287 
288 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
289 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
290 				 t->parms.link, 0, IPPROTO_GRE, 0);
291 		return;
292 	}
293 	if (type == ICMP_REDIRECT) {
294 		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
295 			      IPPROTO_GRE, 0);
296 		return;
297 	}
298 	if (t->parms.iph.daddr == 0 ||
299 	    ipv4_is_multicast(t->parms.iph.daddr))
300 		return;
301 
302 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
303 		return;
304 
305 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
306 		t->err_count++;
307 	else
308 		t->err_count = 1;
309 	t->err_time = jiffies;
310 }
311 
312 static int ipgre_rcv(struct sk_buff *skb)
313 {
314 	struct net *net = dev_net(skb->dev);
315 	struct ip_tunnel_net *itn;
316 	const struct iphdr *iph;
317 	struct ip_tunnel *tunnel;
318 	struct tnl_ptk_info tpi;
319 	int hdr_len;
320 	bool csum_err = false;
321 
322 	if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len) < 0)
323 		goto drop;
324 
325 	if (tpi.proto == htons(ETH_P_TEB))
326 		itn = net_generic(net, gre_tap_net_id);
327 	else
328 		itn = net_generic(net, ipgre_net_id);
329 
330 	iph = ip_hdr(skb);
331 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
332 				  iph->saddr, iph->daddr, tpi.key);
333 
334 	if (tunnel) {
335 		ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
336 		return 0;
337 	}
338 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
339 drop:
340 	kfree_skb(skb);
341 	return 0;
342 }
343 
344 static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff *skb)
345 {
346 	int err;
347 
348 	if (skb_is_gso(skb)) {
349 		err = skb_unclone(skb, GFP_ATOMIC);
350 		if (unlikely(err))
351 			goto error;
352 		skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
353 		return skb;
354 	} else if (skb->ip_summed == CHECKSUM_PARTIAL &&
355 		   tunnel->parms.o_flags&TUNNEL_CSUM) {
356 		err = skb_checksum_help(skb);
357 		if (unlikely(err))
358 			goto error;
359 	} else if (skb->ip_summed != CHECKSUM_PARTIAL)
360 		skb->ip_summed = CHECKSUM_NONE;
361 
362 	return skb;
363 
364 error:
365 	kfree_skb(skb);
366 	return ERR_PTR(err);
367 }
368 
369 static struct sk_buff *gre_build_header(struct sk_buff *skb,
370 					const struct tnl_ptk_info *tpi,
371 					int hdr_len)
372 {
373 	struct gre_base_hdr *greh;
374 
375 	skb_push(skb, hdr_len);
376 
377 	greh = (struct gre_base_hdr *)skb->data;
378 	greh->flags = tnl_flags_to_gre_flags(tpi->flags);
379 	greh->protocol = tpi->proto;
380 
381 	if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
382 		__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
383 
384 		if (tpi->flags&TUNNEL_SEQ) {
385 			*ptr = tpi->seq;
386 			ptr--;
387 		}
388 		if (tpi->flags&TUNNEL_KEY) {
389 			*ptr = tpi->key;
390 			ptr--;
391 		}
392 		if (tpi->flags&TUNNEL_CSUM &&
393 		    !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
394 			*(__sum16 *)ptr = 0;
395 			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
396 								 skb->len, 0));
397 		}
398 	}
399 
400 	return skb;
401 }
402 
403 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
404 		       const struct iphdr *tnl_params,
405 		       __be16 proto)
406 {
407 	struct ip_tunnel *tunnel = netdev_priv(dev);
408 	struct tnl_ptk_info tpi;
409 
410 	if (likely(!skb->encapsulation)) {
411 		skb_reset_inner_headers(skb);
412 		skb->encapsulation = 1;
413 	}
414 
415 	tpi.flags = tunnel->parms.o_flags;
416 	tpi.proto = proto;
417 	tpi.key = tunnel->parms.o_key;
418 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
419 		tunnel->o_seqno++;
420 	tpi.seq = htonl(tunnel->o_seqno);
421 
422 	/* Push GRE header. */
423 	skb = gre_build_header(skb, &tpi, tunnel->hlen);
424 	if (unlikely(!skb)) {
425 		dev->stats.tx_dropped++;
426 		return;
427 	}
428 
429 	ip_tunnel_xmit(skb, dev, tnl_params);
430 }
431 
432 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
433 			      struct net_device *dev)
434 {
435 	struct ip_tunnel *tunnel = netdev_priv(dev);
436 	const struct iphdr *tnl_params;
437 
438 	skb = handle_offloads(tunnel, skb);
439 	if (IS_ERR(skb))
440 		goto out;
441 
442 	if (dev->header_ops) {
443 		/* Need space for new headers */
444 		if (skb_cow_head(skb, dev->needed_headroom -
445 				      (tunnel->hlen + sizeof(struct iphdr))));
446 			goto free_skb;
447 
448 		tnl_params = (const struct iphdr *)skb->data;
449 
450 		/* Pull skb since ip_tunnel_xmit() needs skb->data pointing
451 		 * to gre header.
452 		 */
453 		skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
454 	} else {
455 		if (skb_cow_head(skb, dev->needed_headroom))
456 			goto free_skb;
457 
458 		tnl_params = &tunnel->parms.iph;
459 	}
460 
461 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
462 
463 	return NETDEV_TX_OK;
464 
465 free_skb:
466 	dev_kfree_skb(skb);
467 out:
468 	dev->stats.tx_dropped++;
469 	return NETDEV_TX_OK;
470 }
471 
472 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
473 				struct net_device *dev)
474 {
475 	struct ip_tunnel *tunnel = netdev_priv(dev);
476 
477 	skb = handle_offloads(tunnel, skb);
478 	if (IS_ERR(skb))
479 		goto out;
480 
481 	if (skb_cow_head(skb, dev->needed_headroom))
482 		goto free_skb;
483 
484 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
485 
486 	return NETDEV_TX_OK;
487 
488 free_skb:
489 	dev_kfree_skb(skb);
490 out:
491 	dev->stats.tx_dropped++;
492 	return NETDEV_TX_OK;
493 }
494 
495 static int ipgre_tunnel_ioctl(struct net_device *dev,
496 			      struct ifreq *ifr, int cmd)
497 {
498 	int err = 0;
499 	struct ip_tunnel_parm p;
500 
501 	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
502 		return -EFAULT;
503 	if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
504 	    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
505 	    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) {
506 		return -EINVAL;
507 	}
508 	p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
509 	p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
510 
511 	err = ip_tunnel_ioctl(dev, &p, cmd);
512 	if (err)
513 		return err;
514 
515 	p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
516 	p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
517 
518 	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
519 		return -EFAULT;
520 	return 0;
521 }
522 
523 /* Nice toy. Unfortunately, useless in real life :-)
524    It allows to construct virtual multiprotocol broadcast "LAN"
525    over the Internet, provided multicast routing is tuned.
526 
527 
528    I have no idea was this bicycle invented before me,
529    so that I had to set ARPHRD_IPGRE to a random value.
530    I have an impression, that Cisco could make something similar,
531    but this feature is apparently missing in IOS<=11.2(8).
532 
533    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
534    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
535 
536    ping -t 255 224.66.66.66
537 
538    If nobody answers, mbone does not work.
539 
540    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
541    ip addr add 10.66.66.<somewhat>/24 dev Universe
542    ifconfig Universe up
543    ifconfig Universe add fe80::<Your_real_addr>/10
544    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
545    ftp 10.66.66.66
546    ...
547    ftp fec0:6666:6666::193.233.7.65
548    ...
549  */
550 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
551 			unsigned short type,
552 			const void *daddr, const void *saddr, unsigned int len)
553 {
554 	struct ip_tunnel *t = netdev_priv(dev);
555 	struct iphdr *iph;
556 	struct gre_base_hdr *greh;
557 
558 	iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
559 	greh = (struct gre_base_hdr *)(iph+1);
560 	greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
561 	greh->protocol = htons(type);
562 
563 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
564 
565 	/* Set the source hardware address. */
566 	if (saddr)
567 		memcpy(&iph->saddr, saddr, 4);
568 	if (daddr)
569 		memcpy(&iph->daddr, daddr, 4);
570 	if (iph->daddr)
571 		return t->hlen;
572 
573 	return -(t->hlen + sizeof(*iph));
574 }
575 
576 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
577 {
578 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
579 	memcpy(haddr, &iph->saddr, 4);
580 	return 4;
581 }
582 
583 static const struct header_ops ipgre_header_ops = {
584 	.create	= ipgre_header,
585 	.parse	= ipgre_header_parse,
586 };
587 
588 #ifdef CONFIG_NET_IPGRE_BROADCAST
589 static int ipgre_open(struct net_device *dev)
590 {
591 	struct ip_tunnel *t = netdev_priv(dev);
592 
593 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
594 		struct flowi4 fl4;
595 		struct rtable *rt;
596 
597 		rt = ip_route_output_gre(dev_net(dev), &fl4,
598 					 t->parms.iph.daddr,
599 					 t->parms.iph.saddr,
600 					 t->parms.o_key,
601 					 RT_TOS(t->parms.iph.tos),
602 					 t->parms.link);
603 		if (IS_ERR(rt))
604 			return -EADDRNOTAVAIL;
605 		dev = rt->dst.dev;
606 		ip_rt_put(rt);
607 		if (__in_dev_get_rtnl(dev) == NULL)
608 			return -EADDRNOTAVAIL;
609 		t->mlink = dev->ifindex;
610 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
611 	}
612 	return 0;
613 }
614 
615 static int ipgre_close(struct net_device *dev)
616 {
617 	struct ip_tunnel *t = netdev_priv(dev);
618 
619 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
620 		struct in_device *in_dev;
621 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
622 		if (in_dev)
623 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
624 	}
625 	return 0;
626 }
627 #endif
628 
629 static const struct net_device_ops ipgre_netdev_ops = {
630 	.ndo_init		= ipgre_tunnel_init,
631 	.ndo_uninit		= ip_tunnel_uninit,
632 #ifdef CONFIG_NET_IPGRE_BROADCAST
633 	.ndo_open		= ipgre_open,
634 	.ndo_stop		= ipgre_close,
635 #endif
636 	.ndo_start_xmit		= ipgre_xmit,
637 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
638 	.ndo_change_mtu		= ip_tunnel_change_mtu,
639 	.ndo_get_stats64	= ip_tunnel_get_stats64,
640 };
641 
642 #define GRE_FEATURES (NETIF_F_SG |		\
643 		      NETIF_F_FRAGLIST |	\
644 		      NETIF_F_HIGHDMA |		\
645 		      NETIF_F_HW_CSUM)
646 
647 static void ipgre_tunnel_setup(struct net_device *dev)
648 {
649 	dev->netdev_ops		= &ipgre_netdev_ops;
650 	ip_tunnel_setup(dev, ipgre_net_id);
651 }
652 
653 static void __gre_tunnel_init(struct net_device *dev)
654 {
655 	struct ip_tunnel *tunnel;
656 
657 	tunnel = netdev_priv(dev);
658 	tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
659 	tunnel->parms.iph.protocol = IPPROTO_GRE;
660 
661 	dev->needed_headroom	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
662 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
663 	dev->iflink		= 0;
664 
665 	dev->features		|= NETIF_F_NETNS_LOCAL | GRE_FEATURES;
666 	dev->hw_features	|= GRE_FEATURES;
667 
668 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
669 		/* TCP offload with GRE SEQ is not supported. */
670 		dev->features    |= NETIF_F_GSO_SOFTWARE;
671 		dev->hw_features |= NETIF_F_GSO_SOFTWARE;
672 		/* Can use a lockless transmit, unless we generate
673 		 * output sequences
674 		 */
675 		dev->features |= NETIF_F_LLTX;
676 	}
677 }
678 
679 static int ipgre_tunnel_init(struct net_device *dev)
680 {
681 	struct ip_tunnel *tunnel = netdev_priv(dev);
682 	struct iphdr *iph = &tunnel->parms.iph;
683 
684 	__gre_tunnel_init(dev);
685 
686 	memcpy(dev->dev_addr, &iph->saddr, 4);
687 	memcpy(dev->broadcast, &iph->daddr, 4);
688 
689 	dev->type		= ARPHRD_IPGRE;
690 	dev->flags		= IFF_NOARP;
691 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
692 	dev->addr_len		= 4;
693 
694 	if (iph->daddr) {
695 #ifdef CONFIG_NET_IPGRE_BROADCAST
696 		if (ipv4_is_multicast(iph->daddr)) {
697 			if (!iph->saddr)
698 				return -EINVAL;
699 			dev->flags = IFF_BROADCAST;
700 			dev->header_ops = &ipgre_header_ops;
701 		}
702 #endif
703 	} else
704 		dev->header_ops = &ipgre_header_ops;
705 
706 	return ip_tunnel_init(dev);
707 }
708 
709 static const struct gre_protocol ipgre_protocol = {
710 	.handler     = ipgre_rcv,
711 	.err_handler = ipgre_err,
712 };
713 
714 static int __net_init ipgre_init_net(struct net *net)
715 {
716 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
717 }
718 
719 static void __net_exit ipgre_exit_net(struct net *net)
720 {
721 	struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
722 	ip_tunnel_delete_net(itn);
723 }
724 
725 static struct pernet_operations ipgre_net_ops = {
726 	.init = ipgre_init_net,
727 	.exit = ipgre_exit_net,
728 	.id   = &ipgre_net_id,
729 	.size = sizeof(struct ip_tunnel_net),
730 };
731 
732 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
733 {
734 	__be16 flags;
735 
736 	if (!data)
737 		return 0;
738 
739 	flags = 0;
740 	if (data[IFLA_GRE_IFLAGS])
741 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
742 	if (data[IFLA_GRE_OFLAGS])
743 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
744 	if (flags & (GRE_VERSION|GRE_ROUTING))
745 		return -EINVAL;
746 
747 	return 0;
748 }
749 
750 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
751 {
752 	__be32 daddr;
753 
754 	if (tb[IFLA_ADDRESS]) {
755 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
756 			return -EINVAL;
757 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
758 			return -EADDRNOTAVAIL;
759 	}
760 
761 	if (!data)
762 		goto out;
763 
764 	if (data[IFLA_GRE_REMOTE]) {
765 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
766 		if (!daddr)
767 			return -EINVAL;
768 	}
769 
770 out:
771 	return ipgre_tunnel_validate(tb, data);
772 }
773 
774 static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
775 			       struct ip_tunnel_parm *parms)
776 {
777 	memset(parms, 0, sizeof(*parms));
778 
779 	parms->iph.protocol = IPPROTO_GRE;
780 
781 	if (!data)
782 		return;
783 
784 	if (data[IFLA_GRE_LINK])
785 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
786 
787 	if (data[IFLA_GRE_IFLAGS])
788 		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
789 
790 	if (data[IFLA_GRE_OFLAGS])
791 		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
792 
793 	if (data[IFLA_GRE_IKEY])
794 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
795 
796 	if (data[IFLA_GRE_OKEY])
797 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
798 
799 	if (data[IFLA_GRE_LOCAL])
800 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
801 
802 	if (data[IFLA_GRE_REMOTE])
803 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
804 
805 	if (data[IFLA_GRE_TTL])
806 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
807 
808 	if (data[IFLA_GRE_TOS])
809 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
810 
811 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
812 		parms->iph.frag_off = htons(IP_DF);
813 }
814 
815 static int gre_tap_init(struct net_device *dev)
816 {
817 	__gre_tunnel_init(dev);
818 
819 	return ip_tunnel_init(dev);
820 }
821 
822 static const struct net_device_ops gre_tap_netdev_ops = {
823 	.ndo_init		= gre_tap_init,
824 	.ndo_uninit		= ip_tunnel_uninit,
825 	.ndo_start_xmit		= gre_tap_xmit,
826 	.ndo_set_mac_address 	= eth_mac_addr,
827 	.ndo_validate_addr	= eth_validate_addr,
828 	.ndo_change_mtu		= ip_tunnel_change_mtu,
829 	.ndo_get_stats64	= ip_tunnel_get_stats64,
830 };
831 
832 static void ipgre_tap_setup(struct net_device *dev)
833 {
834 	ether_setup(dev);
835 	dev->netdev_ops		= &gre_tap_netdev_ops;
836 	ip_tunnel_setup(dev, gre_tap_net_id);
837 }
838 
839 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
840 			 struct nlattr *tb[], struct nlattr *data[])
841 {
842 	struct ip_tunnel_parm p;
843 
844 	ipgre_netlink_parms(data, tb, &p);
845 	return ip_tunnel_newlink(dev, tb, &p);
846 }
847 
848 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
849 			    struct nlattr *data[])
850 {
851 	struct ip_tunnel_parm p;
852 
853 	ipgre_netlink_parms(data, tb, &p);
854 	return ip_tunnel_changelink(dev, tb, &p);
855 }
856 
857 static size_t ipgre_get_size(const struct net_device *dev)
858 {
859 	return
860 		/* IFLA_GRE_LINK */
861 		nla_total_size(4) +
862 		/* IFLA_GRE_IFLAGS */
863 		nla_total_size(2) +
864 		/* IFLA_GRE_OFLAGS */
865 		nla_total_size(2) +
866 		/* IFLA_GRE_IKEY */
867 		nla_total_size(4) +
868 		/* IFLA_GRE_OKEY */
869 		nla_total_size(4) +
870 		/* IFLA_GRE_LOCAL */
871 		nla_total_size(4) +
872 		/* IFLA_GRE_REMOTE */
873 		nla_total_size(4) +
874 		/* IFLA_GRE_TTL */
875 		nla_total_size(1) +
876 		/* IFLA_GRE_TOS */
877 		nla_total_size(1) +
878 		/* IFLA_GRE_PMTUDISC */
879 		nla_total_size(1) +
880 		0;
881 }
882 
883 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
884 {
885 	struct ip_tunnel *t = netdev_priv(dev);
886 	struct ip_tunnel_parm *p = &t->parms;
887 
888 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
889 	    nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
890 	    nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
891 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
892 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
893 	    nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
894 	    nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
895 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
896 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
897 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
898 		       !!(p->iph.frag_off & htons(IP_DF))))
899 		goto nla_put_failure;
900 	return 0;
901 
902 nla_put_failure:
903 	return -EMSGSIZE;
904 }
905 
906 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
907 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
908 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
909 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
910 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
911 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
912 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
913 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
914 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
915 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
916 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
917 };
918 
919 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
920 	.kind		= "gre",
921 	.maxtype	= IFLA_GRE_MAX,
922 	.policy		= ipgre_policy,
923 	.priv_size	= sizeof(struct ip_tunnel),
924 	.setup		= ipgre_tunnel_setup,
925 	.validate	= ipgre_tunnel_validate,
926 	.newlink	= ipgre_newlink,
927 	.changelink	= ipgre_changelink,
928 	.dellink	= ip_tunnel_dellink,
929 	.get_size	= ipgre_get_size,
930 	.fill_info	= ipgre_fill_info,
931 };
932 
933 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
934 	.kind		= "gretap",
935 	.maxtype	= IFLA_GRE_MAX,
936 	.policy		= ipgre_policy,
937 	.priv_size	= sizeof(struct ip_tunnel),
938 	.setup		= ipgre_tap_setup,
939 	.validate	= ipgre_tap_validate,
940 	.newlink	= ipgre_newlink,
941 	.changelink	= ipgre_changelink,
942 	.dellink	= ip_tunnel_dellink,
943 	.get_size	= ipgre_get_size,
944 	.fill_info	= ipgre_fill_info,
945 };
946 
947 static int __net_init ipgre_tap_init_net(struct net *net)
948 {
949 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
950 }
951 
952 static void __net_exit ipgre_tap_exit_net(struct net *net)
953 {
954 	struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
955 	ip_tunnel_delete_net(itn);
956 }
957 
958 static struct pernet_operations ipgre_tap_net_ops = {
959 	.init = ipgre_tap_init_net,
960 	.exit = ipgre_tap_exit_net,
961 	.id   = &gre_tap_net_id,
962 	.size = sizeof(struct ip_tunnel_net),
963 };
964 
965 static int __init ipgre_init(void)
966 {
967 	int err;
968 
969 	pr_info("GRE over IPv4 tunneling driver\n");
970 
971 	err = register_pernet_device(&ipgre_net_ops);
972 	if (err < 0)
973 		return err;
974 
975 	err = register_pernet_device(&ipgre_tap_net_ops);
976 	if (err < 0)
977 		goto pnet_tap_faied;
978 
979 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
980 	if (err < 0) {
981 		pr_info("%s: can't add protocol\n", __func__);
982 		goto add_proto_failed;
983 	}
984 
985 	err = rtnl_link_register(&ipgre_link_ops);
986 	if (err < 0)
987 		goto rtnl_link_failed;
988 
989 	err = rtnl_link_register(&ipgre_tap_ops);
990 	if (err < 0)
991 		goto tap_ops_failed;
992 
993 	return 0;
994 
995 tap_ops_failed:
996 	rtnl_link_unregister(&ipgre_link_ops);
997 rtnl_link_failed:
998 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
999 add_proto_failed:
1000 	unregister_pernet_device(&ipgre_tap_net_ops);
1001 pnet_tap_faied:
1002 	unregister_pernet_device(&ipgre_net_ops);
1003 	return err;
1004 }
1005 
1006 static void __exit ipgre_fini(void)
1007 {
1008 	rtnl_link_unregister(&ipgre_tap_ops);
1009 	rtnl_link_unregister(&ipgre_link_ops);
1010 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1011 		pr_info("%s: can't remove protocol\n", __func__);
1012 	unregister_pernet_device(&ipgre_tap_net_ops);
1013 	unregister_pernet_device(&ipgre_net_ops);
1014 }
1015 
1016 module_init(ipgre_init);
1017 module_exit(ipgre_fini);
1018 MODULE_LICENSE("GPL");
1019 MODULE_ALIAS_RTNL_LINK("gre");
1020 MODULE_ALIAS_RTNL_LINK("gretap");
1021 MODULE_ALIAS_NETDEV("gre0");
1022 MODULE_ALIAS_NETDEV("gretap0");
1023