xref: /linux/net/ipv4/ip_gre.c (revision b2d0f5d5dc53532e6f07bc546a476a55ebdfe0f3)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <linux/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/if_vlan.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 #include <net/dst_metadata.h>
51 #include <net/erspan.h>
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is a good
66    solution, but it supposes maintaining new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: xmit_recursion breaks dead loops. This is a percpu
70    counter, since when we enter the first ndo_xmit(), cpu migration is
71    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
72 
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77 
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86 
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89 
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, ttl is not solution at all.
93 
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    rapidly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106 
107    Alexey Kuznetsov.
108  */
109 
110 static bool log_ecn_error = true;
111 module_param(log_ecn_error, bool, 0644);
112 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
113 
114 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
115 static int ipgre_tunnel_init(struct net_device *dev);
116 static void erspan_build_header(struct sk_buff *skb,
117 				__be32 id, u32 index, bool truncate);
118 
119 static unsigned int ipgre_net_id __read_mostly;
120 static unsigned int gre_tap_net_id __read_mostly;
121 static unsigned int erspan_net_id __read_mostly;
122 
123 static void ipgre_err(struct sk_buff *skb, u32 info,
124 		      const struct tnl_ptk_info *tpi)
125 {
126 
127 	/* All the routers (except for Linux) return only
128 	   8 bytes of packet payload. It means, that precise relaying of
129 	   ICMP in the real Internet is absolutely infeasible.
130 
131 	   Moreover, Cisco "wise men" put GRE key to the third word
132 	   in GRE header. It makes impossible maintaining even soft
133 	   state for keyed GRE tunnels with enabled checksum. Tell
134 	   them "thank you".
135 
136 	   Well, I wonder, rfc1812 was written by Cisco employee,
137 	   what the hell these idiots break standards established
138 	   by themselves???
139 	   */
140 	struct net *net = dev_net(skb->dev);
141 	struct ip_tunnel_net *itn;
142 	const struct iphdr *iph;
143 	const int type = icmp_hdr(skb)->type;
144 	const int code = icmp_hdr(skb)->code;
145 	unsigned int data_len = 0;
146 	struct ip_tunnel *t;
147 
148 	switch (type) {
149 	default:
150 	case ICMP_PARAMETERPROB:
151 		return;
152 
153 	case ICMP_DEST_UNREACH:
154 		switch (code) {
155 		case ICMP_SR_FAILED:
156 		case ICMP_PORT_UNREACH:
157 			/* Impossible event. */
158 			return;
159 		default:
160 			/* All others are translated to HOST_UNREACH.
161 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
162 			   I believe they are just ether pollution. --ANK
163 			 */
164 			break;
165 		}
166 		break;
167 
168 	case ICMP_TIME_EXCEEDED:
169 		if (code != ICMP_EXC_TTL)
170 			return;
171 		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
172 		break;
173 
174 	case ICMP_REDIRECT:
175 		break;
176 	}
177 
178 	if (tpi->proto == htons(ETH_P_TEB))
179 		itn = net_generic(net, gre_tap_net_id);
180 	else
181 		itn = net_generic(net, ipgre_net_id);
182 
183 	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
184 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
185 			     iph->daddr, iph->saddr, tpi->key);
186 
187 	if (!t)
188 		return;
189 
190 #if IS_ENABLED(CONFIG_IPV6)
191        if (tpi->proto == htons(ETH_P_IPV6) &&
192            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
193 				       type, data_len))
194                return;
195 #endif
196 
197 	if (t->parms.iph.daddr == 0 ||
198 	    ipv4_is_multicast(t->parms.iph.daddr))
199 		return;
200 
201 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
202 		return;
203 
204 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
205 		t->err_count++;
206 	else
207 		t->err_count = 1;
208 	t->err_time = jiffies;
209 }
210 
211 static void gre_err(struct sk_buff *skb, u32 info)
212 {
213 	/* All the routers (except for Linux) return only
214 	 * 8 bytes of packet payload. It means, that precise relaying of
215 	 * ICMP in the real Internet is absolutely infeasible.
216 	 *
217 	 * Moreover, Cisco "wise men" put GRE key to the third word
218 	 * in GRE header. It makes impossible maintaining even soft
219 	 * state for keyed
220 	 * GRE tunnels with enabled checksum. Tell them "thank you".
221 	 *
222 	 * Well, I wonder, rfc1812 was written by Cisco employee,
223 	 * what the hell these idiots break standards established
224 	 * by themselves???
225 	 */
226 
227 	const struct iphdr *iph = (struct iphdr *)skb->data;
228 	const int type = icmp_hdr(skb)->type;
229 	const int code = icmp_hdr(skb)->code;
230 	struct tnl_ptk_info tpi;
231 	bool csum_err = false;
232 
233 	if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
234 			     iph->ihl * 4) < 0) {
235 		if (!csum_err)		/* ignore csum errors. */
236 			return;
237 	}
238 
239 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
240 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
241 				 skb->dev->ifindex, 0, IPPROTO_GRE, 0);
242 		return;
243 	}
244 	if (type == ICMP_REDIRECT) {
245 		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
246 			      IPPROTO_GRE, 0);
247 		return;
248 	}
249 
250 	ipgre_err(skb, info, &tpi);
251 }
252 
253 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
254 		      int gre_hdr_len)
255 {
256 	struct net *net = dev_net(skb->dev);
257 	struct metadata_dst *tun_dst = NULL;
258 	struct ip_tunnel_net *itn;
259 	struct ip_tunnel *tunnel;
260 	struct erspanhdr *ershdr;
261 	const struct iphdr *iph;
262 	__be32 index;
263 	int len;
264 
265 	itn = net_generic(net, erspan_net_id);
266 	len = gre_hdr_len + sizeof(*ershdr);
267 
268 	if (unlikely(!pskb_may_pull(skb, len)))
269 		return -ENOMEM;
270 
271 	iph = ip_hdr(skb);
272 	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
273 
274 	/* The original GRE header does not have key field,
275 	 * Use ERSPAN 10-bit session ID as key.
276 	 */
277 	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
278 	index = ershdr->md.index;
279 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
280 				  tpi->flags | TUNNEL_KEY,
281 				  iph->saddr, iph->daddr, tpi->key);
282 
283 	if (tunnel) {
284 		if (__iptunnel_pull_header(skb,
285 					   gre_hdr_len + sizeof(*ershdr),
286 					   htons(ETH_P_TEB),
287 					   false, false) < 0)
288 			goto drop;
289 
290 		if (tunnel->collect_md) {
291 			struct ip_tunnel_info *info;
292 			struct erspan_metadata *md;
293 			__be64 tun_id;
294 			__be16 flags;
295 
296 			tpi->flags |= TUNNEL_KEY;
297 			flags = tpi->flags;
298 			tun_id = key32_to_tunnel_id(tpi->key);
299 
300 			tun_dst = ip_tun_rx_dst(skb, flags,
301 						tun_id, sizeof(*md));
302 			if (!tun_dst)
303 				return PACKET_REJECT;
304 
305 			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
306 			if (!md)
307 				return PACKET_REJECT;
308 
309 			md->index = index;
310 			info = &tun_dst->u.tun_info;
311 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
312 			info->options_len = sizeof(*md);
313 		} else {
314 			tunnel->index = ntohl(index);
315 		}
316 
317 		skb_reset_mac_header(skb);
318 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
319 		return PACKET_RCVD;
320 	}
321 drop:
322 	kfree_skb(skb);
323 	return PACKET_RCVD;
324 }
325 
326 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
327 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
328 {
329 	struct metadata_dst *tun_dst = NULL;
330 	const struct iphdr *iph;
331 	struct ip_tunnel *tunnel;
332 
333 	iph = ip_hdr(skb);
334 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
335 				  iph->saddr, iph->daddr, tpi->key);
336 
337 	if (tunnel) {
338 		if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
339 					   raw_proto, false) < 0)
340 			goto drop;
341 
342 		if (tunnel->dev->type != ARPHRD_NONE)
343 			skb_pop_mac_header(skb);
344 		else
345 			skb_reset_mac_header(skb);
346 		if (tunnel->collect_md) {
347 			__be16 flags;
348 			__be64 tun_id;
349 
350 			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
351 			tun_id = key32_to_tunnel_id(tpi->key);
352 			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
353 			if (!tun_dst)
354 				return PACKET_REJECT;
355 		}
356 
357 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
358 		return PACKET_RCVD;
359 	}
360 	return PACKET_NEXT;
361 
362 drop:
363 	kfree_skb(skb);
364 	return PACKET_RCVD;
365 }
366 
367 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
368 		     int hdr_len)
369 {
370 	struct net *net = dev_net(skb->dev);
371 	struct ip_tunnel_net *itn;
372 	int res;
373 
374 	if (tpi->proto == htons(ETH_P_TEB))
375 		itn = net_generic(net, gre_tap_net_id);
376 	else
377 		itn = net_generic(net, ipgre_net_id);
378 
379 	res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
380 	if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
381 		/* ipgre tunnels in collect metadata mode should receive
382 		 * also ETH_P_TEB traffic.
383 		 */
384 		itn = net_generic(net, ipgre_net_id);
385 		res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
386 	}
387 	return res;
388 }
389 
390 static int gre_rcv(struct sk_buff *skb)
391 {
392 	struct tnl_ptk_info tpi;
393 	bool csum_err = false;
394 	int hdr_len;
395 
396 #ifdef CONFIG_NET_IPGRE_BROADCAST
397 	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
398 		/* Looped back packet, drop it! */
399 		if (rt_is_output_route(skb_rtable(skb)))
400 			goto drop;
401 	}
402 #endif
403 
404 	hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
405 	if (hdr_len < 0)
406 		goto drop;
407 
408 	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
409 		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
410 			return 0;
411 	}
412 
413 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
414 		return 0;
415 
416 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
417 drop:
418 	kfree_skb(skb);
419 	return 0;
420 }
421 
422 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
423 		       const struct iphdr *tnl_params,
424 		       __be16 proto)
425 {
426 	struct ip_tunnel *tunnel = netdev_priv(dev);
427 
428 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
429 		tunnel->o_seqno++;
430 
431 	/* Push GRE header. */
432 	gre_build_header(skb, tunnel->tun_hlen,
433 			 tunnel->parms.o_flags, proto, tunnel->parms.o_key,
434 			 htonl(tunnel->o_seqno));
435 
436 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
437 }
438 
439 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
440 {
441 	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
442 }
443 
444 static struct rtable *gre_get_rt(struct sk_buff *skb,
445 				 struct net_device *dev,
446 				 struct flowi4 *fl,
447 				 const struct ip_tunnel_key *key)
448 {
449 	struct net *net = dev_net(dev);
450 
451 	memset(fl, 0, sizeof(*fl));
452 	fl->daddr = key->u.ipv4.dst;
453 	fl->saddr = key->u.ipv4.src;
454 	fl->flowi4_tos = RT_TOS(key->tos);
455 	fl->flowi4_mark = skb->mark;
456 	fl->flowi4_proto = IPPROTO_GRE;
457 
458 	return ip_route_output_key(net, fl);
459 }
460 
461 static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
462 				      struct net_device *dev,
463 				      struct flowi4 *fl,
464 				      int tunnel_hlen)
465 {
466 	struct ip_tunnel_info *tun_info;
467 	const struct ip_tunnel_key *key;
468 	struct rtable *rt = NULL;
469 	int min_headroom;
470 	bool use_cache;
471 	int err;
472 
473 	tun_info = skb_tunnel_info(skb);
474 	key = &tun_info->key;
475 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
476 
477 	if (use_cache)
478 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
479 	if (!rt) {
480 		rt = gre_get_rt(skb, dev, fl, key);
481 		if (IS_ERR(rt))
482 			goto err_free_skb;
483 		if (use_cache)
484 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
485 					  fl->saddr);
486 	}
487 
488 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
489 			+ tunnel_hlen + sizeof(struct iphdr);
490 	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
491 		int head_delta = SKB_DATA_ALIGN(min_headroom -
492 						skb_headroom(skb) +
493 						16);
494 		err = pskb_expand_head(skb, max_t(int, head_delta, 0),
495 				       0, GFP_ATOMIC);
496 		if (unlikely(err))
497 			goto err_free_rt;
498 	}
499 	return rt;
500 
501 err_free_rt:
502 	ip_rt_put(rt);
503 err_free_skb:
504 	kfree_skb(skb);
505 	dev->stats.tx_dropped++;
506 	return NULL;
507 }
508 
509 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
510 			__be16 proto)
511 {
512 	struct ip_tunnel_info *tun_info;
513 	const struct ip_tunnel_key *key;
514 	struct rtable *rt = NULL;
515 	struct flowi4 fl;
516 	int tunnel_hlen;
517 	__be16 df, flags;
518 
519 	tun_info = skb_tunnel_info(skb);
520 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
521 		     ip_tunnel_info_af(tun_info) != AF_INET))
522 		goto err_free_skb;
523 
524 	key = &tun_info->key;
525 	tunnel_hlen = gre_calc_hlen(key->tun_flags);
526 
527 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
528 	if (!rt)
529 		return;
530 
531 	/* Push Tunnel header. */
532 	if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
533 		goto err_free_rt;
534 
535 	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
536 	gre_build_header(skb, tunnel_hlen, flags, proto,
537 			 tunnel_id_to_key32(tun_info->key.tun_id), 0);
538 
539 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
540 
541 	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
542 		      key->tos, key->ttl, df, false);
543 	return;
544 
545 err_free_rt:
546 	ip_rt_put(rt);
547 err_free_skb:
548 	kfree_skb(skb);
549 	dev->stats.tx_dropped++;
550 }
551 
552 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
553 			   __be16 proto)
554 {
555 	struct ip_tunnel *tunnel = netdev_priv(dev);
556 	struct ip_tunnel_info *tun_info;
557 	const struct ip_tunnel_key *key;
558 	struct erspan_metadata *md;
559 	struct rtable *rt = NULL;
560 	bool truncate = false;
561 	struct flowi4 fl;
562 	int tunnel_hlen;
563 	__be16 df;
564 
565 	tun_info = skb_tunnel_info(skb);
566 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
567 		     ip_tunnel_info_af(tun_info) != AF_INET))
568 		goto err_free_skb;
569 
570 	key = &tun_info->key;
571 
572 	/* ERSPAN has fixed 8 byte GRE header */
573 	tunnel_hlen = 8 + sizeof(struct erspanhdr);
574 
575 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
576 	if (!rt)
577 		return;
578 
579 	if (gre_handle_offloads(skb, false))
580 		goto err_free_rt;
581 
582 	if (skb->len > dev->mtu + dev->hard_header_len) {
583 		pskb_trim(skb, dev->mtu + dev->hard_header_len);
584 		truncate = true;
585 	}
586 
587 	md = ip_tunnel_info_opts(tun_info);
588 	if (!md)
589 		goto err_free_rt;
590 
591 	erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
592 			    ntohl(md->index), truncate);
593 
594 	gre_build_header(skb, 8, TUNNEL_SEQ,
595 			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
596 
597 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
598 
599 	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
600 		      key->tos, key->ttl, df, false);
601 	return;
602 
603 err_free_rt:
604 	ip_rt_put(rt);
605 err_free_skb:
606 	kfree_skb(skb);
607 	dev->stats.tx_dropped++;
608 }
609 
610 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
611 {
612 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
613 	struct rtable *rt;
614 	struct flowi4 fl4;
615 
616 	if (ip_tunnel_info_af(info) != AF_INET)
617 		return -EINVAL;
618 
619 	rt = gre_get_rt(skb, dev, &fl4, &info->key);
620 	if (IS_ERR(rt))
621 		return PTR_ERR(rt);
622 
623 	ip_rt_put(rt);
624 	info->key.u.ipv4.src = fl4.saddr;
625 	return 0;
626 }
627 
628 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
629 			      struct net_device *dev)
630 {
631 	struct ip_tunnel *tunnel = netdev_priv(dev);
632 	const struct iphdr *tnl_params;
633 
634 	if (tunnel->collect_md) {
635 		gre_fb_xmit(skb, dev, skb->protocol);
636 		return NETDEV_TX_OK;
637 	}
638 
639 	if (dev->header_ops) {
640 		/* Need space for new headers */
641 		if (skb_cow_head(skb, dev->needed_headroom -
642 				      (tunnel->hlen + sizeof(struct iphdr))))
643 			goto free_skb;
644 
645 		tnl_params = (const struct iphdr *)skb->data;
646 
647 		/* Pull skb since ip_tunnel_xmit() needs skb->data pointing
648 		 * to gre header.
649 		 */
650 		skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
651 		skb_reset_mac_header(skb);
652 	} else {
653 		if (skb_cow_head(skb, dev->needed_headroom))
654 			goto free_skb;
655 
656 		tnl_params = &tunnel->parms.iph;
657 	}
658 
659 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
660 		goto free_skb;
661 
662 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
663 	return NETDEV_TX_OK;
664 
665 free_skb:
666 	kfree_skb(skb);
667 	dev->stats.tx_dropped++;
668 	return NETDEV_TX_OK;
669 }
670 
671 static inline u8 tos_to_cos(u8 tos)
672 {
673 	u8 dscp, cos;
674 
675 	dscp = tos >> 2;
676 	cos = dscp >> 3;
677 	return cos;
678 }
679 
680 static void erspan_build_header(struct sk_buff *skb,
681 				__be32 id, u32 index, bool truncate)
682 {
683 	struct iphdr *iphdr = ip_hdr(skb);
684 	struct ethhdr *eth = eth_hdr(skb);
685 	enum erspan_encap_type enc_type;
686 	struct erspanhdr *ershdr;
687 	struct qtag_prefix {
688 		__be16 eth_type;
689 		__be16 tci;
690 	} *qp;
691 	u16 vlan_tci = 0;
692 
693 	enc_type = ERSPAN_ENCAP_NOVLAN;
694 
695 	/* If mirrored packet has vlan tag, extract tci and
696 	 *  perserve vlan header in the mirrored frame.
697 	 */
698 	if (eth->h_proto == htons(ETH_P_8021Q)) {
699 		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
700 		vlan_tci = ntohs(qp->tci);
701 		enc_type = ERSPAN_ENCAP_INFRAME;
702 	}
703 
704 	skb_push(skb, sizeof(*ershdr));
705 	ershdr = (struct erspanhdr *)skb->data;
706 	memset(ershdr, 0, sizeof(*ershdr));
707 
708 	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
709 				 (ERSPAN_VERSION << VER_OFFSET));
710 	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
711 			   ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
712 			   (enc_type << EN_OFFSET & EN_MASK) |
713 			   ((truncate << T_OFFSET) & T_MASK));
714 	ershdr->md.index = htonl(index & INDEX_MASK);
715 }
716 
717 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
718 			       struct net_device *dev)
719 {
720 	struct ip_tunnel *tunnel = netdev_priv(dev);
721 	bool truncate = false;
722 
723 	if (tunnel->collect_md) {
724 		erspan_fb_xmit(skb, dev, skb->protocol);
725 		return NETDEV_TX_OK;
726 	}
727 
728 	if (gre_handle_offloads(skb, false))
729 		goto free_skb;
730 
731 	if (skb_cow_head(skb, dev->needed_headroom))
732 		goto free_skb;
733 
734 	if (skb->len > dev->mtu + dev->hard_header_len) {
735 		pskb_trim(skb, dev->mtu + dev->hard_header_len);
736 		truncate = true;
737 	}
738 
739 	/* Push ERSPAN header */
740 	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
741 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
742 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
743 	return NETDEV_TX_OK;
744 
745 free_skb:
746 	kfree_skb(skb);
747 	dev->stats.tx_dropped++;
748 	return NETDEV_TX_OK;
749 }
750 
751 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
752 				struct net_device *dev)
753 {
754 	struct ip_tunnel *tunnel = netdev_priv(dev);
755 
756 	if (tunnel->collect_md) {
757 		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
758 		return NETDEV_TX_OK;
759 	}
760 
761 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
762 		goto free_skb;
763 
764 	if (skb_cow_head(skb, dev->needed_headroom))
765 		goto free_skb;
766 
767 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
768 	return NETDEV_TX_OK;
769 
770 free_skb:
771 	kfree_skb(skb);
772 	dev->stats.tx_dropped++;
773 	return NETDEV_TX_OK;
774 }
775 
776 static int ipgre_tunnel_ioctl(struct net_device *dev,
777 			      struct ifreq *ifr, int cmd)
778 {
779 	int err;
780 	struct ip_tunnel_parm p;
781 
782 	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
783 		return -EFAULT;
784 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
785 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
786 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
787 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
788 			return -EINVAL;
789 	}
790 	p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
791 	p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
792 
793 	err = ip_tunnel_ioctl(dev, &p, cmd);
794 	if (err)
795 		return err;
796 
797 	p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
798 	p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
799 
800 	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
801 		return -EFAULT;
802 	return 0;
803 }
804 
805 /* Nice toy. Unfortunately, useless in real life :-)
806    It allows to construct virtual multiprotocol broadcast "LAN"
807    over the Internet, provided multicast routing is tuned.
808 
809 
810    I have no idea was this bicycle invented before me,
811    so that I had to set ARPHRD_IPGRE to a random value.
812    I have an impression, that Cisco could make something similar,
813    but this feature is apparently missing in IOS<=11.2(8).
814 
815    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
816    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
817 
818    ping -t 255 224.66.66.66
819 
820    If nobody answers, mbone does not work.
821 
822    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
823    ip addr add 10.66.66.<somewhat>/24 dev Universe
824    ifconfig Universe up
825    ifconfig Universe add fe80::<Your_real_addr>/10
826    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
827    ftp 10.66.66.66
828    ...
829    ftp fec0:6666:6666::193.233.7.65
830    ...
831  */
832 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
833 			unsigned short type,
834 			const void *daddr, const void *saddr, unsigned int len)
835 {
836 	struct ip_tunnel *t = netdev_priv(dev);
837 	struct iphdr *iph;
838 	struct gre_base_hdr *greh;
839 
840 	iph = skb_push(skb, t->hlen + sizeof(*iph));
841 	greh = (struct gre_base_hdr *)(iph+1);
842 	greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
843 	greh->protocol = htons(type);
844 
845 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
846 
847 	/* Set the source hardware address. */
848 	if (saddr)
849 		memcpy(&iph->saddr, saddr, 4);
850 	if (daddr)
851 		memcpy(&iph->daddr, daddr, 4);
852 	if (iph->daddr)
853 		return t->hlen + sizeof(*iph);
854 
855 	return -(t->hlen + sizeof(*iph));
856 }
857 
858 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
859 {
860 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
861 	memcpy(haddr, &iph->saddr, 4);
862 	return 4;
863 }
864 
865 static const struct header_ops ipgre_header_ops = {
866 	.create	= ipgre_header,
867 	.parse	= ipgre_header_parse,
868 };
869 
870 #ifdef CONFIG_NET_IPGRE_BROADCAST
871 static int ipgre_open(struct net_device *dev)
872 {
873 	struct ip_tunnel *t = netdev_priv(dev);
874 
875 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
876 		struct flowi4 fl4;
877 		struct rtable *rt;
878 
879 		rt = ip_route_output_gre(t->net, &fl4,
880 					 t->parms.iph.daddr,
881 					 t->parms.iph.saddr,
882 					 t->parms.o_key,
883 					 RT_TOS(t->parms.iph.tos),
884 					 t->parms.link);
885 		if (IS_ERR(rt))
886 			return -EADDRNOTAVAIL;
887 		dev = rt->dst.dev;
888 		ip_rt_put(rt);
889 		if (!__in_dev_get_rtnl(dev))
890 			return -EADDRNOTAVAIL;
891 		t->mlink = dev->ifindex;
892 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
893 	}
894 	return 0;
895 }
896 
897 static int ipgre_close(struct net_device *dev)
898 {
899 	struct ip_tunnel *t = netdev_priv(dev);
900 
901 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
902 		struct in_device *in_dev;
903 		in_dev = inetdev_by_index(t->net, t->mlink);
904 		if (in_dev)
905 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
906 	}
907 	return 0;
908 }
909 #endif
910 
911 static const struct net_device_ops ipgre_netdev_ops = {
912 	.ndo_init		= ipgre_tunnel_init,
913 	.ndo_uninit		= ip_tunnel_uninit,
914 #ifdef CONFIG_NET_IPGRE_BROADCAST
915 	.ndo_open		= ipgre_open,
916 	.ndo_stop		= ipgre_close,
917 #endif
918 	.ndo_start_xmit		= ipgre_xmit,
919 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
920 	.ndo_change_mtu		= ip_tunnel_change_mtu,
921 	.ndo_get_stats64	= ip_tunnel_get_stats64,
922 	.ndo_get_iflink		= ip_tunnel_get_iflink,
923 };
924 
925 #define GRE_FEATURES (NETIF_F_SG |		\
926 		      NETIF_F_FRAGLIST |	\
927 		      NETIF_F_HIGHDMA |		\
928 		      NETIF_F_HW_CSUM)
929 
930 static void ipgre_tunnel_setup(struct net_device *dev)
931 {
932 	dev->netdev_ops		= &ipgre_netdev_ops;
933 	dev->type		= ARPHRD_IPGRE;
934 	ip_tunnel_setup(dev, ipgre_net_id);
935 }
936 
937 static void __gre_tunnel_init(struct net_device *dev)
938 {
939 	struct ip_tunnel *tunnel;
940 	int t_hlen;
941 
942 	tunnel = netdev_priv(dev);
943 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
944 	tunnel->parms.iph.protocol = IPPROTO_GRE;
945 
946 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
947 
948 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
949 
950 	dev->needed_headroom	= LL_MAX_HEADER + t_hlen + 4;
951 	dev->mtu		= ETH_DATA_LEN - t_hlen - 4;
952 
953 	dev->features		|= GRE_FEATURES;
954 	dev->hw_features	|= GRE_FEATURES;
955 
956 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
957 		/* TCP offload with GRE SEQ is not supported, nor
958 		 * can we support 2 levels of outer headers requiring
959 		 * an update.
960 		 */
961 		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
962 		    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
963 			dev->features    |= NETIF_F_GSO_SOFTWARE;
964 			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
965 		}
966 
967 		/* Can use a lockless transmit, unless we generate
968 		 * output sequences
969 		 */
970 		dev->features |= NETIF_F_LLTX;
971 	}
972 }
973 
974 static int ipgre_tunnel_init(struct net_device *dev)
975 {
976 	struct ip_tunnel *tunnel = netdev_priv(dev);
977 	struct iphdr *iph = &tunnel->parms.iph;
978 
979 	__gre_tunnel_init(dev);
980 
981 	memcpy(dev->dev_addr, &iph->saddr, 4);
982 	memcpy(dev->broadcast, &iph->daddr, 4);
983 
984 	dev->flags		= IFF_NOARP;
985 	netif_keep_dst(dev);
986 	dev->addr_len		= 4;
987 
988 	if (iph->daddr && !tunnel->collect_md) {
989 #ifdef CONFIG_NET_IPGRE_BROADCAST
990 		if (ipv4_is_multicast(iph->daddr)) {
991 			if (!iph->saddr)
992 				return -EINVAL;
993 			dev->flags = IFF_BROADCAST;
994 			dev->header_ops = &ipgre_header_ops;
995 		}
996 #endif
997 	} else if (!tunnel->collect_md) {
998 		dev->header_ops = &ipgre_header_ops;
999 	}
1000 
1001 	return ip_tunnel_init(dev);
1002 }
1003 
1004 static const struct gre_protocol ipgre_protocol = {
1005 	.handler     = gre_rcv,
1006 	.err_handler = gre_err,
1007 };
1008 
1009 static int __net_init ipgre_init_net(struct net *net)
1010 {
1011 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1012 }
1013 
1014 static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
1015 {
1016 	ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
1017 }
1018 
1019 static struct pernet_operations ipgre_net_ops = {
1020 	.init = ipgre_init_net,
1021 	.exit_batch = ipgre_exit_batch_net,
1022 	.id   = &ipgre_net_id,
1023 	.size = sizeof(struct ip_tunnel_net),
1024 };
1025 
1026 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1027 				 struct netlink_ext_ack *extack)
1028 {
1029 	__be16 flags;
1030 
1031 	if (!data)
1032 		return 0;
1033 
1034 	flags = 0;
1035 	if (data[IFLA_GRE_IFLAGS])
1036 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1037 	if (data[IFLA_GRE_OFLAGS])
1038 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1039 	if (flags & (GRE_VERSION|GRE_ROUTING))
1040 		return -EINVAL;
1041 
1042 	if (data[IFLA_GRE_COLLECT_METADATA] &&
1043 	    data[IFLA_GRE_ENCAP_TYPE] &&
1044 	    nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1045 		return -EINVAL;
1046 
1047 	return 0;
1048 }
1049 
1050 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1051 			      struct netlink_ext_ack *extack)
1052 {
1053 	__be32 daddr;
1054 
1055 	if (tb[IFLA_ADDRESS]) {
1056 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1057 			return -EINVAL;
1058 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1059 			return -EADDRNOTAVAIL;
1060 	}
1061 
1062 	if (!data)
1063 		goto out;
1064 
1065 	if (data[IFLA_GRE_REMOTE]) {
1066 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1067 		if (!daddr)
1068 			return -EINVAL;
1069 	}
1070 
1071 out:
1072 	return ipgre_tunnel_validate(tb, data, extack);
1073 }
1074 
1075 static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1076 			   struct netlink_ext_ack *extack)
1077 {
1078 	__be16 flags = 0;
1079 	int ret;
1080 
1081 	if (!data)
1082 		return 0;
1083 
1084 	ret = ipgre_tap_validate(tb, data, extack);
1085 	if (ret)
1086 		return ret;
1087 
1088 	/* ERSPAN should only have GRE sequence and key flag */
1089 	if (data[IFLA_GRE_OFLAGS])
1090 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1091 	if (data[IFLA_GRE_IFLAGS])
1092 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1093 	if (!data[IFLA_GRE_COLLECT_METADATA] &&
1094 	    flags != (GRE_SEQ | GRE_KEY))
1095 		return -EINVAL;
1096 
1097 	/* ERSPAN Session ID only has 10-bit. Since we reuse
1098 	 * 32-bit key field as ID, check it's range.
1099 	 */
1100 	if (data[IFLA_GRE_IKEY] &&
1101 	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1102 		return -EINVAL;
1103 
1104 	if (data[IFLA_GRE_OKEY] &&
1105 	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1106 		return -EINVAL;
1107 
1108 	return 0;
1109 }
1110 
1111 static int ipgre_netlink_parms(struct net_device *dev,
1112 				struct nlattr *data[],
1113 				struct nlattr *tb[],
1114 				struct ip_tunnel_parm *parms,
1115 				__u32 *fwmark)
1116 {
1117 	struct ip_tunnel *t = netdev_priv(dev);
1118 
1119 	memset(parms, 0, sizeof(*parms));
1120 
1121 	parms->iph.protocol = IPPROTO_GRE;
1122 
1123 	if (!data)
1124 		return 0;
1125 
1126 	if (data[IFLA_GRE_LINK])
1127 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1128 
1129 	if (data[IFLA_GRE_IFLAGS])
1130 		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1131 
1132 	if (data[IFLA_GRE_OFLAGS])
1133 		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1134 
1135 	if (data[IFLA_GRE_IKEY])
1136 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1137 
1138 	if (data[IFLA_GRE_OKEY])
1139 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1140 
1141 	if (data[IFLA_GRE_LOCAL])
1142 		parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1143 
1144 	if (data[IFLA_GRE_REMOTE])
1145 		parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1146 
1147 	if (data[IFLA_GRE_TTL])
1148 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1149 
1150 	if (data[IFLA_GRE_TOS])
1151 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1152 
1153 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1154 		if (t->ignore_df)
1155 			return -EINVAL;
1156 		parms->iph.frag_off = htons(IP_DF);
1157 	}
1158 
1159 	if (data[IFLA_GRE_COLLECT_METADATA]) {
1160 		t->collect_md = true;
1161 		if (dev->type == ARPHRD_IPGRE)
1162 			dev->type = ARPHRD_NONE;
1163 	}
1164 
1165 	if (data[IFLA_GRE_IGNORE_DF]) {
1166 		if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1167 		  && (parms->iph.frag_off & htons(IP_DF)))
1168 			return -EINVAL;
1169 		t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1170 	}
1171 
1172 	if (data[IFLA_GRE_FWMARK])
1173 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1174 
1175 	if (data[IFLA_GRE_ERSPAN_INDEX]) {
1176 		t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1177 
1178 		if (t->index & ~INDEX_MASK)
1179 			return -EINVAL;
1180 	}
1181 
1182 	return 0;
1183 }
1184 
1185 /* This function returns true when ENCAP attributes are present in the nl msg */
1186 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1187 				      struct ip_tunnel_encap *ipencap)
1188 {
1189 	bool ret = false;
1190 
1191 	memset(ipencap, 0, sizeof(*ipencap));
1192 
1193 	if (!data)
1194 		return ret;
1195 
1196 	if (data[IFLA_GRE_ENCAP_TYPE]) {
1197 		ret = true;
1198 		ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1199 	}
1200 
1201 	if (data[IFLA_GRE_ENCAP_FLAGS]) {
1202 		ret = true;
1203 		ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1204 	}
1205 
1206 	if (data[IFLA_GRE_ENCAP_SPORT]) {
1207 		ret = true;
1208 		ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1209 	}
1210 
1211 	if (data[IFLA_GRE_ENCAP_DPORT]) {
1212 		ret = true;
1213 		ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1214 	}
1215 
1216 	return ret;
1217 }
1218 
1219 static int gre_tap_init(struct net_device *dev)
1220 {
1221 	__gre_tunnel_init(dev);
1222 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1223 	netif_keep_dst(dev);
1224 
1225 	return ip_tunnel_init(dev);
1226 }
1227 
1228 static const struct net_device_ops gre_tap_netdev_ops = {
1229 	.ndo_init		= gre_tap_init,
1230 	.ndo_uninit		= ip_tunnel_uninit,
1231 	.ndo_start_xmit		= gre_tap_xmit,
1232 	.ndo_set_mac_address 	= eth_mac_addr,
1233 	.ndo_validate_addr	= eth_validate_addr,
1234 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1235 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1236 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1237 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1238 };
1239 
1240 static int erspan_tunnel_init(struct net_device *dev)
1241 {
1242 	struct ip_tunnel *tunnel = netdev_priv(dev);
1243 	int t_hlen;
1244 
1245 	tunnel->tun_hlen = 8;
1246 	tunnel->parms.iph.protocol = IPPROTO_GRE;
1247 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1248 		       sizeof(struct erspanhdr);
1249 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
1250 
1251 	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
1252 	dev->mtu = ETH_DATA_LEN - t_hlen - 4;
1253 	dev->features		|= GRE_FEATURES;
1254 	dev->hw_features	|= GRE_FEATURES;
1255 	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
1256 	netif_keep_dst(dev);
1257 
1258 	return ip_tunnel_init(dev);
1259 }
1260 
1261 static const struct net_device_ops erspan_netdev_ops = {
1262 	.ndo_init		= erspan_tunnel_init,
1263 	.ndo_uninit		= ip_tunnel_uninit,
1264 	.ndo_start_xmit		= erspan_xmit,
1265 	.ndo_set_mac_address	= eth_mac_addr,
1266 	.ndo_validate_addr	= eth_validate_addr,
1267 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1268 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1269 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1270 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1271 };
1272 
1273 static void ipgre_tap_setup(struct net_device *dev)
1274 {
1275 	ether_setup(dev);
1276 	dev->netdev_ops	= &gre_tap_netdev_ops;
1277 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1278 	dev->priv_flags	|= IFF_LIVE_ADDR_CHANGE;
1279 	ip_tunnel_setup(dev, gre_tap_net_id);
1280 }
1281 
1282 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1283 			 struct nlattr *tb[], struct nlattr *data[],
1284 			 struct netlink_ext_ack *extack)
1285 {
1286 	struct ip_tunnel_parm p;
1287 	struct ip_tunnel_encap ipencap;
1288 	__u32 fwmark = 0;
1289 	int err;
1290 
1291 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1292 		struct ip_tunnel *t = netdev_priv(dev);
1293 		err = ip_tunnel_encap_setup(t, &ipencap);
1294 
1295 		if (err < 0)
1296 			return err;
1297 	}
1298 
1299 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1300 	if (err < 0)
1301 		return err;
1302 	return ip_tunnel_newlink(dev, tb, &p, fwmark);
1303 }
1304 
1305 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1306 			    struct nlattr *data[],
1307 			    struct netlink_ext_ack *extack)
1308 {
1309 	struct ip_tunnel *t = netdev_priv(dev);
1310 	struct ip_tunnel_parm p;
1311 	struct ip_tunnel_encap ipencap;
1312 	__u32 fwmark = t->fwmark;
1313 	int err;
1314 
1315 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1316 		err = ip_tunnel_encap_setup(t, &ipencap);
1317 
1318 		if (err < 0)
1319 			return err;
1320 	}
1321 
1322 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1323 	if (err < 0)
1324 		return err;
1325 	return ip_tunnel_changelink(dev, tb, &p, fwmark);
1326 }
1327 
1328 static size_t ipgre_get_size(const struct net_device *dev)
1329 {
1330 	return
1331 		/* IFLA_GRE_LINK */
1332 		nla_total_size(4) +
1333 		/* IFLA_GRE_IFLAGS */
1334 		nla_total_size(2) +
1335 		/* IFLA_GRE_OFLAGS */
1336 		nla_total_size(2) +
1337 		/* IFLA_GRE_IKEY */
1338 		nla_total_size(4) +
1339 		/* IFLA_GRE_OKEY */
1340 		nla_total_size(4) +
1341 		/* IFLA_GRE_LOCAL */
1342 		nla_total_size(4) +
1343 		/* IFLA_GRE_REMOTE */
1344 		nla_total_size(4) +
1345 		/* IFLA_GRE_TTL */
1346 		nla_total_size(1) +
1347 		/* IFLA_GRE_TOS */
1348 		nla_total_size(1) +
1349 		/* IFLA_GRE_PMTUDISC */
1350 		nla_total_size(1) +
1351 		/* IFLA_GRE_ENCAP_TYPE */
1352 		nla_total_size(2) +
1353 		/* IFLA_GRE_ENCAP_FLAGS */
1354 		nla_total_size(2) +
1355 		/* IFLA_GRE_ENCAP_SPORT */
1356 		nla_total_size(2) +
1357 		/* IFLA_GRE_ENCAP_DPORT */
1358 		nla_total_size(2) +
1359 		/* IFLA_GRE_COLLECT_METADATA */
1360 		nla_total_size(0) +
1361 		/* IFLA_GRE_IGNORE_DF */
1362 		nla_total_size(1) +
1363 		/* IFLA_GRE_FWMARK */
1364 		nla_total_size(4) +
1365 		/* IFLA_GRE_ERSPAN_INDEX */
1366 		nla_total_size(4) +
1367 		0;
1368 }
1369 
1370 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1371 {
1372 	struct ip_tunnel *t = netdev_priv(dev);
1373 	struct ip_tunnel_parm *p = &t->parms;
1374 
1375 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1376 	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
1377 			 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1378 	    nla_put_be16(skb, IFLA_GRE_OFLAGS,
1379 			 gre_tnl_flags_to_gre_flags(p->o_flags)) ||
1380 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1381 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1382 	    nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1383 	    nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1384 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1385 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1386 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1387 		       !!(p->iph.frag_off & htons(IP_DF))) ||
1388 	    nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1389 		goto nla_put_failure;
1390 
1391 	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1392 			t->encap.type) ||
1393 	    nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1394 			 t->encap.sport) ||
1395 	    nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1396 			 t->encap.dport) ||
1397 	    nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1398 			t->encap.flags))
1399 		goto nla_put_failure;
1400 
1401 	if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1402 		goto nla_put_failure;
1403 
1404 	if (t->collect_md) {
1405 		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1406 			goto nla_put_failure;
1407 	}
1408 
1409 	if (t->index)
1410 		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1411 			goto nla_put_failure;
1412 
1413 	return 0;
1414 
1415 nla_put_failure:
1416 	return -EMSGSIZE;
1417 }
1418 
1419 static void erspan_setup(struct net_device *dev)
1420 {
1421 	ether_setup(dev);
1422 	dev->netdev_ops = &erspan_netdev_ops;
1423 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1424 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1425 	ip_tunnel_setup(dev, erspan_net_id);
1426 }
1427 
1428 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1429 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1430 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1431 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1432 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1433 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1434 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1435 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1436 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1437 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1438 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1439 	[IFLA_GRE_ENCAP_TYPE]	= { .type = NLA_U16 },
1440 	[IFLA_GRE_ENCAP_FLAGS]	= { .type = NLA_U16 },
1441 	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
1442 	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
1443 	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
1444 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
1445 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
1446 	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
1447 };
1448 
1449 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1450 	.kind		= "gre",
1451 	.maxtype	= IFLA_GRE_MAX,
1452 	.policy		= ipgre_policy,
1453 	.priv_size	= sizeof(struct ip_tunnel),
1454 	.setup		= ipgre_tunnel_setup,
1455 	.validate	= ipgre_tunnel_validate,
1456 	.newlink	= ipgre_newlink,
1457 	.changelink	= ipgre_changelink,
1458 	.dellink	= ip_tunnel_dellink,
1459 	.get_size	= ipgre_get_size,
1460 	.fill_info	= ipgre_fill_info,
1461 	.get_link_net	= ip_tunnel_get_link_net,
1462 };
1463 
1464 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1465 	.kind		= "gretap",
1466 	.maxtype	= IFLA_GRE_MAX,
1467 	.policy		= ipgre_policy,
1468 	.priv_size	= sizeof(struct ip_tunnel),
1469 	.setup		= ipgre_tap_setup,
1470 	.validate	= ipgre_tap_validate,
1471 	.newlink	= ipgre_newlink,
1472 	.changelink	= ipgre_changelink,
1473 	.dellink	= ip_tunnel_dellink,
1474 	.get_size	= ipgre_get_size,
1475 	.fill_info	= ipgre_fill_info,
1476 	.get_link_net	= ip_tunnel_get_link_net,
1477 };
1478 
1479 static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1480 	.kind		= "erspan",
1481 	.maxtype	= IFLA_GRE_MAX,
1482 	.policy		= ipgre_policy,
1483 	.priv_size	= sizeof(struct ip_tunnel),
1484 	.setup		= erspan_setup,
1485 	.validate	= erspan_validate,
1486 	.newlink	= ipgre_newlink,
1487 	.changelink	= ipgre_changelink,
1488 	.dellink	= ip_tunnel_dellink,
1489 	.get_size	= ipgre_get_size,
1490 	.fill_info	= ipgre_fill_info,
1491 	.get_link_net	= ip_tunnel_get_link_net,
1492 };
1493 
1494 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1495 					u8 name_assign_type)
1496 {
1497 	struct nlattr *tb[IFLA_MAX + 1];
1498 	struct net_device *dev;
1499 	LIST_HEAD(list_kill);
1500 	struct ip_tunnel *t;
1501 	int err;
1502 
1503 	memset(&tb, 0, sizeof(tb));
1504 
1505 	dev = rtnl_create_link(net, name, name_assign_type,
1506 			       &ipgre_tap_ops, tb);
1507 	if (IS_ERR(dev))
1508 		return dev;
1509 
1510 	/* Configure flow based GRE device. */
1511 	t = netdev_priv(dev);
1512 	t->collect_md = true;
1513 
1514 	err = ipgre_newlink(net, dev, tb, NULL, NULL);
1515 	if (err < 0) {
1516 		free_netdev(dev);
1517 		return ERR_PTR(err);
1518 	}
1519 
1520 	/* openvswitch users expect packet sizes to be unrestricted,
1521 	 * so set the largest MTU we can.
1522 	 */
1523 	err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1524 	if (err)
1525 		goto out;
1526 
1527 	err = rtnl_configure_link(dev, NULL);
1528 	if (err < 0)
1529 		goto out;
1530 
1531 	return dev;
1532 out:
1533 	ip_tunnel_dellink(dev, &list_kill);
1534 	unregister_netdevice_many(&list_kill);
1535 	return ERR_PTR(err);
1536 }
1537 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1538 
1539 static int __net_init ipgre_tap_init_net(struct net *net)
1540 {
1541 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1542 }
1543 
1544 static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1545 {
1546 	ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1547 }
1548 
1549 static struct pernet_operations ipgre_tap_net_ops = {
1550 	.init = ipgre_tap_init_net,
1551 	.exit_batch = ipgre_tap_exit_batch_net,
1552 	.id   = &gre_tap_net_id,
1553 	.size = sizeof(struct ip_tunnel_net),
1554 };
1555 
1556 static int __net_init erspan_init_net(struct net *net)
1557 {
1558 	return ip_tunnel_init_net(net, erspan_net_id,
1559 				  &erspan_link_ops, "erspan0");
1560 }
1561 
1562 static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1563 {
1564 	ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1565 }
1566 
1567 static struct pernet_operations erspan_net_ops = {
1568 	.init = erspan_init_net,
1569 	.exit_batch = erspan_exit_batch_net,
1570 	.id   = &erspan_net_id,
1571 	.size = sizeof(struct ip_tunnel_net),
1572 };
1573 
1574 static int __init ipgre_init(void)
1575 {
1576 	int err;
1577 
1578 	pr_info("GRE over IPv4 tunneling driver\n");
1579 
1580 	err = register_pernet_device(&ipgre_net_ops);
1581 	if (err < 0)
1582 		return err;
1583 
1584 	err = register_pernet_device(&ipgre_tap_net_ops);
1585 	if (err < 0)
1586 		goto pnet_tap_failed;
1587 
1588 	err = register_pernet_device(&erspan_net_ops);
1589 	if (err < 0)
1590 		goto pnet_erspan_failed;
1591 
1592 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1593 	if (err < 0) {
1594 		pr_info("%s: can't add protocol\n", __func__);
1595 		goto add_proto_failed;
1596 	}
1597 
1598 	err = rtnl_link_register(&ipgre_link_ops);
1599 	if (err < 0)
1600 		goto rtnl_link_failed;
1601 
1602 	err = rtnl_link_register(&ipgre_tap_ops);
1603 	if (err < 0)
1604 		goto tap_ops_failed;
1605 
1606 	err = rtnl_link_register(&erspan_link_ops);
1607 	if (err < 0)
1608 		goto erspan_link_failed;
1609 
1610 	return 0;
1611 
1612 erspan_link_failed:
1613 	rtnl_link_unregister(&ipgre_tap_ops);
1614 tap_ops_failed:
1615 	rtnl_link_unregister(&ipgre_link_ops);
1616 rtnl_link_failed:
1617 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1618 add_proto_failed:
1619 	unregister_pernet_device(&erspan_net_ops);
1620 pnet_erspan_failed:
1621 	unregister_pernet_device(&ipgre_tap_net_ops);
1622 pnet_tap_failed:
1623 	unregister_pernet_device(&ipgre_net_ops);
1624 	return err;
1625 }
1626 
1627 static void __exit ipgre_fini(void)
1628 {
1629 	rtnl_link_unregister(&ipgre_tap_ops);
1630 	rtnl_link_unregister(&ipgre_link_ops);
1631 	rtnl_link_unregister(&erspan_link_ops);
1632 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1633 	unregister_pernet_device(&ipgre_tap_net_ops);
1634 	unregister_pernet_device(&ipgre_net_ops);
1635 	unregister_pernet_device(&erspan_net_ops);
1636 }
1637 
1638 module_init(ipgre_init);
1639 module_exit(ipgre_fini);
1640 MODULE_LICENSE("GPL");
1641 MODULE_ALIAS_RTNL_LINK("gre");
1642 MODULE_ALIAS_RTNL_LINK("gretap");
1643 MODULE_ALIAS_RTNL_LINK("erspan");
1644 MODULE_ALIAS_NETDEV("gre0");
1645 MODULE_ALIAS_NETDEV("gretap0");
1646 MODULE_ALIAS_NETDEV("erspan0");
1647