xref: /linux/net/ipv4/ip_tunnel.c (revision 54f5a57e266318d72f84fda95805099986a7e201)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst)
73 {
74 	struct dst_entry *old_dst;
75 
76 	if (dst) {
77 		if (dst->flags & DST_NOCACHE)
78 			dst = NULL;
79 		else
80 			dst_clone(dst);
81 	}
82 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83 	dst_release(old_dst);
84 }
85 
86 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87 {
88 	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89 }
90 
91 static void tunnel_dst_reset(struct ip_tunnel *t)
92 {
93 	tunnel_dst_set(t, NULL);
94 }
95 
96 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
97 {
98 	int i;
99 
100 	for_each_possible_cpu(i)
101 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102 }
103 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
104 
105 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
106 {
107 	struct dst_entry *dst;
108 
109 	rcu_read_lock();
110 	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
111 	if (dst) {
112 		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
113 			rcu_read_unlock();
114 			tunnel_dst_reset(t);
115 			return NULL;
116 		}
117 		dst_hold(dst);
118 	}
119 	rcu_read_unlock();
120 	return (struct rtable *)dst;
121 }
122 
123 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
124 				__be16 flags, __be32 key)
125 {
126 	if (p->i_flags & TUNNEL_KEY) {
127 		if (flags & TUNNEL_KEY)
128 			return key == p->i_key;
129 		else
130 			/* key expected, none present */
131 			return false;
132 	} else
133 		return !(flags & TUNNEL_KEY);
134 }
135 
136 /* Fallback tunnel: no source, no destination, no key, no options
137 
138    Tunnel hash table:
139    We require exact key match i.e. if a key is present in packet
140    it will match only tunnel with the same key; if it is not present,
141    it will match only keyless tunnel.
142 
143    All keysless packets, if not matched configured keyless tunnels
144    will match fallback tunnel.
145    Given src, dst and key, find appropriate for input tunnel.
146 */
147 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
148 				   int link, __be16 flags,
149 				   __be32 remote, __be32 local,
150 				   __be32 key)
151 {
152 	unsigned int hash;
153 	struct ip_tunnel *t, *cand = NULL;
154 	struct hlist_head *head;
155 
156 	hash = ip_tunnel_hash(key, remote);
157 	head = &itn->tunnels[hash];
158 
159 	hlist_for_each_entry_rcu(t, head, hash_node) {
160 		if (local != t->parms.iph.saddr ||
161 		    remote != t->parms.iph.daddr ||
162 		    !(t->dev->flags & IFF_UP))
163 			continue;
164 
165 		if (!ip_tunnel_key_match(&t->parms, flags, key))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else
171 			cand = t;
172 	}
173 
174 	hlist_for_each_entry_rcu(t, head, hash_node) {
175 		if (remote != t->parms.iph.daddr ||
176 		    !(t->dev->flags & IFF_UP))
177 			continue;
178 
179 		if (!ip_tunnel_key_match(&t->parms, flags, key))
180 			continue;
181 
182 		if (t->parms.link == link)
183 			return t;
184 		else if (!cand)
185 			cand = t;
186 	}
187 
188 	hash = ip_tunnel_hash(key, 0);
189 	head = &itn->tunnels[hash];
190 
191 	hlist_for_each_entry_rcu(t, head, hash_node) {
192 		if ((local != t->parms.iph.saddr &&
193 		     (local != t->parms.iph.daddr ||
194 		      !ipv4_is_multicast(local))) ||
195 		    !(t->dev->flags & IFF_UP))
196 			continue;
197 
198 		if (!ip_tunnel_key_match(&t->parms, flags, key))
199 			continue;
200 
201 		if (t->parms.link == link)
202 			return t;
203 		else if (!cand)
204 			cand = t;
205 	}
206 
207 	if (flags & TUNNEL_NO_KEY)
208 		goto skip_key_lookup;
209 
210 	hlist_for_each_entry_rcu(t, head, hash_node) {
211 		if (t->parms.i_key != key ||
212 		    !(t->dev->flags & IFF_UP))
213 			continue;
214 
215 		if (t->parms.link == link)
216 			return t;
217 		else if (!cand)
218 			cand = t;
219 	}
220 
221 skip_key_lookup:
222 	if (cand)
223 		return cand;
224 
225 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
226 		return netdev_priv(itn->fb_tunnel_dev);
227 
228 
229 	return NULL;
230 }
231 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
232 
233 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
234 				    struct ip_tunnel_parm *parms)
235 {
236 	unsigned int h;
237 	__be32 remote;
238 
239 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
240 		remote = parms->iph.daddr;
241 	else
242 		remote = 0;
243 
244 	h = ip_tunnel_hash(parms->i_key, remote);
245 	return &itn->tunnels[h];
246 }
247 
248 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
249 {
250 	struct hlist_head *head = ip_bucket(itn, &t->parms);
251 
252 	hlist_add_head_rcu(&t->hash_node, head);
253 }
254 
255 static void ip_tunnel_del(struct ip_tunnel *t)
256 {
257 	hlist_del_init_rcu(&t->hash_node);
258 }
259 
260 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
261 					struct ip_tunnel_parm *parms,
262 					int type)
263 {
264 	__be32 remote = parms->iph.daddr;
265 	__be32 local = parms->iph.saddr;
266 	__be32 key = parms->i_key;
267 	int link = parms->link;
268 	struct ip_tunnel *t = NULL;
269 	struct hlist_head *head = ip_bucket(itn, parms);
270 
271 	hlist_for_each_entry_rcu(t, head, hash_node) {
272 		if (local == t->parms.iph.saddr &&
273 		    remote == t->parms.iph.daddr &&
274 		    key == t->parms.i_key &&
275 		    link == t->parms.link &&
276 		    type == t->dev->type)
277 			break;
278 	}
279 	return t;
280 }
281 
282 static struct net_device *__ip_tunnel_create(struct net *net,
283 					     const struct rtnl_link_ops *ops,
284 					     struct ip_tunnel_parm *parms)
285 {
286 	int err;
287 	struct ip_tunnel *tunnel;
288 	struct net_device *dev;
289 	char name[IFNAMSIZ];
290 
291 	if (parms->name[0])
292 		strlcpy(name, parms->name, IFNAMSIZ);
293 	else {
294 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
295 			err = -E2BIG;
296 			goto failed;
297 		}
298 		strlcpy(name, ops->kind, IFNAMSIZ);
299 		strncat(name, "%d", 2);
300 	}
301 
302 	ASSERT_RTNL();
303 	dev = alloc_netdev(ops->priv_size, name, ops->setup);
304 	if (!dev) {
305 		err = -ENOMEM;
306 		goto failed;
307 	}
308 	dev_net_set(dev, net);
309 
310 	dev->rtnl_link_ops = ops;
311 
312 	tunnel = netdev_priv(dev);
313 	tunnel->parms = *parms;
314 	tunnel->net = net;
315 
316 	err = register_netdevice(dev);
317 	if (err)
318 		goto failed_free;
319 
320 	return dev;
321 
322 failed_free:
323 	free_netdev(dev);
324 failed:
325 	return ERR_PTR(err);
326 }
327 
328 static inline void init_tunnel_flow(struct flowi4 *fl4,
329 				    int proto,
330 				    __be32 daddr, __be32 saddr,
331 				    __be32 key, __u8 tos, int oif)
332 {
333 	memset(fl4, 0, sizeof(*fl4));
334 	fl4->flowi4_oif = oif;
335 	fl4->daddr = daddr;
336 	fl4->saddr = saddr;
337 	fl4->flowi4_tos = tos;
338 	fl4->flowi4_proto = proto;
339 	fl4->fl4_gre_key = key;
340 }
341 
342 static int ip_tunnel_bind_dev(struct net_device *dev)
343 {
344 	struct net_device *tdev = NULL;
345 	struct ip_tunnel *tunnel = netdev_priv(dev);
346 	const struct iphdr *iph;
347 	int hlen = LL_MAX_HEADER;
348 	int mtu = ETH_DATA_LEN;
349 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
350 
351 	iph = &tunnel->parms.iph;
352 
353 	/* Guess output device to choose reasonable mtu and needed_headroom */
354 	if (iph->daddr) {
355 		struct flowi4 fl4;
356 		struct rtable *rt;
357 
358 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
359 				 iph->saddr, tunnel->parms.o_key,
360 				 RT_TOS(iph->tos), tunnel->parms.link);
361 		rt = ip_route_output_key(tunnel->net, &fl4);
362 
363 		if (!IS_ERR(rt)) {
364 			tdev = rt->dst.dev;
365 			tunnel_dst_set(tunnel, &rt->dst);
366 			ip_rt_put(rt);
367 		}
368 		if (dev->type != ARPHRD_ETHER)
369 			dev->flags |= IFF_POINTOPOINT;
370 	}
371 
372 	if (!tdev && tunnel->parms.link)
373 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
374 
375 	if (tdev) {
376 		hlen = tdev->hard_header_len + tdev->needed_headroom;
377 		mtu = tdev->mtu;
378 	}
379 	dev->iflink = tunnel->parms.link;
380 
381 	dev->needed_headroom = t_hlen + hlen;
382 	mtu -= (dev->hard_header_len + t_hlen);
383 
384 	if (mtu < 68)
385 		mtu = 68;
386 
387 	return mtu;
388 }
389 
390 static struct ip_tunnel *ip_tunnel_create(struct net *net,
391 					  struct ip_tunnel_net *itn,
392 					  struct ip_tunnel_parm *parms)
393 {
394 	struct ip_tunnel *nt, *fbt;
395 	struct net_device *dev;
396 
397 	BUG_ON(!itn->fb_tunnel_dev);
398 	fbt = netdev_priv(itn->fb_tunnel_dev);
399 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
400 	if (IS_ERR(dev))
401 		return NULL;
402 
403 	dev->mtu = ip_tunnel_bind_dev(dev);
404 
405 	nt = netdev_priv(dev);
406 	ip_tunnel_add(itn, nt);
407 	return nt;
408 }
409 
410 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
411 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
412 {
413 	struct pcpu_sw_netstats *tstats;
414 	const struct iphdr *iph = ip_hdr(skb);
415 	int err;
416 
417 #ifdef CONFIG_NET_IPGRE_BROADCAST
418 	if (ipv4_is_multicast(iph->daddr)) {
419 		/* Looped back packet, drop it! */
420 		if (rt_is_output_route(skb_rtable(skb)))
421 			goto drop;
422 		tunnel->dev->stats.multicast++;
423 		skb->pkt_type = PACKET_BROADCAST;
424 	}
425 #endif
426 
427 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
428 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
429 		tunnel->dev->stats.rx_crc_errors++;
430 		tunnel->dev->stats.rx_errors++;
431 		goto drop;
432 	}
433 
434 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
435 		if (!(tpi->flags&TUNNEL_SEQ) ||
436 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
437 			tunnel->dev->stats.rx_fifo_errors++;
438 			tunnel->dev->stats.rx_errors++;
439 			goto drop;
440 		}
441 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
442 	}
443 
444 	err = IP_ECN_decapsulate(iph, skb);
445 	if (unlikely(err)) {
446 		if (log_ecn_error)
447 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
448 					&iph->saddr, iph->tos);
449 		if (err > 1) {
450 			++tunnel->dev->stats.rx_frame_errors;
451 			++tunnel->dev->stats.rx_errors;
452 			goto drop;
453 		}
454 	}
455 
456 	tstats = this_cpu_ptr(tunnel->dev->tstats);
457 	u64_stats_update_begin(&tstats->syncp);
458 	tstats->rx_packets++;
459 	tstats->rx_bytes += skb->len;
460 	u64_stats_update_end(&tstats->syncp);
461 
462 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
463 
464 	if (tunnel->dev->type == ARPHRD_ETHER) {
465 		skb->protocol = eth_type_trans(skb, tunnel->dev);
466 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
467 	} else {
468 		skb->dev = tunnel->dev;
469 	}
470 
471 	gro_cells_receive(&tunnel->gro_cells, skb);
472 	return 0;
473 
474 drop:
475 	kfree_skb(skb);
476 	return 0;
477 }
478 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
479 
480 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
481 			    struct rtable *rt, __be16 df)
482 {
483 	struct ip_tunnel *tunnel = netdev_priv(dev);
484 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
485 	int mtu;
486 
487 	if (df)
488 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
489 					- sizeof(struct iphdr) - tunnel->hlen;
490 	else
491 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
492 
493 	if (skb_dst(skb))
494 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
495 
496 	if (skb->protocol == htons(ETH_P_IP)) {
497 		if (!skb_is_gso(skb) &&
498 		    (df & htons(IP_DF)) && mtu < pkt_size) {
499 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
500 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
501 			return -E2BIG;
502 		}
503 	}
504 #if IS_ENABLED(CONFIG_IPV6)
505 	else if (skb->protocol == htons(ETH_P_IPV6)) {
506 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
507 
508 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
509 			   mtu >= IPV6_MIN_MTU) {
510 			if ((tunnel->parms.iph.daddr &&
511 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
512 			    rt6->rt6i_dst.plen == 128) {
513 				rt6->rt6i_flags |= RTF_MODIFIED;
514 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
515 			}
516 		}
517 
518 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
519 					mtu < pkt_size) {
520 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
521 			return -E2BIG;
522 		}
523 	}
524 #endif
525 	return 0;
526 }
527 
528 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
529 		    const struct iphdr *tnl_params, const u8 protocol)
530 {
531 	struct ip_tunnel *tunnel = netdev_priv(dev);
532 	const struct iphdr *inner_iph;
533 	struct flowi4 fl4;
534 	u8     tos, ttl;
535 	__be16 df;
536 	struct rtable *rt;		/* Route to the other host */
537 	unsigned int max_headroom;	/* The extra header space needed */
538 	__be32 dst;
539 	int err;
540 	bool connected = true;
541 
542 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
543 
544 	dst = tnl_params->daddr;
545 	if (dst == 0) {
546 		/* NBMA tunnel */
547 
548 		if (skb_dst(skb) == NULL) {
549 			dev->stats.tx_fifo_errors++;
550 			goto tx_error;
551 		}
552 
553 		if (skb->protocol == htons(ETH_P_IP)) {
554 			rt = skb_rtable(skb);
555 			dst = rt_nexthop(rt, inner_iph->daddr);
556 		}
557 #if IS_ENABLED(CONFIG_IPV6)
558 		else if (skb->protocol == htons(ETH_P_IPV6)) {
559 			const struct in6_addr *addr6;
560 			struct neighbour *neigh;
561 			bool do_tx_error_icmp;
562 			int addr_type;
563 
564 			neigh = dst_neigh_lookup(skb_dst(skb),
565 						 &ipv6_hdr(skb)->daddr);
566 			if (neigh == NULL)
567 				goto tx_error;
568 
569 			addr6 = (const struct in6_addr *)&neigh->primary_key;
570 			addr_type = ipv6_addr_type(addr6);
571 
572 			if (addr_type == IPV6_ADDR_ANY) {
573 				addr6 = &ipv6_hdr(skb)->daddr;
574 				addr_type = ipv6_addr_type(addr6);
575 			}
576 
577 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
578 				do_tx_error_icmp = true;
579 			else {
580 				do_tx_error_icmp = false;
581 				dst = addr6->s6_addr32[3];
582 			}
583 			neigh_release(neigh);
584 			if (do_tx_error_icmp)
585 				goto tx_error_icmp;
586 		}
587 #endif
588 		else
589 			goto tx_error;
590 
591 		connected = false;
592 	}
593 
594 	tos = tnl_params->tos;
595 	if (tos & 0x1) {
596 		tos &= ~0x1;
597 		if (skb->protocol == htons(ETH_P_IP)) {
598 			tos = inner_iph->tos;
599 			connected = false;
600 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
601 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
602 			connected = false;
603 		}
604 	}
605 
606 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
607 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
608 
609 	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
610 
611 	if (!rt) {
612 		rt = ip_route_output_key(tunnel->net, &fl4);
613 
614 		if (IS_ERR(rt)) {
615 			dev->stats.tx_carrier_errors++;
616 			goto tx_error;
617 		}
618 		if (connected)
619 			tunnel_dst_set(tunnel, &rt->dst);
620 	}
621 
622 	if (rt->dst.dev == dev) {
623 		ip_rt_put(rt);
624 		dev->stats.collisions++;
625 		goto tx_error;
626 	}
627 
628 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
629 		ip_rt_put(rt);
630 		goto tx_error;
631 	}
632 
633 	if (tunnel->err_count > 0) {
634 		if (time_before(jiffies,
635 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
636 			tunnel->err_count--;
637 
638 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
639 			dst_link_failure(skb);
640 		} else
641 			tunnel->err_count = 0;
642 	}
643 
644 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
645 	ttl = tnl_params->ttl;
646 	if (ttl == 0) {
647 		if (skb->protocol == htons(ETH_P_IP))
648 			ttl = inner_iph->ttl;
649 #if IS_ENABLED(CONFIG_IPV6)
650 		else if (skb->protocol == htons(ETH_P_IPV6))
651 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
652 #endif
653 		else
654 			ttl = ip4_dst_hoplimit(&rt->dst);
655 	}
656 
657 	df = tnl_params->frag_off;
658 	if (skb->protocol == htons(ETH_P_IP))
659 		df |= (inner_iph->frag_off&htons(IP_DF));
660 
661 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
662 			+ rt->dst.header_len;
663 	if (max_headroom > dev->needed_headroom)
664 		dev->needed_headroom = max_headroom;
665 
666 	if (skb_cow_head(skb, dev->needed_headroom)) {
667 		dev->stats.tx_dropped++;
668 		kfree_skb(skb);
669 		return;
670 	}
671 
672 	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
673 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
674 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
675 
676 	return;
677 
678 #if IS_ENABLED(CONFIG_IPV6)
679 tx_error_icmp:
680 	dst_link_failure(skb);
681 #endif
682 tx_error:
683 	dev->stats.tx_errors++;
684 	kfree_skb(skb);
685 }
686 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
687 
688 static void ip_tunnel_update(struct ip_tunnel_net *itn,
689 			     struct ip_tunnel *t,
690 			     struct net_device *dev,
691 			     struct ip_tunnel_parm *p,
692 			     bool set_mtu)
693 {
694 	ip_tunnel_del(t);
695 	t->parms.iph.saddr = p->iph.saddr;
696 	t->parms.iph.daddr = p->iph.daddr;
697 	t->parms.i_key = p->i_key;
698 	t->parms.o_key = p->o_key;
699 	if (dev->type != ARPHRD_ETHER) {
700 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
701 		memcpy(dev->broadcast, &p->iph.daddr, 4);
702 	}
703 	ip_tunnel_add(itn, t);
704 
705 	t->parms.iph.ttl = p->iph.ttl;
706 	t->parms.iph.tos = p->iph.tos;
707 	t->parms.iph.frag_off = p->iph.frag_off;
708 
709 	if (t->parms.link != p->link) {
710 		int mtu;
711 
712 		t->parms.link = p->link;
713 		mtu = ip_tunnel_bind_dev(dev);
714 		if (set_mtu)
715 			dev->mtu = mtu;
716 	}
717 	ip_tunnel_dst_reset_all(t);
718 	netdev_state_change(dev);
719 }
720 
721 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
722 {
723 	int err = 0;
724 	struct ip_tunnel *t;
725 	struct net *net = dev_net(dev);
726 	struct ip_tunnel *tunnel = netdev_priv(dev);
727 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
728 
729 	BUG_ON(!itn->fb_tunnel_dev);
730 	switch (cmd) {
731 	case SIOCGETTUNNEL:
732 		t = NULL;
733 		if (dev == itn->fb_tunnel_dev)
734 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
735 		if (t == NULL)
736 			t = netdev_priv(dev);
737 		memcpy(p, &t->parms, sizeof(*p));
738 		break;
739 
740 	case SIOCADDTUNNEL:
741 	case SIOCCHGTUNNEL:
742 		err = -EPERM;
743 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
744 			goto done;
745 		if (p->iph.ttl)
746 			p->iph.frag_off |= htons(IP_DF);
747 		if (!(p->i_flags&TUNNEL_KEY))
748 			p->i_key = 0;
749 		if (!(p->o_flags&TUNNEL_KEY))
750 			p->o_key = 0;
751 
752 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
753 
754 		if (!t && (cmd == SIOCADDTUNNEL))
755 			t = ip_tunnel_create(net, itn, p);
756 
757 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
758 			if (t != NULL) {
759 				if (t->dev != dev) {
760 					err = -EEXIST;
761 					break;
762 				}
763 			} else {
764 				unsigned int nflags = 0;
765 
766 				if (ipv4_is_multicast(p->iph.daddr))
767 					nflags = IFF_BROADCAST;
768 				else if (p->iph.daddr)
769 					nflags = IFF_POINTOPOINT;
770 
771 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
772 					err = -EINVAL;
773 					break;
774 				}
775 
776 				t = netdev_priv(dev);
777 			}
778 		}
779 
780 		if (t) {
781 			err = 0;
782 			ip_tunnel_update(itn, t, dev, p, true);
783 		} else
784 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
785 		break;
786 
787 	case SIOCDELTUNNEL:
788 		err = -EPERM;
789 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
790 			goto done;
791 
792 		if (dev == itn->fb_tunnel_dev) {
793 			err = -ENOENT;
794 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
795 			if (t == NULL)
796 				goto done;
797 			err = -EPERM;
798 			if (t == netdev_priv(itn->fb_tunnel_dev))
799 				goto done;
800 			dev = t->dev;
801 		}
802 		unregister_netdevice(dev);
803 		err = 0;
804 		break;
805 
806 	default:
807 		err = -EINVAL;
808 	}
809 
810 done:
811 	return err;
812 }
813 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
814 
815 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
816 {
817 	struct ip_tunnel *tunnel = netdev_priv(dev);
818 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
819 
820 	if (new_mtu < 68 ||
821 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
822 		return -EINVAL;
823 	dev->mtu = new_mtu;
824 	return 0;
825 }
826 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
827 
828 static void ip_tunnel_dev_free(struct net_device *dev)
829 {
830 	struct ip_tunnel *tunnel = netdev_priv(dev);
831 
832 	gro_cells_destroy(&tunnel->gro_cells);
833 	free_percpu(tunnel->dst_cache);
834 	free_percpu(dev->tstats);
835 	free_netdev(dev);
836 }
837 
838 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
839 {
840 	struct ip_tunnel *tunnel = netdev_priv(dev);
841 	struct ip_tunnel_net *itn;
842 
843 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
844 
845 	if (itn->fb_tunnel_dev != dev) {
846 		ip_tunnel_del(netdev_priv(dev));
847 		unregister_netdevice_queue(dev, head);
848 	}
849 }
850 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
851 
852 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
853 				  struct rtnl_link_ops *ops, char *devname)
854 {
855 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
856 	struct ip_tunnel_parm parms;
857 	unsigned int i;
858 
859 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
860 		INIT_HLIST_HEAD(&itn->tunnels[i]);
861 
862 	if (!ops) {
863 		itn->fb_tunnel_dev = NULL;
864 		return 0;
865 	}
866 
867 	memset(&parms, 0, sizeof(parms));
868 	if (devname)
869 		strlcpy(parms.name, devname, IFNAMSIZ);
870 
871 	rtnl_lock();
872 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
873 	/* FB netdevice is special: we have one, and only one per netns.
874 	 * Allowing to move it to another netns is clearly unsafe.
875 	 */
876 	if (!IS_ERR(itn->fb_tunnel_dev)) {
877 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
878 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
879 	}
880 	rtnl_unlock();
881 
882 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
883 }
884 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
885 
886 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
887 			      struct rtnl_link_ops *ops)
888 {
889 	struct net *net = dev_net(itn->fb_tunnel_dev);
890 	struct net_device *dev, *aux;
891 	int h;
892 
893 	for_each_netdev_safe(net, dev, aux)
894 		if (dev->rtnl_link_ops == ops)
895 			unregister_netdevice_queue(dev, head);
896 
897 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
898 		struct ip_tunnel *t;
899 		struct hlist_node *n;
900 		struct hlist_head *thead = &itn->tunnels[h];
901 
902 		hlist_for_each_entry_safe(t, n, thead, hash_node)
903 			/* If dev is in the same netns, it has already
904 			 * been added to the list by the previous loop.
905 			 */
906 			if (!net_eq(dev_net(t->dev), net))
907 				unregister_netdevice_queue(t->dev, head);
908 	}
909 }
910 
911 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
912 {
913 	LIST_HEAD(list);
914 
915 	rtnl_lock();
916 	ip_tunnel_destroy(itn, &list, ops);
917 	unregister_netdevice_many(&list);
918 	rtnl_unlock();
919 }
920 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
921 
922 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
923 		      struct ip_tunnel_parm *p)
924 {
925 	struct ip_tunnel *nt;
926 	struct net *net = dev_net(dev);
927 	struct ip_tunnel_net *itn;
928 	int mtu;
929 	int err;
930 
931 	nt = netdev_priv(dev);
932 	itn = net_generic(net, nt->ip_tnl_net_id);
933 
934 	if (ip_tunnel_find(itn, p, dev->type))
935 		return -EEXIST;
936 
937 	nt->net = net;
938 	nt->parms = *p;
939 	err = register_netdevice(dev);
940 	if (err)
941 		goto out;
942 
943 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
944 		eth_hw_addr_random(dev);
945 
946 	mtu = ip_tunnel_bind_dev(dev);
947 	if (!tb[IFLA_MTU])
948 		dev->mtu = mtu;
949 
950 	ip_tunnel_add(itn, nt);
951 
952 out:
953 	return err;
954 }
955 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
956 
957 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
958 			 struct ip_tunnel_parm *p)
959 {
960 	struct ip_tunnel *t;
961 	struct ip_tunnel *tunnel = netdev_priv(dev);
962 	struct net *net = tunnel->net;
963 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
964 
965 	if (dev == itn->fb_tunnel_dev)
966 		return -EINVAL;
967 
968 	t = ip_tunnel_find(itn, p, dev->type);
969 
970 	if (t) {
971 		if (t->dev != dev)
972 			return -EEXIST;
973 	} else {
974 		t = tunnel;
975 
976 		if (dev->type != ARPHRD_ETHER) {
977 			unsigned int nflags = 0;
978 
979 			if (ipv4_is_multicast(p->iph.daddr))
980 				nflags = IFF_BROADCAST;
981 			else if (p->iph.daddr)
982 				nflags = IFF_POINTOPOINT;
983 
984 			if ((dev->flags ^ nflags) &
985 			    (IFF_POINTOPOINT | IFF_BROADCAST))
986 				return -EINVAL;
987 		}
988 	}
989 
990 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
991 	return 0;
992 }
993 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
994 
995 int ip_tunnel_init(struct net_device *dev)
996 {
997 	struct ip_tunnel *tunnel = netdev_priv(dev);
998 	struct iphdr *iph = &tunnel->parms.iph;
999 	int i, err;
1000 
1001 	dev->destructor	= ip_tunnel_dev_free;
1002 	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
1003 	if (!dev->tstats)
1004 		return -ENOMEM;
1005 
1006 	for_each_possible_cpu(i) {
1007 		struct pcpu_sw_netstats *ipt_stats;
1008 		ipt_stats = per_cpu_ptr(dev->tstats, i);
1009 		u64_stats_init(&ipt_stats->syncp);
1010 	}
1011 
1012 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1013 	if (!tunnel->dst_cache) {
1014 		free_percpu(dev->tstats);
1015 		return -ENOMEM;
1016 	}
1017 
1018 	err = gro_cells_init(&tunnel->gro_cells, dev);
1019 	if (err) {
1020 		free_percpu(tunnel->dst_cache);
1021 		free_percpu(dev->tstats);
1022 		return err;
1023 	}
1024 
1025 	tunnel->dev = dev;
1026 	tunnel->net = dev_net(dev);
1027 	strcpy(tunnel->parms.name, dev->name);
1028 	iph->version		= 4;
1029 	iph->ihl		= 5;
1030 
1031 	return 0;
1032 }
1033 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1034 
1035 void ip_tunnel_uninit(struct net_device *dev)
1036 {
1037 	struct ip_tunnel *tunnel = netdev_priv(dev);
1038 	struct net *net = tunnel->net;
1039 	struct ip_tunnel_net *itn;
1040 
1041 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1042 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1043 	if (itn->fb_tunnel_dev != dev)
1044 		ip_tunnel_del(netdev_priv(dev));
1045 
1046 	ip_tunnel_dst_reset_all(tunnel);
1047 }
1048 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1049 
1050 /* Do least required initialization, rest of init is done in tunnel_init call */
1051 void ip_tunnel_setup(struct net_device *dev, int net_id)
1052 {
1053 	struct ip_tunnel *tunnel = netdev_priv(dev);
1054 	tunnel->ip_tnl_net_id = net_id;
1055 }
1056 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1057 
1058 MODULE_LICENSE("GPL");
1059