xref: /linux/net/ipv4/ip_tunnel.c (revision 988b0c541ed8b1c633c4d4df7169010635942e18)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst)
73 {
74 	struct dst_entry *old_dst;
75 
76 	dst_clone(dst);
77 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
78 	dst_release(old_dst);
79 }
80 
81 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
82 {
83 	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
84 }
85 
86 static void tunnel_dst_reset(struct ip_tunnel *t)
87 {
88 	tunnel_dst_set(t, NULL);
89 }
90 
91 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
92 {
93 	int i;
94 
95 	for_each_possible_cpu(i)
96 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
97 }
98 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
99 
100 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
101 {
102 	struct dst_entry *dst;
103 
104 	rcu_read_lock();
105 	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
106 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
107 		dst = NULL;
108 	if (dst) {
109 		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
110 			tunnel_dst_reset(t);
111 			dst_release(dst);
112 			dst = NULL;
113 		}
114 	}
115 	rcu_read_unlock();
116 	return (struct rtable *)dst;
117 }
118 
119 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
120 				__be16 flags, __be32 key)
121 {
122 	if (p->i_flags & TUNNEL_KEY) {
123 		if (flags & TUNNEL_KEY)
124 			return key == p->i_key;
125 		else
126 			/* key expected, none present */
127 			return false;
128 	} else
129 		return !(flags & TUNNEL_KEY);
130 }
131 
132 /* Fallback tunnel: no source, no destination, no key, no options
133 
134    Tunnel hash table:
135    We require exact key match i.e. if a key is present in packet
136    it will match only tunnel with the same key; if it is not present,
137    it will match only keyless tunnel.
138 
139    All keysless packets, if not matched configured keyless tunnels
140    will match fallback tunnel.
141    Given src, dst and key, find appropriate for input tunnel.
142 */
143 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
144 				   int link, __be16 flags,
145 				   __be32 remote, __be32 local,
146 				   __be32 key)
147 {
148 	unsigned int hash;
149 	struct ip_tunnel *t, *cand = NULL;
150 	struct hlist_head *head;
151 
152 	hash = ip_tunnel_hash(key, remote);
153 	head = &itn->tunnels[hash];
154 
155 	hlist_for_each_entry_rcu(t, head, hash_node) {
156 		if (local != t->parms.iph.saddr ||
157 		    remote != t->parms.iph.daddr ||
158 		    !(t->dev->flags & IFF_UP))
159 			continue;
160 
161 		if (!ip_tunnel_key_match(&t->parms, flags, key))
162 			continue;
163 
164 		if (t->parms.link == link)
165 			return t;
166 		else
167 			cand = t;
168 	}
169 
170 	hlist_for_each_entry_rcu(t, head, hash_node) {
171 		if (remote != t->parms.iph.daddr ||
172 		    !(t->dev->flags & IFF_UP))
173 			continue;
174 
175 		if (!ip_tunnel_key_match(&t->parms, flags, key))
176 			continue;
177 
178 		if (t->parms.link == link)
179 			return t;
180 		else if (!cand)
181 			cand = t;
182 	}
183 
184 	hash = ip_tunnel_hash(key, 0);
185 	head = &itn->tunnels[hash];
186 
187 	hlist_for_each_entry_rcu(t, head, hash_node) {
188 		if ((local != t->parms.iph.saddr &&
189 		     (local != t->parms.iph.daddr ||
190 		      !ipv4_is_multicast(local))) ||
191 		    !(t->dev->flags & IFF_UP))
192 			continue;
193 
194 		if (!ip_tunnel_key_match(&t->parms, flags, key))
195 			continue;
196 
197 		if (t->parms.link == link)
198 			return t;
199 		else if (!cand)
200 			cand = t;
201 	}
202 
203 	if (flags & TUNNEL_NO_KEY)
204 		goto skip_key_lookup;
205 
206 	hlist_for_each_entry_rcu(t, head, hash_node) {
207 		if (t->parms.i_key != key ||
208 		    !(t->dev->flags & IFF_UP))
209 			continue;
210 
211 		if (t->parms.link == link)
212 			return t;
213 		else if (!cand)
214 			cand = t;
215 	}
216 
217 skip_key_lookup:
218 	if (cand)
219 		return cand;
220 
221 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
222 		return netdev_priv(itn->fb_tunnel_dev);
223 
224 
225 	return NULL;
226 }
227 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
228 
229 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
230 				    struct ip_tunnel_parm *parms)
231 {
232 	unsigned int h;
233 	__be32 remote;
234 	__be32 i_key = parms->i_key;
235 
236 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
237 		remote = parms->iph.daddr;
238 	else
239 		remote = 0;
240 
241 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
242 		i_key = 0;
243 
244 	h = ip_tunnel_hash(i_key, remote);
245 	return &itn->tunnels[h];
246 }
247 
248 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
249 {
250 	struct hlist_head *head = ip_bucket(itn, &t->parms);
251 
252 	hlist_add_head_rcu(&t->hash_node, head);
253 }
254 
255 static void ip_tunnel_del(struct ip_tunnel *t)
256 {
257 	hlist_del_init_rcu(&t->hash_node);
258 }
259 
260 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
261 					struct ip_tunnel_parm *parms,
262 					int type)
263 {
264 	__be32 remote = parms->iph.daddr;
265 	__be32 local = parms->iph.saddr;
266 	__be32 key = parms->i_key;
267 	__be16 flags = parms->i_flags;
268 	int link = parms->link;
269 	struct ip_tunnel *t = NULL;
270 	struct hlist_head *head = ip_bucket(itn, parms);
271 
272 	hlist_for_each_entry_rcu(t, head, hash_node) {
273 		if (local == t->parms.iph.saddr &&
274 		    remote == t->parms.iph.daddr &&
275 		    link == t->parms.link &&
276 		    type == t->dev->type &&
277 		    ip_tunnel_key_match(&t->parms, flags, key))
278 			break;
279 	}
280 	return t;
281 }
282 
283 static struct net_device *__ip_tunnel_create(struct net *net,
284 					     const struct rtnl_link_ops *ops,
285 					     struct ip_tunnel_parm *parms)
286 {
287 	int err;
288 	struct ip_tunnel *tunnel;
289 	struct net_device *dev;
290 	char name[IFNAMSIZ];
291 
292 	if (parms->name[0])
293 		strlcpy(name, parms->name, IFNAMSIZ);
294 	else {
295 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
296 			err = -E2BIG;
297 			goto failed;
298 		}
299 		strlcpy(name, ops->kind, IFNAMSIZ);
300 		strncat(name, "%d", 2);
301 	}
302 
303 	ASSERT_RTNL();
304 	dev = alloc_netdev(ops->priv_size, name, ops->setup);
305 	if (!dev) {
306 		err = -ENOMEM;
307 		goto failed;
308 	}
309 	dev_net_set(dev, net);
310 
311 	dev->rtnl_link_ops = ops;
312 
313 	tunnel = netdev_priv(dev);
314 	tunnel->parms = *parms;
315 	tunnel->net = net;
316 
317 	err = register_netdevice(dev);
318 	if (err)
319 		goto failed_free;
320 
321 	return dev;
322 
323 failed_free:
324 	free_netdev(dev);
325 failed:
326 	return ERR_PTR(err);
327 }
328 
329 static inline void init_tunnel_flow(struct flowi4 *fl4,
330 				    int proto,
331 				    __be32 daddr, __be32 saddr,
332 				    __be32 key, __u8 tos, int oif)
333 {
334 	memset(fl4, 0, sizeof(*fl4));
335 	fl4->flowi4_oif = oif;
336 	fl4->daddr = daddr;
337 	fl4->saddr = saddr;
338 	fl4->flowi4_tos = tos;
339 	fl4->flowi4_proto = proto;
340 	fl4->fl4_gre_key = key;
341 }
342 
343 static int ip_tunnel_bind_dev(struct net_device *dev)
344 {
345 	struct net_device *tdev = NULL;
346 	struct ip_tunnel *tunnel = netdev_priv(dev);
347 	const struct iphdr *iph;
348 	int hlen = LL_MAX_HEADER;
349 	int mtu = ETH_DATA_LEN;
350 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
351 
352 	iph = &tunnel->parms.iph;
353 
354 	/* Guess output device to choose reasonable mtu and needed_headroom */
355 	if (iph->daddr) {
356 		struct flowi4 fl4;
357 		struct rtable *rt;
358 
359 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
360 				 iph->saddr, tunnel->parms.o_key,
361 				 RT_TOS(iph->tos), tunnel->parms.link);
362 		rt = ip_route_output_key(tunnel->net, &fl4);
363 
364 		if (!IS_ERR(rt)) {
365 			tdev = rt->dst.dev;
366 			tunnel_dst_set(tunnel, &rt->dst);
367 			ip_rt_put(rt);
368 		}
369 		if (dev->type != ARPHRD_ETHER)
370 			dev->flags |= IFF_POINTOPOINT;
371 	}
372 
373 	if (!tdev && tunnel->parms.link)
374 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
375 
376 	if (tdev) {
377 		hlen = tdev->hard_header_len + tdev->needed_headroom;
378 		mtu = tdev->mtu;
379 	}
380 	dev->iflink = tunnel->parms.link;
381 
382 	dev->needed_headroom = t_hlen + hlen;
383 	mtu -= (dev->hard_header_len + t_hlen);
384 
385 	if (mtu < 68)
386 		mtu = 68;
387 
388 	return mtu;
389 }
390 
391 static struct ip_tunnel *ip_tunnel_create(struct net *net,
392 					  struct ip_tunnel_net *itn,
393 					  struct ip_tunnel_parm *parms)
394 {
395 	struct ip_tunnel *nt;
396 	struct net_device *dev;
397 
398 	BUG_ON(!itn->fb_tunnel_dev);
399 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
400 	if (IS_ERR(dev))
401 		return ERR_CAST(dev);
402 
403 	dev->mtu = ip_tunnel_bind_dev(dev);
404 
405 	nt = netdev_priv(dev);
406 	ip_tunnel_add(itn, nt);
407 	return nt;
408 }
409 
410 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
411 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
412 {
413 	struct pcpu_sw_netstats *tstats;
414 	const struct iphdr *iph = ip_hdr(skb);
415 	int err;
416 
417 #ifdef CONFIG_NET_IPGRE_BROADCAST
418 	if (ipv4_is_multicast(iph->daddr)) {
419 		tunnel->dev->stats.multicast++;
420 		skb->pkt_type = PACKET_BROADCAST;
421 	}
422 #endif
423 
424 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
425 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
426 		tunnel->dev->stats.rx_crc_errors++;
427 		tunnel->dev->stats.rx_errors++;
428 		goto drop;
429 	}
430 
431 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
432 		if (!(tpi->flags&TUNNEL_SEQ) ||
433 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
434 			tunnel->dev->stats.rx_fifo_errors++;
435 			tunnel->dev->stats.rx_errors++;
436 			goto drop;
437 		}
438 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
439 	}
440 
441 	skb_reset_network_header(skb);
442 
443 	err = IP_ECN_decapsulate(iph, skb);
444 	if (unlikely(err)) {
445 		if (log_ecn_error)
446 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
447 					&iph->saddr, iph->tos);
448 		if (err > 1) {
449 			++tunnel->dev->stats.rx_frame_errors;
450 			++tunnel->dev->stats.rx_errors;
451 			goto drop;
452 		}
453 	}
454 
455 	tstats = this_cpu_ptr(tunnel->dev->tstats);
456 	u64_stats_update_begin(&tstats->syncp);
457 	tstats->rx_packets++;
458 	tstats->rx_bytes += skb->len;
459 	u64_stats_update_end(&tstats->syncp);
460 
461 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
462 
463 	if (tunnel->dev->type == ARPHRD_ETHER) {
464 		skb->protocol = eth_type_trans(skb, tunnel->dev);
465 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
466 	} else {
467 		skb->dev = tunnel->dev;
468 	}
469 
470 	gro_cells_receive(&tunnel->gro_cells, skb);
471 	return 0;
472 
473 drop:
474 	kfree_skb(skb);
475 	return 0;
476 }
477 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
478 
479 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
480 			    struct rtable *rt, __be16 df)
481 {
482 	struct ip_tunnel *tunnel = netdev_priv(dev);
483 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
484 	int mtu;
485 
486 	if (df)
487 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
488 					- sizeof(struct iphdr) - tunnel->hlen;
489 	else
490 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
491 
492 	if (skb_dst(skb))
493 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
494 
495 	if (skb->protocol == htons(ETH_P_IP)) {
496 		if (!skb_is_gso(skb) &&
497 		    (df & htons(IP_DF)) && mtu < pkt_size) {
498 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
499 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
500 			return -E2BIG;
501 		}
502 	}
503 #if IS_ENABLED(CONFIG_IPV6)
504 	else if (skb->protocol == htons(ETH_P_IPV6)) {
505 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
506 
507 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
508 			   mtu >= IPV6_MIN_MTU) {
509 			if ((tunnel->parms.iph.daddr &&
510 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
511 			    rt6->rt6i_dst.plen == 128) {
512 				rt6->rt6i_flags |= RTF_MODIFIED;
513 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
514 			}
515 		}
516 
517 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
518 					mtu < pkt_size) {
519 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
520 			return -E2BIG;
521 		}
522 	}
523 #endif
524 	return 0;
525 }
526 
527 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
528 		    const struct iphdr *tnl_params, const u8 protocol)
529 {
530 	struct ip_tunnel *tunnel = netdev_priv(dev);
531 	const struct iphdr *inner_iph;
532 	struct flowi4 fl4;
533 	u8     tos, ttl;
534 	__be16 df;
535 	struct rtable *rt;		/* Route to the other host */
536 	unsigned int max_headroom;	/* The extra header space needed */
537 	__be32 dst;
538 	int err;
539 	bool connected;
540 
541 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
542 	connected = (tunnel->parms.iph.daddr != 0);
543 
544 	dst = tnl_params->daddr;
545 	if (dst == 0) {
546 		/* NBMA tunnel */
547 
548 		if (skb_dst(skb) == NULL) {
549 			dev->stats.tx_fifo_errors++;
550 			goto tx_error;
551 		}
552 
553 		if (skb->protocol == htons(ETH_P_IP)) {
554 			rt = skb_rtable(skb);
555 			dst = rt_nexthop(rt, inner_iph->daddr);
556 		}
557 #if IS_ENABLED(CONFIG_IPV6)
558 		else if (skb->protocol == htons(ETH_P_IPV6)) {
559 			const struct in6_addr *addr6;
560 			struct neighbour *neigh;
561 			bool do_tx_error_icmp;
562 			int addr_type;
563 
564 			neigh = dst_neigh_lookup(skb_dst(skb),
565 						 &ipv6_hdr(skb)->daddr);
566 			if (neigh == NULL)
567 				goto tx_error;
568 
569 			addr6 = (const struct in6_addr *)&neigh->primary_key;
570 			addr_type = ipv6_addr_type(addr6);
571 
572 			if (addr_type == IPV6_ADDR_ANY) {
573 				addr6 = &ipv6_hdr(skb)->daddr;
574 				addr_type = ipv6_addr_type(addr6);
575 			}
576 
577 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
578 				do_tx_error_icmp = true;
579 			else {
580 				do_tx_error_icmp = false;
581 				dst = addr6->s6_addr32[3];
582 			}
583 			neigh_release(neigh);
584 			if (do_tx_error_icmp)
585 				goto tx_error_icmp;
586 		}
587 #endif
588 		else
589 			goto tx_error;
590 
591 		connected = false;
592 	}
593 
594 	tos = tnl_params->tos;
595 	if (tos & 0x1) {
596 		tos &= ~0x1;
597 		if (skb->protocol == htons(ETH_P_IP)) {
598 			tos = inner_iph->tos;
599 			connected = false;
600 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
601 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
602 			connected = false;
603 		}
604 	}
605 
606 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
607 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
608 
609 	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
610 
611 	if (!rt) {
612 		rt = ip_route_output_key(tunnel->net, &fl4);
613 
614 		if (IS_ERR(rt)) {
615 			dev->stats.tx_carrier_errors++;
616 			goto tx_error;
617 		}
618 		if (connected)
619 			tunnel_dst_set(tunnel, &rt->dst);
620 	}
621 
622 	if (rt->dst.dev == dev) {
623 		ip_rt_put(rt);
624 		dev->stats.collisions++;
625 		goto tx_error;
626 	}
627 
628 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
629 		ip_rt_put(rt);
630 		goto tx_error;
631 	}
632 
633 	if (tunnel->err_count > 0) {
634 		if (time_before(jiffies,
635 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
636 			tunnel->err_count--;
637 
638 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
639 			dst_link_failure(skb);
640 		} else
641 			tunnel->err_count = 0;
642 	}
643 
644 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
645 	ttl = tnl_params->ttl;
646 	if (ttl == 0) {
647 		if (skb->protocol == htons(ETH_P_IP))
648 			ttl = inner_iph->ttl;
649 #if IS_ENABLED(CONFIG_IPV6)
650 		else if (skb->protocol == htons(ETH_P_IPV6))
651 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
652 #endif
653 		else
654 			ttl = ip4_dst_hoplimit(&rt->dst);
655 	}
656 
657 	df = tnl_params->frag_off;
658 	if (skb->protocol == htons(ETH_P_IP))
659 		df |= (inner_iph->frag_off&htons(IP_DF));
660 
661 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
662 			+ rt->dst.header_len;
663 	if (max_headroom > dev->needed_headroom)
664 		dev->needed_headroom = max_headroom;
665 
666 	if (skb_cow_head(skb, dev->needed_headroom)) {
667 		ip_rt_put(rt);
668 		dev->stats.tx_dropped++;
669 		kfree_skb(skb);
670 		return;
671 	}
672 
673 	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
674 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
675 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
676 
677 	return;
678 
679 #if IS_ENABLED(CONFIG_IPV6)
680 tx_error_icmp:
681 	dst_link_failure(skb);
682 #endif
683 tx_error:
684 	dev->stats.tx_errors++;
685 	kfree_skb(skb);
686 }
687 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
688 
689 static void ip_tunnel_update(struct ip_tunnel_net *itn,
690 			     struct ip_tunnel *t,
691 			     struct net_device *dev,
692 			     struct ip_tunnel_parm *p,
693 			     bool set_mtu)
694 {
695 	ip_tunnel_del(t);
696 	t->parms.iph.saddr = p->iph.saddr;
697 	t->parms.iph.daddr = p->iph.daddr;
698 	t->parms.i_key = p->i_key;
699 	t->parms.o_key = p->o_key;
700 	if (dev->type != ARPHRD_ETHER) {
701 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
702 		memcpy(dev->broadcast, &p->iph.daddr, 4);
703 	}
704 	ip_tunnel_add(itn, t);
705 
706 	t->parms.iph.ttl = p->iph.ttl;
707 	t->parms.iph.tos = p->iph.tos;
708 	t->parms.iph.frag_off = p->iph.frag_off;
709 
710 	if (t->parms.link != p->link) {
711 		int mtu;
712 
713 		t->parms.link = p->link;
714 		mtu = ip_tunnel_bind_dev(dev);
715 		if (set_mtu)
716 			dev->mtu = mtu;
717 	}
718 	ip_tunnel_dst_reset_all(t);
719 	netdev_state_change(dev);
720 }
721 
722 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
723 {
724 	int err = 0;
725 	struct ip_tunnel *t = netdev_priv(dev);
726 	struct net *net = t->net;
727 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
728 
729 	BUG_ON(!itn->fb_tunnel_dev);
730 	switch (cmd) {
731 	case SIOCGETTUNNEL:
732 		if (dev == itn->fb_tunnel_dev) {
733 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
734 			if (t == NULL)
735 				t = netdev_priv(dev);
736 		}
737 		memcpy(p, &t->parms, sizeof(*p));
738 		break;
739 
740 	case SIOCADDTUNNEL:
741 	case SIOCCHGTUNNEL:
742 		err = -EPERM;
743 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
744 			goto done;
745 		if (p->iph.ttl)
746 			p->iph.frag_off |= htons(IP_DF);
747 		if (!(p->i_flags & VTI_ISVTI)) {
748 			if (!(p->i_flags & TUNNEL_KEY))
749 				p->i_key = 0;
750 			if (!(p->o_flags & TUNNEL_KEY))
751 				p->o_key = 0;
752 		}
753 
754 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
755 
756 		if (!t && (cmd == SIOCADDTUNNEL)) {
757 			t = ip_tunnel_create(net, itn, p);
758 			err = PTR_ERR_OR_ZERO(t);
759 			break;
760 		}
761 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
762 			if (t != NULL) {
763 				if (t->dev != dev) {
764 					err = -EEXIST;
765 					break;
766 				}
767 			} else {
768 				unsigned int nflags = 0;
769 
770 				if (ipv4_is_multicast(p->iph.daddr))
771 					nflags = IFF_BROADCAST;
772 				else if (p->iph.daddr)
773 					nflags = IFF_POINTOPOINT;
774 
775 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
776 					err = -EINVAL;
777 					break;
778 				}
779 
780 				t = netdev_priv(dev);
781 			}
782 		}
783 
784 		if (t) {
785 			err = 0;
786 			ip_tunnel_update(itn, t, dev, p, true);
787 		} else {
788 			err = -ENOENT;
789 		}
790 		break;
791 
792 	case SIOCDELTUNNEL:
793 		err = -EPERM;
794 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
795 			goto done;
796 
797 		if (dev == itn->fb_tunnel_dev) {
798 			err = -ENOENT;
799 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
800 			if (t == NULL)
801 				goto done;
802 			err = -EPERM;
803 			if (t == netdev_priv(itn->fb_tunnel_dev))
804 				goto done;
805 			dev = t->dev;
806 		}
807 		unregister_netdevice(dev);
808 		err = 0;
809 		break;
810 
811 	default:
812 		err = -EINVAL;
813 	}
814 
815 done:
816 	return err;
817 }
818 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
819 
820 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
821 {
822 	struct ip_tunnel *tunnel = netdev_priv(dev);
823 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
824 
825 	if (new_mtu < 68 ||
826 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
827 		return -EINVAL;
828 	dev->mtu = new_mtu;
829 	return 0;
830 }
831 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
832 
833 static void ip_tunnel_dev_free(struct net_device *dev)
834 {
835 	struct ip_tunnel *tunnel = netdev_priv(dev);
836 
837 	gro_cells_destroy(&tunnel->gro_cells);
838 	free_percpu(tunnel->dst_cache);
839 	free_percpu(dev->tstats);
840 	free_netdev(dev);
841 }
842 
843 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
844 {
845 	struct ip_tunnel *tunnel = netdev_priv(dev);
846 	struct ip_tunnel_net *itn;
847 
848 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
849 
850 	if (itn->fb_tunnel_dev != dev) {
851 		ip_tunnel_del(netdev_priv(dev));
852 		unregister_netdevice_queue(dev, head);
853 	}
854 }
855 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
856 
857 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
858 				  struct rtnl_link_ops *ops, char *devname)
859 {
860 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
861 	struct ip_tunnel_parm parms;
862 	unsigned int i;
863 
864 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
865 		INIT_HLIST_HEAD(&itn->tunnels[i]);
866 
867 	if (!ops) {
868 		itn->fb_tunnel_dev = NULL;
869 		return 0;
870 	}
871 
872 	memset(&parms, 0, sizeof(parms));
873 	if (devname)
874 		strlcpy(parms.name, devname, IFNAMSIZ);
875 
876 	rtnl_lock();
877 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
878 	/* FB netdevice is special: we have one, and only one per netns.
879 	 * Allowing to move it to another netns is clearly unsafe.
880 	 */
881 	if (!IS_ERR(itn->fb_tunnel_dev)) {
882 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
883 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
884 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
885 	}
886 	rtnl_unlock();
887 
888 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
889 }
890 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
891 
892 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
893 			      struct rtnl_link_ops *ops)
894 {
895 	struct net *net = dev_net(itn->fb_tunnel_dev);
896 	struct net_device *dev, *aux;
897 	int h;
898 
899 	for_each_netdev_safe(net, dev, aux)
900 		if (dev->rtnl_link_ops == ops)
901 			unregister_netdevice_queue(dev, head);
902 
903 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
904 		struct ip_tunnel *t;
905 		struct hlist_node *n;
906 		struct hlist_head *thead = &itn->tunnels[h];
907 
908 		hlist_for_each_entry_safe(t, n, thead, hash_node)
909 			/* If dev is in the same netns, it has already
910 			 * been added to the list by the previous loop.
911 			 */
912 			if (!net_eq(dev_net(t->dev), net))
913 				unregister_netdevice_queue(t->dev, head);
914 	}
915 }
916 
917 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
918 {
919 	LIST_HEAD(list);
920 
921 	rtnl_lock();
922 	ip_tunnel_destroy(itn, &list, ops);
923 	unregister_netdevice_many(&list);
924 	rtnl_unlock();
925 }
926 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
927 
928 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
929 		      struct ip_tunnel_parm *p)
930 {
931 	struct ip_tunnel *nt;
932 	struct net *net = dev_net(dev);
933 	struct ip_tunnel_net *itn;
934 	int mtu;
935 	int err;
936 
937 	nt = netdev_priv(dev);
938 	itn = net_generic(net, nt->ip_tnl_net_id);
939 
940 	if (ip_tunnel_find(itn, p, dev->type))
941 		return -EEXIST;
942 
943 	nt->net = net;
944 	nt->parms = *p;
945 	err = register_netdevice(dev);
946 	if (err)
947 		goto out;
948 
949 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
950 		eth_hw_addr_random(dev);
951 
952 	mtu = ip_tunnel_bind_dev(dev);
953 	if (!tb[IFLA_MTU])
954 		dev->mtu = mtu;
955 
956 	ip_tunnel_add(itn, nt);
957 
958 out:
959 	return err;
960 }
961 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
962 
963 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
964 			 struct ip_tunnel_parm *p)
965 {
966 	struct ip_tunnel *t;
967 	struct ip_tunnel *tunnel = netdev_priv(dev);
968 	struct net *net = tunnel->net;
969 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
970 
971 	if (dev == itn->fb_tunnel_dev)
972 		return -EINVAL;
973 
974 	t = ip_tunnel_find(itn, p, dev->type);
975 
976 	if (t) {
977 		if (t->dev != dev)
978 			return -EEXIST;
979 	} else {
980 		t = tunnel;
981 
982 		if (dev->type != ARPHRD_ETHER) {
983 			unsigned int nflags = 0;
984 
985 			if (ipv4_is_multicast(p->iph.daddr))
986 				nflags = IFF_BROADCAST;
987 			else if (p->iph.daddr)
988 				nflags = IFF_POINTOPOINT;
989 
990 			if ((dev->flags ^ nflags) &
991 			    (IFF_POINTOPOINT | IFF_BROADCAST))
992 				return -EINVAL;
993 		}
994 	}
995 
996 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
997 	return 0;
998 }
999 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1000 
1001 int ip_tunnel_init(struct net_device *dev)
1002 {
1003 	struct ip_tunnel *tunnel = netdev_priv(dev);
1004 	struct iphdr *iph = &tunnel->parms.iph;
1005 	int err;
1006 
1007 	dev->destructor	= ip_tunnel_dev_free;
1008 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1009 	if (!dev->tstats)
1010 		return -ENOMEM;
1011 
1012 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1013 	if (!tunnel->dst_cache) {
1014 		free_percpu(dev->tstats);
1015 		return -ENOMEM;
1016 	}
1017 
1018 	err = gro_cells_init(&tunnel->gro_cells, dev);
1019 	if (err) {
1020 		free_percpu(tunnel->dst_cache);
1021 		free_percpu(dev->tstats);
1022 		return err;
1023 	}
1024 
1025 	tunnel->dev = dev;
1026 	tunnel->net = dev_net(dev);
1027 	strcpy(tunnel->parms.name, dev->name);
1028 	iph->version		= 4;
1029 	iph->ihl		= 5;
1030 
1031 	return 0;
1032 }
1033 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1034 
1035 void ip_tunnel_uninit(struct net_device *dev)
1036 {
1037 	struct ip_tunnel *tunnel = netdev_priv(dev);
1038 	struct net *net = tunnel->net;
1039 	struct ip_tunnel_net *itn;
1040 
1041 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1042 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1043 	if (itn->fb_tunnel_dev != dev)
1044 		ip_tunnel_del(netdev_priv(dev));
1045 
1046 	ip_tunnel_dst_reset_all(tunnel);
1047 }
1048 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1049 
1050 /* Do least required initialization, rest of init is done in tunnel_init call */
1051 void ip_tunnel_setup(struct net_device *dev, int net_id)
1052 {
1053 	struct ip_tunnel *tunnel = netdev_priv(dev);
1054 	tunnel->ip_tnl_net_id = net_id;
1055 }
1056 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1057 
1058 MODULE_LICENSE("GPL");
1059