xref: /linux/net/ipv4/ip_tunnel.c (revision 3b812ecce736432e6b55e77028ea387eb1517d24)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst, __be32 saddr)
73 {
74 	struct dst_entry *old_dst;
75 
76 	dst_clone(dst);
77 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
78 	dst_release(old_dst);
79 	idst->saddr = saddr;
80 }
81 
82 static noinline void tunnel_dst_set(struct ip_tunnel *t,
83 			   struct dst_entry *dst, __be32 saddr)
84 {
85 	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
86 }
87 
88 static void tunnel_dst_reset(struct ip_tunnel *t)
89 {
90 	tunnel_dst_set(t, NULL, 0);
91 }
92 
93 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
94 {
95 	int i;
96 
97 	for_each_possible_cpu(i)
98 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
99 }
100 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
101 
102 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
103 					u32 cookie, __be32 *saddr)
104 {
105 	struct ip_tunnel_dst *idst;
106 	struct dst_entry *dst;
107 
108 	rcu_read_lock();
109 	idst = raw_cpu_ptr(t->dst_cache);
110 	dst = rcu_dereference(idst->dst);
111 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
112 		dst = NULL;
113 	if (dst) {
114 		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
115 			*saddr = idst->saddr;
116 		} else {
117 			tunnel_dst_reset(t);
118 			dst_release(dst);
119 			dst = NULL;
120 		}
121 	}
122 	rcu_read_unlock();
123 	return (struct rtable *)dst;
124 }
125 
126 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
127 				__be16 flags, __be32 key)
128 {
129 	if (p->i_flags & TUNNEL_KEY) {
130 		if (flags & TUNNEL_KEY)
131 			return key == p->i_key;
132 		else
133 			/* key expected, none present */
134 			return false;
135 	} else
136 		return !(flags & TUNNEL_KEY);
137 }
138 
139 /* Fallback tunnel: no source, no destination, no key, no options
140 
141    Tunnel hash table:
142    We require exact key match i.e. if a key is present in packet
143    it will match only tunnel with the same key; if it is not present,
144    it will match only keyless tunnel.
145 
146    All keysless packets, if not matched configured keyless tunnels
147    will match fallback tunnel.
148    Given src, dst and key, find appropriate for input tunnel.
149 */
150 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
151 				   int link, __be16 flags,
152 				   __be32 remote, __be32 local,
153 				   __be32 key)
154 {
155 	unsigned int hash;
156 	struct ip_tunnel *t, *cand = NULL;
157 	struct hlist_head *head;
158 
159 	hash = ip_tunnel_hash(key, remote);
160 	head = &itn->tunnels[hash];
161 
162 	hlist_for_each_entry_rcu(t, head, hash_node) {
163 		if (local != t->parms.iph.saddr ||
164 		    remote != t->parms.iph.daddr ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (!ip_tunnel_key_match(&t->parms, flags, key))
169 			continue;
170 
171 		if (t->parms.link == link)
172 			return t;
173 		else
174 			cand = t;
175 	}
176 
177 	hlist_for_each_entry_rcu(t, head, hash_node) {
178 		if (remote != t->parms.iph.daddr ||
179 		    t->parms.iph.saddr != 0 ||
180 		    !(t->dev->flags & IFF_UP))
181 			continue;
182 
183 		if (!ip_tunnel_key_match(&t->parms, flags, key))
184 			continue;
185 
186 		if (t->parms.link == link)
187 			return t;
188 		else if (!cand)
189 			cand = t;
190 	}
191 
192 	hash = ip_tunnel_hash(key, 0);
193 	head = &itn->tunnels[hash];
194 
195 	hlist_for_each_entry_rcu(t, head, hash_node) {
196 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
197 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
198 			continue;
199 
200 		if (!(t->dev->flags & IFF_UP))
201 			continue;
202 
203 		if (!ip_tunnel_key_match(&t->parms, flags, key))
204 			continue;
205 
206 		if (t->parms.link == link)
207 			return t;
208 		else if (!cand)
209 			cand = t;
210 	}
211 
212 	if (flags & TUNNEL_NO_KEY)
213 		goto skip_key_lookup;
214 
215 	hlist_for_each_entry_rcu(t, head, hash_node) {
216 		if (t->parms.i_key != key ||
217 		    t->parms.iph.saddr != 0 ||
218 		    t->parms.iph.daddr != 0 ||
219 		    !(t->dev->flags & IFF_UP))
220 			continue;
221 
222 		if (t->parms.link == link)
223 			return t;
224 		else if (!cand)
225 			cand = t;
226 	}
227 
228 skip_key_lookup:
229 	if (cand)
230 		return cand;
231 
232 	t = rcu_dereference(itn->collect_md_tun);
233 	if (t)
234 		return t;
235 
236 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
237 		return netdev_priv(itn->fb_tunnel_dev);
238 
239 	return NULL;
240 }
241 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
242 
243 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
244 				    struct ip_tunnel_parm *parms)
245 {
246 	unsigned int h;
247 	__be32 remote;
248 	__be32 i_key = parms->i_key;
249 
250 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
251 		remote = parms->iph.daddr;
252 	else
253 		remote = 0;
254 
255 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
256 		i_key = 0;
257 
258 	h = ip_tunnel_hash(i_key, remote);
259 	return &itn->tunnels[h];
260 }
261 
262 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
263 {
264 	struct hlist_head *head = ip_bucket(itn, &t->parms);
265 
266 	if (t->collect_md)
267 		rcu_assign_pointer(itn->collect_md_tun, t);
268 	hlist_add_head_rcu(&t->hash_node, head);
269 }
270 
271 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
272 {
273 	if (t->collect_md)
274 		rcu_assign_pointer(itn->collect_md_tun, NULL);
275 	hlist_del_init_rcu(&t->hash_node);
276 }
277 
278 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
279 					struct ip_tunnel_parm *parms,
280 					int type)
281 {
282 	__be32 remote = parms->iph.daddr;
283 	__be32 local = parms->iph.saddr;
284 	__be32 key = parms->i_key;
285 	__be16 flags = parms->i_flags;
286 	int link = parms->link;
287 	struct ip_tunnel *t = NULL;
288 	struct hlist_head *head = ip_bucket(itn, parms);
289 
290 	hlist_for_each_entry_rcu(t, head, hash_node) {
291 		if (local == t->parms.iph.saddr &&
292 		    remote == t->parms.iph.daddr &&
293 		    link == t->parms.link &&
294 		    type == t->dev->type &&
295 		    ip_tunnel_key_match(&t->parms, flags, key))
296 			break;
297 	}
298 	return t;
299 }
300 
301 static struct net_device *__ip_tunnel_create(struct net *net,
302 					     const struct rtnl_link_ops *ops,
303 					     struct ip_tunnel_parm *parms)
304 {
305 	int err;
306 	struct ip_tunnel *tunnel;
307 	struct net_device *dev;
308 	char name[IFNAMSIZ];
309 
310 	if (parms->name[0])
311 		strlcpy(name, parms->name, IFNAMSIZ);
312 	else {
313 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
314 			err = -E2BIG;
315 			goto failed;
316 		}
317 		strlcpy(name, ops->kind, IFNAMSIZ);
318 		strncat(name, "%d", 2);
319 	}
320 
321 	ASSERT_RTNL();
322 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
323 	if (!dev) {
324 		err = -ENOMEM;
325 		goto failed;
326 	}
327 	dev_net_set(dev, net);
328 
329 	dev->rtnl_link_ops = ops;
330 
331 	tunnel = netdev_priv(dev);
332 	tunnel->parms = *parms;
333 	tunnel->net = net;
334 
335 	err = register_netdevice(dev);
336 	if (err)
337 		goto failed_free;
338 
339 	return dev;
340 
341 failed_free:
342 	free_netdev(dev);
343 failed:
344 	return ERR_PTR(err);
345 }
346 
347 static inline void init_tunnel_flow(struct flowi4 *fl4,
348 				    int proto,
349 				    __be32 daddr, __be32 saddr,
350 				    __be32 key, __u8 tos, int oif)
351 {
352 	memset(fl4, 0, sizeof(*fl4));
353 	fl4->flowi4_oif = oif;
354 	fl4->daddr = daddr;
355 	fl4->saddr = saddr;
356 	fl4->flowi4_tos = tos;
357 	fl4->flowi4_proto = proto;
358 	fl4->fl4_gre_key = key;
359 }
360 
361 static int ip_tunnel_bind_dev(struct net_device *dev)
362 {
363 	struct net_device *tdev = NULL;
364 	struct ip_tunnel *tunnel = netdev_priv(dev);
365 	const struct iphdr *iph;
366 	int hlen = LL_MAX_HEADER;
367 	int mtu = ETH_DATA_LEN;
368 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
369 
370 	iph = &tunnel->parms.iph;
371 
372 	/* Guess output device to choose reasonable mtu and needed_headroom */
373 	if (iph->daddr) {
374 		struct flowi4 fl4;
375 		struct rtable *rt;
376 
377 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
378 				 iph->saddr, tunnel->parms.o_key,
379 				 RT_TOS(iph->tos), tunnel->parms.link);
380 		rt = ip_route_output_key(tunnel->net, &fl4);
381 
382 		if (!IS_ERR(rt)) {
383 			tdev = rt->dst.dev;
384 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
385 			ip_rt_put(rt);
386 		}
387 		if (dev->type != ARPHRD_ETHER)
388 			dev->flags |= IFF_POINTOPOINT;
389 	}
390 
391 	if (!tdev && tunnel->parms.link)
392 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
393 
394 	if (tdev) {
395 		hlen = tdev->hard_header_len + tdev->needed_headroom;
396 		mtu = tdev->mtu;
397 	}
398 
399 	dev->needed_headroom = t_hlen + hlen;
400 	mtu -= (dev->hard_header_len + t_hlen);
401 
402 	if (mtu < 68)
403 		mtu = 68;
404 
405 	return mtu;
406 }
407 
408 static struct ip_tunnel *ip_tunnel_create(struct net *net,
409 					  struct ip_tunnel_net *itn,
410 					  struct ip_tunnel_parm *parms)
411 {
412 	struct ip_tunnel *nt;
413 	struct net_device *dev;
414 
415 	BUG_ON(!itn->fb_tunnel_dev);
416 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
417 	if (IS_ERR(dev))
418 		return ERR_CAST(dev);
419 
420 	dev->mtu = ip_tunnel_bind_dev(dev);
421 
422 	nt = netdev_priv(dev);
423 	ip_tunnel_add(itn, nt);
424 	return nt;
425 }
426 
427 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
428 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
429 		  bool log_ecn_error)
430 {
431 	struct pcpu_sw_netstats *tstats;
432 	const struct iphdr *iph = ip_hdr(skb);
433 	int err;
434 
435 #ifdef CONFIG_NET_IPGRE_BROADCAST
436 	if (ipv4_is_multicast(iph->daddr)) {
437 		tunnel->dev->stats.multicast++;
438 		skb->pkt_type = PACKET_BROADCAST;
439 	}
440 #endif
441 
442 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
443 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
444 		tunnel->dev->stats.rx_crc_errors++;
445 		tunnel->dev->stats.rx_errors++;
446 		goto drop;
447 	}
448 
449 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
450 		if (!(tpi->flags&TUNNEL_SEQ) ||
451 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
452 			tunnel->dev->stats.rx_fifo_errors++;
453 			tunnel->dev->stats.rx_errors++;
454 			goto drop;
455 		}
456 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
457 	}
458 
459 	skb_reset_network_header(skb);
460 
461 	err = IP_ECN_decapsulate(iph, skb);
462 	if (unlikely(err)) {
463 		if (log_ecn_error)
464 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
465 					&iph->saddr, iph->tos);
466 		if (err > 1) {
467 			++tunnel->dev->stats.rx_frame_errors;
468 			++tunnel->dev->stats.rx_errors;
469 			goto drop;
470 		}
471 	}
472 
473 	tstats = this_cpu_ptr(tunnel->dev->tstats);
474 	u64_stats_update_begin(&tstats->syncp);
475 	tstats->rx_packets++;
476 	tstats->rx_bytes += skb->len;
477 	u64_stats_update_end(&tstats->syncp);
478 
479 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
480 
481 	if (tunnel->dev->type == ARPHRD_ETHER) {
482 		skb->protocol = eth_type_trans(skb, tunnel->dev);
483 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
484 	} else {
485 		skb->dev = tunnel->dev;
486 	}
487 
488 	if (tun_dst)
489 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
490 
491 	gro_cells_receive(&tunnel->gro_cells, skb);
492 	return 0;
493 
494 drop:
495 	kfree_skb(skb);
496 	return 0;
497 }
498 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
499 
500 static int ip_encap_hlen(struct ip_tunnel_encap *e)
501 {
502 	const struct ip_tunnel_encap_ops *ops;
503 	int hlen = -EINVAL;
504 
505 	if (e->type == TUNNEL_ENCAP_NONE)
506 		return 0;
507 
508 	if (e->type >= MAX_IPTUN_ENCAP_OPS)
509 		return -EINVAL;
510 
511 	rcu_read_lock();
512 	ops = rcu_dereference(iptun_encaps[e->type]);
513 	if (likely(ops && ops->encap_hlen))
514 		hlen = ops->encap_hlen(e);
515 	rcu_read_unlock();
516 
517 	return hlen;
518 }
519 
520 const struct ip_tunnel_encap_ops __rcu *
521 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
522 
523 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
524 			    unsigned int num)
525 {
526 	if (num >= MAX_IPTUN_ENCAP_OPS)
527 		return -ERANGE;
528 
529 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
530 			&iptun_encaps[num],
531 			NULL, ops) ? 0 : -1;
532 }
533 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
534 
535 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
536 			    unsigned int num)
537 {
538 	int ret;
539 
540 	if (num >= MAX_IPTUN_ENCAP_OPS)
541 		return -ERANGE;
542 
543 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
544 		       &iptun_encaps[num],
545 		       ops, NULL) == ops) ? 0 : -1;
546 
547 	synchronize_net();
548 
549 	return ret;
550 }
551 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
552 
553 int ip_tunnel_encap_setup(struct ip_tunnel *t,
554 			  struct ip_tunnel_encap *ipencap)
555 {
556 	int hlen;
557 
558 	memset(&t->encap, 0, sizeof(t->encap));
559 
560 	hlen = ip_encap_hlen(ipencap);
561 	if (hlen < 0)
562 		return hlen;
563 
564 	t->encap.type = ipencap->type;
565 	t->encap.sport = ipencap->sport;
566 	t->encap.dport = ipencap->dport;
567 	t->encap.flags = ipencap->flags;
568 
569 	t->encap_hlen = hlen;
570 	t->hlen = t->encap_hlen + t->tun_hlen;
571 
572 	return 0;
573 }
574 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
575 
576 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
577 		    u8 *protocol, struct flowi4 *fl4)
578 {
579 	const struct ip_tunnel_encap_ops *ops;
580 	int ret = -EINVAL;
581 
582 	if (t->encap.type == TUNNEL_ENCAP_NONE)
583 		return 0;
584 
585 	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
586 		return -EINVAL;
587 
588 	rcu_read_lock();
589 	ops = rcu_dereference(iptun_encaps[t->encap.type]);
590 	if (likely(ops && ops->build_header))
591 		ret = ops->build_header(skb, &t->encap, protocol, fl4);
592 	rcu_read_unlock();
593 
594 	return ret;
595 }
596 EXPORT_SYMBOL(ip_tunnel_encap);
597 
598 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
599 			    struct rtable *rt, __be16 df,
600 			    const struct iphdr *inner_iph)
601 {
602 	struct ip_tunnel *tunnel = netdev_priv(dev);
603 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
604 	int mtu;
605 
606 	if (df)
607 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
608 					- sizeof(struct iphdr) - tunnel->hlen;
609 	else
610 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
611 
612 	if (skb_dst(skb))
613 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
614 
615 	if (skb->protocol == htons(ETH_P_IP)) {
616 		if (!skb_is_gso(skb) &&
617 		    (inner_iph->frag_off & htons(IP_DF)) &&
618 		    mtu < pkt_size) {
619 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
620 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
621 			return -E2BIG;
622 		}
623 	}
624 #if IS_ENABLED(CONFIG_IPV6)
625 	else if (skb->protocol == htons(ETH_P_IPV6)) {
626 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
627 
628 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
629 			   mtu >= IPV6_MIN_MTU) {
630 			if ((tunnel->parms.iph.daddr &&
631 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
632 			    rt6->rt6i_dst.plen == 128) {
633 				rt6->rt6i_flags |= RTF_MODIFIED;
634 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
635 			}
636 		}
637 
638 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
639 					mtu < pkt_size) {
640 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
641 			return -E2BIG;
642 		}
643 	}
644 #endif
645 	return 0;
646 }
647 
648 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
649 		    const struct iphdr *tnl_params, u8 protocol)
650 {
651 	struct ip_tunnel *tunnel = netdev_priv(dev);
652 	const struct iphdr *inner_iph;
653 	struct flowi4 fl4;
654 	u8     tos, ttl;
655 	__be16 df;
656 	struct rtable *rt;		/* Route to the other host */
657 	unsigned int max_headroom;	/* The extra header space needed */
658 	__be32 dst;
659 	bool connected;
660 
661 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
662 	connected = (tunnel->parms.iph.daddr != 0);
663 
664 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
665 
666 	dst = tnl_params->daddr;
667 	if (dst == 0) {
668 		/* NBMA tunnel */
669 
670 		if (!skb_dst(skb)) {
671 			dev->stats.tx_fifo_errors++;
672 			goto tx_error;
673 		}
674 
675 		if (skb->protocol == htons(ETH_P_IP)) {
676 			rt = skb_rtable(skb);
677 			dst = rt_nexthop(rt, inner_iph->daddr);
678 		}
679 #if IS_ENABLED(CONFIG_IPV6)
680 		else if (skb->protocol == htons(ETH_P_IPV6)) {
681 			const struct in6_addr *addr6;
682 			struct neighbour *neigh;
683 			bool do_tx_error_icmp;
684 			int addr_type;
685 
686 			neigh = dst_neigh_lookup(skb_dst(skb),
687 						 &ipv6_hdr(skb)->daddr);
688 			if (!neigh)
689 				goto tx_error;
690 
691 			addr6 = (const struct in6_addr *)&neigh->primary_key;
692 			addr_type = ipv6_addr_type(addr6);
693 
694 			if (addr_type == IPV6_ADDR_ANY) {
695 				addr6 = &ipv6_hdr(skb)->daddr;
696 				addr_type = ipv6_addr_type(addr6);
697 			}
698 
699 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
700 				do_tx_error_icmp = true;
701 			else {
702 				do_tx_error_icmp = false;
703 				dst = addr6->s6_addr32[3];
704 			}
705 			neigh_release(neigh);
706 			if (do_tx_error_icmp)
707 				goto tx_error_icmp;
708 		}
709 #endif
710 		else
711 			goto tx_error;
712 
713 		connected = false;
714 	}
715 
716 	tos = tnl_params->tos;
717 	if (tos & 0x1) {
718 		tos &= ~0x1;
719 		if (skb->protocol == htons(ETH_P_IP)) {
720 			tos = inner_iph->tos;
721 			connected = false;
722 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
723 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
724 			connected = false;
725 		}
726 	}
727 
728 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
729 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
730 
731 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
732 		goto tx_error;
733 
734 	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
735 
736 	if (!rt) {
737 		rt = ip_route_output_key(tunnel->net, &fl4);
738 
739 		if (IS_ERR(rt)) {
740 			dev->stats.tx_carrier_errors++;
741 			goto tx_error;
742 		}
743 		if (connected)
744 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
745 	}
746 
747 	if (rt->dst.dev == dev) {
748 		ip_rt_put(rt);
749 		dev->stats.collisions++;
750 		goto tx_error;
751 	}
752 
753 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
754 		ip_rt_put(rt);
755 		goto tx_error;
756 	}
757 
758 	if (tunnel->err_count > 0) {
759 		if (time_before(jiffies,
760 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
761 			tunnel->err_count--;
762 
763 			dst_link_failure(skb);
764 		} else
765 			tunnel->err_count = 0;
766 	}
767 
768 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
769 	ttl = tnl_params->ttl;
770 	if (ttl == 0) {
771 		if (skb->protocol == htons(ETH_P_IP))
772 			ttl = inner_iph->ttl;
773 #if IS_ENABLED(CONFIG_IPV6)
774 		else if (skb->protocol == htons(ETH_P_IPV6))
775 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
776 #endif
777 		else
778 			ttl = ip4_dst_hoplimit(&rt->dst);
779 	}
780 
781 	df = tnl_params->frag_off;
782 	if (skb->protocol == htons(ETH_P_IP))
783 		df |= (inner_iph->frag_off&htons(IP_DF));
784 
785 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
786 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
787 	if (max_headroom > dev->needed_headroom)
788 		dev->needed_headroom = max_headroom;
789 
790 	if (skb_cow_head(skb, dev->needed_headroom)) {
791 		ip_rt_put(rt);
792 		dev->stats.tx_dropped++;
793 		kfree_skb(skb);
794 		return;
795 	}
796 
797 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
798 		      df, !net_eq(tunnel->net, dev_net(dev)));
799 	return;
800 
801 #if IS_ENABLED(CONFIG_IPV6)
802 tx_error_icmp:
803 	dst_link_failure(skb);
804 #endif
805 tx_error:
806 	dev->stats.tx_errors++;
807 	kfree_skb(skb);
808 }
809 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
810 
811 static void ip_tunnel_update(struct ip_tunnel_net *itn,
812 			     struct ip_tunnel *t,
813 			     struct net_device *dev,
814 			     struct ip_tunnel_parm *p,
815 			     bool set_mtu)
816 {
817 	ip_tunnel_del(itn, t);
818 	t->parms.iph.saddr = p->iph.saddr;
819 	t->parms.iph.daddr = p->iph.daddr;
820 	t->parms.i_key = p->i_key;
821 	t->parms.o_key = p->o_key;
822 	if (dev->type != ARPHRD_ETHER) {
823 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
824 		memcpy(dev->broadcast, &p->iph.daddr, 4);
825 	}
826 	ip_tunnel_add(itn, t);
827 
828 	t->parms.iph.ttl = p->iph.ttl;
829 	t->parms.iph.tos = p->iph.tos;
830 	t->parms.iph.frag_off = p->iph.frag_off;
831 
832 	if (t->parms.link != p->link) {
833 		int mtu;
834 
835 		t->parms.link = p->link;
836 		mtu = ip_tunnel_bind_dev(dev);
837 		if (set_mtu)
838 			dev->mtu = mtu;
839 	}
840 	ip_tunnel_dst_reset_all(t);
841 	netdev_state_change(dev);
842 }
843 
844 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
845 {
846 	int err = 0;
847 	struct ip_tunnel *t = netdev_priv(dev);
848 	struct net *net = t->net;
849 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
850 
851 	BUG_ON(!itn->fb_tunnel_dev);
852 	switch (cmd) {
853 	case SIOCGETTUNNEL:
854 		if (dev == itn->fb_tunnel_dev) {
855 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
856 			if (!t)
857 				t = netdev_priv(dev);
858 		}
859 		memcpy(p, &t->parms, sizeof(*p));
860 		break;
861 
862 	case SIOCADDTUNNEL:
863 	case SIOCCHGTUNNEL:
864 		err = -EPERM;
865 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
866 			goto done;
867 		if (p->iph.ttl)
868 			p->iph.frag_off |= htons(IP_DF);
869 		if (!(p->i_flags & VTI_ISVTI)) {
870 			if (!(p->i_flags & TUNNEL_KEY))
871 				p->i_key = 0;
872 			if (!(p->o_flags & TUNNEL_KEY))
873 				p->o_key = 0;
874 		}
875 
876 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
877 
878 		if (cmd == SIOCADDTUNNEL) {
879 			if (!t) {
880 				t = ip_tunnel_create(net, itn, p);
881 				err = PTR_ERR_OR_ZERO(t);
882 				break;
883 			}
884 
885 			err = -EEXIST;
886 			break;
887 		}
888 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
889 			if (t) {
890 				if (t->dev != dev) {
891 					err = -EEXIST;
892 					break;
893 				}
894 			} else {
895 				unsigned int nflags = 0;
896 
897 				if (ipv4_is_multicast(p->iph.daddr))
898 					nflags = IFF_BROADCAST;
899 				else if (p->iph.daddr)
900 					nflags = IFF_POINTOPOINT;
901 
902 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
903 					err = -EINVAL;
904 					break;
905 				}
906 
907 				t = netdev_priv(dev);
908 			}
909 		}
910 
911 		if (t) {
912 			err = 0;
913 			ip_tunnel_update(itn, t, dev, p, true);
914 		} else {
915 			err = -ENOENT;
916 		}
917 		break;
918 
919 	case SIOCDELTUNNEL:
920 		err = -EPERM;
921 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
922 			goto done;
923 
924 		if (dev == itn->fb_tunnel_dev) {
925 			err = -ENOENT;
926 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
927 			if (!t)
928 				goto done;
929 			err = -EPERM;
930 			if (t == netdev_priv(itn->fb_tunnel_dev))
931 				goto done;
932 			dev = t->dev;
933 		}
934 		unregister_netdevice(dev);
935 		err = 0;
936 		break;
937 
938 	default:
939 		err = -EINVAL;
940 	}
941 
942 done:
943 	return err;
944 }
945 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
946 
947 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
948 {
949 	struct ip_tunnel *tunnel = netdev_priv(dev);
950 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
951 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
952 
953 	if (new_mtu < 68)
954 		return -EINVAL;
955 
956 	if (new_mtu > max_mtu) {
957 		if (strict)
958 			return -EINVAL;
959 
960 		new_mtu = max_mtu;
961 	}
962 
963 	dev->mtu = new_mtu;
964 	return 0;
965 }
966 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
967 
968 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
969 {
970 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
971 }
972 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
973 
974 static void ip_tunnel_dev_free(struct net_device *dev)
975 {
976 	struct ip_tunnel *tunnel = netdev_priv(dev);
977 
978 	gro_cells_destroy(&tunnel->gro_cells);
979 	free_percpu(tunnel->dst_cache);
980 	free_percpu(dev->tstats);
981 	free_netdev(dev);
982 }
983 
984 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
985 {
986 	struct ip_tunnel *tunnel = netdev_priv(dev);
987 	struct ip_tunnel_net *itn;
988 
989 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
990 
991 	if (itn->fb_tunnel_dev != dev) {
992 		ip_tunnel_del(itn, netdev_priv(dev));
993 		unregister_netdevice_queue(dev, head);
994 	}
995 }
996 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
997 
998 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
999 {
1000 	struct ip_tunnel *tunnel = netdev_priv(dev);
1001 
1002 	return tunnel->net;
1003 }
1004 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1005 
1006 int ip_tunnel_get_iflink(const struct net_device *dev)
1007 {
1008 	struct ip_tunnel *tunnel = netdev_priv(dev);
1009 
1010 	return tunnel->parms.link;
1011 }
1012 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1013 
1014 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
1015 				  struct rtnl_link_ops *ops, char *devname)
1016 {
1017 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1018 	struct ip_tunnel_parm parms;
1019 	unsigned int i;
1020 
1021 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1022 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1023 
1024 	if (!ops) {
1025 		itn->fb_tunnel_dev = NULL;
1026 		return 0;
1027 	}
1028 
1029 	memset(&parms, 0, sizeof(parms));
1030 	if (devname)
1031 		strlcpy(parms.name, devname, IFNAMSIZ);
1032 
1033 	rtnl_lock();
1034 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1035 	/* FB netdevice is special: we have one, and only one per netns.
1036 	 * Allowing to move it to another netns is clearly unsafe.
1037 	 */
1038 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1039 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1040 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1041 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1042 	}
1043 	rtnl_unlock();
1044 
1045 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1046 }
1047 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1048 
1049 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1050 			      struct rtnl_link_ops *ops)
1051 {
1052 	struct net *net = dev_net(itn->fb_tunnel_dev);
1053 	struct net_device *dev, *aux;
1054 	int h;
1055 
1056 	for_each_netdev_safe(net, dev, aux)
1057 		if (dev->rtnl_link_ops == ops)
1058 			unregister_netdevice_queue(dev, head);
1059 
1060 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1061 		struct ip_tunnel *t;
1062 		struct hlist_node *n;
1063 		struct hlist_head *thead = &itn->tunnels[h];
1064 
1065 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1066 			/* If dev is in the same netns, it has already
1067 			 * been added to the list by the previous loop.
1068 			 */
1069 			if (!net_eq(dev_net(t->dev), net))
1070 				unregister_netdevice_queue(t->dev, head);
1071 	}
1072 }
1073 
1074 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1075 {
1076 	LIST_HEAD(list);
1077 
1078 	rtnl_lock();
1079 	ip_tunnel_destroy(itn, &list, ops);
1080 	unregister_netdevice_many(&list);
1081 	rtnl_unlock();
1082 }
1083 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1084 
1085 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1086 		      struct ip_tunnel_parm *p)
1087 {
1088 	struct ip_tunnel *nt;
1089 	struct net *net = dev_net(dev);
1090 	struct ip_tunnel_net *itn;
1091 	int mtu;
1092 	int err;
1093 
1094 	nt = netdev_priv(dev);
1095 	itn = net_generic(net, nt->ip_tnl_net_id);
1096 
1097 	if (nt->collect_md) {
1098 		if (rtnl_dereference(itn->collect_md_tun))
1099 			return -EEXIST;
1100 	} else {
1101 		if (ip_tunnel_find(itn, p, dev->type))
1102 			return -EEXIST;
1103 	}
1104 
1105 	nt->net = net;
1106 	nt->parms = *p;
1107 	err = register_netdevice(dev);
1108 	if (err)
1109 		goto out;
1110 
1111 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1112 		eth_hw_addr_random(dev);
1113 
1114 	mtu = ip_tunnel_bind_dev(dev);
1115 	if (!tb[IFLA_MTU])
1116 		dev->mtu = mtu;
1117 
1118 	ip_tunnel_add(itn, nt);
1119 out:
1120 	return err;
1121 }
1122 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1123 
1124 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1125 			 struct ip_tunnel_parm *p)
1126 {
1127 	struct ip_tunnel *t;
1128 	struct ip_tunnel *tunnel = netdev_priv(dev);
1129 	struct net *net = tunnel->net;
1130 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1131 
1132 	if (dev == itn->fb_tunnel_dev)
1133 		return -EINVAL;
1134 
1135 	t = ip_tunnel_find(itn, p, dev->type);
1136 
1137 	if (t) {
1138 		if (t->dev != dev)
1139 			return -EEXIST;
1140 	} else {
1141 		t = tunnel;
1142 
1143 		if (dev->type != ARPHRD_ETHER) {
1144 			unsigned int nflags = 0;
1145 
1146 			if (ipv4_is_multicast(p->iph.daddr))
1147 				nflags = IFF_BROADCAST;
1148 			else if (p->iph.daddr)
1149 				nflags = IFF_POINTOPOINT;
1150 
1151 			if ((dev->flags ^ nflags) &
1152 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1153 				return -EINVAL;
1154 		}
1155 	}
1156 
1157 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1158 	return 0;
1159 }
1160 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1161 
1162 int ip_tunnel_init(struct net_device *dev)
1163 {
1164 	struct ip_tunnel *tunnel = netdev_priv(dev);
1165 	struct iphdr *iph = &tunnel->parms.iph;
1166 	int err;
1167 
1168 	dev->destructor	= ip_tunnel_dev_free;
1169 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1170 	if (!dev->tstats)
1171 		return -ENOMEM;
1172 
1173 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1174 	if (!tunnel->dst_cache) {
1175 		free_percpu(dev->tstats);
1176 		return -ENOMEM;
1177 	}
1178 
1179 	err = gro_cells_init(&tunnel->gro_cells, dev);
1180 	if (err) {
1181 		free_percpu(tunnel->dst_cache);
1182 		free_percpu(dev->tstats);
1183 		return err;
1184 	}
1185 
1186 	tunnel->dev = dev;
1187 	tunnel->net = dev_net(dev);
1188 	strcpy(tunnel->parms.name, dev->name);
1189 	iph->version		= 4;
1190 	iph->ihl		= 5;
1191 
1192 	if (tunnel->collect_md) {
1193 		dev->features |= NETIF_F_NETNS_LOCAL;
1194 		netif_keep_dst(dev);
1195 	}
1196 	return 0;
1197 }
1198 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1199 
1200 void ip_tunnel_uninit(struct net_device *dev)
1201 {
1202 	struct ip_tunnel *tunnel = netdev_priv(dev);
1203 	struct net *net = tunnel->net;
1204 	struct ip_tunnel_net *itn;
1205 
1206 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1207 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1208 	if (itn->fb_tunnel_dev != dev)
1209 		ip_tunnel_del(itn, netdev_priv(dev));
1210 
1211 	ip_tunnel_dst_reset_all(tunnel);
1212 }
1213 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1214 
1215 /* Do least required initialization, rest of init is done in tunnel_init call */
1216 void ip_tunnel_setup(struct net_device *dev, int net_id)
1217 {
1218 	struct ip_tunnel *tunnel = netdev_priv(dev);
1219 	tunnel->ip_tnl_net_id = net_id;
1220 }
1221 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1222 
1223 MODULE_LICENSE("GPL");
1224