xref: /linux/net/ipv4/ip_tunnel.c (revision 957e3facd147510f2cf8780e38606f1d707f0e33)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73 			     struct dst_entry *dst, __be32 saddr)
74 {
75 	struct dst_entry *old_dst;
76 
77 	dst_clone(dst);
78 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79 	dst_release(old_dst);
80 	idst->saddr = saddr;
81 }
82 
83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84 			   struct dst_entry *dst, __be32 saddr)
85 {
86 	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87 }
88 
89 static void tunnel_dst_reset(struct ip_tunnel *t)
90 {
91 	tunnel_dst_set(t, NULL, 0);
92 }
93 
94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95 {
96 	int i;
97 
98 	for_each_possible_cpu(i)
99 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100 }
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102 
103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104 					u32 cookie, __be32 *saddr)
105 {
106 	struct ip_tunnel_dst *idst;
107 	struct dst_entry *dst;
108 
109 	rcu_read_lock();
110 	idst = raw_cpu_ptr(t->dst_cache);
111 	dst = rcu_dereference(idst->dst);
112 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113 		dst = NULL;
114 	if (dst) {
115 		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116 			*saddr = idst->saddr;
117 		} else {
118 			tunnel_dst_reset(t);
119 			dst_release(dst);
120 			dst = NULL;
121 		}
122 	}
123 	rcu_read_unlock();
124 	return (struct rtable *)dst;
125 }
126 
127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128 				__be16 flags, __be32 key)
129 {
130 	if (p->i_flags & TUNNEL_KEY) {
131 		if (flags & TUNNEL_KEY)
132 			return key == p->i_key;
133 		else
134 			/* key expected, none present */
135 			return false;
136 	} else
137 		return !(flags & TUNNEL_KEY);
138 }
139 
140 /* Fallback tunnel: no source, no destination, no key, no options
141 
142    Tunnel hash table:
143    We require exact key match i.e. if a key is present in packet
144    it will match only tunnel with the same key; if it is not present,
145    it will match only keyless tunnel.
146 
147    All keysless packets, if not matched configured keyless tunnels
148    will match fallback tunnel.
149    Given src, dst and key, find appropriate for input tunnel.
150 */
151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152 				   int link, __be16 flags,
153 				   __be32 remote, __be32 local,
154 				   __be32 key)
155 {
156 	unsigned int hash;
157 	struct ip_tunnel *t, *cand = NULL;
158 	struct hlist_head *head;
159 
160 	hash = ip_tunnel_hash(key, remote);
161 	head = &itn->tunnels[hash];
162 
163 	hlist_for_each_entry_rcu(t, head, hash_node) {
164 		if (local != t->parms.iph.saddr ||
165 		    remote != t->parms.iph.daddr ||
166 		    !(t->dev->flags & IFF_UP))
167 			continue;
168 
169 		if (!ip_tunnel_key_match(&t->parms, flags, key))
170 			continue;
171 
172 		if (t->parms.link == link)
173 			return t;
174 		else
175 			cand = t;
176 	}
177 
178 	hlist_for_each_entry_rcu(t, head, hash_node) {
179 		if (remote != t->parms.iph.daddr ||
180 		    t->parms.iph.saddr != 0 ||
181 		    !(t->dev->flags & IFF_UP))
182 			continue;
183 
184 		if (!ip_tunnel_key_match(&t->parms, flags, key))
185 			continue;
186 
187 		if (t->parms.link == link)
188 			return t;
189 		else if (!cand)
190 			cand = t;
191 	}
192 
193 	hash = ip_tunnel_hash(key, 0);
194 	head = &itn->tunnels[hash];
195 
196 	hlist_for_each_entry_rcu(t, head, hash_node) {
197 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199 			continue;
200 
201 		if (!(t->dev->flags & IFF_UP))
202 			continue;
203 
204 		if (!ip_tunnel_key_match(&t->parms, flags, key))
205 			continue;
206 
207 		if (t->parms.link == link)
208 			return t;
209 		else if (!cand)
210 			cand = t;
211 	}
212 
213 	if (flags & TUNNEL_NO_KEY)
214 		goto skip_key_lookup;
215 
216 	hlist_for_each_entry_rcu(t, head, hash_node) {
217 		if (t->parms.i_key != key ||
218 		    t->parms.iph.saddr != 0 ||
219 		    t->parms.iph.daddr != 0 ||
220 		    !(t->dev->flags & IFF_UP))
221 			continue;
222 
223 		if (t->parms.link == link)
224 			return t;
225 		else if (!cand)
226 			cand = t;
227 	}
228 
229 skip_key_lookup:
230 	if (cand)
231 		return cand;
232 
233 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
234 		return netdev_priv(itn->fb_tunnel_dev);
235 
236 
237 	return NULL;
238 }
239 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
240 
241 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
242 				    struct ip_tunnel_parm *parms)
243 {
244 	unsigned int h;
245 	__be32 remote;
246 	__be32 i_key = parms->i_key;
247 
248 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
249 		remote = parms->iph.daddr;
250 	else
251 		remote = 0;
252 
253 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
254 		i_key = 0;
255 
256 	h = ip_tunnel_hash(i_key, remote);
257 	return &itn->tunnels[h];
258 }
259 
260 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
261 {
262 	struct hlist_head *head = ip_bucket(itn, &t->parms);
263 
264 	hlist_add_head_rcu(&t->hash_node, head);
265 }
266 
267 static void ip_tunnel_del(struct ip_tunnel *t)
268 {
269 	hlist_del_init_rcu(&t->hash_node);
270 }
271 
272 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
273 					struct ip_tunnel_parm *parms,
274 					int type)
275 {
276 	__be32 remote = parms->iph.daddr;
277 	__be32 local = parms->iph.saddr;
278 	__be32 key = parms->i_key;
279 	__be16 flags = parms->i_flags;
280 	int link = parms->link;
281 	struct ip_tunnel *t = NULL;
282 	struct hlist_head *head = ip_bucket(itn, parms);
283 
284 	hlist_for_each_entry_rcu(t, head, hash_node) {
285 		if (local == t->parms.iph.saddr &&
286 		    remote == t->parms.iph.daddr &&
287 		    link == t->parms.link &&
288 		    type == t->dev->type &&
289 		    ip_tunnel_key_match(&t->parms, flags, key))
290 			break;
291 	}
292 	return t;
293 }
294 
295 static struct net_device *__ip_tunnel_create(struct net *net,
296 					     const struct rtnl_link_ops *ops,
297 					     struct ip_tunnel_parm *parms)
298 {
299 	int err;
300 	struct ip_tunnel *tunnel;
301 	struct net_device *dev;
302 	char name[IFNAMSIZ];
303 
304 	if (parms->name[0])
305 		strlcpy(name, parms->name, IFNAMSIZ);
306 	else {
307 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
308 			err = -E2BIG;
309 			goto failed;
310 		}
311 		strlcpy(name, ops->kind, IFNAMSIZ);
312 		strncat(name, "%d", 2);
313 	}
314 
315 	ASSERT_RTNL();
316 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
317 	if (!dev) {
318 		err = -ENOMEM;
319 		goto failed;
320 	}
321 	dev_net_set(dev, net);
322 
323 	dev->rtnl_link_ops = ops;
324 
325 	tunnel = netdev_priv(dev);
326 	tunnel->parms = *parms;
327 	tunnel->net = net;
328 
329 	err = register_netdevice(dev);
330 	if (err)
331 		goto failed_free;
332 
333 	return dev;
334 
335 failed_free:
336 	free_netdev(dev);
337 failed:
338 	return ERR_PTR(err);
339 }
340 
341 static inline void init_tunnel_flow(struct flowi4 *fl4,
342 				    int proto,
343 				    __be32 daddr, __be32 saddr,
344 				    __be32 key, __u8 tos, int oif)
345 {
346 	memset(fl4, 0, sizeof(*fl4));
347 	fl4->flowi4_oif = oif;
348 	fl4->daddr = daddr;
349 	fl4->saddr = saddr;
350 	fl4->flowi4_tos = tos;
351 	fl4->flowi4_proto = proto;
352 	fl4->fl4_gre_key = key;
353 }
354 
355 static int ip_tunnel_bind_dev(struct net_device *dev)
356 {
357 	struct net_device *tdev = NULL;
358 	struct ip_tunnel *tunnel = netdev_priv(dev);
359 	const struct iphdr *iph;
360 	int hlen = LL_MAX_HEADER;
361 	int mtu = ETH_DATA_LEN;
362 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
363 
364 	iph = &tunnel->parms.iph;
365 
366 	/* Guess output device to choose reasonable mtu and needed_headroom */
367 	if (iph->daddr) {
368 		struct flowi4 fl4;
369 		struct rtable *rt;
370 
371 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
372 				 iph->saddr, tunnel->parms.o_key,
373 				 RT_TOS(iph->tos), tunnel->parms.link);
374 		rt = ip_route_output_key(tunnel->net, &fl4);
375 
376 		if (!IS_ERR(rt)) {
377 			tdev = rt->dst.dev;
378 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
379 			ip_rt_put(rt);
380 		}
381 		if (dev->type != ARPHRD_ETHER)
382 			dev->flags |= IFF_POINTOPOINT;
383 	}
384 
385 	if (!tdev && tunnel->parms.link)
386 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
387 
388 	if (tdev) {
389 		hlen = tdev->hard_header_len + tdev->needed_headroom;
390 		mtu = tdev->mtu;
391 	}
392 	dev->iflink = tunnel->parms.link;
393 
394 	dev->needed_headroom = t_hlen + hlen;
395 	mtu -= (dev->hard_header_len + t_hlen);
396 
397 	if (mtu < 68)
398 		mtu = 68;
399 
400 	return mtu;
401 }
402 
403 static struct ip_tunnel *ip_tunnel_create(struct net *net,
404 					  struct ip_tunnel_net *itn,
405 					  struct ip_tunnel_parm *parms)
406 {
407 	struct ip_tunnel *nt;
408 	struct net_device *dev;
409 
410 	BUG_ON(!itn->fb_tunnel_dev);
411 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
412 	if (IS_ERR(dev))
413 		return ERR_CAST(dev);
414 
415 	dev->mtu = ip_tunnel_bind_dev(dev);
416 
417 	nt = netdev_priv(dev);
418 	ip_tunnel_add(itn, nt);
419 	return nt;
420 }
421 
422 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
423 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
424 {
425 	struct pcpu_sw_netstats *tstats;
426 	const struct iphdr *iph = ip_hdr(skb);
427 	int err;
428 
429 #ifdef CONFIG_NET_IPGRE_BROADCAST
430 	if (ipv4_is_multicast(iph->daddr)) {
431 		tunnel->dev->stats.multicast++;
432 		skb->pkt_type = PACKET_BROADCAST;
433 	}
434 #endif
435 
436 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
437 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
438 		tunnel->dev->stats.rx_crc_errors++;
439 		tunnel->dev->stats.rx_errors++;
440 		goto drop;
441 	}
442 
443 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
444 		if (!(tpi->flags&TUNNEL_SEQ) ||
445 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
446 			tunnel->dev->stats.rx_fifo_errors++;
447 			tunnel->dev->stats.rx_errors++;
448 			goto drop;
449 		}
450 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
451 	}
452 
453 	skb_reset_network_header(skb);
454 
455 	err = IP_ECN_decapsulate(iph, skb);
456 	if (unlikely(err)) {
457 		if (log_ecn_error)
458 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
459 					&iph->saddr, iph->tos);
460 		if (err > 1) {
461 			++tunnel->dev->stats.rx_frame_errors;
462 			++tunnel->dev->stats.rx_errors;
463 			goto drop;
464 		}
465 	}
466 
467 	tstats = this_cpu_ptr(tunnel->dev->tstats);
468 	u64_stats_update_begin(&tstats->syncp);
469 	tstats->rx_packets++;
470 	tstats->rx_bytes += skb->len;
471 	u64_stats_update_end(&tstats->syncp);
472 
473 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
474 
475 	if (tunnel->dev->type == ARPHRD_ETHER) {
476 		skb->protocol = eth_type_trans(skb, tunnel->dev);
477 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
478 	} else {
479 		skb->dev = tunnel->dev;
480 	}
481 
482 	gro_cells_receive(&tunnel->gro_cells, skb);
483 	return 0;
484 
485 drop:
486 	kfree_skb(skb);
487 	return 0;
488 }
489 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
490 
491 static int ip_encap_hlen(struct ip_tunnel_encap *e)
492 {
493 	const struct ip_tunnel_encap_ops *ops;
494 	int hlen = -EINVAL;
495 
496 	if (e->type == TUNNEL_ENCAP_NONE)
497 		return 0;
498 
499 	if (e->type >= MAX_IPTUN_ENCAP_OPS)
500 		return -EINVAL;
501 
502 	rcu_read_lock();
503 	ops = rcu_dereference(iptun_encaps[e->type]);
504 	if (likely(ops && ops->encap_hlen))
505 		hlen = ops->encap_hlen(e);
506 	rcu_read_unlock();
507 
508 	return hlen;
509 }
510 
511 const struct ip_tunnel_encap_ops __rcu *
512 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
513 
514 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
515 			    unsigned int num)
516 {
517 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
518 			&iptun_encaps[num],
519 			NULL, ops) ? 0 : -1;
520 }
521 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
522 
523 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
524 			    unsigned int num)
525 {
526 	int ret;
527 
528 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
529 		       &iptun_encaps[num],
530 		       ops, NULL) == ops) ? 0 : -1;
531 
532 	synchronize_net();
533 
534 	return ret;
535 }
536 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
537 
538 int ip_tunnel_encap_setup(struct ip_tunnel *t,
539 			  struct ip_tunnel_encap *ipencap)
540 {
541 	int hlen;
542 
543 	memset(&t->encap, 0, sizeof(t->encap));
544 
545 	hlen = ip_encap_hlen(ipencap);
546 	if (hlen < 0)
547 		return hlen;
548 
549 	t->encap.type = ipencap->type;
550 	t->encap.sport = ipencap->sport;
551 	t->encap.dport = ipencap->dport;
552 	t->encap.flags = ipencap->flags;
553 
554 	t->encap_hlen = hlen;
555 	t->hlen = t->encap_hlen + t->tun_hlen;
556 
557 	return 0;
558 }
559 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
560 
561 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
562 		    u8 *protocol, struct flowi4 *fl4)
563 {
564 	const struct ip_tunnel_encap_ops *ops;
565 	int ret = -EINVAL;
566 
567 	if (t->encap.type == TUNNEL_ENCAP_NONE)
568 		return 0;
569 
570 	rcu_read_lock();
571 	ops = rcu_dereference(iptun_encaps[t->encap.type]);
572 	if (likely(ops && ops->build_header))
573 		ret = ops->build_header(skb, &t->encap, protocol, fl4);
574 	rcu_read_unlock();
575 
576 	return ret;
577 }
578 EXPORT_SYMBOL(ip_tunnel_encap);
579 
580 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
581 			    struct rtable *rt, __be16 df)
582 {
583 	struct ip_tunnel *tunnel = netdev_priv(dev);
584 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
585 	int mtu;
586 
587 	if (df)
588 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
589 					- sizeof(struct iphdr) - tunnel->hlen;
590 	else
591 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
592 
593 	if (skb_dst(skb))
594 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
595 
596 	if (skb->protocol == htons(ETH_P_IP)) {
597 		if (!skb_is_gso(skb) &&
598 		    (df & htons(IP_DF)) && mtu < pkt_size) {
599 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
600 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
601 			return -E2BIG;
602 		}
603 	}
604 #if IS_ENABLED(CONFIG_IPV6)
605 	else if (skb->protocol == htons(ETH_P_IPV6)) {
606 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
607 
608 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
609 			   mtu >= IPV6_MIN_MTU) {
610 			if ((tunnel->parms.iph.daddr &&
611 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
612 			    rt6->rt6i_dst.plen == 128) {
613 				rt6->rt6i_flags |= RTF_MODIFIED;
614 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
615 			}
616 		}
617 
618 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
619 					mtu < pkt_size) {
620 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
621 			return -E2BIG;
622 		}
623 	}
624 #endif
625 	return 0;
626 }
627 
628 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
629 		    const struct iphdr *tnl_params, u8 protocol)
630 {
631 	struct ip_tunnel *tunnel = netdev_priv(dev);
632 	const struct iphdr *inner_iph;
633 	struct flowi4 fl4;
634 	u8     tos, ttl;
635 	__be16 df;
636 	struct rtable *rt;		/* Route to the other host */
637 	unsigned int max_headroom;	/* The extra header space needed */
638 	__be32 dst;
639 	int err;
640 	bool connected;
641 
642 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
643 	connected = (tunnel->parms.iph.daddr != 0);
644 
645 	dst = tnl_params->daddr;
646 	if (dst == 0) {
647 		/* NBMA tunnel */
648 
649 		if (skb_dst(skb) == NULL) {
650 			dev->stats.tx_fifo_errors++;
651 			goto tx_error;
652 		}
653 
654 		if (skb->protocol == htons(ETH_P_IP)) {
655 			rt = skb_rtable(skb);
656 			dst = rt_nexthop(rt, inner_iph->daddr);
657 		}
658 #if IS_ENABLED(CONFIG_IPV6)
659 		else if (skb->protocol == htons(ETH_P_IPV6)) {
660 			const struct in6_addr *addr6;
661 			struct neighbour *neigh;
662 			bool do_tx_error_icmp;
663 			int addr_type;
664 
665 			neigh = dst_neigh_lookup(skb_dst(skb),
666 						 &ipv6_hdr(skb)->daddr);
667 			if (neigh == NULL)
668 				goto tx_error;
669 
670 			addr6 = (const struct in6_addr *)&neigh->primary_key;
671 			addr_type = ipv6_addr_type(addr6);
672 
673 			if (addr_type == IPV6_ADDR_ANY) {
674 				addr6 = &ipv6_hdr(skb)->daddr;
675 				addr_type = ipv6_addr_type(addr6);
676 			}
677 
678 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
679 				do_tx_error_icmp = true;
680 			else {
681 				do_tx_error_icmp = false;
682 				dst = addr6->s6_addr32[3];
683 			}
684 			neigh_release(neigh);
685 			if (do_tx_error_icmp)
686 				goto tx_error_icmp;
687 		}
688 #endif
689 		else
690 			goto tx_error;
691 
692 		connected = false;
693 	}
694 
695 	tos = tnl_params->tos;
696 	if (tos & 0x1) {
697 		tos &= ~0x1;
698 		if (skb->protocol == htons(ETH_P_IP)) {
699 			tos = inner_iph->tos;
700 			connected = false;
701 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
702 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
703 			connected = false;
704 		}
705 	}
706 
707 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
708 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
709 
710 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
711 		goto tx_error;
712 
713 	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
714 
715 	if (!rt) {
716 		rt = ip_route_output_key(tunnel->net, &fl4);
717 
718 		if (IS_ERR(rt)) {
719 			dev->stats.tx_carrier_errors++;
720 			goto tx_error;
721 		}
722 		if (connected)
723 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
724 	}
725 
726 	if (rt->dst.dev == dev) {
727 		ip_rt_put(rt);
728 		dev->stats.collisions++;
729 		goto tx_error;
730 	}
731 
732 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
733 		ip_rt_put(rt);
734 		goto tx_error;
735 	}
736 
737 	if (tunnel->err_count > 0) {
738 		if (time_before(jiffies,
739 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
740 			tunnel->err_count--;
741 
742 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
743 			dst_link_failure(skb);
744 		} else
745 			tunnel->err_count = 0;
746 	}
747 
748 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
749 	ttl = tnl_params->ttl;
750 	if (ttl == 0) {
751 		if (skb->protocol == htons(ETH_P_IP))
752 			ttl = inner_iph->ttl;
753 #if IS_ENABLED(CONFIG_IPV6)
754 		else if (skb->protocol == htons(ETH_P_IPV6))
755 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
756 #endif
757 		else
758 			ttl = ip4_dst_hoplimit(&rt->dst);
759 	}
760 
761 	df = tnl_params->frag_off;
762 	if (skb->protocol == htons(ETH_P_IP))
763 		df |= (inner_iph->frag_off&htons(IP_DF));
764 
765 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
766 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
767 	if (max_headroom > dev->needed_headroom)
768 		dev->needed_headroom = max_headroom;
769 
770 	if (skb_cow_head(skb, dev->needed_headroom)) {
771 		ip_rt_put(rt);
772 		dev->stats.tx_dropped++;
773 		kfree_skb(skb);
774 		return;
775 	}
776 
777 	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
778 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
779 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
780 
781 	return;
782 
783 #if IS_ENABLED(CONFIG_IPV6)
784 tx_error_icmp:
785 	dst_link_failure(skb);
786 #endif
787 tx_error:
788 	dev->stats.tx_errors++;
789 	kfree_skb(skb);
790 }
791 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
792 
793 static void ip_tunnel_update(struct ip_tunnel_net *itn,
794 			     struct ip_tunnel *t,
795 			     struct net_device *dev,
796 			     struct ip_tunnel_parm *p,
797 			     bool set_mtu)
798 {
799 	ip_tunnel_del(t);
800 	t->parms.iph.saddr = p->iph.saddr;
801 	t->parms.iph.daddr = p->iph.daddr;
802 	t->parms.i_key = p->i_key;
803 	t->parms.o_key = p->o_key;
804 	if (dev->type != ARPHRD_ETHER) {
805 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
806 		memcpy(dev->broadcast, &p->iph.daddr, 4);
807 	}
808 	ip_tunnel_add(itn, t);
809 
810 	t->parms.iph.ttl = p->iph.ttl;
811 	t->parms.iph.tos = p->iph.tos;
812 	t->parms.iph.frag_off = p->iph.frag_off;
813 
814 	if (t->parms.link != p->link) {
815 		int mtu;
816 
817 		t->parms.link = p->link;
818 		mtu = ip_tunnel_bind_dev(dev);
819 		if (set_mtu)
820 			dev->mtu = mtu;
821 	}
822 	ip_tunnel_dst_reset_all(t);
823 	netdev_state_change(dev);
824 }
825 
826 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
827 {
828 	int err = 0;
829 	struct ip_tunnel *t = netdev_priv(dev);
830 	struct net *net = t->net;
831 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
832 
833 	BUG_ON(!itn->fb_tunnel_dev);
834 	switch (cmd) {
835 	case SIOCGETTUNNEL:
836 		if (dev == itn->fb_tunnel_dev) {
837 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
838 			if (t == NULL)
839 				t = netdev_priv(dev);
840 		}
841 		memcpy(p, &t->parms, sizeof(*p));
842 		break;
843 
844 	case SIOCADDTUNNEL:
845 	case SIOCCHGTUNNEL:
846 		err = -EPERM;
847 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
848 			goto done;
849 		if (p->iph.ttl)
850 			p->iph.frag_off |= htons(IP_DF);
851 		if (!(p->i_flags & VTI_ISVTI)) {
852 			if (!(p->i_flags & TUNNEL_KEY))
853 				p->i_key = 0;
854 			if (!(p->o_flags & TUNNEL_KEY))
855 				p->o_key = 0;
856 		}
857 
858 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
859 
860 		if (cmd == SIOCADDTUNNEL) {
861 			if (!t) {
862 				t = ip_tunnel_create(net, itn, p);
863 				err = PTR_ERR_OR_ZERO(t);
864 				break;
865 			}
866 
867 			err = -EEXIST;
868 			break;
869 		}
870 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
871 			if (t != NULL) {
872 				if (t->dev != dev) {
873 					err = -EEXIST;
874 					break;
875 				}
876 			} else {
877 				unsigned int nflags = 0;
878 
879 				if (ipv4_is_multicast(p->iph.daddr))
880 					nflags = IFF_BROADCAST;
881 				else if (p->iph.daddr)
882 					nflags = IFF_POINTOPOINT;
883 
884 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
885 					err = -EINVAL;
886 					break;
887 				}
888 
889 				t = netdev_priv(dev);
890 			}
891 		}
892 
893 		if (t) {
894 			err = 0;
895 			ip_tunnel_update(itn, t, dev, p, true);
896 		} else {
897 			err = -ENOENT;
898 		}
899 		break;
900 
901 	case SIOCDELTUNNEL:
902 		err = -EPERM;
903 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
904 			goto done;
905 
906 		if (dev == itn->fb_tunnel_dev) {
907 			err = -ENOENT;
908 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
909 			if (t == NULL)
910 				goto done;
911 			err = -EPERM;
912 			if (t == netdev_priv(itn->fb_tunnel_dev))
913 				goto done;
914 			dev = t->dev;
915 		}
916 		unregister_netdevice(dev);
917 		err = 0;
918 		break;
919 
920 	default:
921 		err = -EINVAL;
922 	}
923 
924 done:
925 	return err;
926 }
927 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
928 
929 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
930 {
931 	struct ip_tunnel *tunnel = netdev_priv(dev);
932 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
933 
934 	if (new_mtu < 68 ||
935 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
936 		return -EINVAL;
937 	dev->mtu = new_mtu;
938 	return 0;
939 }
940 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
941 
942 static void ip_tunnel_dev_free(struct net_device *dev)
943 {
944 	struct ip_tunnel *tunnel = netdev_priv(dev);
945 
946 	gro_cells_destroy(&tunnel->gro_cells);
947 	free_percpu(tunnel->dst_cache);
948 	free_percpu(dev->tstats);
949 	free_netdev(dev);
950 }
951 
952 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
953 {
954 	struct ip_tunnel *tunnel = netdev_priv(dev);
955 	struct ip_tunnel_net *itn;
956 
957 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
958 
959 	if (itn->fb_tunnel_dev != dev) {
960 		ip_tunnel_del(netdev_priv(dev));
961 		unregister_netdevice_queue(dev, head);
962 	}
963 }
964 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
965 
966 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
967 				  struct rtnl_link_ops *ops, char *devname)
968 {
969 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
970 	struct ip_tunnel_parm parms;
971 	unsigned int i;
972 
973 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
974 		INIT_HLIST_HEAD(&itn->tunnels[i]);
975 
976 	if (!ops) {
977 		itn->fb_tunnel_dev = NULL;
978 		return 0;
979 	}
980 
981 	memset(&parms, 0, sizeof(parms));
982 	if (devname)
983 		strlcpy(parms.name, devname, IFNAMSIZ);
984 
985 	rtnl_lock();
986 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
987 	/* FB netdevice is special: we have one, and only one per netns.
988 	 * Allowing to move it to another netns is clearly unsafe.
989 	 */
990 	if (!IS_ERR(itn->fb_tunnel_dev)) {
991 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
992 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
993 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
994 	}
995 	rtnl_unlock();
996 
997 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
998 }
999 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1000 
1001 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1002 			      struct rtnl_link_ops *ops)
1003 {
1004 	struct net *net = dev_net(itn->fb_tunnel_dev);
1005 	struct net_device *dev, *aux;
1006 	int h;
1007 
1008 	for_each_netdev_safe(net, dev, aux)
1009 		if (dev->rtnl_link_ops == ops)
1010 			unregister_netdevice_queue(dev, head);
1011 
1012 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1013 		struct ip_tunnel *t;
1014 		struct hlist_node *n;
1015 		struct hlist_head *thead = &itn->tunnels[h];
1016 
1017 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1018 			/* If dev is in the same netns, it has already
1019 			 * been added to the list by the previous loop.
1020 			 */
1021 			if (!net_eq(dev_net(t->dev), net))
1022 				unregister_netdevice_queue(t->dev, head);
1023 	}
1024 }
1025 
1026 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1027 {
1028 	LIST_HEAD(list);
1029 
1030 	rtnl_lock();
1031 	ip_tunnel_destroy(itn, &list, ops);
1032 	unregister_netdevice_many(&list);
1033 	rtnl_unlock();
1034 }
1035 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1036 
1037 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1038 		      struct ip_tunnel_parm *p)
1039 {
1040 	struct ip_tunnel *nt;
1041 	struct net *net = dev_net(dev);
1042 	struct ip_tunnel_net *itn;
1043 	int mtu;
1044 	int err;
1045 
1046 	nt = netdev_priv(dev);
1047 	itn = net_generic(net, nt->ip_tnl_net_id);
1048 
1049 	if (ip_tunnel_find(itn, p, dev->type))
1050 		return -EEXIST;
1051 
1052 	nt->net = net;
1053 	nt->parms = *p;
1054 	err = register_netdevice(dev);
1055 	if (err)
1056 		goto out;
1057 
1058 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1059 		eth_hw_addr_random(dev);
1060 
1061 	mtu = ip_tunnel_bind_dev(dev);
1062 	if (!tb[IFLA_MTU])
1063 		dev->mtu = mtu;
1064 
1065 	ip_tunnel_add(itn, nt);
1066 
1067 out:
1068 	return err;
1069 }
1070 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1071 
1072 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1073 			 struct ip_tunnel_parm *p)
1074 {
1075 	struct ip_tunnel *t;
1076 	struct ip_tunnel *tunnel = netdev_priv(dev);
1077 	struct net *net = tunnel->net;
1078 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1079 
1080 	if (dev == itn->fb_tunnel_dev)
1081 		return -EINVAL;
1082 
1083 	t = ip_tunnel_find(itn, p, dev->type);
1084 
1085 	if (t) {
1086 		if (t->dev != dev)
1087 			return -EEXIST;
1088 	} else {
1089 		t = tunnel;
1090 
1091 		if (dev->type != ARPHRD_ETHER) {
1092 			unsigned int nflags = 0;
1093 
1094 			if (ipv4_is_multicast(p->iph.daddr))
1095 				nflags = IFF_BROADCAST;
1096 			else if (p->iph.daddr)
1097 				nflags = IFF_POINTOPOINT;
1098 
1099 			if ((dev->flags ^ nflags) &
1100 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1101 				return -EINVAL;
1102 		}
1103 	}
1104 
1105 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1106 	return 0;
1107 }
1108 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1109 
1110 int ip_tunnel_init(struct net_device *dev)
1111 {
1112 	struct ip_tunnel *tunnel = netdev_priv(dev);
1113 	struct iphdr *iph = &tunnel->parms.iph;
1114 	int err;
1115 
1116 	dev->destructor	= ip_tunnel_dev_free;
1117 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1118 	if (!dev->tstats)
1119 		return -ENOMEM;
1120 
1121 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1122 	if (!tunnel->dst_cache) {
1123 		free_percpu(dev->tstats);
1124 		return -ENOMEM;
1125 	}
1126 
1127 	err = gro_cells_init(&tunnel->gro_cells, dev);
1128 	if (err) {
1129 		free_percpu(tunnel->dst_cache);
1130 		free_percpu(dev->tstats);
1131 		return err;
1132 	}
1133 
1134 	tunnel->dev = dev;
1135 	tunnel->net = dev_net(dev);
1136 	strcpy(tunnel->parms.name, dev->name);
1137 	iph->version		= 4;
1138 	iph->ihl		= 5;
1139 
1140 	return 0;
1141 }
1142 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1143 
1144 void ip_tunnel_uninit(struct net_device *dev)
1145 {
1146 	struct ip_tunnel *tunnel = netdev_priv(dev);
1147 	struct net *net = tunnel->net;
1148 	struct ip_tunnel_net *itn;
1149 
1150 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1151 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1152 	if (itn->fb_tunnel_dev != dev)
1153 		ip_tunnel_del(netdev_priv(dev));
1154 
1155 	ip_tunnel_dst_reset_all(tunnel);
1156 }
1157 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1158 
1159 /* Do least required initialization, rest of init is done in tunnel_init call */
1160 void ip_tunnel_setup(struct net_device *dev, int net_id)
1161 {
1162 	struct ip_tunnel *tunnel = netdev_priv(dev);
1163 	tunnel->ip_tnl_net_id = net_id;
1164 }
1165 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1166 
1167 MODULE_LICENSE("GPL");
1168