xref: /linux/net/ipv4/ip_tunnel.c (revision 110e6f26af80dfd90b6e5c645b1aed7228aa580d)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
72 				__be16 flags, __be32 key)
73 {
74 	if (p->i_flags & TUNNEL_KEY) {
75 		if (flags & TUNNEL_KEY)
76 			return key == p->i_key;
77 		else
78 			/* key expected, none present */
79 			return false;
80 	} else
81 		return !(flags & TUNNEL_KEY);
82 }
83 
84 /* Fallback tunnel: no source, no destination, no key, no options
85 
86    Tunnel hash table:
87    We require exact key match i.e. if a key is present in packet
88    it will match only tunnel with the same key; if it is not present,
89    it will match only keyless tunnel.
90 
91    All keysless packets, if not matched configured keyless tunnels
92    will match fallback tunnel.
93    Given src, dst and key, find appropriate for input tunnel.
94 */
95 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
96 				   int link, __be16 flags,
97 				   __be32 remote, __be32 local,
98 				   __be32 key)
99 {
100 	unsigned int hash;
101 	struct ip_tunnel *t, *cand = NULL;
102 	struct hlist_head *head;
103 
104 	hash = ip_tunnel_hash(key, remote);
105 	head = &itn->tunnels[hash];
106 
107 	hlist_for_each_entry_rcu(t, head, hash_node) {
108 		if (local != t->parms.iph.saddr ||
109 		    remote != t->parms.iph.daddr ||
110 		    !(t->dev->flags & IFF_UP))
111 			continue;
112 
113 		if (!ip_tunnel_key_match(&t->parms, flags, key))
114 			continue;
115 
116 		if (t->parms.link == link)
117 			return t;
118 		else
119 			cand = t;
120 	}
121 
122 	hlist_for_each_entry_rcu(t, head, hash_node) {
123 		if (remote != t->parms.iph.daddr ||
124 		    t->parms.iph.saddr != 0 ||
125 		    !(t->dev->flags & IFF_UP))
126 			continue;
127 
128 		if (!ip_tunnel_key_match(&t->parms, flags, key))
129 			continue;
130 
131 		if (t->parms.link == link)
132 			return t;
133 		else if (!cand)
134 			cand = t;
135 	}
136 
137 	hash = ip_tunnel_hash(key, 0);
138 	head = &itn->tunnels[hash];
139 
140 	hlist_for_each_entry_rcu(t, head, hash_node) {
141 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
142 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
143 			continue;
144 
145 		if (!(t->dev->flags & IFF_UP))
146 			continue;
147 
148 		if (!ip_tunnel_key_match(&t->parms, flags, key))
149 			continue;
150 
151 		if (t->parms.link == link)
152 			return t;
153 		else if (!cand)
154 			cand = t;
155 	}
156 
157 	if (flags & TUNNEL_NO_KEY)
158 		goto skip_key_lookup;
159 
160 	hlist_for_each_entry_rcu(t, head, hash_node) {
161 		if (t->parms.i_key != key ||
162 		    t->parms.iph.saddr != 0 ||
163 		    t->parms.iph.daddr != 0 ||
164 		    !(t->dev->flags & IFF_UP))
165 			continue;
166 
167 		if (t->parms.link == link)
168 			return t;
169 		else if (!cand)
170 			cand = t;
171 	}
172 
173 skip_key_lookup:
174 	if (cand)
175 		return cand;
176 
177 	t = rcu_dereference(itn->collect_md_tun);
178 	if (t)
179 		return t;
180 
181 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
182 		return netdev_priv(itn->fb_tunnel_dev);
183 
184 	return NULL;
185 }
186 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
187 
188 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
189 				    struct ip_tunnel_parm *parms)
190 {
191 	unsigned int h;
192 	__be32 remote;
193 	__be32 i_key = parms->i_key;
194 
195 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
196 		remote = parms->iph.daddr;
197 	else
198 		remote = 0;
199 
200 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
201 		i_key = 0;
202 
203 	h = ip_tunnel_hash(i_key, remote);
204 	return &itn->tunnels[h];
205 }
206 
207 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
208 {
209 	struct hlist_head *head = ip_bucket(itn, &t->parms);
210 
211 	if (t->collect_md)
212 		rcu_assign_pointer(itn->collect_md_tun, t);
213 	hlist_add_head_rcu(&t->hash_node, head);
214 }
215 
216 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
217 {
218 	if (t->collect_md)
219 		rcu_assign_pointer(itn->collect_md_tun, NULL);
220 	hlist_del_init_rcu(&t->hash_node);
221 }
222 
223 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
224 					struct ip_tunnel_parm *parms,
225 					int type)
226 {
227 	__be32 remote = parms->iph.daddr;
228 	__be32 local = parms->iph.saddr;
229 	__be32 key = parms->i_key;
230 	__be16 flags = parms->i_flags;
231 	int link = parms->link;
232 	struct ip_tunnel *t = NULL;
233 	struct hlist_head *head = ip_bucket(itn, parms);
234 
235 	hlist_for_each_entry_rcu(t, head, hash_node) {
236 		if (local == t->parms.iph.saddr &&
237 		    remote == t->parms.iph.daddr &&
238 		    link == t->parms.link &&
239 		    type == t->dev->type &&
240 		    ip_tunnel_key_match(&t->parms, flags, key))
241 			break;
242 	}
243 	return t;
244 }
245 
246 static struct net_device *__ip_tunnel_create(struct net *net,
247 					     const struct rtnl_link_ops *ops,
248 					     struct ip_tunnel_parm *parms)
249 {
250 	int err;
251 	struct ip_tunnel *tunnel;
252 	struct net_device *dev;
253 	char name[IFNAMSIZ];
254 
255 	if (parms->name[0])
256 		strlcpy(name, parms->name, IFNAMSIZ);
257 	else {
258 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
259 			err = -E2BIG;
260 			goto failed;
261 		}
262 		strlcpy(name, ops->kind, IFNAMSIZ);
263 		strncat(name, "%d", 2);
264 	}
265 
266 	ASSERT_RTNL();
267 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
268 	if (!dev) {
269 		err = -ENOMEM;
270 		goto failed;
271 	}
272 	dev_net_set(dev, net);
273 
274 	dev->rtnl_link_ops = ops;
275 
276 	tunnel = netdev_priv(dev);
277 	tunnel->parms = *parms;
278 	tunnel->net = net;
279 
280 	err = register_netdevice(dev);
281 	if (err)
282 		goto failed_free;
283 
284 	return dev;
285 
286 failed_free:
287 	free_netdev(dev);
288 failed:
289 	return ERR_PTR(err);
290 }
291 
292 static inline void init_tunnel_flow(struct flowi4 *fl4,
293 				    int proto,
294 				    __be32 daddr, __be32 saddr,
295 				    __be32 key, __u8 tos, int oif)
296 {
297 	memset(fl4, 0, sizeof(*fl4));
298 	fl4->flowi4_oif = oif;
299 	fl4->daddr = daddr;
300 	fl4->saddr = saddr;
301 	fl4->flowi4_tos = tos;
302 	fl4->flowi4_proto = proto;
303 	fl4->fl4_gre_key = key;
304 }
305 
306 static int ip_tunnel_bind_dev(struct net_device *dev)
307 {
308 	struct net_device *tdev = NULL;
309 	struct ip_tunnel *tunnel = netdev_priv(dev);
310 	const struct iphdr *iph;
311 	int hlen = LL_MAX_HEADER;
312 	int mtu = ETH_DATA_LEN;
313 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
314 
315 	iph = &tunnel->parms.iph;
316 
317 	/* Guess output device to choose reasonable mtu and needed_headroom */
318 	if (iph->daddr) {
319 		struct flowi4 fl4;
320 		struct rtable *rt;
321 
322 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
323 				 iph->saddr, tunnel->parms.o_key,
324 				 RT_TOS(iph->tos), tunnel->parms.link);
325 		rt = ip_route_output_key(tunnel->net, &fl4);
326 
327 		if (!IS_ERR(rt)) {
328 			tdev = rt->dst.dev;
329 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
330 					  fl4.saddr);
331 			ip_rt_put(rt);
332 		}
333 		if (dev->type != ARPHRD_ETHER)
334 			dev->flags |= IFF_POINTOPOINT;
335 	}
336 
337 	if (!tdev && tunnel->parms.link)
338 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
339 
340 	if (tdev) {
341 		hlen = tdev->hard_header_len + tdev->needed_headroom;
342 		mtu = tdev->mtu;
343 	}
344 
345 	dev->needed_headroom = t_hlen + hlen;
346 	mtu -= (dev->hard_header_len + t_hlen);
347 
348 	if (mtu < 68)
349 		mtu = 68;
350 
351 	return mtu;
352 }
353 
354 static struct ip_tunnel *ip_tunnel_create(struct net *net,
355 					  struct ip_tunnel_net *itn,
356 					  struct ip_tunnel_parm *parms)
357 {
358 	struct ip_tunnel *nt;
359 	struct net_device *dev;
360 
361 	BUG_ON(!itn->fb_tunnel_dev);
362 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
363 	if (IS_ERR(dev))
364 		return ERR_CAST(dev);
365 
366 	dev->mtu = ip_tunnel_bind_dev(dev);
367 
368 	nt = netdev_priv(dev);
369 	ip_tunnel_add(itn, nt);
370 	return nt;
371 }
372 
373 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
374 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
375 		  bool log_ecn_error)
376 {
377 	struct pcpu_sw_netstats *tstats;
378 	const struct iphdr *iph = ip_hdr(skb);
379 	int err;
380 
381 #ifdef CONFIG_NET_IPGRE_BROADCAST
382 	if (ipv4_is_multicast(iph->daddr)) {
383 		tunnel->dev->stats.multicast++;
384 		skb->pkt_type = PACKET_BROADCAST;
385 	}
386 #endif
387 
388 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
389 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
390 		tunnel->dev->stats.rx_crc_errors++;
391 		tunnel->dev->stats.rx_errors++;
392 		goto drop;
393 	}
394 
395 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
396 		if (!(tpi->flags&TUNNEL_SEQ) ||
397 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
398 			tunnel->dev->stats.rx_fifo_errors++;
399 			tunnel->dev->stats.rx_errors++;
400 			goto drop;
401 		}
402 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
403 	}
404 
405 	skb_reset_network_header(skb);
406 
407 	err = IP_ECN_decapsulate(iph, skb);
408 	if (unlikely(err)) {
409 		if (log_ecn_error)
410 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
411 					&iph->saddr, iph->tos);
412 		if (err > 1) {
413 			++tunnel->dev->stats.rx_frame_errors;
414 			++tunnel->dev->stats.rx_errors;
415 			goto drop;
416 		}
417 	}
418 
419 	tstats = this_cpu_ptr(tunnel->dev->tstats);
420 	u64_stats_update_begin(&tstats->syncp);
421 	tstats->rx_packets++;
422 	tstats->rx_bytes += skb->len;
423 	u64_stats_update_end(&tstats->syncp);
424 
425 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
426 
427 	if (tunnel->dev->type == ARPHRD_ETHER) {
428 		skb->protocol = eth_type_trans(skb, tunnel->dev);
429 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
430 	} else {
431 		skb->dev = tunnel->dev;
432 	}
433 
434 	if (tun_dst)
435 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
436 
437 	gro_cells_receive(&tunnel->gro_cells, skb);
438 	return 0;
439 
440 drop:
441 	kfree_skb(skb);
442 	return 0;
443 }
444 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
445 
446 static int ip_encap_hlen(struct ip_tunnel_encap *e)
447 {
448 	const struct ip_tunnel_encap_ops *ops;
449 	int hlen = -EINVAL;
450 
451 	if (e->type == TUNNEL_ENCAP_NONE)
452 		return 0;
453 
454 	if (e->type >= MAX_IPTUN_ENCAP_OPS)
455 		return -EINVAL;
456 
457 	rcu_read_lock();
458 	ops = rcu_dereference(iptun_encaps[e->type]);
459 	if (likely(ops && ops->encap_hlen))
460 		hlen = ops->encap_hlen(e);
461 	rcu_read_unlock();
462 
463 	return hlen;
464 }
465 
466 const struct ip_tunnel_encap_ops __rcu *
467 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
468 
469 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
470 			    unsigned int num)
471 {
472 	if (num >= MAX_IPTUN_ENCAP_OPS)
473 		return -ERANGE;
474 
475 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
476 			&iptun_encaps[num],
477 			NULL, ops) ? 0 : -1;
478 }
479 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
480 
481 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
482 			    unsigned int num)
483 {
484 	int ret;
485 
486 	if (num >= MAX_IPTUN_ENCAP_OPS)
487 		return -ERANGE;
488 
489 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
490 		       &iptun_encaps[num],
491 		       ops, NULL) == ops) ? 0 : -1;
492 
493 	synchronize_net();
494 
495 	return ret;
496 }
497 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
498 
499 int ip_tunnel_encap_setup(struct ip_tunnel *t,
500 			  struct ip_tunnel_encap *ipencap)
501 {
502 	int hlen;
503 
504 	memset(&t->encap, 0, sizeof(t->encap));
505 
506 	hlen = ip_encap_hlen(ipencap);
507 	if (hlen < 0)
508 		return hlen;
509 
510 	t->encap.type = ipencap->type;
511 	t->encap.sport = ipencap->sport;
512 	t->encap.dport = ipencap->dport;
513 	t->encap.flags = ipencap->flags;
514 
515 	t->encap_hlen = hlen;
516 	t->hlen = t->encap_hlen + t->tun_hlen;
517 
518 	return 0;
519 }
520 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
521 
522 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
523 		    u8 *protocol, struct flowi4 *fl4)
524 {
525 	const struct ip_tunnel_encap_ops *ops;
526 	int ret = -EINVAL;
527 
528 	if (t->encap.type == TUNNEL_ENCAP_NONE)
529 		return 0;
530 
531 	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
532 		return -EINVAL;
533 
534 	rcu_read_lock();
535 	ops = rcu_dereference(iptun_encaps[t->encap.type]);
536 	if (likely(ops && ops->build_header))
537 		ret = ops->build_header(skb, &t->encap, protocol, fl4);
538 	rcu_read_unlock();
539 
540 	return ret;
541 }
542 EXPORT_SYMBOL(ip_tunnel_encap);
543 
544 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
545 			    struct rtable *rt, __be16 df,
546 			    const struct iphdr *inner_iph)
547 {
548 	struct ip_tunnel *tunnel = netdev_priv(dev);
549 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
550 	int mtu;
551 
552 	if (df)
553 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
554 					- sizeof(struct iphdr) - tunnel->hlen;
555 	else
556 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
557 
558 	if (skb_dst(skb))
559 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
560 
561 	if (skb->protocol == htons(ETH_P_IP)) {
562 		if (!skb_is_gso(skb) &&
563 		    (inner_iph->frag_off & htons(IP_DF)) &&
564 		    mtu < pkt_size) {
565 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
566 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
567 			return -E2BIG;
568 		}
569 	}
570 #if IS_ENABLED(CONFIG_IPV6)
571 	else if (skb->protocol == htons(ETH_P_IPV6)) {
572 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
573 
574 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
575 			   mtu >= IPV6_MIN_MTU) {
576 			if ((tunnel->parms.iph.daddr &&
577 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
578 			    rt6->rt6i_dst.plen == 128) {
579 				rt6->rt6i_flags |= RTF_MODIFIED;
580 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
581 			}
582 		}
583 
584 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
585 					mtu < pkt_size) {
586 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
587 			return -E2BIG;
588 		}
589 	}
590 #endif
591 	return 0;
592 }
593 
594 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
595 		    const struct iphdr *tnl_params, u8 protocol)
596 {
597 	struct ip_tunnel *tunnel = netdev_priv(dev);
598 	const struct iphdr *inner_iph;
599 	struct flowi4 fl4;
600 	u8     tos, ttl;
601 	__be16 df;
602 	struct rtable *rt;		/* Route to the other host */
603 	unsigned int max_headroom;	/* The extra header space needed */
604 	__be32 dst;
605 	bool connected;
606 
607 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
608 	connected = (tunnel->parms.iph.daddr != 0);
609 
610 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
611 
612 	dst = tnl_params->daddr;
613 	if (dst == 0) {
614 		/* NBMA tunnel */
615 
616 		if (!skb_dst(skb)) {
617 			dev->stats.tx_fifo_errors++;
618 			goto tx_error;
619 		}
620 
621 		if (skb->protocol == htons(ETH_P_IP)) {
622 			rt = skb_rtable(skb);
623 			dst = rt_nexthop(rt, inner_iph->daddr);
624 		}
625 #if IS_ENABLED(CONFIG_IPV6)
626 		else if (skb->protocol == htons(ETH_P_IPV6)) {
627 			const struct in6_addr *addr6;
628 			struct neighbour *neigh;
629 			bool do_tx_error_icmp;
630 			int addr_type;
631 
632 			neigh = dst_neigh_lookup(skb_dst(skb),
633 						 &ipv6_hdr(skb)->daddr);
634 			if (!neigh)
635 				goto tx_error;
636 
637 			addr6 = (const struct in6_addr *)&neigh->primary_key;
638 			addr_type = ipv6_addr_type(addr6);
639 
640 			if (addr_type == IPV6_ADDR_ANY) {
641 				addr6 = &ipv6_hdr(skb)->daddr;
642 				addr_type = ipv6_addr_type(addr6);
643 			}
644 
645 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
646 				do_tx_error_icmp = true;
647 			else {
648 				do_tx_error_icmp = false;
649 				dst = addr6->s6_addr32[3];
650 			}
651 			neigh_release(neigh);
652 			if (do_tx_error_icmp)
653 				goto tx_error_icmp;
654 		}
655 #endif
656 		else
657 			goto tx_error;
658 
659 		connected = false;
660 	}
661 
662 	tos = tnl_params->tos;
663 	if (tos & 0x1) {
664 		tos &= ~0x1;
665 		if (skb->protocol == htons(ETH_P_IP)) {
666 			tos = inner_iph->tos;
667 			connected = false;
668 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
669 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
670 			connected = false;
671 		}
672 	}
673 
674 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
675 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
676 
677 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
678 		goto tx_error;
679 
680 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
681 			 NULL;
682 
683 	if (!rt) {
684 		rt = ip_route_output_key(tunnel->net, &fl4);
685 
686 		if (IS_ERR(rt)) {
687 			dev->stats.tx_carrier_errors++;
688 			goto tx_error;
689 		}
690 		if (connected)
691 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
692 					  fl4.saddr);
693 	}
694 
695 	if (rt->dst.dev == dev) {
696 		ip_rt_put(rt);
697 		dev->stats.collisions++;
698 		goto tx_error;
699 	}
700 
701 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
702 		ip_rt_put(rt);
703 		goto tx_error;
704 	}
705 
706 	if (tunnel->err_count > 0) {
707 		if (time_before(jiffies,
708 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
709 			tunnel->err_count--;
710 
711 			dst_link_failure(skb);
712 		} else
713 			tunnel->err_count = 0;
714 	}
715 
716 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
717 	ttl = tnl_params->ttl;
718 	if (ttl == 0) {
719 		if (skb->protocol == htons(ETH_P_IP))
720 			ttl = inner_iph->ttl;
721 #if IS_ENABLED(CONFIG_IPV6)
722 		else if (skb->protocol == htons(ETH_P_IPV6))
723 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
724 #endif
725 		else
726 			ttl = ip4_dst_hoplimit(&rt->dst);
727 	}
728 
729 	df = tnl_params->frag_off;
730 	if (skb->protocol == htons(ETH_P_IP))
731 		df |= (inner_iph->frag_off&htons(IP_DF));
732 
733 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
734 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
735 	if (max_headroom > dev->needed_headroom)
736 		dev->needed_headroom = max_headroom;
737 
738 	if (skb_cow_head(skb, dev->needed_headroom)) {
739 		ip_rt_put(rt);
740 		dev->stats.tx_dropped++;
741 		kfree_skb(skb);
742 		return;
743 	}
744 
745 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
746 		      df, !net_eq(tunnel->net, dev_net(dev)));
747 	return;
748 
749 #if IS_ENABLED(CONFIG_IPV6)
750 tx_error_icmp:
751 	dst_link_failure(skb);
752 #endif
753 tx_error:
754 	dev->stats.tx_errors++;
755 	kfree_skb(skb);
756 }
757 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
758 
759 static void ip_tunnel_update(struct ip_tunnel_net *itn,
760 			     struct ip_tunnel *t,
761 			     struct net_device *dev,
762 			     struct ip_tunnel_parm *p,
763 			     bool set_mtu)
764 {
765 	ip_tunnel_del(itn, t);
766 	t->parms.iph.saddr = p->iph.saddr;
767 	t->parms.iph.daddr = p->iph.daddr;
768 	t->parms.i_key = p->i_key;
769 	t->parms.o_key = p->o_key;
770 	if (dev->type != ARPHRD_ETHER) {
771 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
772 		memcpy(dev->broadcast, &p->iph.daddr, 4);
773 	}
774 	ip_tunnel_add(itn, t);
775 
776 	t->parms.iph.ttl = p->iph.ttl;
777 	t->parms.iph.tos = p->iph.tos;
778 	t->parms.iph.frag_off = p->iph.frag_off;
779 
780 	if (t->parms.link != p->link) {
781 		int mtu;
782 
783 		t->parms.link = p->link;
784 		mtu = ip_tunnel_bind_dev(dev);
785 		if (set_mtu)
786 			dev->mtu = mtu;
787 	}
788 	dst_cache_reset(&t->dst_cache);
789 	netdev_state_change(dev);
790 }
791 
792 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
793 {
794 	int err = 0;
795 	struct ip_tunnel *t = netdev_priv(dev);
796 	struct net *net = t->net;
797 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
798 
799 	BUG_ON(!itn->fb_tunnel_dev);
800 	switch (cmd) {
801 	case SIOCGETTUNNEL:
802 		if (dev == itn->fb_tunnel_dev) {
803 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
804 			if (!t)
805 				t = netdev_priv(dev);
806 		}
807 		memcpy(p, &t->parms, sizeof(*p));
808 		break;
809 
810 	case SIOCADDTUNNEL:
811 	case SIOCCHGTUNNEL:
812 		err = -EPERM;
813 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
814 			goto done;
815 		if (p->iph.ttl)
816 			p->iph.frag_off |= htons(IP_DF);
817 		if (!(p->i_flags & VTI_ISVTI)) {
818 			if (!(p->i_flags & TUNNEL_KEY))
819 				p->i_key = 0;
820 			if (!(p->o_flags & TUNNEL_KEY))
821 				p->o_key = 0;
822 		}
823 
824 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
825 
826 		if (cmd == SIOCADDTUNNEL) {
827 			if (!t) {
828 				t = ip_tunnel_create(net, itn, p);
829 				err = PTR_ERR_OR_ZERO(t);
830 				break;
831 			}
832 
833 			err = -EEXIST;
834 			break;
835 		}
836 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
837 			if (t) {
838 				if (t->dev != dev) {
839 					err = -EEXIST;
840 					break;
841 				}
842 			} else {
843 				unsigned int nflags = 0;
844 
845 				if (ipv4_is_multicast(p->iph.daddr))
846 					nflags = IFF_BROADCAST;
847 				else if (p->iph.daddr)
848 					nflags = IFF_POINTOPOINT;
849 
850 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
851 					err = -EINVAL;
852 					break;
853 				}
854 
855 				t = netdev_priv(dev);
856 			}
857 		}
858 
859 		if (t) {
860 			err = 0;
861 			ip_tunnel_update(itn, t, dev, p, true);
862 		} else {
863 			err = -ENOENT;
864 		}
865 		break;
866 
867 	case SIOCDELTUNNEL:
868 		err = -EPERM;
869 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
870 			goto done;
871 
872 		if (dev == itn->fb_tunnel_dev) {
873 			err = -ENOENT;
874 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
875 			if (!t)
876 				goto done;
877 			err = -EPERM;
878 			if (t == netdev_priv(itn->fb_tunnel_dev))
879 				goto done;
880 			dev = t->dev;
881 		}
882 		unregister_netdevice(dev);
883 		err = 0;
884 		break;
885 
886 	default:
887 		err = -EINVAL;
888 	}
889 
890 done:
891 	return err;
892 }
893 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
894 
895 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
896 {
897 	struct ip_tunnel *tunnel = netdev_priv(dev);
898 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
899 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
900 
901 	if (new_mtu < 68)
902 		return -EINVAL;
903 
904 	if (new_mtu > max_mtu) {
905 		if (strict)
906 			return -EINVAL;
907 
908 		new_mtu = max_mtu;
909 	}
910 
911 	dev->mtu = new_mtu;
912 	return 0;
913 }
914 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
915 
916 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
917 {
918 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
919 }
920 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
921 
922 static void ip_tunnel_dev_free(struct net_device *dev)
923 {
924 	struct ip_tunnel *tunnel = netdev_priv(dev);
925 
926 	gro_cells_destroy(&tunnel->gro_cells);
927 	dst_cache_destroy(&tunnel->dst_cache);
928 	free_percpu(dev->tstats);
929 	free_netdev(dev);
930 }
931 
932 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
933 {
934 	struct ip_tunnel *tunnel = netdev_priv(dev);
935 	struct ip_tunnel_net *itn;
936 
937 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
938 
939 	if (itn->fb_tunnel_dev != dev) {
940 		ip_tunnel_del(itn, netdev_priv(dev));
941 		unregister_netdevice_queue(dev, head);
942 	}
943 }
944 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
945 
946 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
947 {
948 	struct ip_tunnel *tunnel = netdev_priv(dev);
949 
950 	return tunnel->net;
951 }
952 EXPORT_SYMBOL(ip_tunnel_get_link_net);
953 
954 int ip_tunnel_get_iflink(const struct net_device *dev)
955 {
956 	struct ip_tunnel *tunnel = netdev_priv(dev);
957 
958 	return tunnel->parms.link;
959 }
960 EXPORT_SYMBOL(ip_tunnel_get_iflink);
961 
962 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
963 				  struct rtnl_link_ops *ops, char *devname)
964 {
965 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
966 	struct ip_tunnel_parm parms;
967 	unsigned int i;
968 
969 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
970 		INIT_HLIST_HEAD(&itn->tunnels[i]);
971 
972 	if (!ops) {
973 		itn->fb_tunnel_dev = NULL;
974 		return 0;
975 	}
976 
977 	memset(&parms, 0, sizeof(parms));
978 	if (devname)
979 		strlcpy(parms.name, devname, IFNAMSIZ);
980 
981 	rtnl_lock();
982 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
983 	/* FB netdevice is special: we have one, and only one per netns.
984 	 * Allowing to move it to another netns is clearly unsafe.
985 	 */
986 	if (!IS_ERR(itn->fb_tunnel_dev)) {
987 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
988 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
989 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
990 	}
991 	rtnl_unlock();
992 
993 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
994 }
995 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
996 
997 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
998 			      struct rtnl_link_ops *ops)
999 {
1000 	struct net *net = dev_net(itn->fb_tunnel_dev);
1001 	struct net_device *dev, *aux;
1002 	int h;
1003 
1004 	for_each_netdev_safe(net, dev, aux)
1005 		if (dev->rtnl_link_ops == ops)
1006 			unregister_netdevice_queue(dev, head);
1007 
1008 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1009 		struct ip_tunnel *t;
1010 		struct hlist_node *n;
1011 		struct hlist_head *thead = &itn->tunnels[h];
1012 
1013 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1014 			/* If dev is in the same netns, it has already
1015 			 * been added to the list by the previous loop.
1016 			 */
1017 			if (!net_eq(dev_net(t->dev), net))
1018 				unregister_netdevice_queue(t->dev, head);
1019 	}
1020 }
1021 
1022 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1023 {
1024 	LIST_HEAD(list);
1025 
1026 	rtnl_lock();
1027 	ip_tunnel_destroy(itn, &list, ops);
1028 	unregister_netdevice_many(&list);
1029 	rtnl_unlock();
1030 }
1031 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1032 
1033 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1034 		      struct ip_tunnel_parm *p)
1035 {
1036 	struct ip_tunnel *nt;
1037 	struct net *net = dev_net(dev);
1038 	struct ip_tunnel_net *itn;
1039 	int mtu;
1040 	int err;
1041 
1042 	nt = netdev_priv(dev);
1043 	itn = net_generic(net, nt->ip_tnl_net_id);
1044 
1045 	if (nt->collect_md) {
1046 		if (rtnl_dereference(itn->collect_md_tun))
1047 			return -EEXIST;
1048 	} else {
1049 		if (ip_tunnel_find(itn, p, dev->type))
1050 			return -EEXIST;
1051 	}
1052 
1053 	nt->net = net;
1054 	nt->parms = *p;
1055 	err = register_netdevice(dev);
1056 	if (err)
1057 		goto out;
1058 
1059 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1060 		eth_hw_addr_random(dev);
1061 
1062 	mtu = ip_tunnel_bind_dev(dev);
1063 	if (!tb[IFLA_MTU])
1064 		dev->mtu = mtu;
1065 
1066 	ip_tunnel_add(itn, nt);
1067 out:
1068 	return err;
1069 }
1070 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1071 
1072 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1073 			 struct ip_tunnel_parm *p)
1074 {
1075 	struct ip_tunnel *t;
1076 	struct ip_tunnel *tunnel = netdev_priv(dev);
1077 	struct net *net = tunnel->net;
1078 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1079 
1080 	if (dev == itn->fb_tunnel_dev)
1081 		return -EINVAL;
1082 
1083 	t = ip_tunnel_find(itn, p, dev->type);
1084 
1085 	if (t) {
1086 		if (t->dev != dev)
1087 			return -EEXIST;
1088 	} else {
1089 		t = tunnel;
1090 
1091 		if (dev->type != ARPHRD_ETHER) {
1092 			unsigned int nflags = 0;
1093 
1094 			if (ipv4_is_multicast(p->iph.daddr))
1095 				nflags = IFF_BROADCAST;
1096 			else if (p->iph.daddr)
1097 				nflags = IFF_POINTOPOINT;
1098 
1099 			if ((dev->flags ^ nflags) &
1100 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1101 				return -EINVAL;
1102 		}
1103 	}
1104 
1105 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1106 	return 0;
1107 }
1108 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1109 
1110 int ip_tunnel_init(struct net_device *dev)
1111 {
1112 	struct ip_tunnel *tunnel = netdev_priv(dev);
1113 	struct iphdr *iph = &tunnel->parms.iph;
1114 	int err;
1115 
1116 	dev->destructor	= ip_tunnel_dev_free;
1117 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1118 	if (!dev->tstats)
1119 		return -ENOMEM;
1120 
1121 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1122 	if (err) {
1123 		free_percpu(dev->tstats);
1124 		return err;
1125 	}
1126 
1127 	err = gro_cells_init(&tunnel->gro_cells, dev);
1128 	if (err) {
1129 		dst_cache_destroy(&tunnel->dst_cache);
1130 		free_percpu(dev->tstats);
1131 		return err;
1132 	}
1133 
1134 	tunnel->dev = dev;
1135 	tunnel->net = dev_net(dev);
1136 	strcpy(tunnel->parms.name, dev->name);
1137 	iph->version		= 4;
1138 	iph->ihl		= 5;
1139 
1140 	if (tunnel->collect_md) {
1141 		dev->features |= NETIF_F_NETNS_LOCAL;
1142 		netif_keep_dst(dev);
1143 	}
1144 	return 0;
1145 }
1146 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1147 
1148 void ip_tunnel_uninit(struct net_device *dev)
1149 {
1150 	struct ip_tunnel *tunnel = netdev_priv(dev);
1151 	struct net *net = tunnel->net;
1152 	struct ip_tunnel_net *itn;
1153 
1154 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1155 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1156 	if (itn->fb_tunnel_dev != dev)
1157 		ip_tunnel_del(itn, netdev_priv(dev));
1158 
1159 	dst_cache_reset(&tunnel->dst_cache);
1160 }
1161 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1162 
1163 /* Do least required initialization, rest of init is done in tunnel_init call */
1164 void ip_tunnel_setup(struct net_device *dev, int net_id)
1165 {
1166 	struct ip_tunnel *tunnel = netdev_priv(dev);
1167 	tunnel->ip_tnl_net_id = net_id;
1168 }
1169 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1170 
1171 MODULE_LICENSE("GPL");
1172