xref: /linux/net/ipv4/ip_tunnel.c (revision a5d9265e017f081f0dc133c0e2f45103d027b874)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t && t->dev->flags & IFF_UP)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	err = -E2BIG;
257 	if (parms->name[0]) {
258 		if (!dev_valid_name(parms->name))
259 			goto failed;
260 		strlcpy(name, parms->name, IFNAMSIZ);
261 	} else {
262 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
263 			goto failed;
264 		strcpy(name, ops->kind);
265 		strcat(name, "%d");
266 	}
267 
268 	ASSERT_RTNL();
269 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
270 	if (!dev) {
271 		err = -ENOMEM;
272 		goto failed;
273 	}
274 	dev_net_set(dev, net);
275 
276 	dev->rtnl_link_ops = ops;
277 
278 	tunnel = netdev_priv(dev);
279 	tunnel->parms = *parms;
280 	tunnel->net = net;
281 
282 	err = register_netdevice(dev);
283 	if (err)
284 		goto failed_free;
285 
286 	return dev;
287 
288 failed_free:
289 	free_netdev(dev);
290 failed:
291 	return ERR_PTR(err);
292 }
293 
294 static int ip_tunnel_bind_dev(struct net_device *dev)
295 {
296 	struct net_device *tdev = NULL;
297 	struct ip_tunnel *tunnel = netdev_priv(dev);
298 	const struct iphdr *iph;
299 	int hlen = LL_MAX_HEADER;
300 	int mtu = ETH_DATA_LEN;
301 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
302 
303 	iph = &tunnel->parms.iph;
304 
305 	/* Guess output device to choose reasonable mtu and needed_headroom */
306 	if (iph->daddr) {
307 		struct flowi4 fl4;
308 		struct rtable *rt;
309 
310 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
311 				    iph->saddr, tunnel->parms.o_key,
312 				    RT_TOS(iph->tos), tunnel->parms.link,
313 				    tunnel->fwmark);
314 		rt = ip_route_output_key(tunnel->net, &fl4);
315 
316 		if (!IS_ERR(rt)) {
317 			tdev = rt->dst.dev;
318 			ip_rt_put(rt);
319 		}
320 		if (dev->type != ARPHRD_ETHER)
321 			dev->flags |= IFF_POINTOPOINT;
322 
323 		dst_cache_reset(&tunnel->dst_cache);
324 	}
325 
326 	if (!tdev && tunnel->parms.link)
327 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
328 
329 	if (tdev) {
330 		hlen = tdev->hard_header_len + tdev->needed_headroom;
331 		mtu = min(tdev->mtu, IP_MAX_MTU);
332 	}
333 
334 	dev->needed_headroom = t_hlen + hlen;
335 	mtu -= (dev->hard_header_len + t_hlen);
336 
337 	if (mtu < IPV4_MIN_MTU)
338 		mtu = IPV4_MIN_MTU;
339 
340 	return mtu;
341 }
342 
343 static struct ip_tunnel *ip_tunnel_create(struct net *net,
344 					  struct ip_tunnel_net *itn,
345 					  struct ip_tunnel_parm *parms)
346 {
347 	struct ip_tunnel *nt;
348 	struct net_device *dev;
349 	int t_hlen;
350 	int mtu;
351 	int err;
352 
353 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
354 	if (IS_ERR(dev))
355 		return ERR_CAST(dev);
356 
357 	mtu = ip_tunnel_bind_dev(dev);
358 	err = dev_set_mtu(dev, mtu);
359 	if (err)
360 		goto err_dev_set_mtu;
361 
362 	nt = netdev_priv(dev);
363 	t_hlen = nt->hlen + sizeof(struct iphdr);
364 	dev->min_mtu = ETH_MIN_MTU;
365 	dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
366 	ip_tunnel_add(itn, nt);
367 	return nt;
368 
369 err_dev_set_mtu:
370 	unregister_netdevice(dev);
371 	return ERR_PTR(err);
372 }
373 
374 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
375 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
376 		  bool log_ecn_error)
377 {
378 	struct pcpu_sw_netstats *tstats;
379 	const struct iphdr *iph = ip_hdr(skb);
380 	int err;
381 
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383 	if (ipv4_is_multicast(iph->daddr)) {
384 		tunnel->dev->stats.multicast++;
385 		skb->pkt_type = PACKET_BROADCAST;
386 	}
387 #endif
388 
389 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391 		tunnel->dev->stats.rx_crc_errors++;
392 		tunnel->dev->stats.rx_errors++;
393 		goto drop;
394 	}
395 
396 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397 		if (!(tpi->flags&TUNNEL_SEQ) ||
398 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399 			tunnel->dev->stats.rx_fifo_errors++;
400 			tunnel->dev->stats.rx_errors++;
401 			goto drop;
402 		}
403 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
404 	}
405 
406 	skb_reset_network_header(skb);
407 
408 	err = IP_ECN_decapsulate(iph, skb);
409 	if (unlikely(err)) {
410 		if (log_ecn_error)
411 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412 					&iph->saddr, iph->tos);
413 		if (err > 1) {
414 			++tunnel->dev->stats.rx_frame_errors;
415 			++tunnel->dev->stats.rx_errors;
416 			goto drop;
417 		}
418 	}
419 
420 	tstats = this_cpu_ptr(tunnel->dev->tstats);
421 	u64_stats_update_begin(&tstats->syncp);
422 	tstats->rx_packets++;
423 	tstats->rx_bytes += skb->len;
424 	u64_stats_update_end(&tstats->syncp);
425 
426 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
427 
428 	if (tunnel->dev->type == ARPHRD_ETHER) {
429 		skb->protocol = eth_type_trans(skb, tunnel->dev);
430 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
431 	} else {
432 		skb->dev = tunnel->dev;
433 	}
434 
435 	if (tun_dst)
436 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
437 
438 	gro_cells_receive(&tunnel->gro_cells, skb);
439 	return 0;
440 
441 drop:
442 	if (tun_dst)
443 		dst_release((struct dst_entry *)tun_dst);
444 	kfree_skb(skb);
445 	return 0;
446 }
447 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
448 
449 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
450 			    unsigned int num)
451 {
452 	if (num >= MAX_IPTUN_ENCAP_OPS)
453 		return -ERANGE;
454 
455 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
456 			&iptun_encaps[num],
457 			NULL, ops) ? 0 : -1;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
460 
461 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
462 			    unsigned int num)
463 {
464 	int ret;
465 
466 	if (num >= MAX_IPTUN_ENCAP_OPS)
467 		return -ERANGE;
468 
469 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
470 		       &iptun_encaps[num],
471 		       ops, NULL) == ops) ? 0 : -1;
472 
473 	synchronize_net();
474 
475 	return ret;
476 }
477 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
478 
479 int ip_tunnel_encap_setup(struct ip_tunnel *t,
480 			  struct ip_tunnel_encap *ipencap)
481 {
482 	int hlen;
483 
484 	memset(&t->encap, 0, sizeof(t->encap));
485 
486 	hlen = ip_encap_hlen(ipencap);
487 	if (hlen < 0)
488 		return hlen;
489 
490 	t->encap.type = ipencap->type;
491 	t->encap.sport = ipencap->sport;
492 	t->encap.dport = ipencap->dport;
493 	t->encap.flags = ipencap->flags;
494 
495 	t->encap_hlen = hlen;
496 	t->hlen = t->encap_hlen + t->tun_hlen;
497 
498 	return 0;
499 }
500 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
501 
502 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
503 			    struct rtable *rt, __be16 df,
504 			    const struct iphdr *inner_iph,
505 			    int tunnel_hlen, __be32 dst, bool md)
506 {
507 	struct ip_tunnel *tunnel = netdev_priv(dev);
508 	int pkt_size;
509 	int mtu;
510 
511 	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
512 	pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
513 
514 	if (df)
515 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
516 					- sizeof(struct iphdr) - tunnel_hlen;
517 	else
518 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
519 
520 	skb_dst_update_pmtu(skb, mtu);
521 
522 	if (skb->protocol == htons(ETH_P_IP)) {
523 		if (!skb_is_gso(skb) &&
524 		    (inner_iph->frag_off & htons(IP_DF)) &&
525 		    mtu < pkt_size) {
526 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
527 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
528 			return -E2BIG;
529 		}
530 	}
531 #if IS_ENABLED(CONFIG_IPV6)
532 	else if (skb->protocol == htons(ETH_P_IPV6)) {
533 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
534 		__be32 daddr;
535 
536 		daddr = md ? dst : tunnel->parms.iph.daddr;
537 
538 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
539 			   mtu >= IPV6_MIN_MTU) {
540 			if ((daddr && !ipv4_is_multicast(daddr)) ||
541 			    rt6->rt6i_dst.plen == 128) {
542 				rt6->rt6i_flags |= RTF_MODIFIED;
543 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
544 			}
545 		}
546 
547 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
548 					mtu < pkt_size) {
549 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
550 			return -E2BIG;
551 		}
552 	}
553 #endif
554 	return 0;
555 }
556 
557 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
558 		       u8 proto, int tunnel_hlen)
559 {
560 	struct ip_tunnel *tunnel = netdev_priv(dev);
561 	u32 headroom = sizeof(struct iphdr);
562 	struct ip_tunnel_info *tun_info;
563 	const struct ip_tunnel_key *key;
564 	const struct iphdr *inner_iph;
565 	struct rtable *rt = NULL;
566 	struct flowi4 fl4;
567 	__be16 df = 0;
568 	u8 tos, ttl;
569 	bool use_cache;
570 
571 	tun_info = skb_tunnel_info(skb);
572 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
573 		     ip_tunnel_info_af(tun_info) != AF_INET))
574 		goto tx_error;
575 	key = &tun_info->key;
576 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
577 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
578 	tos = key->tos;
579 	if (tos == 1) {
580 		if (skb->protocol == htons(ETH_P_IP))
581 			tos = inner_iph->tos;
582 		else if (skb->protocol == htons(ETH_P_IPV6))
583 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
584 	}
585 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
586 			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
587 			    0, skb->mark);
588 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
589 		goto tx_error;
590 
591 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
592 	if (use_cache)
593 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
594 	if (!rt) {
595 		rt = ip_route_output_key(tunnel->net, &fl4);
596 		if (IS_ERR(rt)) {
597 			dev->stats.tx_carrier_errors++;
598 			goto tx_error;
599 		}
600 		if (use_cache)
601 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
602 					  fl4.saddr);
603 	}
604 	if (rt->dst.dev == dev) {
605 		ip_rt_put(rt);
606 		dev->stats.collisions++;
607 		goto tx_error;
608 	}
609 
610 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
611 		df = htons(IP_DF);
612 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
613 			    key->u.ipv4.dst, true)) {
614 		ip_rt_put(rt);
615 		goto tx_error;
616 	}
617 
618 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
619 	ttl = key->ttl;
620 	if (ttl == 0) {
621 		if (skb->protocol == htons(ETH_P_IP))
622 			ttl = inner_iph->ttl;
623 		else if (skb->protocol == htons(ETH_P_IPV6))
624 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
625 		else
626 			ttl = ip4_dst_hoplimit(&rt->dst);
627 	}
628 
629 	if (!df && skb->protocol == htons(ETH_P_IP))
630 		df = inner_iph->frag_off & htons(IP_DF);
631 
632 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
633 	if (headroom > dev->needed_headroom)
634 		dev->needed_headroom = headroom;
635 
636 	if (skb_cow_head(skb, dev->needed_headroom)) {
637 		ip_rt_put(rt);
638 		goto tx_dropped;
639 	}
640 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
641 		      df, !net_eq(tunnel->net, dev_net(dev)));
642 	return;
643 tx_error:
644 	dev->stats.tx_errors++;
645 	goto kfree;
646 tx_dropped:
647 	dev->stats.tx_dropped++;
648 kfree:
649 	kfree_skb(skb);
650 }
651 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
652 
653 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
654 		    const struct iphdr *tnl_params, u8 protocol)
655 {
656 	struct ip_tunnel *tunnel = netdev_priv(dev);
657 	const struct iphdr *inner_iph;
658 	struct flowi4 fl4;
659 	u8     tos, ttl;
660 	__be16 df;
661 	struct rtable *rt;		/* Route to the other host */
662 	unsigned int max_headroom;	/* The extra header space needed */
663 	__be32 dst;
664 	bool connected;
665 
666 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
667 	connected = (tunnel->parms.iph.daddr != 0);
668 
669 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
670 
671 	dst = tnl_params->daddr;
672 	if (dst == 0) {
673 		/* NBMA tunnel */
674 		struct ip_tunnel_info *tun_info;
675 
676 		if (!skb_dst(skb)) {
677 			dev->stats.tx_fifo_errors++;
678 			goto tx_error;
679 		}
680 
681 		tun_info = skb_tunnel_info(skb);
682 		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
683 		    ip_tunnel_info_af(tun_info) == AF_INET &&
684 		    tun_info->key.u.ipv4.dst)
685 			dst = tun_info->key.u.ipv4.dst;
686 		else if (skb->protocol == htons(ETH_P_IP)) {
687 			rt = skb_rtable(skb);
688 			dst = rt_nexthop(rt, inner_iph->daddr);
689 		}
690 #if IS_ENABLED(CONFIG_IPV6)
691 		else if (skb->protocol == htons(ETH_P_IPV6)) {
692 			const struct in6_addr *addr6;
693 			struct neighbour *neigh;
694 			bool do_tx_error_icmp;
695 			int addr_type;
696 
697 			neigh = dst_neigh_lookup(skb_dst(skb),
698 						 &ipv6_hdr(skb)->daddr);
699 			if (!neigh)
700 				goto tx_error;
701 
702 			addr6 = (const struct in6_addr *)&neigh->primary_key;
703 			addr_type = ipv6_addr_type(addr6);
704 
705 			if (addr_type == IPV6_ADDR_ANY) {
706 				addr6 = &ipv6_hdr(skb)->daddr;
707 				addr_type = ipv6_addr_type(addr6);
708 			}
709 
710 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
711 				do_tx_error_icmp = true;
712 			else {
713 				do_tx_error_icmp = false;
714 				dst = addr6->s6_addr32[3];
715 			}
716 			neigh_release(neigh);
717 			if (do_tx_error_icmp)
718 				goto tx_error_icmp;
719 		}
720 #endif
721 		else
722 			goto tx_error;
723 
724 		connected = false;
725 	}
726 
727 	tos = tnl_params->tos;
728 	if (tos & 0x1) {
729 		tos &= ~0x1;
730 		if (skb->protocol == htons(ETH_P_IP)) {
731 			tos = inner_iph->tos;
732 			connected = false;
733 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
734 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
735 			connected = false;
736 		}
737 	}
738 
739 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
740 			    tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
741 			    tunnel->fwmark);
742 
743 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
744 		goto tx_error;
745 
746 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
747 			 NULL;
748 
749 	if (!rt) {
750 		rt = ip_route_output_key(tunnel->net, &fl4);
751 
752 		if (IS_ERR(rt)) {
753 			dev->stats.tx_carrier_errors++;
754 			goto tx_error;
755 		}
756 		if (connected)
757 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
758 					  fl4.saddr);
759 	}
760 
761 	if (rt->dst.dev == dev) {
762 		ip_rt_put(rt);
763 		dev->stats.collisions++;
764 		goto tx_error;
765 	}
766 
767 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
768 			    0, 0, false)) {
769 		ip_rt_put(rt);
770 		goto tx_error;
771 	}
772 
773 	if (tunnel->err_count > 0) {
774 		if (time_before(jiffies,
775 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
776 			tunnel->err_count--;
777 
778 			dst_link_failure(skb);
779 		} else
780 			tunnel->err_count = 0;
781 	}
782 
783 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
784 	ttl = tnl_params->ttl;
785 	if (ttl == 0) {
786 		if (skb->protocol == htons(ETH_P_IP))
787 			ttl = inner_iph->ttl;
788 #if IS_ENABLED(CONFIG_IPV6)
789 		else if (skb->protocol == htons(ETH_P_IPV6))
790 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
791 #endif
792 		else
793 			ttl = ip4_dst_hoplimit(&rt->dst);
794 	}
795 
796 	df = tnl_params->frag_off;
797 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
798 		df |= (inner_iph->frag_off&htons(IP_DF));
799 
800 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
801 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
802 	if (max_headroom > dev->needed_headroom)
803 		dev->needed_headroom = max_headroom;
804 
805 	if (skb_cow_head(skb, dev->needed_headroom)) {
806 		ip_rt_put(rt);
807 		dev->stats.tx_dropped++;
808 		kfree_skb(skb);
809 		return;
810 	}
811 
812 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
813 		      df, !net_eq(tunnel->net, dev_net(dev)));
814 	return;
815 
816 #if IS_ENABLED(CONFIG_IPV6)
817 tx_error_icmp:
818 	dst_link_failure(skb);
819 #endif
820 tx_error:
821 	dev->stats.tx_errors++;
822 	kfree_skb(skb);
823 }
824 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
825 
826 static void ip_tunnel_update(struct ip_tunnel_net *itn,
827 			     struct ip_tunnel *t,
828 			     struct net_device *dev,
829 			     struct ip_tunnel_parm *p,
830 			     bool set_mtu,
831 			     __u32 fwmark)
832 {
833 	ip_tunnel_del(itn, t);
834 	t->parms.iph.saddr = p->iph.saddr;
835 	t->parms.iph.daddr = p->iph.daddr;
836 	t->parms.i_key = p->i_key;
837 	t->parms.o_key = p->o_key;
838 	if (dev->type != ARPHRD_ETHER) {
839 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
840 		memcpy(dev->broadcast, &p->iph.daddr, 4);
841 	}
842 	ip_tunnel_add(itn, t);
843 
844 	t->parms.iph.ttl = p->iph.ttl;
845 	t->parms.iph.tos = p->iph.tos;
846 	t->parms.iph.frag_off = p->iph.frag_off;
847 
848 	if (t->parms.link != p->link || t->fwmark != fwmark) {
849 		int mtu;
850 
851 		t->parms.link = p->link;
852 		t->fwmark = fwmark;
853 		mtu = ip_tunnel_bind_dev(dev);
854 		if (set_mtu)
855 			dev->mtu = mtu;
856 	}
857 	dst_cache_reset(&t->dst_cache);
858 	netdev_state_change(dev);
859 }
860 
861 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
862 {
863 	int err = 0;
864 	struct ip_tunnel *t = netdev_priv(dev);
865 	struct net *net = t->net;
866 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
867 
868 	switch (cmd) {
869 	case SIOCGETTUNNEL:
870 		if (dev == itn->fb_tunnel_dev) {
871 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
872 			if (!t)
873 				t = netdev_priv(dev);
874 		}
875 		memcpy(p, &t->parms, sizeof(*p));
876 		break;
877 
878 	case SIOCADDTUNNEL:
879 	case SIOCCHGTUNNEL:
880 		err = -EPERM;
881 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
882 			goto done;
883 		if (p->iph.ttl)
884 			p->iph.frag_off |= htons(IP_DF);
885 		if (!(p->i_flags & VTI_ISVTI)) {
886 			if (!(p->i_flags & TUNNEL_KEY))
887 				p->i_key = 0;
888 			if (!(p->o_flags & TUNNEL_KEY))
889 				p->o_key = 0;
890 		}
891 
892 		t = ip_tunnel_find(itn, p, itn->type);
893 
894 		if (cmd == SIOCADDTUNNEL) {
895 			if (!t) {
896 				t = ip_tunnel_create(net, itn, p);
897 				err = PTR_ERR_OR_ZERO(t);
898 				break;
899 			}
900 
901 			err = -EEXIST;
902 			break;
903 		}
904 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
905 			if (t) {
906 				if (t->dev != dev) {
907 					err = -EEXIST;
908 					break;
909 				}
910 			} else {
911 				unsigned int nflags = 0;
912 
913 				if (ipv4_is_multicast(p->iph.daddr))
914 					nflags = IFF_BROADCAST;
915 				else if (p->iph.daddr)
916 					nflags = IFF_POINTOPOINT;
917 
918 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
919 					err = -EINVAL;
920 					break;
921 				}
922 
923 				t = netdev_priv(dev);
924 			}
925 		}
926 
927 		if (t) {
928 			err = 0;
929 			ip_tunnel_update(itn, t, dev, p, true, 0);
930 		} else {
931 			err = -ENOENT;
932 		}
933 		break;
934 
935 	case SIOCDELTUNNEL:
936 		err = -EPERM;
937 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
938 			goto done;
939 
940 		if (dev == itn->fb_tunnel_dev) {
941 			err = -ENOENT;
942 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
943 			if (!t)
944 				goto done;
945 			err = -EPERM;
946 			if (t == netdev_priv(itn->fb_tunnel_dev))
947 				goto done;
948 			dev = t->dev;
949 		}
950 		unregister_netdevice(dev);
951 		err = 0;
952 		break;
953 
954 	default:
955 		err = -EINVAL;
956 	}
957 
958 done:
959 	return err;
960 }
961 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
962 
963 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
964 {
965 	struct ip_tunnel *tunnel = netdev_priv(dev);
966 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
967 	int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
968 
969 	if (new_mtu < ETH_MIN_MTU)
970 		return -EINVAL;
971 
972 	if (new_mtu > max_mtu) {
973 		if (strict)
974 			return -EINVAL;
975 
976 		new_mtu = max_mtu;
977 	}
978 
979 	dev->mtu = new_mtu;
980 	return 0;
981 }
982 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
983 
984 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
985 {
986 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
987 }
988 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
989 
990 static void ip_tunnel_dev_free(struct net_device *dev)
991 {
992 	struct ip_tunnel *tunnel = netdev_priv(dev);
993 
994 	gro_cells_destroy(&tunnel->gro_cells);
995 	dst_cache_destroy(&tunnel->dst_cache);
996 	free_percpu(dev->tstats);
997 }
998 
999 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1000 {
1001 	struct ip_tunnel *tunnel = netdev_priv(dev);
1002 	struct ip_tunnel_net *itn;
1003 
1004 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1005 
1006 	if (itn->fb_tunnel_dev != dev) {
1007 		ip_tunnel_del(itn, netdev_priv(dev));
1008 		unregister_netdevice_queue(dev, head);
1009 	}
1010 }
1011 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1012 
1013 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1014 {
1015 	struct ip_tunnel *tunnel = netdev_priv(dev);
1016 
1017 	return tunnel->net;
1018 }
1019 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1020 
1021 int ip_tunnel_get_iflink(const struct net_device *dev)
1022 {
1023 	struct ip_tunnel *tunnel = netdev_priv(dev);
1024 
1025 	return tunnel->parms.link;
1026 }
1027 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1028 
1029 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1030 				  struct rtnl_link_ops *ops, char *devname)
1031 {
1032 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1033 	struct ip_tunnel_parm parms;
1034 	unsigned int i;
1035 
1036 	itn->rtnl_link_ops = ops;
1037 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1038 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1039 
1040 	if (!ops || !net_has_fallback_tunnels(net)) {
1041 		struct ip_tunnel_net *it_init_net;
1042 
1043 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1044 		itn->type = it_init_net->type;
1045 		itn->fb_tunnel_dev = NULL;
1046 		return 0;
1047 	}
1048 
1049 	memset(&parms, 0, sizeof(parms));
1050 	if (devname)
1051 		strlcpy(parms.name, devname, IFNAMSIZ);
1052 
1053 	rtnl_lock();
1054 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1055 	/* FB netdevice is special: we have one, and only one per netns.
1056 	 * Allowing to move it to another netns is clearly unsafe.
1057 	 */
1058 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1059 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1060 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1061 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1062 		itn->type = itn->fb_tunnel_dev->type;
1063 	}
1064 	rtnl_unlock();
1065 
1066 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1067 }
1068 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1069 
1070 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1071 			      struct list_head *head,
1072 			      struct rtnl_link_ops *ops)
1073 {
1074 	struct net_device *dev, *aux;
1075 	int h;
1076 
1077 	for_each_netdev_safe(net, dev, aux)
1078 		if (dev->rtnl_link_ops == ops)
1079 			unregister_netdevice_queue(dev, head);
1080 
1081 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1082 		struct ip_tunnel *t;
1083 		struct hlist_node *n;
1084 		struct hlist_head *thead = &itn->tunnels[h];
1085 
1086 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1087 			/* If dev is in the same netns, it has already
1088 			 * been added to the list by the previous loop.
1089 			 */
1090 			if (!net_eq(dev_net(t->dev), net))
1091 				unregister_netdevice_queue(t->dev, head);
1092 	}
1093 }
1094 
1095 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1096 			   struct rtnl_link_ops *ops)
1097 {
1098 	struct ip_tunnel_net *itn;
1099 	struct net *net;
1100 	LIST_HEAD(list);
1101 
1102 	rtnl_lock();
1103 	list_for_each_entry(net, net_list, exit_list) {
1104 		itn = net_generic(net, id);
1105 		ip_tunnel_destroy(net, itn, &list, ops);
1106 	}
1107 	unregister_netdevice_many(&list);
1108 	rtnl_unlock();
1109 }
1110 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1111 
1112 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1113 		      struct ip_tunnel_parm *p, __u32 fwmark)
1114 {
1115 	struct ip_tunnel *nt;
1116 	struct net *net = dev_net(dev);
1117 	struct ip_tunnel_net *itn;
1118 	int mtu;
1119 	int err;
1120 
1121 	nt = netdev_priv(dev);
1122 	itn = net_generic(net, nt->ip_tnl_net_id);
1123 
1124 	if (nt->collect_md) {
1125 		if (rtnl_dereference(itn->collect_md_tun))
1126 			return -EEXIST;
1127 	} else {
1128 		if (ip_tunnel_find(itn, p, dev->type))
1129 			return -EEXIST;
1130 	}
1131 
1132 	nt->net = net;
1133 	nt->parms = *p;
1134 	nt->fwmark = fwmark;
1135 	err = register_netdevice(dev);
1136 	if (err)
1137 		goto err_register_netdevice;
1138 
1139 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1140 		eth_hw_addr_random(dev);
1141 
1142 	mtu = ip_tunnel_bind_dev(dev);
1143 	if (tb[IFLA_MTU]) {
1144 		unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1145 
1146 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1147 			    (unsigned int)(max - sizeof(struct iphdr)));
1148 	}
1149 
1150 	err = dev_set_mtu(dev, mtu);
1151 	if (err)
1152 		goto err_dev_set_mtu;
1153 
1154 	ip_tunnel_add(itn, nt);
1155 	return 0;
1156 
1157 err_dev_set_mtu:
1158 	unregister_netdevice(dev);
1159 err_register_netdevice:
1160 	return err;
1161 }
1162 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1163 
1164 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1165 			 struct ip_tunnel_parm *p, __u32 fwmark)
1166 {
1167 	struct ip_tunnel *t;
1168 	struct ip_tunnel *tunnel = netdev_priv(dev);
1169 	struct net *net = tunnel->net;
1170 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1171 
1172 	if (dev == itn->fb_tunnel_dev)
1173 		return -EINVAL;
1174 
1175 	t = ip_tunnel_find(itn, p, dev->type);
1176 
1177 	if (t) {
1178 		if (t->dev != dev)
1179 			return -EEXIST;
1180 	} else {
1181 		t = tunnel;
1182 
1183 		if (dev->type != ARPHRD_ETHER) {
1184 			unsigned int nflags = 0;
1185 
1186 			if (ipv4_is_multicast(p->iph.daddr))
1187 				nflags = IFF_BROADCAST;
1188 			else if (p->iph.daddr)
1189 				nflags = IFF_POINTOPOINT;
1190 
1191 			if ((dev->flags ^ nflags) &
1192 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1193 				return -EINVAL;
1194 		}
1195 	}
1196 
1197 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1198 	return 0;
1199 }
1200 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1201 
1202 int ip_tunnel_init(struct net_device *dev)
1203 {
1204 	struct ip_tunnel *tunnel = netdev_priv(dev);
1205 	struct iphdr *iph = &tunnel->parms.iph;
1206 	int err;
1207 
1208 	dev->needs_free_netdev = true;
1209 	dev->priv_destructor = ip_tunnel_dev_free;
1210 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1211 	if (!dev->tstats)
1212 		return -ENOMEM;
1213 
1214 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1215 	if (err) {
1216 		free_percpu(dev->tstats);
1217 		return err;
1218 	}
1219 
1220 	err = gro_cells_init(&tunnel->gro_cells, dev);
1221 	if (err) {
1222 		dst_cache_destroy(&tunnel->dst_cache);
1223 		free_percpu(dev->tstats);
1224 		return err;
1225 	}
1226 
1227 	tunnel->dev = dev;
1228 	tunnel->net = dev_net(dev);
1229 	strcpy(tunnel->parms.name, dev->name);
1230 	iph->version		= 4;
1231 	iph->ihl		= 5;
1232 
1233 	if (tunnel->collect_md) {
1234 		dev->features |= NETIF_F_NETNS_LOCAL;
1235 		netif_keep_dst(dev);
1236 	}
1237 	return 0;
1238 }
1239 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1240 
1241 void ip_tunnel_uninit(struct net_device *dev)
1242 {
1243 	struct ip_tunnel *tunnel = netdev_priv(dev);
1244 	struct net *net = tunnel->net;
1245 	struct ip_tunnel_net *itn;
1246 
1247 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1248 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1249 	if (itn->fb_tunnel_dev != dev)
1250 		ip_tunnel_del(itn, netdev_priv(dev));
1251 
1252 	dst_cache_reset(&tunnel->dst_cache);
1253 }
1254 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1255 
1256 /* Do least required initialization, rest of init is done in tunnel_init call */
1257 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1258 {
1259 	struct ip_tunnel *tunnel = netdev_priv(dev);
1260 	tunnel->ip_tnl_net_id = net_id;
1261 }
1262 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1263 
1264 MODULE_LICENSE("GPL");
1265