xref: /linux/net/ipv4/ip_tunnel.c (revision 86287543715ac2a6d92d561cc105d79306511457)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5 
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30 
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46 
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52 
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55 	return hash_32((__force u32)key ^ (__force u32)remote,
56 			 IP_TNL_HASH_BITS);
57 }
58 
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 				__be16 flags, __be32 key)
61 {
62 	if (p->i_flags & TUNNEL_KEY) {
63 		if (flags & TUNNEL_KEY)
64 			return key == p->i_key;
65 		else
66 			/* key expected, none present */
67 			return false;
68 	} else
69 		return !(flags & TUNNEL_KEY);
70 }
71 
72 /* Fallback tunnel: no source, no destination, no key, no options
73 
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78 
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 				   int link, __be16 flags,
85 				   __be32 remote, __be32 local,
86 				   __be32 key)
87 {
88 	unsigned int hash;
89 	struct ip_tunnel *t, *cand = NULL;
90 	struct hlist_head *head;
91 
92 	hash = ip_tunnel_hash(key, remote);
93 	head = &itn->tunnels[hash];
94 
95 	hlist_for_each_entry_rcu(t, head, hash_node) {
96 		if (local != t->parms.iph.saddr ||
97 		    remote != t->parms.iph.daddr ||
98 		    !(t->dev->flags & IFF_UP))
99 			continue;
100 
101 		if (!ip_tunnel_key_match(&t->parms, flags, key))
102 			continue;
103 
104 		if (t->parms.link == link)
105 			return t;
106 		else
107 			cand = t;
108 	}
109 
110 	hlist_for_each_entry_rcu(t, head, hash_node) {
111 		if (remote != t->parms.iph.daddr ||
112 		    t->parms.iph.saddr != 0 ||
113 		    !(t->dev->flags & IFF_UP))
114 			continue;
115 
116 		if (!ip_tunnel_key_match(&t->parms, flags, key))
117 			continue;
118 
119 		if (t->parms.link == link)
120 			return t;
121 		else if (!cand)
122 			cand = t;
123 	}
124 
125 	hash = ip_tunnel_hash(key, 0);
126 	head = &itn->tunnels[hash];
127 
128 	hlist_for_each_entry_rcu(t, head, hash_node) {
129 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
130 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
131 			continue;
132 
133 		if (!(t->dev->flags & IFF_UP))
134 			continue;
135 
136 		if (!ip_tunnel_key_match(&t->parms, flags, key))
137 			continue;
138 
139 		if (t->parms.link == link)
140 			return t;
141 		else if (!cand)
142 			cand = t;
143 	}
144 
145 	hlist_for_each_entry_rcu(t, head, hash_node) {
146 		if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
147 		    t->parms.iph.saddr != 0 ||
148 		    t->parms.iph.daddr != 0 ||
149 		    !(t->dev->flags & IFF_UP))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (cand)
159 		return cand;
160 
161 	t = rcu_dereference(itn->collect_md_tun);
162 	if (t && t->dev->flags & IFF_UP)
163 		return t;
164 
165 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
166 		return netdev_priv(itn->fb_tunnel_dev);
167 
168 	return NULL;
169 }
170 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
171 
172 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
173 				    struct ip_tunnel_parm *parms)
174 {
175 	unsigned int h;
176 	__be32 remote;
177 	__be32 i_key = parms->i_key;
178 
179 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
180 		remote = parms->iph.daddr;
181 	else
182 		remote = 0;
183 
184 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
185 		i_key = 0;
186 
187 	h = ip_tunnel_hash(i_key, remote);
188 	return &itn->tunnels[h];
189 }
190 
191 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
192 {
193 	struct hlist_head *head = ip_bucket(itn, &t->parms);
194 
195 	if (t->collect_md)
196 		rcu_assign_pointer(itn->collect_md_tun, t);
197 	hlist_add_head_rcu(&t->hash_node, head);
198 }
199 
200 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
201 {
202 	if (t->collect_md)
203 		rcu_assign_pointer(itn->collect_md_tun, NULL);
204 	hlist_del_init_rcu(&t->hash_node);
205 }
206 
207 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
208 					struct ip_tunnel_parm *parms,
209 					int type)
210 {
211 	__be32 remote = parms->iph.daddr;
212 	__be32 local = parms->iph.saddr;
213 	__be32 key = parms->i_key;
214 	__be16 flags = parms->i_flags;
215 	int link = parms->link;
216 	struct ip_tunnel *t = NULL;
217 	struct hlist_head *head = ip_bucket(itn, parms);
218 
219 	hlist_for_each_entry_rcu(t, head, hash_node) {
220 		if (local == t->parms.iph.saddr &&
221 		    remote == t->parms.iph.daddr &&
222 		    link == t->parms.link &&
223 		    type == t->dev->type &&
224 		    ip_tunnel_key_match(&t->parms, flags, key))
225 			break;
226 	}
227 	return t;
228 }
229 
230 static struct net_device *__ip_tunnel_create(struct net *net,
231 					     const struct rtnl_link_ops *ops,
232 					     struct ip_tunnel_parm *parms)
233 {
234 	int err;
235 	struct ip_tunnel *tunnel;
236 	struct net_device *dev;
237 	char name[IFNAMSIZ];
238 
239 	err = -E2BIG;
240 	if (parms->name[0]) {
241 		if (!dev_valid_name(parms->name))
242 			goto failed;
243 		strlcpy(name, parms->name, IFNAMSIZ);
244 	} else {
245 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
246 			goto failed;
247 		strcpy(name, ops->kind);
248 		strcat(name, "%d");
249 	}
250 
251 	ASSERT_RTNL();
252 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
253 	if (!dev) {
254 		err = -ENOMEM;
255 		goto failed;
256 	}
257 	dev_net_set(dev, net);
258 
259 	dev->rtnl_link_ops = ops;
260 
261 	tunnel = netdev_priv(dev);
262 	tunnel->parms = *parms;
263 	tunnel->net = net;
264 
265 	err = register_netdevice(dev);
266 	if (err)
267 		goto failed_free;
268 
269 	return dev;
270 
271 failed_free:
272 	free_netdev(dev);
273 failed:
274 	return ERR_PTR(err);
275 }
276 
277 static int ip_tunnel_bind_dev(struct net_device *dev)
278 {
279 	struct net_device *tdev = NULL;
280 	struct ip_tunnel *tunnel = netdev_priv(dev);
281 	const struct iphdr *iph;
282 	int hlen = LL_MAX_HEADER;
283 	int mtu = ETH_DATA_LEN;
284 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
285 
286 	iph = &tunnel->parms.iph;
287 
288 	/* Guess output device to choose reasonable mtu and needed_headroom */
289 	if (iph->daddr) {
290 		struct flowi4 fl4;
291 		struct rtable *rt;
292 
293 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
294 				    iph->saddr, tunnel->parms.o_key,
295 				    RT_TOS(iph->tos), tunnel->parms.link,
296 				    tunnel->fwmark, 0);
297 		rt = ip_route_output_key(tunnel->net, &fl4);
298 
299 		if (!IS_ERR(rt)) {
300 			tdev = rt->dst.dev;
301 			ip_rt_put(rt);
302 		}
303 		if (dev->type != ARPHRD_ETHER)
304 			dev->flags |= IFF_POINTOPOINT;
305 
306 		dst_cache_reset(&tunnel->dst_cache);
307 	}
308 
309 	if (!tdev && tunnel->parms.link)
310 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
311 
312 	if (tdev) {
313 		hlen = tdev->hard_header_len + tdev->needed_headroom;
314 		mtu = min(tdev->mtu, IP_MAX_MTU);
315 	}
316 
317 	dev->needed_headroom = t_hlen + hlen;
318 	mtu -= (dev->hard_header_len + t_hlen);
319 
320 	if (mtu < IPV4_MIN_MTU)
321 		mtu = IPV4_MIN_MTU;
322 
323 	return mtu;
324 }
325 
326 static struct ip_tunnel *ip_tunnel_create(struct net *net,
327 					  struct ip_tunnel_net *itn,
328 					  struct ip_tunnel_parm *parms)
329 {
330 	struct ip_tunnel *nt;
331 	struct net_device *dev;
332 	int t_hlen;
333 	int mtu;
334 	int err;
335 
336 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
337 	if (IS_ERR(dev))
338 		return ERR_CAST(dev);
339 
340 	mtu = ip_tunnel_bind_dev(dev);
341 	err = dev_set_mtu(dev, mtu);
342 	if (err)
343 		goto err_dev_set_mtu;
344 
345 	nt = netdev_priv(dev);
346 	t_hlen = nt->hlen + sizeof(struct iphdr);
347 	dev->min_mtu = ETH_MIN_MTU;
348 	dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
349 	ip_tunnel_add(itn, nt);
350 	return nt;
351 
352 err_dev_set_mtu:
353 	unregister_netdevice(dev);
354 	return ERR_PTR(err);
355 }
356 
357 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
358 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
359 		  bool log_ecn_error)
360 {
361 	struct pcpu_sw_netstats *tstats;
362 	const struct iphdr *iph = ip_hdr(skb);
363 	int err;
364 
365 #ifdef CONFIG_NET_IPGRE_BROADCAST
366 	if (ipv4_is_multicast(iph->daddr)) {
367 		tunnel->dev->stats.multicast++;
368 		skb->pkt_type = PACKET_BROADCAST;
369 	}
370 #endif
371 
372 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
373 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
374 		tunnel->dev->stats.rx_crc_errors++;
375 		tunnel->dev->stats.rx_errors++;
376 		goto drop;
377 	}
378 
379 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
380 		if (!(tpi->flags&TUNNEL_SEQ) ||
381 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
382 			tunnel->dev->stats.rx_fifo_errors++;
383 			tunnel->dev->stats.rx_errors++;
384 			goto drop;
385 		}
386 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
387 	}
388 
389 	skb_reset_network_header(skb);
390 
391 	err = IP_ECN_decapsulate(iph, skb);
392 	if (unlikely(err)) {
393 		if (log_ecn_error)
394 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
395 					&iph->saddr, iph->tos);
396 		if (err > 1) {
397 			++tunnel->dev->stats.rx_frame_errors;
398 			++tunnel->dev->stats.rx_errors;
399 			goto drop;
400 		}
401 	}
402 
403 	tstats = this_cpu_ptr(tunnel->dev->tstats);
404 	u64_stats_update_begin(&tstats->syncp);
405 	tstats->rx_packets++;
406 	tstats->rx_bytes += skb->len;
407 	u64_stats_update_end(&tstats->syncp);
408 
409 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
410 
411 	if (tunnel->dev->type == ARPHRD_ETHER) {
412 		skb->protocol = eth_type_trans(skb, tunnel->dev);
413 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
414 	} else {
415 		skb->dev = tunnel->dev;
416 	}
417 
418 	if (tun_dst)
419 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
420 
421 	gro_cells_receive(&tunnel->gro_cells, skb);
422 	return 0;
423 
424 drop:
425 	if (tun_dst)
426 		dst_release((struct dst_entry *)tun_dst);
427 	kfree_skb(skb);
428 	return 0;
429 }
430 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
431 
432 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
433 			    unsigned int num)
434 {
435 	if (num >= MAX_IPTUN_ENCAP_OPS)
436 		return -ERANGE;
437 
438 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
439 			&iptun_encaps[num],
440 			NULL, ops) ? 0 : -1;
441 }
442 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
443 
444 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
445 			    unsigned int num)
446 {
447 	int ret;
448 
449 	if (num >= MAX_IPTUN_ENCAP_OPS)
450 		return -ERANGE;
451 
452 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
453 		       &iptun_encaps[num],
454 		       ops, NULL) == ops) ? 0 : -1;
455 
456 	synchronize_net();
457 
458 	return ret;
459 }
460 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
461 
462 int ip_tunnel_encap_setup(struct ip_tunnel *t,
463 			  struct ip_tunnel_encap *ipencap)
464 {
465 	int hlen;
466 
467 	memset(&t->encap, 0, sizeof(t->encap));
468 
469 	hlen = ip_encap_hlen(ipencap);
470 	if (hlen < 0)
471 		return hlen;
472 
473 	t->encap.type = ipencap->type;
474 	t->encap.sport = ipencap->sport;
475 	t->encap.dport = ipencap->dport;
476 	t->encap.flags = ipencap->flags;
477 
478 	t->encap_hlen = hlen;
479 	t->hlen = t->encap_hlen + t->tun_hlen;
480 
481 	return 0;
482 }
483 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
484 
485 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
486 			    struct rtable *rt, __be16 df,
487 			    const struct iphdr *inner_iph,
488 			    int tunnel_hlen, __be32 dst, bool md)
489 {
490 	struct ip_tunnel *tunnel = netdev_priv(dev);
491 	int pkt_size;
492 	int mtu;
493 
494 	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
495 	pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
496 
497 	if (df)
498 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
499 					- sizeof(struct iphdr) - tunnel_hlen;
500 	else
501 		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
502 
503 	if (skb_valid_dst(skb))
504 		skb_dst_update_pmtu_no_confirm(skb, mtu);
505 
506 	if (skb->protocol == htons(ETH_P_IP)) {
507 		if (!skb_is_gso(skb) &&
508 		    (inner_iph->frag_off & htons(IP_DF)) &&
509 		    mtu < pkt_size) {
510 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
511 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
512 			return -E2BIG;
513 		}
514 	}
515 #if IS_ENABLED(CONFIG_IPV6)
516 	else if (skb->protocol == htons(ETH_P_IPV6)) {
517 		struct rt6_info *rt6;
518 		__be32 daddr;
519 
520 		rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
521 					   NULL;
522 		daddr = md ? dst : tunnel->parms.iph.daddr;
523 
524 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
525 			   mtu >= IPV6_MIN_MTU) {
526 			if ((daddr && !ipv4_is_multicast(daddr)) ||
527 			    rt6->rt6i_dst.plen == 128) {
528 				rt6->rt6i_flags |= RTF_MODIFIED;
529 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
530 			}
531 		}
532 
533 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
534 					mtu < pkt_size) {
535 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
536 			return -E2BIG;
537 		}
538 	}
539 #endif
540 	return 0;
541 }
542 
543 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
544 		       u8 proto, int tunnel_hlen)
545 {
546 	struct ip_tunnel *tunnel = netdev_priv(dev);
547 	u32 headroom = sizeof(struct iphdr);
548 	struct ip_tunnel_info *tun_info;
549 	const struct ip_tunnel_key *key;
550 	const struct iphdr *inner_iph;
551 	struct rtable *rt = NULL;
552 	struct flowi4 fl4;
553 	__be16 df = 0;
554 	u8 tos, ttl;
555 	bool use_cache;
556 
557 	tun_info = skb_tunnel_info(skb);
558 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
559 		     ip_tunnel_info_af(tun_info) != AF_INET))
560 		goto tx_error;
561 	key = &tun_info->key;
562 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
563 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
564 	tos = key->tos;
565 	if (tos == 1) {
566 		if (skb->protocol == htons(ETH_P_IP))
567 			tos = inner_iph->tos;
568 		else if (skb->protocol == htons(ETH_P_IPV6))
569 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
570 	}
571 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
572 			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
573 			    0, skb->mark, skb_get_hash(skb));
574 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
575 		goto tx_error;
576 
577 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
578 	if (use_cache)
579 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
580 	if (!rt) {
581 		rt = ip_route_output_key(tunnel->net, &fl4);
582 		if (IS_ERR(rt)) {
583 			dev->stats.tx_carrier_errors++;
584 			goto tx_error;
585 		}
586 		if (use_cache)
587 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
588 					  fl4.saddr);
589 	}
590 	if (rt->dst.dev == dev) {
591 		ip_rt_put(rt);
592 		dev->stats.collisions++;
593 		goto tx_error;
594 	}
595 
596 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
597 		df = htons(IP_DF);
598 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
599 			    key->u.ipv4.dst, true)) {
600 		ip_rt_put(rt);
601 		goto tx_error;
602 	}
603 
604 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
605 	ttl = key->ttl;
606 	if (ttl == 0) {
607 		if (skb->protocol == htons(ETH_P_IP))
608 			ttl = inner_iph->ttl;
609 		else if (skb->protocol == htons(ETH_P_IPV6))
610 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
611 		else
612 			ttl = ip4_dst_hoplimit(&rt->dst);
613 	}
614 
615 	if (!df && skb->protocol == htons(ETH_P_IP))
616 		df = inner_iph->frag_off & htons(IP_DF);
617 
618 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
619 	if (headroom > dev->needed_headroom)
620 		dev->needed_headroom = headroom;
621 
622 	if (skb_cow_head(skb, dev->needed_headroom)) {
623 		ip_rt_put(rt);
624 		goto tx_dropped;
625 	}
626 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
627 		      df, !net_eq(tunnel->net, dev_net(dev)));
628 	return;
629 tx_error:
630 	dev->stats.tx_errors++;
631 	goto kfree;
632 tx_dropped:
633 	dev->stats.tx_dropped++;
634 kfree:
635 	kfree_skb(skb);
636 }
637 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
638 
639 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
640 		    const struct iphdr *tnl_params, u8 protocol)
641 {
642 	struct ip_tunnel *tunnel = netdev_priv(dev);
643 	struct ip_tunnel_info *tun_info = NULL;
644 	const struct iphdr *inner_iph;
645 	unsigned int max_headroom;	/* The extra header space needed */
646 	struct rtable *rt = NULL;		/* Route to the other host */
647 	bool use_cache = false;
648 	struct flowi4 fl4;
649 	bool md = false;
650 	bool connected;
651 	u8 tos, ttl;
652 	__be32 dst;
653 	__be16 df;
654 
655 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
656 	connected = (tunnel->parms.iph.daddr != 0);
657 
658 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
659 
660 	dst = tnl_params->daddr;
661 	if (dst == 0) {
662 		/* NBMA tunnel */
663 
664 		if (!skb_dst(skb)) {
665 			dev->stats.tx_fifo_errors++;
666 			goto tx_error;
667 		}
668 
669 		tun_info = skb_tunnel_info(skb);
670 		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
671 		    ip_tunnel_info_af(tun_info) == AF_INET &&
672 		    tun_info->key.u.ipv4.dst) {
673 			dst = tun_info->key.u.ipv4.dst;
674 			md = true;
675 			connected = true;
676 		}
677 		else if (skb->protocol == htons(ETH_P_IP)) {
678 			rt = skb_rtable(skb);
679 			dst = rt_nexthop(rt, inner_iph->daddr);
680 		}
681 #if IS_ENABLED(CONFIG_IPV6)
682 		else if (skb->protocol == htons(ETH_P_IPV6)) {
683 			const struct in6_addr *addr6;
684 			struct neighbour *neigh;
685 			bool do_tx_error_icmp;
686 			int addr_type;
687 
688 			neigh = dst_neigh_lookup(skb_dst(skb),
689 						 &ipv6_hdr(skb)->daddr);
690 			if (!neigh)
691 				goto tx_error;
692 
693 			addr6 = (const struct in6_addr *)&neigh->primary_key;
694 			addr_type = ipv6_addr_type(addr6);
695 
696 			if (addr_type == IPV6_ADDR_ANY) {
697 				addr6 = &ipv6_hdr(skb)->daddr;
698 				addr_type = ipv6_addr_type(addr6);
699 			}
700 
701 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
702 				do_tx_error_icmp = true;
703 			else {
704 				do_tx_error_icmp = false;
705 				dst = addr6->s6_addr32[3];
706 			}
707 			neigh_release(neigh);
708 			if (do_tx_error_icmp)
709 				goto tx_error_icmp;
710 		}
711 #endif
712 		else
713 			goto tx_error;
714 
715 		if (!md)
716 			connected = false;
717 	}
718 
719 	tos = tnl_params->tos;
720 	if (tos & 0x1) {
721 		tos &= ~0x1;
722 		if (skb->protocol == htons(ETH_P_IP)) {
723 			tos = inner_iph->tos;
724 			connected = false;
725 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
726 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
727 			connected = false;
728 		}
729 	}
730 
731 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
732 			    tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
733 			    tunnel->fwmark, skb_get_hash(skb));
734 
735 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
736 		goto tx_error;
737 
738 	if (connected && md) {
739 		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
740 		if (use_cache)
741 			rt = dst_cache_get_ip4(&tun_info->dst_cache,
742 					       &fl4.saddr);
743 	} else {
744 		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
745 						&fl4.saddr) : NULL;
746 	}
747 
748 	if (!rt) {
749 		rt = ip_route_output_key(tunnel->net, &fl4);
750 
751 		if (IS_ERR(rt)) {
752 			dev->stats.tx_carrier_errors++;
753 			goto tx_error;
754 		}
755 		if (use_cache)
756 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
757 					  fl4.saddr);
758 		else if (!md && connected)
759 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
760 					  fl4.saddr);
761 	}
762 
763 	if (rt->dst.dev == dev) {
764 		ip_rt_put(rt);
765 		dev->stats.collisions++;
766 		goto tx_error;
767 	}
768 
769 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
770 			    0, 0, false)) {
771 		ip_rt_put(rt);
772 		goto tx_error;
773 	}
774 
775 	if (tunnel->err_count > 0) {
776 		if (time_before(jiffies,
777 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
778 			tunnel->err_count--;
779 
780 			dst_link_failure(skb);
781 		} else
782 			tunnel->err_count = 0;
783 	}
784 
785 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
786 	ttl = tnl_params->ttl;
787 	if (ttl == 0) {
788 		if (skb->protocol == htons(ETH_P_IP))
789 			ttl = inner_iph->ttl;
790 #if IS_ENABLED(CONFIG_IPV6)
791 		else if (skb->protocol == htons(ETH_P_IPV6))
792 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
793 #endif
794 		else
795 			ttl = ip4_dst_hoplimit(&rt->dst);
796 	}
797 
798 	df = tnl_params->frag_off;
799 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
800 		df |= (inner_iph->frag_off&htons(IP_DF));
801 
802 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
803 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
804 	if (max_headroom > dev->needed_headroom)
805 		dev->needed_headroom = max_headroom;
806 
807 	if (skb_cow_head(skb, dev->needed_headroom)) {
808 		ip_rt_put(rt);
809 		dev->stats.tx_dropped++;
810 		kfree_skb(skb);
811 		return;
812 	}
813 
814 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
815 		      df, !net_eq(tunnel->net, dev_net(dev)));
816 	return;
817 
818 #if IS_ENABLED(CONFIG_IPV6)
819 tx_error_icmp:
820 	dst_link_failure(skb);
821 #endif
822 tx_error:
823 	dev->stats.tx_errors++;
824 	kfree_skb(skb);
825 }
826 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
827 
828 static void ip_tunnel_update(struct ip_tunnel_net *itn,
829 			     struct ip_tunnel *t,
830 			     struct net_device *dev,
831 			     struct ip_tunnel_parm *p,
832 			     bool set_mtu,
833 			     __u32 fwmark)
834 {
835 	ip_tunnel_del(itn, t);
836 	t->parms.iph.saddr = p->iph.saddr;
837 	t->parms.iph.daddr = p->iph.daddr;
838 	t->parms.i_key = p->i_key;
839 	t->parms.o_key = p->o_key;
840 	if (dev->type != ARPHRD_ETHER) {
841 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
842 		memcpy(dev->broadcast, &p->iph.daddr, 4);
843 	}
844 	ip_tunnel_add(itn, t);
845 
846 	t->parms.iph.ttl = p->iph.ttl;
847 	t->parms.iph.tos = p->iph.tos;
848 	t->parms.iph.frag_off = p->iph.frag_off;
849 
850 	if (t->parms.link != p->link || t->fwmark != fwmark) {
851 		int mtu;
852 
853 		t->parms.link = p->link;
854 		t->fwmark = fwmark;
855 		mtu = ip_tunnel_bind_dev(dev);
856 		if (set_mtu)
857 			dev->mtu = mtu;
858 	}
859 	dst_cache_reset(&t->dst_cache);
860 	netdev_state_change(dev);
861 }
862 
863 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
864 {
865 	int err = 0;
866 	struct ip_tunnel *t = netdev_priv(dev);
867 	struct net *net = t->net;
868 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
869 
870 	switch (cmd) {
871 	case SIOCGETTUNNEL:
872 		if (dev == itn->fb_tunnel_dev) {
873 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
874 			if (!t)
875 				t = netdev_priv(dev);
876 		}
877 		memcpy(p, &t->parms, sizeof(*p));
878 		break;
879 
880 	case SIOCADDTUNNEL:
881 	case SIOCCHGTUNNEL:
882 		err = -EPERM;
883 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
884 			goto done;
885 		if (p->iph.ttl)
886 			p->iph.frag_off |= htons(IP_DF);
887 		if (!(p->i_flags & VTI_ISVTI)) {
888 			if (!(p->i_flags & TUNNEL_KEY))
889 				p->i_key = 0;
890 			if (!(p->o_flags & TUNNEL_KEY))
891 				p->o_key = 0;
892 		}
893 
894 		t = ip_tunnel_find(itn, p, itn->type);
895 
896 		if (cmd == SIOCADDTUNNEL) {
897 			if (!t) {
898 				t = ip_tunnel_create(net, itn, p);
899 				err = PTR_ERR_OR_ZERO(t);
900 				break;
901 			}
902 
903 			err = -EEXIST;
904 			break;
905 		}
906 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
907 			if (t) {
908 				if (t->dev != dev) {
909 					err = -EEXIST;
910 					break;
911 				}
912 			} else {
913 				unsigned int nflags = 0;
914 
915 				if (ipv4_is_multicast(p->iph.daddr))
916 					nflags = IFF_BROADCAST;
917 				else if (p->iph.daddr)
918 					nflags = IFF_POINTOPOINT;
919 
920 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
921 					err = -EINVAL;
922 					break;
923 				}
924 
925 				t = netdev_priv(dev);
926 			}
927 		}
928 
929 		if (t) {
930 			err = 0;
931 			ip_tunnel_update(itn, t, dev, p, true, 0);
932 		} else {
933 			err = -ENOENT;
934 		}
935 		break;
936 
937 	case SIOCDELTUNNEL:
938 		err = -EPERM;
939 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
940 			goto done;
941 
942 		if (dev == itn->fb_tunnel_dev) {
943 			err = -ENOENT;
944 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
945 			if (!t)
946 				goto done;
947 			err = -EPERM;
948 			if (t == netdev_priv(itn->fb_tunnel_dev))
949 				goto done;
950 			dev = t->dev;
951 		}
952 		unregister_netdevice(dev);
953 		err = 0;
954 		break;
955 
956 	default:
957 		err = -EINVAL;
958 	}
959 
960 done:
961 	return err;
962 }
963 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
964 
965 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
966 {
967 	struct ip_tunnel *tunnel = netdev_priv(dev);
968 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
969 	int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
970 
971 	if (new_mtu < ETH_MIN_MTU)
972 		return -EINVAL;
973 
974 	if (new_mtu > max_mtu) {
975 		if (strict)
976 			return -EINVAL;
977 
978 		new_mtu = max_mtu;
979 	}
980 
981 	dev->mtu = new_mtu;
982 	return 0;
983 }
984 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
985 
986 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
987 {
988 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
989 }
990 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
991 
992 static void ip_tunnel_dev_free(struct net_device *dev)
993 {
994 	struct ip_tunnel *tunnel = netdev_priv(dev);
995 
996 	gro_cells_destroy(&tunnel->gro_cells);
997 	dst_cache_destroy(&tunnel->dst_cache);
998 	free_percpu(dev->tstats);
999 }
1000 
1001 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1002 {
1003 	struct ip_tunnel *tunnel = netdev_priv(dev);
1004 	struct ip_tunnel_net *itn;
1005 
1006 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1007 
1008 	if (itn->fb_tunnel_dev != dev) {
1009 		ip_tunnel_del(itn, netdev_priv(dev));
1010 		unregister_netdevice_queue(dev, head);
1011 	}
1012 }
1013 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1014 
1015 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1016 {
1017 	struct ip_tunnel *tunnel = netdev_priv(dev);
1018 
1019 	return tunnel->net;
1020 }
1021 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1022 
1023 int ip_tunnel_get_iflink(const struct net_device *dev)
1024 {
1025 	struct ip_tunnel *tunnel = netdev_priv(dev);
1026 
1027 	return tunnel->parms.link;
1028 }
1029 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1030 
1031 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1032 				  struct rtnl_link_ops *ops, char *devname)
1033 {
1034 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1035 	struct ip_tunnel_parm parms;
1036 	unsigned int i;
1037 
1038 	itn->rtnl_link_ops = ops;
1039 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1040 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1041 
1042 	if (!ops || !net_has_fallback_tunnels(net)) {
1043 		struct ip_tunnel_net *it_init_net;
1044 
1045 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1046 		itn->type = it_init_net->type;
1047 		itn->fb_tunnel_dev = NULL;
1048 		return 0;
1049 	}
1050 
1051 	memset(&parms, 0, sizeof(parms));
1052 	if (devname)
1053 		strlcpy(parms.name, devname, IFNAMSIZ);
1054 
1055 	rtnl_lock();
1056 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1057 	/* FB netdevice is special: we have one, and only one per netns.
1058 	 * Allowing to move it to another netns is clearly unsafe.
1059 	 */
1060 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1061 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1062 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1063 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1064 		itn->type = itn->fb_tunnel_dev->type;
1065 	}
1066 	rtnl_unlock();
1067 
1068 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1069 }
1070 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1071 
1072 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1073 			      struct list_head *head,
1074 			      struct rtnl_link_ops *ops)
1075 {
1076 	struct net_device *dev, *aux;
1077 	int h;
1078 
1079 	for_each_netdev_safe(net, dev, aux)
1080 		if (dev->rtnl_link_ops == ops)
1081 			unregister_netdevice_queue(dev, head);
1082 
1083 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1084 		struct ip_tunnel *t;
1085 		struct hlist_node *n;
1086 		struct hlist_head *thead = &itn->tunnels[h];
1087 
1088 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1089 			/* If dev is in the same netns, it has already
1090 			 * been added to the list by the previous loop.
1091 			 */
1092 			if (!net_eq(dev_net(t->dev), net))
1093 				unregister_netdevice_queue(t->dev, head);
1094 	}
1095 }
1096 
1097 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1098 			   struct rtnl_link_ops *ops)
1099 {
1100 	struct ip_tunnel_net *itn;
1101 	struct net *net;
1102 	LIST_HEAD(list);
1103 
1104 	rtnl_lock();
1105 	list_for_each_entry(net, net_list, exit_list) {
1106 		itn = net_generic(net, id);
1107 		ip_tunnel_destroy(net, itn, &list, ops);
1108 	}
1109 	unregister_netdevice_many(&list);
1110 	rtnl_unlock();
1111 }
1112 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1113 
1114 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1115 		      struct ip_tunnel_parm *p, __u32 fwmark)
1116 {
1117 	struct ip_tunnel *nt;
1118 	struct net *net = dev_net(dev);
1119 	struct ip_tunnel_net *itn;
1120 	int mtu;
1121 	int err;
1122 
1123 	nt = netdev_priv(dev);
1124 	itn = net_generic(net, nt->ip_tnl_net_id);
1125 
1126 	if (nt->collect_md) {
1127 		if (rtnl_dereference(itn->collect_md_tun))
1128 			return -EEXIST;
1129 	} else {
1130 		if (ip_tunnel_find(itn, p, dev->type))
1131 			return -EEXIST;
1132 	}
1133 
1134 	nt->net = net;
1135 	nt->parms = *p;
1136 	nt->fwmark = fwmark;
1137 	err = register_netdevice(dev);
1138 	if (err)
1139 		goto err_register_netdevice;
1140 
1141 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1142 		eth_hw_addr_random(dev);
1143 
1144 	mtu = ip_tunnel_bind_dev(dev);
1145 	if (tb[IFLA_MTU]) {
1146 		unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1147 
1148 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1149 			    (unsigned int)(max - sizeof(struct iphdr)));
1150 	}
1151 
1152 	err = dev_set_mtu(dev, mtu);
1153 	if (err)
1154 		goto err_dev_set_mtu;
1155 
1156 	ip_tunnel_add(itn, nt);
1157 	return 0;
1158 
1159 err_dev_set_mtu:
1160 	unregister_netdevice(dev);
1161 err_register_netdevice:
1162 	return err;
1163 }
1164 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1165 
1166 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1167 			 struct ip_tunnel_parm *p, __u32 fwmark)
1168 {
1169 	struct ip_tunnel *t;
1170 	struct ip_tunnel *tunnel = netdev_priv(dev);
1171 	struct net *net = tunnel->net;
1172 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1173 
1174 	if (dev == itn->fb_tunnel_dev)
1175 		return -EINVAL;
1176 
1177 	t = ip_tunnel_find(itn, p, dev->type);
1178 
1179 	if (t) {
1180 		if (t->dev != dev)
1181 			return -EEXIST;
1182 	} else {
1183 		t = tunnel;
1184 
1185 		if (dev->type != ARPHRD_ETHER) {
1186 			unsigned int nflags = 0;
1187 
1188 			if (ipv4_is_multicast(p->iph.daddr))
1189 				nflags = IFF_BROADCAST;
1190 			else if (p->iph.daddr)
1191 				nflags = IFF_POINTOPOINT;
1192 
1193 			if ((dev->flags ^ nflags) &
1194 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1195 				return -EINVAL;
1196 		}
1197 	}
1198 
1199 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1200 	return 0;
1201 }
1202 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1203 
1204 int ip_tunnel_init(struct net_device *dev)
1205 {
1206 	struct ip_tunnel *tunnel = netdev_priv(dev);
1207 	struct iphdr *iph = &tunnel->parms.iph;
1208 	int err;
1209 
1210 	dev->needs_free_netdev = true;
1211 	dev->priv_destructor = ip_tunnel_dev_free;
1212 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1213 	if (!dev->tstats)
1214 		return -ENOMEM;
1215 
1216 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1217 	if (err) {
1218 		free_percpu(dev->tstats);
1219 		return err;
1220 	}
1221 
1222 	err = gro_cells_init(&tunnel->gro_cells, dev);
1223 	if (err) {
1224 		dst_cache_destroy(&tunnel->dst_cache);
1225 		free_percpu(dev->tstats);
1226 		return err;
1227 	}
1228 
1229 	tunnel->dev = dev;
1230 	tunnel->net = dev_net(dev);
1231 	strcpy(tunnel->parms.name, dev->name);
1232 	iph->version		= 4;
1233 	iph->ihl		= 5;
1234 
1235 	if (tunnel->collect_md)
1236 		netif_keep_dst(dev);
1237 	return 0;
1238 }
1239 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1240 
1241 void ip_tunnel_uninit(struct net_device *dev)
1242 {
1243 	struct ip_tunnel *tunnel = netdev_priv(dev);
1244 	struct net *net = tunnel->net;
1245 	struct ip_tunnel_net *itn;
1246 
1247 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1248 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1249 	if (itn->fb_tunnel_dev != dev)
1250 		ip_tunnel_del(itn, netdev_priv(dev));
1251 
1252 	dst_cache_reset(&tunnel->dst_cache);
1253 }
1254 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1255 
1256 /* Do least required initialization, rest of init is done in tunnel_init call */
1257 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1258 {
1259 	struct ip_tunnel *tunnel = netdev_priv(dev);
1260 	tunnel->ip_tnl_net_id = net_id;
1261 }
1262 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1263 
1264 MODULE_LICENSE("GPL");
1265