xref: /linux/net/ipv4/ip_tunnel.c (revision 00a6d7b6762c27d441e9ac8faff36384bc0fc180)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst)
73 {
74 	struct dst_entry *old_dst;
75 
76 	if (dst) {
77 		if (dst->flags & DST_NOCACHE)
78 			dst = NULL;
79 		else
80 			dst_clone(dst);
81 	}
82 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83 	dst_release(old_dst);
84 }
85 
86 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87 {
88 	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89 }
90 
91 static void tunnel_dst_reset(struct ip_tunnel *t)
92 {
93 	tunnel_dst_set(t, NULL);
94 }
95 
96 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
97 {
98 	int i;
99 
100 	for_each_possible_cpu(i)
101 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102 }
103 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
104 
105 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
106 {
107 	struct dst_entry *dst;
108 
109 	rcu_read_lock();
110 	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
111 	if (dst) {
112 		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
113 			rcu_read_unlock();
114 			tunnel_dst_reset(t);
115 			return NULL;
116 		}
117 		dst_hold(dst);
118 	}
119 	rcu_read_unlock();
120 	return (struct rtable *)dst;
121 }
122 
123 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
124 				__be16 flags, __be32 key)
125 {
126 	if (p->i_flags & TUNNEL_KEY) {
127 		if (flags & TUNNEL_KEY)
128 			return key == p->i_key;
129 		else
130 			/* key expected, none present */
131 			return false;
132 	} else
133 		return !(flags & TUNNEL_KEY);
134 }
135 
136 /* Fallback tunnel: no source, no destination, no key, no options
137 
138    Tunnel hash table:
139    We require exact key match i.e. if a key is present in packet
140    it will match only tunnel with the same key; if it is not present,
141    it will match only keyless tunnel.
142 
143    All keysless packets, if not matched configured keyless tunnels
144    will match fallback tunnel.
145    Given src, dst and key, find appropriate for input tunnel.
146 */
147 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
148 				   int link, __be16 flags,
149 				   __be32 remote, __be32 local,
150 				   __be32 key)
151 {
152 	unsigned int hash;
153 	struct ip_tunnel *t, *cand = NULL;
154 	struct hlist_head *head;
155 
156 	hash = ip_tunnel_hash(key, remote);
157 	head = &itn->tunnels[hash];
158 
159 	hlist_for_each_entry_rcu(t, head, hash_node) {
160 		if (local != t->parms.iph.saddr ||
161 		    remote != t->parms.iph.daddr ||
162 		    !(t->dev->flags & IFF_UP))
163 			continue;
164 
165 		if (!ip_tunnel_key_match(&t->parms, flags, key))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else
171 			cand = t;
172 	}
173 
174 	hlist_for_each_entry_rcu(t, head, hash_node) {
175 		if (remote != t->parms.iph.daddr ||
176 		    !(t->dev->flags & IFF_UP))
177 			continue;
178 
179 		if (!ip_tunnel_key_match(&t->parms, flags, key))
180 			continue;
181 
182 		if (t->parms.link == link)
183 			return t;
184 		else if (!cand)
185 			cand = t;
186 	}
187 
188 	hash = ip_tunnel_hash(key, 0);
189 	head = &itn->tunnels[hash];
190 
191 	hlist_for_each_entry_rcu(t, head, hash_node) {
192 		if ((local != t->parms.iph.saddr &&
193 		     (local != t->parms.iph.daddr ||
194 		      !ipv4_is_multicast(local))) ||
195 		    !(t->dev->flags & IFF_UP))
196 			continue;
197 
198 		if (!ip_tunnel_key_match(&t->parms, flags, key))
199 			continue;
200 
201 		if (t->parms.link == link)
202 			return t;
203 		else if (!cand)
204 			cand = t;
205 	}
206 
207 	if (flags & TUNNEL_NO_KEY)
208 		goto skip_key_lookup;
209 
210 	hlist_for_each_entry_rcu(t, head, hash_node) {
211 		if (t->parms.i_key != key ||
212 		    !(t->dev->flags & IFF_UP))
213 			continue;
214 
215 		if (t->parms.link == link)
216 			return t;
217 		else if (!cand)
218 			cand = t;
219 	}
220 
221 skip_key_lookup:
222 	if (cand)
223 		return cand;
224 
225 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
226 		return netdev_priv(itn->fb_tunnel_dev);
227 
228 
229 	return NULL;
230 }
231 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
232 
233 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
234 				    struct ip_tunnel_parm *parms)
235 {
236 	unsigned int h;
237 	__be32 remote;
238 	__be32 i_key = parms->i_key;
239 
240 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
241 		remote = parms->iph.daddr;
242 	else
243 		remote = 0;
244 
245 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
246 		i_key = 0;
247 
248 	h = ip_tunnel_hash(i_key, remote);
249 	return &itn->tunnels[h];
250 }
251 
252 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
253 {
254 	struct hlist_head *head = ip_bucket(itn, &t->parms);
255 
256 	hlist_add_head_rcu(&t->hash_node, head);
257 }
258 
259 static void ip_tunnel_del(struct ip_tunnel *t)
260 {
261 	hlist_del_init_rcu(&t->hash_node);
262 }
263 
264 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
265 					struct ip_tunnel_parm *parms,
266 					int type)
267 {
268 	__be32 remote = parms->iph.daddr;
269 	__be32 local = parms->iph.saddr;
270 	__be32 key = parms->i_key;
271 	int link = parms->link;
272 	struct ip_tunnel *t = NULL;
273 	struct hlist_head *head = ip_bucket(itn, parms);
274 
275 	hlist_for_each_entry_rcu(t, head, hash_node) {
276 		if (local == t->parms.iph.saddr &&
277 		    remote == t->parms.iph.daddr &&
278 		    key == t->parms.i_key &&
279 		    link == t->parms.link &&
280 		    type == t->dev->type)
281 			break;
282 	}
283 	return t;
284 }
285 
286 static struct net_device *__ip_tunnel_create(struct net *net,
287 					     const struct rtnl_link_ops *ops,
288 					     struct ip_tunnel_parm *parms)
289 {
290 	int err;
291 	struct ip_tunnel *tunnel;
292 	struct net_device *dev;
293 	char name[IFNAMSIZ];
294 
295 	if (parms->name[0])
296 		strlcpy(name, parms->name, IFNAMSIZ);
297 	else {
298 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
299 			err = -E2BIG;
300 			goto failed;
301 		}
302 		strlcpy(name, ops->kind, IFNAMSIZ);
303 		strncat(name, "%d", 2);
304 	}
305 
306 	ASSERT_RTNL();
307 	dev = alloc_netdev(ops->priv_size, name, ops->setup);
308 	if (!dev) {
309 		err = -ENOMEM;
310 		goto failed;
311 	}
312 	dev_net_set(dev, net);
313 
314 	dev->rtnl_link_ops = ops;
315 
316 	tunnel = netdev_priv(dev);
317 	tunnel->parms = *parms;
318 	tunnel->net = net;
319 
320 	err = register_netdevice(dev);
321 	if (err)
322 		goto failed_free;
323 
324 	return dev;
325 
326 failed_free:
327 	free_netdev(dev);
328 failed:
329 	return ERR_PTR(err);
330 }
331 
332 static inline void init_tunnel_flow(struct flowi4 *fl4,
333 				    int proto,
334 				    __be32 daddr, __be32 saddr,
335 				    __be32 key, __u8 tos, int oif)
336 {
337 	memset(fl4, 0, sizeof(*fl4));
338 	fl4->flowi4_oif = oif;
339 	fl4->daddr = daddr;
340 	fl4->saddr = saddr;
341 	fl4->flowi4_tos = tos;
342 	fl4->flowi4_proto = proto;
343 	fl4->fl4_gre_key = key;
344 }
345 
346 static int ip_tunnel_bind_dev(struct net_device *dev)
347 {
348 	struct net_device *tdev = NULL;
349 	struct ip_tunnel *tunnel = netdev_priv(dev);
350 	const struct iphdr *iph;
351 	int hlen = LL_MAX_HEADER;
352 	int mtu = ETH_DATA_LEN;
353 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
354 
355 	iph = &tunnel->parms.iph;
356 
357 	/* Guess output device to choose reasonable mtu and needed_headroom */
358 	if (iph->daddr) {
359 		struct flowi4 fl4;
360 		struct rtable *rt;
361 
362 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
363 				 iph->saddr, tunnel->parms.o_key,
364 				 RT_TOS(iph->tos), tunnel->parms.link);
365 		rt = ip_route_output_key(tunnel->net, &fl4);
366 
367 		if (!IS_ERR(rt)) {
368 			tdev = rt->dst.dev;
369 			tunnel_dst_set(tunnel, &rt->dst);
370 			ip_rt_put(rt);
371 		}
372 		if (dev->type != ARPHRD_ETHER)
373 			dev->flags |= IFF_POINTOPOINT;
374 	}
375 
376 	if (!tdev && tunnel->parms.link)
377 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
378 
379 	if (tdev) {
380 		hlen = tdev->hard_header_len + tdev->needed_headroom;
381 		mtu = tdev->mtu;
382 	}
383 	dev->iflink = tunnel->parms.link;
384 
385 	dev->needed_headroom = t_hlen + hlen;
386 	mtu -= (dev->hard_header_len + t_hlen);
387 
388 	if (mtu < 68)
389 		mtu = 68;
390 
391 	return mtu;
392 }
393 
394 static struct ip_tunnel *ip_tunnel_create(struct net *net,
395 					  struct ip_tunnel_net *itn,
396 					  struct ip_tunnel_parm *parms)
397 {
398 	struct ip_tunnel *nt, *fbt;
399 	struct net_device *dev;
400 
401 	BUG_ON(!itn->fb_tunnel_dev);
402 	fbt = netdev_priv(itn->fb_tunnel_dev);
403 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
404 	if (IS_ERR(dev))
405 		return ERR_CAST(dev);
406 
407 	dev->mtu = ip_tunnel_bind_dev(dev);
408 
409 	nt = netdev_priv(dev);
410 	ip_tunnel_add(itn, nt);
411 	return nt;
412 }
413 
414 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
415 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
416 {
417 	struct pcpu_sw_netstats *tstats;
418 	const struct iphdr *iph = ip_hdr(skb);
419 	int err;
420 
421 #ifdef CONFIG_NET_IPGRE_BROADCAST
422 	if (ipv4_is_multicast(iph->daddr)) {
423 		tunnel->dev->stats.multicast++;
424 		skb->pkt_type = PACKET_BROADCAST;
425 	}
426 #endif
427 
428 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
429 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
430 		tunnel->dev->stats.rx_crc_errors++;
431 		tunnel->dev->stats.rx_errors++;
432 		goto drop;
433 	}
434 
435 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
436 		if (!(tpi->flags&TUNNEL_SEQ) ||
437 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
438 			tunnel->dev->stats.rx_fifo_errors++;
439 			tunnel->dev->stats.rx_errors++;
440 			goto drop;
441 		}
442 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
443 	}
444 
445 	skb_reset_network_header(skb);
446 
447 	err = IP_ECN_decapsulate(iph, skb);
448 	if (unlikely(err)) {
449 		if (log_ecn_error)
450 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
451 					&iph->saddr, iph->tos);
452 		if (err > 1) {
453 			++tunnel->dev->stats.rx_frame_errors;
454 			++tunnel->dev->stats.rx_errors;
455 			goto drop;
456 		}
457 	}
458 
459 	tstats = this_cpu_ptr(tunnel->dev->tstats);
460 	u64_stats_update_begin(&tstats->syncp);
461 	tstats->rx_packets++;
462 	tstats->rx_bytes += skb->len;
463 	u64_stats_update_end(&tstats->syncp);
464 
465 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
466 
467 	if (tunnel->dev->type == ARPHRD_ETHER) {
468 		skb->protocol = eth_type_trans(skb, tunnel->dev);
469 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
470 	} else {
471 		skb->dev = tunnel->dev;
472 	}
473 
474 	gro_cells_receive(&tunnel->gro_cells, skb);
475 	return 0;
476 
477 drop:
478 	kfree_skb(skb);
479 	return 0;
480 }
481 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
482 
483 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
484 			    struct rtable *rt, __be16 df)
485 {
486 	struct ip_tunnel *tunnel = netdev_priv(dev);
487 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
488 	int mtu;
489 
490 	if (df)
491 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
492 					- sizeof(struct iphdr) - tunnel->hlen;
493 	else
494 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
495 
496 	if (skb_dst(skb))
497 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
498 
499 	if (skb->protocol == htons(ETH_P_IP)) {
500 		if (!skb_is_gso(skb) &&
501 		    (df & htons(IP_DF)) && mtu < pkt_size) {
502 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
503 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
504 			return -E2BIG;
505 		}
506 	}
507 #if IS_ENABLED(CONFIG_IPV6)
508 	else if (skb->protocol == htons(ETH_P_IPV6)) {
509 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
510 
511 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
512 			   mtu >= IPV6_MIN_MTU) {
513 			if ((tunnel->parms.iph.daddr &&
514 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
515 			    rt6->rt6i_dst.plen == 128) {
516 				rt6->rt6i_flags |= RTF_MODIFIED;
517 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
518 			}
519 		}
520 
521 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
522 					mtu < pkt_size) {
523 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
524 			return -E2BIG;
525 		}
526 	}
527 #endif
528 	return 0;
529 }
530 
531 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
532 		    const struct iphdr *tnl_params, const u8 protocol)
533 {
534 	struct ip_tunnel *tunnel = netdev_priv(dev);
535 	const struct iphdr *inner_iph;
536 	struct flowi4 fl4;
537 	u8     tos, ttl;
538 	__be16 df;
539 	struct rtable *rt;		/* Route to the other host */
540 	unsigned int max_headroom;	/* The extra header space needed */
541 	__be32 dst;
542 	int err;
543 	bool connected = true;
544 
545 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
546 
547 	dst = tnl_params->daddr;
548 	if (dst == 0) {
549 		/* NBMA tunnel */
550 
551 		if (skb_dst(skb) == NULL) {
552 			dev->stats.tx_fifo_errors++;
553 			goto tx_error;
554 		}
555 
556 		if (skb->protocol == htons(ETH_P_IP)) {
557 			rt = skb_rtable(skb);
558 			dst = rt_nexthop(rt, inner_iph->daddr);
559 		}
560 #if IS_ENABLED(CONFIG_IPV6)
561 		else if (skb->protocol == htons(ETH_P_IPV6)) {
562 			const struct in6_addr *addr6;
563 			struct neighbour *neigh;
564 			bool do_tx_error_icmp;
565 			int addr_type;
566 
567 			neigh = dst_neigh_lookup(skb_dst(skb),
568 						 &ipv6_hdr(skb)->daddr);
569 			if (neigh == NULL)
570 				goto tx_error;
571 
572 			addr6 = (const struct in6_addr *)&neigh->primary_key;
573 			addr_type = ipv6_addr_type(addr6);
574 
575 			if (addr_type == IPV6_ADDR_ANY) {
576 				addr6 = &ipv6_hdr(skb)->daddr;
577 				addr_type = ipv6_addr_type(addr6);
578 			}
579 
580 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
581 				do_tx_error_icmp = true;
582 			else {
583 				do_tx_error_icmp = false;
584 				dst = addr6->s6_addr32[3];
585 			}
586 			neigh_release(neigh);
587 			if (do_tx_error_icmp)
588 				goto tx_error_icmp;
589 		}
590 #endif
591 		else
592 			goto tx_error;
593 
594 		connected = false;
595 	}
596 
597 	tos = tnl_params->tos;
598 	if (tos & 0x1) {
599 		tos &= ~0x1;
600 		if (skb->protocol == htons(ETH_P_IP)) {
601 			tos = inner_iph->tos;
602 			connected = false;
603 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
604 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
605 			connected = false;
606 		}
607 	}
608 
609 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
610 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
611 
612 	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
613 
614 	if (!rt) {
615 		rt = ip_route_output_key(tunnel->net, &fl4);
616 
617 		if (IS_ERR(rt)) {
618 			dev->stats.tx_carrier_errors++;
619 			goto tx_error;
620 		}
621 		if (connected)
622 			tunnel_dst_set(tunnel, &rt->dst);
623 	}
624 
625 	if (rt->dst.dev == dev) {
626 		ip_rt_put(rt);
627 		dev->stats.collisions++;
628 		goto tx_error;
629 	}
630 
631 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
632 		ip_rt_put(rt);
633 		goto tx_error;
634 	}
635 
636 	if (tunnel->err_count > 0) {
637 		if (time_before(jiffies,
638 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
639 			tunnel->err_count--;
640 
641 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
642 			dst_link_failure(skb);
643 		} else
644 			tunnel->err_count = 0;
645 	}
646 
647 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
648 	ttl = tnl_params->ttl;
649 	if (ttl == 0) {
650 		if (skb->protocol == htons(ETH_P_IP))
651 			ttl = inner_iph->ttl;
652 #if IS_ENABLED(CONFIG_IPV6)
653 		else if (skb->protocol == htons(ETH_P_IPV6))
654 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
655 #endif
656 		else
657 			ttl = ip4_dst_hoplimit(&rt->dst);
658 	}
659 
660 	df = tnl_params->frag_off;
661 	if (skb->protocol == htons(ETH_P_IP))
662 		df |= (inner_iph->frag_off&htons(IP_DF));
663 
664 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
665 			+ rt->dst.header_len;
666 	if (max_headroom > dev->needed_headroom)
667 		dev->needed_headroom = max_headroom;
668 
669 	if (skb_cow_head(skb, dev->needed_headroom)) {
670 		dev->stats.tx_dropped++;
671 		kfree_skb(skb);
672 		return;
673 	}
674 
675 	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
676 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
677 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
678 
679 	return;
680 
681 #if IS_ENABLED(CONFIG_IPV6)
682 tx_error_icmp:
683 	dst_link_failure(skb);
684 #endif
685 tx_error:
686 	dev->stats.tx_errors++;
687 	kfree_skb(skb);
688 }
689 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
690 
691 static void ip_tunnel_update(struct ip_tunnel_net *itn,
692 			     struct ip_tunnel *t,
693 			     struct net_device *dev,
694 			     struct ip_tunnel_parm *p,
695 			     bool set_mtu)
696 {
697 	ip_tunnel_del(t);
698 	t->parms.iph.saddr = p->iph.saddr;
699 	t->parms.iph.daddr = p->iph.daddr;
700 	t->parms.i_key = p->i_key;
701 	t->parms.o_key = p->o_key;
702 	if (dev->type != ARPHRD_ETHER) {
703 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
704 		memcpy(dev->broadcast, &p->iph.daddr, 4);
705 	}
706 	ip_tunnel_add(itn, t);
707 
708 	t->parms.iph.ttl = p->iph.ttl;
709 	t->parms.iph.tos = p->iph.tos;
710 	t->parms.iph.frag_off = p->iph.frag_off;
711 
712 	if (t->parms.link != p->link) {
713 		int mtu;
714 
715 		t->parms.link = p->link;
716 		mtu = ip_tunnel_bind_dev(dev);
717 		if (set_mtu)
718 			dev->mtu = mtu;
719 	}
720 	ip_tunnel_dst_reset_all(t);
721 	netdev_state_change(dev);
722 }
723 
724 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
725 {
726 	int err = 0;
727 	struct ip_tunnel *t = netdev_priv(dev);
728 	struct net *net = t->net;
729 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
730 
731 	BUG_ON(!itn->fb_tunnel_dev);
732 	switch (cmd) {
733 	case SIOCGETTUNNEL:
734 		if (dev == itn->fb_tunnel_dev) {
735 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
736 			if (t == NULL)
737 				t = netdev_priv(dev);
738 		}
739 		memcpy(p, &t->parms, sizeof(*p));
740 		break;
741 
742 	case SIOCADDTUNNEL:
743 	case SIOCCHGTUNNEL:
744 		err = -EPERM;
745 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
746 			goto done;
747 		if (p->iph.ttl)
748 			p->iph.frag_off |= htons(IP_DF);
749 		if (!(p->i_flags&TUNNEL_KEY))
750 			p->i_key = 0;
751 		if (!(p->o_flags&TUNNEL_KEY))
752 			p->o_key = 0;
753 
754 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
755 
756 		if (!t && (cmd == SIOCADDTUNNEL)) {
757 			t = ip_tunnel_create(net, itn, p);
758 			if (IS_ERR(t)) {
759 				err = PTR_ERR(t);
760 				break;
761 			}
762 		}
763 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
764 			if (t != NULL) {
765 				if (t->dev != dev) {
766 					err = -EEXIST;
767 					break;
768 				}
769 			} else {
770 				unsigned int nflags = 0;
771 
772 				if (ipv4_is_multicast(p->iph.daddr))
773 					nflags = IFF_BROADCAST;
774 				else if (p->iph.daddr)
775 					nflags = IFF_POINTOPOINT;
776 
777 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
778 					err = -EINVAL;
779 					break;
780 				}
781 
782 				t = netdev_priv(dev);
783 			}
784 		}
785 
786 		if (t) {
787 			err = 0;
788 			ip_tunnel_update(itn, t, dev, p, true);
789 		} else {
790 			err = -ENOENT;
791 		}
792 		break;
793 
794 	case SIOCDELTUNNEL:
795 		err = -EPERM;
796 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
797 			goto done;
798 
799 		if (dev == itn->fb_tunnel_dev) {
800 			err = -ENOENT;
801 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
802 			if (t == NULL)
803 				goto done;
804 			err = -EPERM;
805 			if (t == netdev_priv(itn->fb_tunnel_dev))
806 				goto done;
807 			dev = t->dev;
808 		}
809 		unregister_netdevice(dev);
810 		err = 0;
811 		break;
812 
813 	default:
814 		err = -EINVAL;
815 	}
816 
817 done:
818 	return err;
819 }
820 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
821 
822 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
823 {
824 	struct ip_tunnel *tunnel = netdev_priv(dev);
825 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
826 
827 	if (new_mtu < 68 ||
828 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
829 		return -EINVAL;
830 	dev->mtu = new_mtu;
831 	return 0;
832 }
833 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
834 
835 static void ip_tunnel_dev_free(struct net_device *dev)
836 {
837 	struct ip_tunnel *tunnel = netdev_priv(dev);
838 
839 	gro_cells_destroy(&tunnel->gro_cells);
840 	free_percpu(tunnel->dst_cache);
841 	free_percpu(dev->tstats);
842 	free_netdev(dev);
843 }
844 
845 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
846 {
847 	struct ip_tunnel *tunnel = netdev_priv(dev);
848 	struct ip_tunnel_net *itn;
849 
850 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
851 
852 	if (itn->fb_tunnel_dev != dev) {
853 		ip_tunnel_del(netdev_priv(dev));
854 		unregister_netdevice_queue(dev, head);
855 	}
856 }
857 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
858 
859 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
860 				  struct rtnl_link_ops *ops, char *devname)
861 {
862 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
863 	struct ip_tunnel_parm parms;
864 	unsigned int i;
865 
866 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
867 		INIT_HLIST_HEAD(&itn->tunnels[i]);
868 
869 	if (!ops) {
870 		itn->fb_tunnel_dev = NULL;
871 		return 0;
872 	}
873 
874 	memset(&parms, 0, sizeof(parms));
875 	if (devname)
876 		strlcpy(parms.name, devname, IFNAMSIZ);
877 
878 	rtnl_lock();
879 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
880 	/* FB netdevice is special: we have one, and only one per netns.
881 	 * Allowing to move it to another netns is clearly unsafe.
882 	 */
883 	if (!IS_ERR(itn->fb_tunnel_dev)) {
884 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
885 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
886 	}
887 	rtnl_unlock();
888 
889 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
890 }
891 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
892 
893 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
894 			      struct rtnl_link_ops *ops)
895 {
896 	struct net *net = dev_net(itn->fb_tunnel_dev);
897 	struct net_device *dev, *aux;
898 	int h;
899 
900 	for_each_netdev_safe(net, dev, aux)
901 		if (dev->rtnl_link_ops == ops)
902 			unregister_netdevice_queue(dev, head);
903 
904 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
905 		struct ip_tunnel *t;
906 		struct hlist_node *n;
907 		struct hlist_head *thead = &itn->tunnels[h];
908 
909 		hlist_for_each_entry_safe(t, n, thead, hash_node)
910 			/* If dev is in the same netns, it has already
911 			 * been added to the list by the previous loop.
912 			 */
913 			if (!net_eq(dev_net(t->dev), net))
914 				unregister_netdevice_queue(t->dev, head);
915 	}
916 }
917 
918 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
919 {
920 	LIST_HEAD(list);
921 
922 	rtnl_lock();
923 	ip_tunnel_destroy(itn, &list, ops);
924 	unregister_netdevice_many(&list);
925 	rtnl_unlock();
926 }
927 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
928 
929 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
930 		      struct ip_tunnel_parm *p)
931 {
932 	struct ip_tunnel *nt;
933 	struct net *net = dev_net(dev);
934 	struct ip_tunnel_net *itn;
935 	int mtu;
936 	int err;
937 
938 	nt = netdev_priv(dev);
939 	itn = net_generic(net, nt->ip_tnl_net_id);
940 
941 	if (ip_tunnel_find(itn, p, dev->type))
942 		return -EEXIST;
943 
944 	nt->net = net;
945 	nt->parms = *p;
946 	err = register_netdevice(dev);
947 	if (err)
948 		goto out;
949 
950 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
951 		eth_hw_addr_random(dev);
952 
953 	mtu = ip_tunnel_bind_dev(dev);
954 	if (!tb[IFLA_MTU])
955 		dev->mtu = mtu;
956 
957 	ip_tunnel_add(itn, nt);
958 
959 out:
960 	return err;
961 }
962 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
963 
964 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
965 			 struct ip_tunnel_parm *p)
966 {
967 	struct ip_tunnel *t;
968 	struct ip_tunnel *tunnel = netdev_priv(dev);
969 	struct net *net = tunnel->net;
970 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
971 
972 	if (dev == itn->fb_tunnel_dev)
973 		return -EINVAL;
974 
975 	t = ip_tunnel_find(itn, p, dev->type);
976 
977 	if (t) {
978 		if (t->dev != dev)
979 			return -EEXIST;
980 	} else {
981 		t = tunnel;
982 
983 		if (dev->type != ARPHRD_ETHER) {
984 			unsigned int nflags = 0;
985 
986 			if (ipv4_is_multicast(p->iph.daddr))
987 				nflags = IFF_BROADCAST;
988 			else if (p->iph.daddr)
989 				nflags = IFF_POINTOPOINT;
990 
991 			if ((dev->flags ^ nflags) &
992 			    (IFF_POINTOPOINT | IFF_BROADCAST))
993 				return -EINVAL;
994 		}
995 	}
996 
997 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
998 	return 0;
999 }
1000 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1001 
1002 int ip_tunnel_init(struct net_device *dev)
1003 {
1004 	struct ip_tunnel *tunnel = netdev_priv(dev);
1005 	struct iphdr *iph = &tunnel->parms.iph;
1006 	int err;
1007 
1008 	dev->destructor	= ip_tunnel_dev_free;
1009 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1010 	if (!dev->tstats)
1011 		return -ENOMEM;
1012 
1013 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1014 	if (!tunnel->dst_cache) {
1015 		free_percpu(dev->tstats);
1016 		return -ENOMEM;
1017 	}
1018 
1019 	err = gro_cells_init(&tunnel->gro_cells, dev);
1020 	if (err) {
1021 		free_percpu(tunnel->dst_cache);
1022 		free_percpu(dev->tstats);
1023 		return err;
1024 	}
1025 
1026 	tunnel->dev = dev;
1027 	tunnel->net = dev_net(dev);
1028 	strcpy(tunnel->parms.name, dev->name);
1029 	iph->version		= 4;
1030 	iph->ihl		= 5;
1031 
1032 	return 0;
1033 }
1034 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1035 
1036 void ip_tunnel_uninit(struct net_device *dev)
1037 {
1038 	struct ip_tunnel *tunnel = netdev_priv(dev);
1039 	struct net *net = tunnel->net;
1040 	struct ip_tunnel_net *itn;
1041 
1042 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1043 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1044 	if (itn->fb_tunnel_dev != dev)
1045 		ip_tunnel_del(netdev_priv(dev));
1046 
1047 	ip_tunnel_dst_reset_all(tunnel);
1048 }
1049 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1050 
1051 /* Do least required initialization, rest of init is done in tunnel_init call */
1052 void ip_tunnel_setup(struct net_device *dev, int net_id)
1053 {
1054 	struct ip_tunnel *tunnel = netdev_priv(dev);
1055 	tunnel->ip_tnl_net_id = net_id;
1056 }
1057 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1058 
1059 MODULE_LICENSE("GPL");
1060