xref: /linux/net/ipv4/ip_tunnel.c (revision 3f2fb9a834cb1fcddbae22deca7fde136944dc89)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
72 				__be16 flags, __be32 key)
73 {
74 	if (p->i_flags & TUNNEL_KEY) {
75 		if (flags & TUNNEL_KEY)
76 			return key == p->i_key;
77 		else
78 			/* key expected, none present */
79 			return false;
80 	} else
81 		return !(flags & TUNNEL_KEY);
82 }
83 
84 /* Fallback tunnel: no source, no destination, no key, no options
85 
86    Tunnel hash table:
87    We require exact key match i.e. if a key is present in packet
88    it will match only tunnel with the same key; if it is not present,
89    it will match only keyless tunnel.
90 
91    All keysless packets, if not matched configured keyless tunnels
92    will match fallback tunnel.
93    Given src, dst and key, find appropriate for input tunnel.
94 */
95 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
96 				   int link, __be16 flags,
97 				   __be32 remote, __be32 local,
98 				   __be32 key)
99 {
100 	unsigned int hash;
101 	struct ip_tunnel *t, *cand = NULL;
102 	struct hlist_head *head;
103 
104 	hash = ip_tunnel_hash(key, remote);
105 	head = &itn->tunnels[hash];
106 
107 	hlist_for_each_entry_rcu(t, head, hash_node) {
108 		if (local != t->parms.iph.saddr ||
109 		    remote != t->parms.iph.daddr ||
110 		    !(t->dev->flags & IFF_UP))
111 			continue;
112 
113 		if (!ip_tunnel_key_match(&t->parms, flags, key))
114 			continue;
115 
116 		if (t->parms.link == link)
117 			return t;
118 		else
119 			cand = t;
120 	}
121 
122 	hlist_for_each_entry_rcu(t, head, hash_node) {
123 		if (remote != t->parms.iph.daddr ||
124 		    t->parms.iph.saddr != 0 ||
125 		    !(t->dev->flags & IFF_UP))
126 			continue;
127 
128 		if (!ip_tunnel_key_match(&t->parms, flags, key))
129 			continue;
130 
131 		if (t->parms.link == link)
132 			return t;
133 		else if (!cand)
134 			cand = t;
135 	}
136 
137 	hash = ip_tunnel_hash(key, 0);
138 	head = &itn->tunnels[hash];
139 
140 	hlist_for_each_entry_rcu(t, head, hash_node) {
141 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
142 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
143 			continue;
144 
145 		if (!(t->dev->flags & IFF_UP))
146 			continue;
147 
148 		if (!ip_tunnel_key_match(&t->parms, flags, key))
149 			continue;
150 
151 		if (t->parms.link == link)
152 			return t;
153 		else if (!cand)
154 			cand = t;
155 	}
156 
157 	if (flags & TUNNEL_NO_KEY)
158 		goto skip_key_lookup;
159 
160 	hlist_for_each_entry_rcu(t, head, hash_node) {
161 		if (t->parms.i_key != key ||
162 		    t->parms.iph.saddr != 0 ||
163 		    t->parms.iph.daddr != 0 ||
164 		    !(t->dev->flags & IFF_UP))
165 			continue;
166 
167 		if (t->parms.link == link)
168 			return t;
169 		else if (!cand)
170 			cand = t;
171 	}
172 
173 skip_key_lookup:
174 	if (cand)
175 		return cand;
176 
177 	t = rcu_dereference(itn->collect_md_tun);
178 	if (t)
179 		return t;
180 
181 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
182 		return netdev_priv(itn->fb_tunnel_dev);
183 
184 	return NULL;
185 }
186 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
187 
188 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
189 				    struct ip_tunnel_parm *parms)
190 {
191 	unsigned int h;
192 	__be32 remote;
193 	__be32 i_key = parms->i_key;
194 
195 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
196 		remote = parms->iph.daddr;
197 	else
198 		remote = 0;
199 
200 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
201 		i_key = 0;
202 
203 	h = ip_tunnel_hash(i_key, remote);
204 	return &itn->tunnels[h];
205 }
206 
207 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
208 {
209 	struct hlist_head *head = ip_bucket(itn, &t->parms);
210 
211 	if (t->collect_md)
212 		rcu_assign_pointer(itn->collect_md_tun, t);
213 	hlist_add_head_rcu(&t->hash_node, head);
214 }
215 
216 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
217 {
218 	if (t->collect_md)
219 		rcu_assign_pointer(itn->collect_md_tun, NULL);
220 	hlist_del_init_rcu(&t->hash_node);
221 }
222 
223 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
224 					struct ip_tunnel_parm *parms,
225 					int type)
226 {
227 	__be32 remote = parms->iph.daddr;
228 	__be32 local = parms->iph.saddr;
229 	__be32 key = parms->i_key;
230 	__be16 flags = parms->i_flags;
231 	int link = parms->link;
232 	struct ip_tunnel *t = NULL;
233 	struct hlist_head *head = ip_bucket(itn, parms);
234 
235 	hlist_for_each_entry_rcu(t, head, hash_node) {
236 		if (local == t->parms.iph.saddr &&
237 		    remote == t->parms.iph.daddr &&
238 		    link == t->parms.link &&
239 		    type == t->dev->type &&
240 		    ip_tunnel_key_match(&t->parms, flags, key))
241 			break;
242 	}
243 	return t;
244 }
245 
246 static struct net_device *__ip_tunnel_create(struct net *net,
247 					     const struct rtnl_link_ops *ops,
248 					     struct ip_tunnel_parm *parms)
249 {
250 	int err;
251 	struct ip_tunnel *tunnel;
252 	struct net_device *dev;
253 	char name[IFNAMSIZ];
254 
255 	if (parms->name[0])
256 		strlcpy(name, parms->name, IFNAMSIZ);
257 	else {
258 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
259 			err = -E2BIG;
260 			goto failed;
261 		}
262 		strlcpy(name, ops->kind, IFNAMSIZ);
263 		strncat(name, "%d", 2);
264 	}
265 
266 	ASSERT_RTNL();
267 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
268 	if (!dev) {
269 		err = -ENOMEM;
270 		goto failed;
271 	}
272 	dev_net_set(dev, net);
273 
274 	dev->rtnl_link_ops = ops;
275 
276 	tunnel = netdev_priv(dev);
277 	tunnel->parms = *parms;
278 	tunnel->net = net;
279 
280 	err = register_netdevice(dev);
281 	if (err)
282 		goto failed_free;
283 
284 	return dev;
285 
286 failed_free:
287 	free_netdev(dev);
288 failed:
289 	return ERR_PTR(err);
290 }
291 
292 static inline void init_tunnel_flow(struct flowi4 *fl4,
293 				    int proto,
294 				    __be32 daddr, __be32 saddr,
295 				    __be32 key, __u8 tos, int oif)
296 {
297 	memset(fl4, 0, sizeof(*fl4));
298 	fl4->flowi4_oif = oif;
299 	fl4->daddr = daddr;
300 	fl4->saddr = saddr;
301 	fl4->flowi4_tos = tos;
302 	fl4->flowi4_proto = proto;
303 	fl4->fl4_gre_key = key;
304 }
305 
306 static int ip_tunnel_bind_dev(struct net_device *dev)
307 {
308 	struct net_device *tdev = NULL;
309 	struct ip_tunnel *tunnel = netdev_priv(dev);
310 	const struct iphdr *iph;
311 	int hlen = LL_MAX_HEADER;
312 	int mtu = ETH_DATA_LEN;
313 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
314 
315 	iph = &tunnel->parms.iph;
316 
317 	/* Guess output device to choose reasonable mtu and needed_headroom */
318 	if (iph->daddr) {
319 		struct flowi4 fl4;
320 		struct rtable *rt;
321 
322 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
323 				 iph->saddr, tunnel->parms.o_key,
324 				 RT_TOS(iph->tos), tunnel->parms.link);
325 		rt = ip_route_output_key(tunnel->net, &fl4);
326 
327 		if (!IS_ERR(rt)) {
328 			tdev = rt->dst.dev;
329 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
330 					  fl4.saddr);
331 			ip_rt_put(rt);
332 		}
333 		if (dev->type != ARPHRD_ETHER)
334 			dev->flags |= IFF_POINTOPOINT;
335 	}
336 
337 	if (!tdev && tunnel->parms.link)
338 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
339 
340 	if (tdev) {
341 		hlen = tdev->hard_header_len + tdev->needed_headroom;
342 		mtu = tdev->mtu;
343 	}
344 
345 	dev->needed_headroom = t_hlen + hlen;
346 	mtu -= (dev->hard_header_len + t_hlen);
347 
348 	if (mtu < 68)
349 		mtu = 68;
350 
351 	return mtu;
352 }
353 
354 static struct ip_tunnel *ip_tunnel_create(struct net *net,
355 					  struct ip_tunnel_net *itn,
356 					  struct ip_tunnel_parm *parms)
357 {
358 	struct ip_tunnel *nt;
359 	struct net_device *dev;
360 
361 	BUG_ON(!itn->fb_tunnel_dev);
362 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
363 	if (IS_ERR(dev))
364 		return ERR_CAST(dev);
365 
366 	dev->mtu = ip_tunnel_bind_dev(dev);
367 
368 	nt = netdev_priv(dev);
369 	ip_tunnel_add(itn, nt);
370 	return nt;
371 }
372 
373 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
374 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
375 		  bool log_ecn_error)
376 {
377 	struct pcpu_sw_netstats *tstats;
378 	const struct iphdr *iph = ip_hdr(skb);
379 	int err;
380 
381 #ifdef CONFIG_NET_IPGRE_BROADCAST
382 	if (ipv4_is_multicast(iph->daddr)) {
383 		tunnel->dev->stats.multicast++;
384 		skb->pkt_type = PACKET_BROADCAST;
385 	}
386 #endif
387 
388 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
389 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
390 		tunnel->dev->stats.rx_crc_errors++;
391 		tunnel->dev->stats.rx_errors++;
392 		goto drop;
393 	}
394 
395 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
396 		if (!(tpi->flags&TUNNEL_SEQ) ||
397 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
398 			tunnel->dev->stats.rx_fifo_errors++;
399 			tunnel->dev->stats.rx_errors++;
400 			goto drop;
401 		}
402 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
403 	}
404 
405 	skb_reset_network_header(skb);
406 
407 	err = IP_ECN_decapsulate(iph, skb);
408 	if (unlikely(err)) {
409 		if (log_ecn_error)
410 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
411 					&iph->saddr, iph->tos);
412 		if (err > 1) {
413 			++tunnel->dev->stats.rx_frame_errors;
414 			++tunnel->dev->stats.rx_errors;
415 			goto drop;
416 		}
417 	}
418 
419 	tstats = this_cpu_ptr(tunnel->dev->tstats);
420 	u64_stats_update_begin(&tstats->syncp);
421 	tstats->rx_packets++;
422 	tstats->rx_bytes += skb->len;
423 	u64_stats_update_end(&tstats->syncp);
424 
425 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
426 
427 	if (tunnel->dev->type == ARPHRD_ETHER) {
428 		skb->protocol = eth_type_trans(skb, tunnel->dev);
429 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
430 	} else {
431 		skb->dev = tunnel->dev;
432 	}
433 
434 	if (tun_dst)
435 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
436 
437 	gro_cells_receive(&tunnel->gro_cells, skb);
438 	return 0;
439 
440 drop:
441 	kfree_skb(skb);
442 	return 0;
443 }
444 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
445 
446 static int ip_encap_hlen(struct ip_tunnel_encap *e)
447 {
448 	const struct ip_tunnel_encap_ops *ops;
449 	int hlen = -EINVAL;
450 
451 	if (e->type == TUNNEL_ENCAP_NONE)
452 		return 0;
453 
454 	if (e->type >= MAX_IPTUN_ENCAP_OPS)
455 		return -EINVAL;
456 
457 	rcu_read_lock();
458 	ops = rcu_dereference(iptun_encaps[e->type]);
459 	if (likely(ops && ops->encap_hlen))
460 		hlen = ops->encap_hlen(e);
461 	rcu_read_unlock();
462 
463 	return hlen;
464 }
465 
466 const struct ip_tunnel_encap_ops __rcu *
467 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
468 
469 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
470 			    unsigned int num)
471 {
472 	if (num >= MAX_IPTUN_ENCAP_OPS)
473 		return -ERANGE;
474 
475 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
476 			&iptun_encaps[num],
477 			NULL, ops) ? 0 : -1;
478 }
479 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
480 
481 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
482 			    unsigned int num)
483 {
484 	int ret;
485 
486 	if (num >= MAX_IPTUN_ENCAP_OPS)
487 		return -ERANGE;
488 
489 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
490 		       &iptun_encaps[num],
491 		       ops, NULL) == ops) ? 0 : -1;
492 
493 	synchronize_net();
494 
495 	return ret;
496 }
497 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
498 
499 int ip_tunnel_encap_setup(struct ip_tunnel *t,
500 			  struct ip_tunnel_encap *ipencap)
501 {
502 	int hlen;
503 
504 	memset(&t->encap, 0, sizeof(t->encap));
505 
506 	hlen = ip_encap_hlen(ipencap);
507 	if (hlen < 0)
508 		return hlen;
509 
510 	t->encap.type = ipencap->type;
511 	t->encap.sport = ipencap->sport;
512 	t->encap.dport = ipencap->dport;
513 	t->encap.flags = ipencap->flags;
514 
515 	t->encap_hlen = hlen;
516 	t->hlen = t->encap_hlen + t->tun_hlen;
517 
518 	return 0;
519 }
520 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
521 
522 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
523 		    u8 *protocol, struct flowi4 *fl4)
524 {
525 	const struct ip_tunnel_encap_ops *ops;
526 	int ret = -EINVAL;
527 
528 	if (t->encap.type == TUNNEL_ENCAP_NONE)
529 		return 0;
530 
531 	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
532 		return -EINVAL;
533 
534 	rcu_read_lock();
535 	ops = rcu_dereference(iptun_encaps[t->encap.type]);
536 	if (likely(ops && ops->build_header))
537 		ret = ops->build_header(skb, &t->encap, protocol, fl4);
538 	rcu_read_unlock();
539 
540 	return ret;
541 }
542 EXPORT_SYMBOL(ip_tunnel_encap);
543 
544 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
545 			    struct rtable *rt, __be16 df,
546 			    const struct iphdr *inner_iph)
547 {
548 	struct ip_tunnel *tunnel = netdev_priv(dev);
549 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
550 	int mtu;
551 
552 	if (df)
553 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
554 					- sizeof(struct iphdr) - tunnel->hlen;
555 	else
556 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
557 
558 	if (skb_dst(skb))
559 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
560 
561 	if (skb->protocol == htons(ETH_P_IP)) {
562 		if (!skb_is_gso(skb) &&
563 		    (inner_iph->frag_off & htons(IP_DF)) &&
564 		    mtu < pkt_size) {
565 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
566 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
567 			return -E2BIG;
568 		}
569 	}
570 #if IS_ENABLED(CONFIG_IPV6)
571 	else if (skb->protocol == htons(ETH_P_IPV6)) {
572 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
573 
574 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
575 			   mtu >= IPV6_MIN_MTU) {
576 			if ((tunnel->parms.iph.daddr &&
577 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
578 			    rt6->rt6i_dst.plen == 128) {
579 				rt6->rt6i_flags |= RTF_MODIFIED;
580 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
581 			}
582 		}
583 
584 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
585 					mtu < pkt_size) {
586 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
587 			return -E2BIG;
588 		}
589 	}
590 #endif
591 	return 0;
592 }
593 
594 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
595 		    const struct iphdr *tnl_params, u8 protocol)
596 {
597 	struct ip_tunnel *tunnel = netdev_priv(dev);
598 	const struct iphdr *inner_iph;
599 	struct flowi4 fl4;
600 	u8     tos, ttl;
601 	__be16 df;
602 	struct rtable *rt;		/* Route to the other host */
603 	unsigned int max_headroom;	/* The extra header space needed */
604 	__be32 dst;
605 	bool connected;
606 
607 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
608 	connected = (tunnel->parms.iph.daddr != 0);
609 
610 	dst = tnl_params->daddr;
611 	if (dst == 0) {
612 		/* NBMA tunnel */
613 
614 		if (!skb_dst(skb)) {
615 			dev->stats.tx_fifo_errors++;
616 			goto tx_error;
617 		}
618 
619 		if (skb->protocol == htons(ETH_P_IP)) {
620 			rt = skb_rtable(skb);
621 			dst = rt_nexthop(rt, inner_iph->daddr);
622 		}
623 #if IS_ENABLED(CONFIG_IPV6)
624 		else if (skb->protocol == htons(ETH_P_IPV6)) {
625 			const struct in6_addr *addr6;
626 			struct neighbour *neigh;
627 			bool do_tx_error_icmp;
628 			int addr_type;
629 
630 			neigh = dst_neigh_lookup(skb_dst(skb),
631 						 &ipv6_hdr(skb)->daddr);
632 			if (!neigh)
633 				goto tx_error;
634 
635 			addr6 = (const struct in6_addr *)&neigh->primary_key;
636 			addr_type = ipv6_addr_type(addr6);
637 
638 			if (addr_type == IPV6_ADDR_ANY) {
639 				addr6 = &ipv6_hdr(skb)->daddr;
640 				addr_type = ipv6_addr_type(addr6);
641 			}
642 
643 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
644 				do_tx_error_icmp = true;
645 			else {
646 				do_tx_error_icmp = false;
647 				dst = addr6->s6_addr32[3];
648 			}
649 			neigh_release(neigh);
650 			if (do_tx_error_icmp)
651 				goto tx_error_icmp;
652 		}
653 #endif
654 		else
655 			goto tx_error;
656 
657 		connected = false;
658 	}
659 
660 	tos = tnl_params->tos;
661 	if (tos & 0x1) {
662 		tos &= ~0x1;
663 		if (skb->protocol == htons(ETH_P_IP)) {
664 			tos = inner_iph->tos;
665 			connected = false;
666 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
667 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
668 			connected = false;
669 		}
670 	}
671 
672 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
673 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
674 
675 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
676 		goto tx_error;
677 
678 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
679 			 NULL;
680 
681 	if (!rt) {
682 		rt = ip_route_output_key(tunnel->net, &fl4);
683 
684 		if (IS_ERR(rt)) {
685 			dev->stats.tx_carrier_errors++;
686 			goto tx_error;
687 		}
688 		if (connected)
689 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
690 					  fl4.saddr);
691 	}
692 
693 	if (rt->dst.dev == dev) {
694 		ip_rt_put(rt);
695 		dev->stats.collisions++;
696 		goto tx_error;
697 	}
698 
699 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
700 		ip_rt_put(rt);
701 		goto tx_error;
702 	}
703 
704 	if (tunnel->err_count > 0) {
705 		if (time_before(jiffies,
706 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
707 			tunnel->err_count--;
708 
709 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
710 			dst_link_failure(skb);
711 		} else
712 			tunnel->err_count = 0;
713 	}
714 
715 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
716 	ttl = tnl_params->ttl;
717 	if (ttl == 0) {
718 		if (skb->protocol == htons(ETH_P_IP))
719 			ttl = inner_iph->ttl;
720 #if IS_ENABLED(CONFIG_IPV6)
721 		else if (skb->protocol == htons(ETH_P_IPV6))
722 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
723 #endif
724 		else
725 			ttl = ip4_dst_hoplimit(&rt->dst);
726 	}
727 
728 	df = tnl_params->frag_off;
729 	if (skb->protocol == htons(ETH_P_IP))
730 		df |= (inner_iph->frag_off&htons(IP_DF));
731 
732 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
733 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
734 	if (max_headroom > dev->needed_headroom)
735 		dev->needed_headroom = max_headroom;
736 
737 	if (skb_cow_head(skb, dev->needed_headroom)) {
738 		ip_rt_put(rt);
739 		dev->stats.tx_dropped++;
740 		kfree_skb(skb);
741 		return;
742 	}
743 
744 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
745 		      df, !net_eq(tunnel->net, dev_net(dev)));
746 	return;
747 
748 #if IS_ENABLED(CONFIG_IPV6)
749 tx_error_icmp:
750 	dst_link_failure(skb);
751 #endif
752 tx_error:
753 	dev->stats.tx_errors++;
754 	kfree_skb(skb);
755 }
756 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
757 
758 static void ip_tunnel_update(struct ip_tunnel_net *itn,
759 			     struct ip_tunnel *t,
760 			     struct net_device *dev,
761 			     struct ip_tunnel_parm *p,
762 			     bool set_mtu)
763 {
764 	ip_tunnel_del(itn, t);
765 	t->parms.iph.saddr = p->iph.saddr;
766 	t->parms.iph.daddr = p->iph.daddr;
767 	t->parms.i_key = p->i_key;
768 	t->parms.o_key = p->o_key;
769 	if (dev->type != ARPHRD_ETHER) {
770 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
771 		memcpy(dev->broadcast, &p->iph.daddr, 4);
772 	}
773 	ip_tunnel_add(itn, t);
774 
775 	t->parms.iph.ttl = p->iph.ttl;
776 	t->parms.iph.tos = p->iph.tos;
777 	t->parms.iph.frag_off = p->iph.frag_off;
778 
779 	if (t->parms.link != p->link) {
780 		int mtu;
781 
782 		t->parms.link = p->link;
783 		mtu = ip_tunnel_bind_dev(dev);
784 		if (set_mtu)
785 			dev->mtu = mtu;
786 	}
787 	dst_cache_reset(&t->dst_cache);
788 	netdev_state_change(dev);
789 }
790 
791 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
792 {
793 	int err = 0;
794 	struct ip_tunnel *t = netdev_priv(dev);
795 	struct net *net = t->net;
796 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
797 
798 	BUG_ON(!itn->fb_tunnel_dev);
799 	switch (cmd) {
800 	case SIOCGETTUNNEL:
801 		if (dev == itn->fb_tunnel_dev) {
802 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
803 			if (!t)
804 				t = netdev_priv(dev);
805 		}
806 		memcpy(p, &t->parms, sizeof(*p));
807 		break;
808 
809 	case SIOCADDTUNNEL:
810 	case SIOCCHGTUNNEL:
811 		err = -EPERM;
812 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
813 			goto done;
814 		if (p->iph.ttl)
815 			p->iph.frag_off |= htons(IP_DF);
816 		if (!(p->i_flags & VTI_ISVTI)) {
817 			if (!(p->i_flags & TUNNEL_KEY))
818 				p->i_key = 0;
819 			if (!(p->o_flags & TUNNEL_KEY))
820 				p->o_key = 0;
821 		}
822 
823 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
824 
825 		if (cmd == SIOCADDTUNNEL) {
826 			if (!t) {
827 				t = ip_tunnel_create(net, itn, p);
828 				err = PTR_ERR_OR_ZERO(t);
829 				break;
830 			}
831 
832 			err = -EEXIST;
833 			break;
834 		}
835 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
836 			if (t) {
837 				if (t->dev != dev) {
838 					err = -EEXIST;
839 					break;
840 				}
841 			} else {
842 				unsigned int nflags = 0;
843 
844 				if (ipv4_is_multicast(p->iph.daddr))
845 					nflags = IFF_BROADCAST;
846 				else if (p->iph.daddr)
847 					nflags = IFF_POINTOPOINT;
848 
849 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
850 					err = -EINVAL;
851 					break;
852 				}
853 
854 				t = netdev_priv(dev);
855 			}
856 		}
857 
858 		if (t) {
859 			err = 0;
860 			ip_tunnel_update(itn, t, dev, p, true);
861 		} else {
862 			err = -ENOENT;
863 		}
864 		break;
865 
866 	case SIOCDELTUNNEL:
867 		err = -EPERM;
868 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
869 			goto done;
870 
871 		if (dev == itn->fb_tunnel_dev) {
872 			err = -ENOENT;
873 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
874 			if (!t)
875 				goto done;
876 			err = -EPERM;
877 			if (t == netdev_priv(itn->fb_tunnel_dev))
878 				goto done;
879 			dev = t->dev;
880 		}
881 		unregister_netdevice(dev);
882 		err = 0;
883 		break;
884 
885 	default:
886 		err = -EINVAL;
887 	}
888 
889 done:
890 	return err;
891 }
892 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
893 
894 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
895 {
896 	struct ip_tunnel *tunnel = netdev_priv(dev);
897 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
898 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
899 
900 	if (new_mtu < 68)
901 		return -EINVAL;
902 
903 	if (new_mtu > max_mtu) {
904 		if (strict)
905 			return -EINVAL;
906 
907 		new_mtu = max_mtu;
908 	}
909 
910 	dev->mtu = new_mtu;
911 	return 0;
912 }
913 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
914 
915 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
916 {
917 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
918 }
919 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
920 
921 static void ip_tunnel_dev_free(struct net_device *dev)
922 {
923 	struct ip_tunnel *tunnel = netdev_priv(dev);
924 
925 	gro_cells_destroy(&tunnel->gro_cells);
926 	dst_cache_destroy(&tunnel->dst_cache);
927 	free_percpu(dev->tstats);
928 	free_netdev(dev);
929 }
930 
931 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
932 {
933 	struct ip_tunnel *tunnel = netdev_priv(dev);
934 	struct ip_tunnel_net *itn;
935 
936 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
937 
938 	if (itn->fb_tunnel_dev != dev) {
939 		ip_tunnel_del(itn, netdev_priv(dev));
940 		unregister_netdevice_queue(dev, head);
941 	}
942 }
943 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
944 
945 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
946 {
947 	struct ip_tunnel *tunnel = netdev_priv(dev);
948 
949 	return tunnel->net;
950 }
951 EXPORT_SYMBOL(ip_tunnel_get_link_net);
952 
953 int ip_tunnel_get_iflink(const struct net_device *dev)
954 {
955 	struct ip_tunnel *tunnel = netdev_priv(dev);
956 
957 	return tunnel->parms.link;
958 }
959 EXPORT_SYMBOL(ip_tunnel_get_iflink);
960 
961 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
962 				  struct rtnl_link_ops *ops, char *devname)
963 {
964 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
965 	struct ip_tunnel_parm parms;
966 	unsigned int i;
967 
968 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
969 		INIT_HLIST_HEAD(&itn->tunnels[i]);
970 
971 	if (!ops) {
972 		itn->fb_tunnel_dev = NULL;
973 		return 0;
974 	}
975 
976 	memset(&parms, 0, sizeof(parms));
977 	if (devname)
978 		strlcpy(parms.name, devname, IFNAMSIZ);
979 
980 	rtnl_lock();
981 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
982 	/* FB netdevice is special: we have one, and only one per netns.
983 	 * Allowing to move it to another netns is clearly unsafe.
984 	 */
985 	if (!IS_ERR(itn->fb_tunnel_dev)) {
986 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
987 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
988 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
989 	}
990 	rtnl_unlock();
991 
992 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
993 }
994 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
995 
996 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
997 			      struct rtnl_link_ops *ops)
998 {
999 	struct net *net = dev_net(itn->fb_tunnel_dev);
1000 	struct net_device *dev, *aux;
1001 	int h;
1002 
1003 	for_each_netdev_safe(net, dev, aux)
1004 		if (dev->rtnl_link_ops == ops)
1005 			unregister_netdevice_queue(dev, head);
1006 
1007 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1008 		struct ip_tunnel *t;
1009 		struct hlist_node *n;
1010 		struct hlist_head *thead = &itn->tunnels[h];
1011 
1012 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1013 			/* If dev is in the same netns, it has already
1014 			 * been added to the list by the previous loop.
1015 			 */
1016 			if (!net_eq(dev_net(t->dev), net))
1017 				unregister_netdevice_queue(t->dev, head);
1018 	}
1019 }
1020 
1021 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1022 {
1023 	LIST_HEAD(list);
1024 
1025 	rtnl_lock();
1026 	ip_tunnel_destroy(itn, &list, ops);
1027 	unregister_netdevice_many(&list);
1028 	rtnl_unlock();
1029 }
1030 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1031 
1032 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1033 		      struct ip_tunnel_parm *p)
1034 {
1035 	struct ip_tunnel *nt;
1036 	struct net *net = dev_net(dev);
1037 	struct ip_tunnel_net *itn;
1038 	int mtu;
1039 	int err;
1040 
1041 	nt = netdev_priv(dev);
1042 	itn = net_generic(net, nt->ip_tnl_net_id);
1043 
1044 	if (nt->collect_md) {
1045 		if (rtnl_dereference(itn->collect_md_tun))
1046 			return -EEXIST;
1047 	} else {
1048 		if (ip_tunnel_find(itn, p, dev->type))
1049 			return -EEXIST;
1050 	}
1051 
1052 	nt->net = net;
1053 	nt->parms = *p;
1054 	err = register_netdevice(dev);
1055 	if (err)
1056 		goto out;
1057 
1058 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1059 		eth_hw_addr_random(dev);
1060 
1061 	mtu = ip_tunnel_bind_dev(dev);
1062 	if (!tb[IFLA_MTU])
1063 		dev->mtu = mtu;
1064 
1065 	ip_tunnel_add(itn, nt);
1066 out:
1067 	return err;
1068 }
1069 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1070 
1071 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1072 			 struct ip_tunnel_parm *p)
1073 {
1074 	struct ip_tunnel *t;
1075 	struct ip_tunnel *tunnel = netdev_priv(dev);
1076 	struct net *net = tunnel->net;
1077 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1078 
1079 	if (dev == itn->fb_tunnel_dev)
1080 		return -EINVAL;
1081 
1082 	t = ip_tunnel_find(itn, p, dev->type);
1083 
1084 	if (t) {
1085 		if (t->dev != dev)
1086 			return -EEXIST;
1087 	} else {
1088 		t = tunnel;
1089 
1090 		if (dev->type != ARPHRD_ETHER) {
1091 			unsigned int nflags = 0;
1092 
1093 			if (ipv4_is_multicast(p->iph.daddr))
1094 				nflags = IFF_BROADCAST;
1095 			else if (p->iph.daddr)
1096 				nflags = IFF_POINTOPOINT;
1097 
1098 			if ((dev->flags ^ nflags) &
1099 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1100 				return -EINVAL;
1101 		}
1102 	}
1103 
1104 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1105 	return 0;
1106 }
1107 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1108 
1109 int ip_tunnel_init(struct net_device *dev)
1110 {
1111 	struct ip_tunnel *tunnel = netdev_priv(dev);
1112 	struct iphdr *iph = &tunnel->parms.iph;
1113 	int err;
1114 
1115 	dev->destructor	= ip_tunnel_dev_free;
1116 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1117 	if (!dev->tstats)
1118 		return -ENOMEM;
1119 
1120 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1121 	if (err) {
1122 		free_percpu(dev->tstats);
1123 		return err;
1124 	}
1125 
1126 	err = gro_cells_init(&tunnel->gro_cells, dev);
1127 	if (err) {
1128 		dst_cache_destroy(&tunnel->dst_cache);
1129 		free_percpu(dev->tstats);
1130 		return err;
1131 	}
1132 
1133 	tunnel->dev = dev;
1134 	tunnel->net = dev_net(dev);
1135 	strcpy(tunnel->parms.name, dev->name);
1136 	iph->version		= 4;
1137 	iph->ihl		= 5;
1138 
1139 	if (tunnel->collect_md) {
1140 		dev->features |= NETIF_F_NETNS_LOCAL;
1141 		netif_keep_dst(dev);
1142 	}
1143 	return 0;
1144 }
1145 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1146 
1147 void ip_tunnel_uninit(struct net_device *dev)
1148 {
1149 	struct ip_tunnel *tunnel = netdev_priv(dev);
1150 	struct net *net = tunnel->net;
1151 	struct ip_tunnel_net *itn;
1152 
1153 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1154 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1155 	if (itn->fb_tunnel_dev != dev)
1156 		ip_tunnel_del(itn, netdev_priv(dev));
1157 
1158 	dst_cache_reset(&tunnel->dst_cache);
1159 }
1160 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1161 
1162 /* Do least required initialization, rest of init is done in tunnel_init call */
1163 void ip_tunnel_setup(struct net_device *dev, int net_id)
1164 {
1165 	struct ip_tunnel *tunnel = netdev_priv(dev);
1166 	tunnel->ip_tnl_net_id = net_id;
1167 }
1168 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1169 
1170 MODULE_LICENSE("GPL");
1171