xref: /linux/net/ipv4/ip_tunnel.c (revision 634ec1fc7982efeeeeed4a7688b0004827b43a21)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5 
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30 
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/netdev_lock.h>
44 #include <net/rtnetlink.h>
45 #include <net/udp.h>
46 #include <net/dst_metadata.h>
47 #include <net/inet_dscp.h>
48 
49 #if IS_ENABLED(CONFIG_IPV6)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54 
ip_tunnel_hash(__be32 key,__be32 remote)55 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
56 {
57 	return hash_32((__force u32)key ^ (__force u32)remote,
58 			 IP_TNL_HASH_BITS);
59 }
60 
ip_tunnel_key_match(const struct ip_tunnel_parm_kern * p,const unsigned long * flags,__be32 key)61 static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
62 				const unsigned long *flags, __be32 key)
63 {
64 	if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
65 		return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
66 
67 	return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
68 }
69 
70 /* Fallback tunnel: no source, no destination, no key, no options
71 
72    Tunnel hash table:
73    We require exact key match i.e. if a key is present in packet
74    it will match only tunnel with the same key; if it is not present,
75    it will match only keyless tunnel.
76 
77    All keysless packets, if not matched configured keyless tunnels
78    will match fallback tunnel.
79    Given src, dst and key, find appropriate for input tunnel.
80 */
ip_tunnel_lookup(struct ip_tunnel_net * itn,int link,const unsigned long * flags,__be32 remote,__be32 local,__be32 key)81 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
82 				   int link, const unsigned long *flags,
83 				   __be32 remote, __be32 local,
84 				   __be32 key)
85 {
86 	struct ip_tunnel *t, *cand = NULL;
87 	struct hlist_head *head;
88 	struct net_device *ndev;
89 	unsigned int hash;
90 
91 	hash = ip_tunnel_hash(key, remote);
92 	head = &itn->tunnels[hash];
93 
94 	hlist_for_each_entry_rcu(t, head, hash_node) {
95 		if (local != t->parms.iph.saddr ||
96 		    remote != t->parms.iph.daddr ||
97 		    !(t->dev->flags & IFF_UP))
98 			continue;
99 
100 		if (!ip_tunnel_key_match(&t->parms, flags, key))
101 			continue;
102 
103 		if (READ_ONCE(t->parms.link) == link)
104 			return t;
105 		cand = t;
106 	}
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (remote != t->parms.iph.daddr ||
110 		    t->parms.iph.saddr != 0 ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (READ_ONCE(t->parms.link) == link)
118 			return t;
119 		if (!cand)
120 			cand = t;
121 	}
122 
123 	hash = ip_tunnel_hash(key, 0);
124 	head = &itn->tunnels[hash];
125 
126 	hlist_for_each_entry_rcu(t, head, hash_node) {
127 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
128 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
129 			continue;
130 
131 		if (!(t->dev->flags & IFF_UP))
132 			continue;
133 
134 		if (!ip_tunnel_key_match(&t->parms, flags, key))
135 			continue;
136 
137 		if (READ_ONCE(t->parms.link) == link)
138 			return t;
139 		if (!cand)
140 			cand = t;
141 	}
142 
143 	hlist_for_each_entry_rcu(t, head, hash_node) {
144 		if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
145 		     t->parms.i_key != key) ||
146 		    t->parms.iph.saddr != 0 ||
147 		    t->parms.iph.daddr != 0 ||
148 		    !(t->dev->flags & IFF_UP))
149 			continue;
150 
151 		if (READ_ONCE(t->parms.link) == link)
152 			return t;
153 		if (!cand)
154 			cand = t;
155 	}
156 
157 	if (cand)
158 		return cand;
159 
160 	t = rcu_dereference(itn->collect_md_tun);
161 	if (t && t->dev->flags & IFF_UP)
162 		return t;
163 
164 	ndev = READ_ONCE(itn->fb_tunnel_dev);
165 	if (ndev && ndev->flags & IFF_UP)
166 		return netdev_priv(ndev);
167 
168 	return NULL;
169 }
170 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
171 
ip_bucket(struct ip_tunnel_net * itn,struct ip_tunnel_parm_kern * parms)172 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
173 				    struct ip_tunnel_parm_kern *parms)
174 {
175 	unsigned int h;
176 	__be32 remote;
177 	__be32 i_key = parms->i_key;
178 
179 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
180 		remote = parms->iph.daddr;
181 	else
182 		remote = 0;
183 
184 	if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
185 	    test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
186 		i_key = 0;
187 
188 	h = ip_tunnel_hash(i_key, remote);
189 	return &itn->tunnels[h];
190 }
191 
ip_tunnel_add(struct ip_tunnel_net * itn,struct ip_tunnel * t)192 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
193 {
194 	struct hlist_head *head = ip_bucket(itn, &t->parms);
195 
196 	if (t->collect_md)
197 		rcu_assign_pointer(itn->collect_md_tun, t);
198 	hlist_add_head_rcu(&t->hash_node, head);
199 }
200 
ip_tunnel_del(struct ip_tunnel_net * itn,struct ip_tunnel * t)201 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
202 {
203 	if (t->collect_md)
204 		rcu_assign_pointer(itn->collect_md_tun, NULL);
205 	hlist_del_init_rcu(&t->hash_node);
206 }
207 
ip_tunnel_find(struct ip_tunnel_net * itn,struct ip_tunnel_parm_kern * parms,int type)208 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
209 					struct ip_tunnel_parm_kern *parms,
210 					int type)
211 {
212 	__be32 remote = parms->iph.daddr;
213 	__be32 local = parms->iph.saddr;
214 	IP_TUNNEL_DECLARE_FLAGS(flags);
215 	__be32 key = parms->i_key;
216 	int link = parms->link;
217 	struct ip_tunnel *t = NULL;
218 	struct hlist_head *head = ip_bucket(itn, parms);
219 
220 	ip_tunnel_flags_copy(flags, parms->i_flags);
221 
222 	hlist_for_each_entry_rcu(t, head, hash_node, lockdep_rtnl_is_held()) {
223 		if (local == t->parms.iph.saddr &&
224 		    remote == t->parms.iph.daddr &&
225 		    link == READ_ONCE(t->parms.link) &&
226 		    type == t->dev->type &&
227 		    ip_tunnel_key_match(&t->parms, flags, key))
228 			break;
229 	}
230 	return t;
231 }
232 
__ip_tunnel_create(struct net * net,const struct rtnl_link_ops * ops,struct ip_tunnel_parm_kern * parms)233 static struct net_device *__ip_tunnel_create(struct net *net,
234 					     const struct rtnl_link_ops *ops,
235 					     struct ip_tunnel_parm_kern *parms)
236 {
237 	int err;
238 	struct ip_tunnel *tunnel;
239 	struct net_device *dev;
240 	char name[IFNAMSIZ];
241 
242 	err = -E2BIG;
243 	if (parms->name[0]) {
244 		if (!dev_valid_name(parms->name))
245 			goto failed;
246 		strscpy(name, parms->name);
247 	} else {
248 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
249 			goto failed;
250 		strscpy(name, ops->kind);
251 		strcat(name, "%d");
252 	}
253 
254 	ASSERT_RTNL();
255 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
256 	if (!dev) {
257 		err = -ENOMEM;
258 		goto failed;
259 	}
260 	dev_net_set(dev, net);
261 
262 	dev->rtnl_link_ops = ops;
263 
264 	tunnel = netdev_priv(dev);
265 	tunnel->parms = *parms;
266 	tunnel->net = net;
267 
268 	err = register_netdevice(dev);
269 	if (err)
270 		goto failed_free;
271 
272 	return dev;
273 
274 failed_free:
275 	free_netdev(dev);
276 failed:
277 	return ERR_PTR(err);
278 }
279 
ip_tunnel_bind_dev(struct net_device * dev)280 static int ip_tunnel_bind_dev(struct net_device *dev)
281 {
282 	struct net_device *tdev = NULL;
283 	struct ip_tunnel *tunnel = netdev_priv(dev);
284 	const struct iphdr *iph;
285 	int hlen = LL_MAX_HEADER;
286 	int mtu = ETH_DATA_LEN;
287 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
288 
289 	iph = &tunnel->parms.iph;
290 
291 	/* Guess output device to choose reasonable mtu and needed_headroom */
292 	if (iph->daddr) {
293 		struct flowi4 fl4;
294 		struct rtable *rt;
295 
296 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
297 				    iph->saddr, tunnel->parms.o_key,
298 				    iph->tos & INET_DSCP_MASK, tunnel->net,
299 				    tunnel->parms.link, tunnel->fwmark, 0, 0);
300 		rt = ip_route_output_key(tunnel->net, &fl4);
301 
302 		if (!IS_ERR(rt)) {
303 			tdev = rt->dst.dev;
304 			ip_rt_put(rt);
305 		}
306 		if (dev->type != ARPHRD_ETHER)
307 			dev->flags |= IFF_POINTOPOINT;
308 
309 		dst_cache_reset(&tunnel->dst_cache);
310 	}
311 
312 	if (!tdev && tunnel->parms.link)
313 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
314 
315 	if (tdev) {
316 		hlen = tdev->hard_header_len + tdev->needed_headroom;
317 		mtu = min(tdev->mtu, IP_MAX_MTU);
318 	}
319 
320 	dev->needed_headroom = t_hlen + hlen;
321 	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
322 
323 	if (mtu < IPV4_MIN_MTU)
324 		mtu = IPV4_MIN_MTU;
325 
326 	return mtu;
327 }
328 
ip_tunnel_create(struct net * net,struct ip_tunnel_net * itn,struct ip_tunnel_parm_kern * parms)329 static struct ip_tunnel *ip_tunnel_create(struct net *net,
330 					  struct ip_tunnel_net *itn,
331 					  struct ip_tunnel_parm_kern *parms)
332 {
333 	struct ip_tunnel *nt;
334 	struct net_device *dev;
335 	int t_hlen;
336 	int mtu;
337 	int err;
338 
339 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
340 	if (IS_ERR(dev))
341 		return ERR_CAST(dev);
342 
343 	mtu = ip_tunnel_bind_dev(dev);
344 	err = dev_set_mtu(dev, mtu);
345 	if (err)
346 		goto err_dev_set_mtu;
347 
348 	nt = netdev_priv(dev);
349 	t_hlen = nt->hlen + sizeof(struct iphdr);
350 	dev->min_mtu = ETH_MIN_MTU;
351 	dev->max_mtu = IP_MAX_MTU - t_hlen;
352 	if (dev->type == ARPHRD_ETHER)
353 		dev->max_mtu -= dev->hard_header_len;
354 
355 	ip_tunnel_add(itn, nt);
356 	return nt;
357 
358 err_dev_set_mtu:
359 	unregister_netdevice(dev);
360 	return ERR_PTR(err);
361 }
362 
ip_tunnel_md_udp_encap(struct sk_buff * skb,struct ip_tunnel_info * info)363 void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
364 {
365 	const struct iphdr *iph = ip_hdr(skb);
366 	const struct udphdr *udph;
367 
368 	if (iph->protocol != IPPROTO_UDP)
369 		return;
370 
371 	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
372 	info->encap.sport = udph->source;
373 	info->encap.dport = udph->dest;
374 }
375 EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
376 
ip_tunnel_rcv(struct ip_tunnel * tunnel,struct sk_buff * skb,const struct tnl_ptk_info * tpi,struct metadata_dst * tun_dst,bool log_ecn_error)377 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
378 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
379 		  bool log_ecn_error)
380 {
381 	const struct iphdr *iph = ip_hdr(skb);
382 	int nh, err;
383 
384 #ifdef CONFIG_NET_IPGRE_BROADCAST
385 	if (ipv4_is_multicast(iph->daddr)) {
386 		DEV_STATS_INC(tunnel->dev, multicast);
387 		skb->pkt_type = PACKET_BROADCAST;
388 	}
389 #endif
390 
391 	if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
392 	    test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
393 		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
394 		DEV_STATS_INC(tunnel->dev, rx_errors);
395 		goto drop;
396 	}
397 
398 	if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
399 		if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
400 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
401 			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
402 			DEV_STATS_INC(tunnel->dev, rx_errors);
403 			goto drop;
404 		}
405 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
406 	}
407 
408 	/* Save offset of outer header relative to skb->head,
409 	 * because we are going to reset the network header to the inner header
410 	 * and might change skb->head.
411 	 */
412 	nh = skb_network_header(skb) - skb->head;
413 
414 	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
415 
416 	if (!pskb_inet_may_pull(skb)) {
417 		DEV_STATS_INC(tunnel->dev, rx_length_errors);
418 		DEV_STATS_INC(tunnel->dev, rx_errors);
419 		goto drop;
420 	}
421 	iph = (struct iphdr *)(skb->head + nh);
422 
423 	err = IP_ECN_decapsulate(iph, skb);
424 	if (unlikely(err)) {
425 		if (log_ecn_error)
426 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
427 					&iph->saddr, iph->tos);
428 		if (err > 1) {
429 			DEV_STATS_INC(tunnel->dev, rx_frame_errors);
430 			DEV_STATS_INC(tunnel->dev, rx_errors);
431 			goto drop;
432 		}
433 	}
434 
435 	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
436 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
437 
438 	if (tunnel->dev->type == ARPHRD_ETHER) {
439 		skb->protocol = eth_type_trans(skb, tunnel->dev);
440 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
441 	} else {
442 		skb->dev = tunnel->dev;
443 	}
444 
445 	if (tun_dst)
446 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
447 
448 	gro_cells_receive(&tunnel->gro_cells, skb);
449 	return 0;
450 
451 drop:
452 	if (tun_dst)
453 		dst_release((struct dst_entry *)tun_dst);
454 	kfree_skb(skb);
455 	return 0;
456 }
457 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
458 
ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops * ops,unsigned int num)459 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
460 			    unsigned int num)
461 {
462 	if (num >= MAX_IPTUN_ENCAP_OPS)
463 		return -ERANGE;
464 
465 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
466 			&iptun_encaps[num],
467 			NULL, ops) ? 0 : -1;
468 }
469 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
470 
ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops * ops,unsigned int num)471 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
472 			    unsigned int num)
473 {
474 	int ret;
475 
476 	if (num >= MAX_IPTUN_ENCAP_OPS)
477 		return -ERANGE;
478 
479 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
480 		       &iptun_encaps[num],
481 		       ops, NULL) == ops) ? 0 : -1;
482 
483 	synchronize_net();
484 
485 	return ret;
486 }
487 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
488 
ip_tunnel_encap_setup(struct ip_tunnel * t,struct ip_tunnel_encap * ipencap)489 int ip_tunnel_encap_setup(struct ip_tunnel *t,
490 			  struct ip_tunnel_encap *ipencap)
491 {
492 	int hlen;
493 
494 	memset(&t->encap, 0, sizeof(t->encap));
495 
496 	hlen = ip_encap_hlen(ipencap);
497 	if (hlen < 0)
498 		return hlen;
499 
500 	t->encap.type = ipencap->type;
501 	t->encap.sport = ipencap->sport;
502 	t->encap.dport = ipencap->dport;
503 	t->encap.flags = ipencap->flags;
504 
505 	t->encap_hlen = hlen;
506 	t->hlen = t->encap_hlen + t->tun_hlen;
507 
508 	return 0;
509 }
510 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
511 
tnl_update_pmtu(struct net_device * dev,struct sk_buff * skb,struct rtable * rt,__be16 df,const struct iphdr * inner_iph,int tunnel_hlen,__be32 dst,bool md)512 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
513 			    struct rtable *rt, __be16 df,
514 			    const struct iphdr *inner_iph,
515 			    int tunnel_hlen, __be32 dst, bool md)
516 {
517 	struct ip_tunnel *tunnel = netdev_priv(dev);
518 	int pkt_size;
519 	int mtu;
520 
521 	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
522 	pkt_size = skb->len - tunnel_hlen;
523 	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
524 
525 	if (df) {
526 		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
527 		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
528 	} else {
529 		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
530 	}
531 
532 	if (skb_valid_dst(skb))
533 		skb_dst_update_pmtu_no_confirm(skb, mtu);
534 
535 	if (skb->protocol == htons(ETH_P_IP)) {
536 		if (!skb_is_gso(skb) &&
537 		    (inner_iph->frag_off & htons(IP_DF)) &&
538 		    mtu < pkt_size) {
539 			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
540 			return -E2BIG;
541 		}
542 	}
543 #if IS_ENABLED(CONFIG_IPV6)
544 	else if (skb->protocol == htons(ETH_P_IPV6)) {
545 		struct rt6_info *rt6;
546 		__be32 daddr;
547 
548 		rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
549 					   NULL;
550 		daddr = md ? dst : tunnel->parms.iph.daddr;
551 
552 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
553 			   mtu >= IPV6_MIN_MTU) {
554 			if ((daddr && !ipv4_is_multicast(daddr)) ||
555 			    rt6->rt6i_dst.plen == 128) {
556 				rt6->rt6i_flags |= RTF_MODIFIED;
557 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
558 			}
559 		}
560 
561 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
562 					mtu < pkt_size) {
563 			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
564 			return -E2BIG;
565 		}
566 	}
567 #endif
568 	return 0;
569 }
570 
ip_md_tunnel_xmit(struct sk_buff * skb,struct net_device * dev,u8 proto,int tunnel_hlen)571 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
572 		       u8 proto, int tunnel_hlen)
573 {
574 	struct ip_tunnel *tunnel = netdev_priv(dev);
575 	u32 headroom = sizeof(struct iphdr);
576 	struct ip_tunnel_info *tun_info;
577 	const struct ip_tunnel_key *key;
578 	const struct iphdr *inner_iph;
579 	struct rtable *rt = NULL;
580 	struct flowi4 fl4;
581 	__be16 df = 0;
582 	u8 tos, ttl;
583 	bool use_cache;
584 
585 	tun_info = skb_tunnel_info(skb);
586 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
587 		     ip_tunnel_info_af(tun_info) != AF_INET))
588 		goto tx_error;
589 	key = &tun_info->key;
590 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
591 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
592 	tos = key->tos;
593 	if (tos == 1) {
594 		if (skb->protocol == htons(ETH_P_IP))
595 			tos = inner_iph->tos;
596 		else if (skb->protocol == htons(ETH_P_IPV6))
597 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
598 	}
599 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
600 			    tunnel_id_to_key32(key->tun_id),
601 			    tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark,
602 			    skb_get_hash(skb), key->flow_flags);
603 
604 	if (!tunnel_hlen)
605 		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
606 
607 	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
608 		goto tx_error;
609 
610 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
611 	if (use_cache)
612 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
613 	if (!rt) {
614 		rt = ip_route_output_key(tunnel->net, &fl4);
615 		if (IS_ERR(rt)) {
616 			DEV_STATS_INC(dev, tx_carrier_errors);
617 			goto tx_error;
618 		}
619 		if (use_cache)
620 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
621 					  fl4.saddr);
622 	}
623 	if (rt->dst.dev == dev) {
624 		ip_rt_put(rt);
625 		DEV_STATS_INC(dev, collisions);
626 		goto tx_error;
627 	}
628 
629 	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
630 		df = htons(IP_DF);
631 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
632 			    key->u.ipv4.dst, true)) {
633 		ip_rt_put(rt);
634 		goto tx_error;
635 	}
636 
637 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
638 	ttl = key->ttl;
639 	if (ttl == 0) {
640 		if (skb->protocol == htons(ETH_P_IP))
641 			ttl = inner_iph->ttl;
642 		else if (skb->protocol == htons(ETH_P_IPV6))
643 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
644 		else
645 			ttl = ip4_dst_hoplimit(&rt->dst);
646 	}
647 
648 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
649 	if (skb_cow_head(skb, headroom)) {
650 		ip_rt_put(rt);
651 		goto tx_dropped;
652 	}
653 
654 	ip_tunnel_adj_headroom(dev, headroom);
655 
656 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
657 		      df, !net_eq(tunnel->net, dev_net(dev)), 0);
658 	return;
659 tx_error:
660 	DEV_STATS_INC(dev, tx_errors);
661 	goto kfree;
662 tx_dropped:
663 	DEV_STATS_INC(dev, tx_dropped);
664 kfree:
665 	kfree_skb(skb);
666 }
667 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
668 
ip_tunnel_xmit(struct sk_buff * skb,struct net_device * dev,const struct iphdr * tnl_params,u8 protocol)669 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
670 		    const struct iphdr *tnl_params, u8 protocol)
671 {
672 	struct ip_tunnel *tunnel = netdev_priv(dev);
673 	struct ip_tunnel_info *tun_info = NULL;
674 	const struct iphdr *inner_iph;
675 	unsigned int max_headroom;	/* The extra header space needed */
676 	struct rtable *rt = NULL;		/* Route to the other host */
677 	__be16 payload_protocol;
678 	bool use_cache = false;
679 	struct flowi4 fl4;
680 	bool md = false;
681 	bool connected;
682 	u8 tos, ttl;
683 	__be32 dst;
684 	__be16 df;
685 
686 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
687 	connected = (tunnel->parms.iph.daddr != 0);
688 	payload_protocol = skb_protocol(skb, true);
689 
690 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
691 
692 	dst = tnl_params->daddr;
693 	if (dst == 0) {
694 		/* NBMA tunnel */
695 
696 		if (!skb_dst(skb)) {
697 			DEV_STATS_INC(dev, tx_fifo_errors);
698 			goto tx_error;
699 		}
700 
701 		tun_info = skb_tunnel_info(skb);
702 		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
703 		    ip_tunnel_info_af(tun_info) == AF_INET &&
704 		    tun_info->key.u.ipv4.dst) {
705 			dst = tun_info->key.u.ipv4.dst;
706 			md = true;
707 			connected = true;
708 		} else if (payload_protocol == htons(ETH_P_IP)) {
709 			rt = skb_rtable(skb);
710 			dst = rt_nexthop(rt, inner_iph->daddr);
711 		}
712 #if IS_ENABLED(CONFIG_IPV6)
713 		else if (payload_protocol == htons(ETH_P_IPV6)) {
714 			const struct in6_addr *addr6;
715 			struct neighbour *neigh;
716 			bool do_tx_error_icmp;
717 			int addr_type;
718 
719 			neigh = dst_neigh_lookup(skb_dst(skb),
720 						 &ipv6_hdr(skb)->daddr);
721 			if (!neigh)
722 				goto tx_error;
723 
724 			addr6 = (const struct in6_addr *)&neigh->primary_key;
725 			addr_type = ipv6_addr_type(addr6);
726 
727 			if (addr_type == IPV6_ADDR_ANY) {
728 				addr6 = &ipv6_hdr(skb)->daddr;
729 				addr_type = ipv6_addr_type(addr6);
730 			}
731 
732 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
733 				do_tx_error_icmp = true;
734 			else {
735 				do_tx_error_icmp = false;
736 				dst = addr6->s6_addr32[3];
737 			}
738 			neigh_release(neigh);
739 			if (do_tx_error_icmp)
740 				goto tx_error_icmp;
741 		}
742 #endif
743 		else
744 			goto tx_error;
745 
746 		if (!md)
747 			connected = false;
748 	}
749 
750 	tos = tnl_params->tos;
751 	if (tos & 0x1) {
752 		tos &= ~0x1;
753 		if (payload_protocol == htons(ETH_P_IP)) {
754 			tos = inner_iph->tos;
755 			connected = false;
756 		} else if (payload_protocol == htons(ETH_P_IPV6)) {
757 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
758 			connected = false;
759 		}
760 	}
761 
762 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
763 			    tunnel->parms.o_key, tos & INET_DSCP_MASK,
764 			    tunnel->net, READ_ONCE(tunnel->parms.link),
765 			    tunnel->fwmark, skb_get_hash(skb), 0);
766 
767 	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
768 		goto tx_error;
769 
770 	if (connected && md) {
771 		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
772 		if (use_cache)
773 			rt = dst_cache_get_ip4(&tun_info->dst_cache,
774 					       &fl4.saddr);
775 	} else {
776 		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
777 						&fl4.saddr) : NULL;
778 	}
779 
780 	if (!rt) {
781 		rt = ip_route_output_key(tunnel->net, &fl4);
782 
783 		if (IS_ERR(rt)) {
784 			DEV_STATS_INC(dev, tx_carrier_errors);
785 			goto tx_error;
786 		}
787 		if (use_cache)
788 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
789 					  fl4.saddr);
790 		else if (!md && connected)
791 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
792 					  fl4.saddr);
793 	}
794 
795 	if (rt->dst.dev == dev) {
796 		ip_rt_put(rt);
797 		DEV_STATS_INC(dev, collisions);
798 		goto tx_error;
799 	}
800 
801 	df = tnl_params->frag_off;
802 	if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
803 		df |= (inner_iph->frag_off & htons(IP_DF));
804 
805 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
806 		ip_rt_put(rt);
807 		goto tx_error;
808 	}
809 
810 	if (tunnel->err_count > 0) {
811 		if (time_before(jiffies,
812 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
813 			tunnel->err_count--;
814 
815 			dst_link_failure(skb);
816 		} else
817 			tunnel->err_count = 0;
818 	}
819 
820 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
821 	ttl = tnl_params->ttl;
822 	if (ttl == 0) {
823 		if (payload_protocol == htons(ETH_P_IP))
824 			ttl = inner_iph->ttl;
825 #if IS_ENABLED(CONFIG_IPV6)
826 		else if (payload_protocol == htons(ETH_P_IPV6))
827 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
828 #endif
829 		else
830 			ttl = ip4_dst_hoplimit(&rt->dst);
831 	}
832 
833 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
834 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
835 
836 	if (skb_cow_head(skb, max_headroom)) {
837 		ip_rt_put(rt);
838 		DEV_STATS_INC(dev, tx_dropped);
839 		kfree_skb(skb);
840 		return;
841 	}
842 
843 	ip_tunnel_adj_headroom(dev, max_headroom);
844 
845 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
846 		      df, !net_eq(tunnel->net, dev_net(dev)), 0);
847 	return;
848 
849 #if IS_ENABLED(CONFIG_IPV6)
850 tx_error_icmp:
851 	dst_link_failure(skb);
852 #endif
853 tx_error:
854 	DEV_STATS_INC(dev, tx_errors);
855 	kfree_skb(skb);
856 }
857 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
858 
ip_tunnel_update(struct ip_tunnel_net * itn,struct ip_tunnel * t,struct net_device * dev,struct ip_tunnel_parm_kern * p,bool set_mtu,__u32 fwmark)859 static void ip_tunnel_update(struct ip_tunnel_net *itn,
860 			     struct ip_tunnel *t,
861 			     struct net_device *dev,
862 			     struct ip_tunnel_parm_kern *p,
863 			     bool set_mtu,
864 			     __u32 fwmark)
865 {
866 	ip_tunnel_del(itn, t);
867 	t->parms.iph.saddr = p->iph.saddr;
868 	t->parms.iph.daddr = p->iph.daddr;
869 	t->parms.i_key = p->i_key;
870 	t->parms.o_key = p->o_key;
871 	if (dev->type != ARPHRD_ETHER) {
872 		__dev_addr_set(dev, &p->iph.saddr, 4);
873 		memcpy(dev->broadcast, &p->iph.daddr, 4);
874 	}
875 	ip_tunnel_add(itn, t);
876 
877 	t->parms.iph.ttl = p->iph.ttl;
878 	t->parms.iph.tos = p->iph.tos;
879 	t->parms.iph.frag_off = p->iph.frag_off;
880 
881 	if (t->parms.link != p->link || t->fwmark != fwmark) {
882 		int mtu;
883 
884 		WRITE_ONCE(t->parms.link, p->link);
885 		t->fwmark = fwmark;
886 		mtu = ip_tunnel_bind_dev(dev);
887 		if (set_mtu)
888 			WRITE_ONCE(dev->mtu, mtu);
889 	}
890 	dst_cache_reset(&t->dst_cache);
891 	netdev_state_change(dev);
892 }
893 
ip_tunnel_ctl(struct net_device * dev,struct ip_tunnel_parm_kern * p,int cmd)894 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
895 		  int cmd)
896 {
897 	int err = 0;
898 	struct ip_tunnel *t = netdev_priv(dev);
899 	struct net *net = t->net;
900 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
901 
902 	switch (cmd) {
903 	case SIOCGETTUNNEL:
904 		if (dev == itn->fb_tunnel_dev) {
905 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
906 			if (!t)
907 				t = netdev_priv(dev);
908 		}
909 		memcpy(p, &t->parms, sizeof(*p));
910 		break;
911 
912 	case SIOCADDTUNNEL:
913 	case SIOCCHGTUNNEL:
914 		err = -EPERM;
915 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
916 			goto done;
917 		if (p->iph.ttl)
918 			p->iph.frag_off |= htons(IP_DF);
919 		if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
920 			if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
921 				p->i_key = 0;
922 			if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
923 				p->o_key = 0;
924 		}
925 
926 		t = ip_tunnel_find(itn, p, itn->type);
927 
928 		if (cmd == SIOCADDTUNNEL) {
929 			if (!t) {
930 				t = ip_tunnel_create(net, itn, p);
931 				err = PTR_ERR_OR_ZERO(t);
932 				break;
933 			}
934 
935 			err = -EEXIST;
936 			break;
937 		}
938 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
939 			if (t) {
940 				if (t->dev != dev) {
941 					err = -EEXIST;
942 					break;
943 				}
944 			} else {
945 				unsigned int nflags = 0;
946 
947 				if (ipv4_is_multicast(p->iph.daddr))
948 					nflags = IFF_BROADCAST;
949 				else if (p->iph.daddr)
950 					nflags = IFF_POINTOPOINT;
951 
952 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
953 					err = -EINVAL;
954 					break;
955 				}
956 
957 				t = netdev_priv(dev);
958 			}
959 		}
960 
961 		if (t) {
962 			err = 0;
963 			ip_tunnel_update(itn, t, dev, p, true, 0);
964 		} else {
965 			err = -ENOENT;
966 		}
967 		break;
968 
969 	case SIOCDELTUNNEL:
970 		err = -EPERM;
971 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
972 			goto done;
973 
974 		if (dev == itn->fb_tunnel_dev) {
975 			err = -ENOENT;
976 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
977 			if (!t)
978 				goto done;
979 			err = -EPERM;
980 			if (t == netdev_priv(itn->fb_tunnel_dev))
981 				goto done;
982 			dev = t->dev;
983 		}
984 		unregister_netdevice(dev);
985 		err = 0;
986 		break;
987 
988 	default:
989 		err = -EINVAL;
990 	}
991 
992 done:
993 	return err;
994 }
995 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
996 
ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern * kp,const void __user * data)997 bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
998 			      const void __user *data)
999 {
1000 	struct ip_tunnel_parm p;
1001 
1002 	if (copy_from_user(&p, data, sizeof(p)))
1003 		return false;
1004 
1005 	strscpy(kp->name, p.name);
1006 	kp->link = p.link;
1007 	ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
1008 	ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
1009 	kp->i_key = p.i_key;
1010 	kp->o_key = p.o_key;
1011 	memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
1012 
1013 	return true;
1014 }
1015 EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
1016 
ip_tunnel_parm_to_user(void __user * data,struct ip_tunnel_parm_kern * kp)1017 bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
1018 {
1019 	struct ip_tunnel_parm p;
1020 
1021 	if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
1022 	    !ip_tunnel_flags_is_be16_compat(kp->o_flags))
1023 		return false;
1024 
1025 	memset(&p, 0, sizeof(p));
1026 
1027 	strscpy(p.name, kp->name);
1028 	p.link = kp->link;
1029 	p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
1030 	p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
1031 	p.i_key = kp->i_key;
1032 	p.o_key = kp->o_key;
1033 	memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
1034 
1035 	return !copy_to_user(data, &p, sizeof(p));
1036 }
1037 EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
1038 
ip_tunnel_siocdevprivate(struct net_device * dev,struct ifreq * ifr,void __user * data,int cmd)1039 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1040 			     void __user *data, int cmd)
1041 {
1042 	struct ip_tunnel_parm_kern p;
1043 	int err;
1044 
1045 	if (!ip_tunnel_parm_from_user(&p, data))
1046 		return -EFAULT;
1047 	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1048 	if (!err && !ip_tunnel_parm_to_user(data, &p))
1049 		return -EFAULT;
1050 	return err;
1051 }
1052 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1053 
__ip_tunnel_change_mtu(struct net_device * dev,int new_mtu,bool strict)1054 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1055 {
1056 	struct ip_tunnel *tunnel = netdev_priv(dev);
1057 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1058 	int max_mtu = IP_MAX_MTU - t_hlen;
1059 
1060 	if (dev->type == ARPHRD_ETHER)
1061 		max_mtu -= dev->hard_header_len;
1062 
1063 	if (new_mtu < ETH_MIN_MTU)
1064 		return -EINVAL;
1065 
1066 	if (new_mtu > max_mtu) {
1067 		if (strict)
1068 			return -EINVAL;
1069 
1070 		new_mtu = max_mtu;
1071 	}
1072 
1073 	WRITE_ONCE(dev->mtu, new_mtu);
1074 	return 0;
1075 }
1076 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1077 
ip_tunnel_change_mtu(struct net_device * dev,int new_mtu)1078 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1079 {
1080 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
1081 }
1082 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1083 
ip_tunnel_dev_free(struct net_device * dev)1084 static void ip_tunnel_dev_free(struct net_device *dev)
1085 {
1086 	struct ip_tunnel *tunnel = netdev_priv(dev);
1087 
1088 	gro_cells_destroy(&tunnel->gro_cells);
1089 	dst_cache_destroy(&tunnel->dst_cache);
1090 }
1091 
ip_tunnel_dellink(struct net_device * dev,struct list_head * head)1092 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1093 {
1094 	struct ip_tunnel *tunnel = netdev_priv(dev);
1095 	struct ip_tunnel_net *itn;
1096 
1097 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1098 
1099 	if (itn->fb_tunnel_dev != dev) {
1100 		ip_tunnel_del(itn, netdev_priv(dev));
1101 		unregister_netdevice_queue(dev, head);
1102 	}
1103 }
1104 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1105 
ip_tunnel_get_link_net(const struct net_device * dev)1106 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1107 {
1108 	struct ip_tunnel *tunnel = netdev_priv(dev);
1109 
1110 	return READ_ONCE(tunnel->net);
1111 }
1112 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1113 
ip_tunnel_get_iflink(const struct net_device * dev)1114 int ip_tunnel_get_iflink(const struct net_device *dev)
1115 {
1116 	const struct ip_tunnel *tunnel = netdev_priv(dev);
1117 
1118 	return READ_ONCE(tunnel->parms.link);
1119 }
1120 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1121 
ip_tunnel_init_net(struct net * net,unsigned int ip_tnl_net_id,struct rtnl_link_ops * ops,char * devname)1122 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1123 				  struct rtnl_link_ops *ops, char *devname)
1124 {
1125 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1126 	struct ip_tunnel_parm_kern parms;
1127 	unsigned int i;
1128 
1129 	itn->rtnl_link_ops = ops;
1130 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1131 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1132 
1133 	if (!ops || !net_has_fallback_tunnels(net)) {
1134 		struct ip_tunnel_net *it_init_net;
1135 
1136 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1137 		itn->type = it_init_net->type;
1138 		itn->fb_tunnel_dev = NULL;
1139 		return 0;
1140 	}
1141 
1142 	memset(&parms, 0, sizeof(parms));
1143 	if (devname)
1144 		strscpy(parms.name, devname, IFNAMSIZ);
1145 
1146 	rtnl_lock();
1147 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1148 	/* FB netdevice is special: we have one, and only one per netns.
1149 	 * Allowing to move it to another netns is clearly unsafe.
1150 	 */
1151 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1152 		itn->fb_tunnel_dev->netns_immutable = true;
1153 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1154 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1155 		itn->type = itn->fb_tunnel_dev->type;
1156 	}
1157 	rtnl_unlock();
1158 
1159 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1160 }
1161 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1162 
ip_tunnel_delete_net(struct net * net,unsigned int id,struct rtnl_link_ops * ops,struct list_head * head)1163 void ip_tunnel_delete_net(struct net *net, unsigned int id,
1164 			  struct rtnl_link_ops *ops,
1165 			  struct list_head *head)
1166 {
1167 	struct ip_tunnel_net *itn = net_generic(net, id);
1168 	struct net_device *dev, *aux;
1169 	int h;
1170 
1171 	ASSERT_RTNL_NET(net);
1172 
1173 	for_each_netdev_safe(net, dev, aux)
1174 		if (dev->rtnl_link_ops == ops)
1175 			unregister_netdevice_queue(dev, head);
1176 
1177 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1178 		struct ip_tunnel *t;
1179 		struct hlist_node *n;
1180 		struct hlist_head *thead = &itn->tunnels[h];
1181 
1182 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1183 			/* If dev is in the same netns, it has already
1184 			 * been added to the list by the previous loop.
1185 			 */
1186 			if (!net_eq(dev_net(t->dev), net))
1187 				unregister_netdevice_queue(t->dev, head);
1188 	}
1189 }
1190 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1191 
ip_tunnel_newlink(struct net * net,struct net_device * dev,struct nlattr * tb[],struct ip_tunnel_parm_kern * p,__u32 fwmark)1192 int ip_tunnel_newlink(struct net *net, struct net_device *dev,
1193 		      struct nlattr *tb[], struct ip_tunnel_parm_kern *p,
1194 		      __u32 fwmark)
1195 {
1196 	struct ip_tunnel *nt;
1197 	struct ip_tunnel_net *itn;
1198 	int mtu;
1199 	int err;
1200 
1201 	nt = netdev_priv(dev);
1202 	itn = net_generic(net, nt->ip_tnl_net_id);
1203 
1204 	if (nt->collect_md) {
1205 		if (rtnl_dereference(itn->collect_md_tun))
1206 			return -EEXIST;
1207 	} else {
1208 		if (ip_tunnel_find(itn, p, dev->type))
1209 			return -EEXIST;
1210 	}
1211 
1212 	nt->net = net;
1213 	nt->parms = *p;
1214 	nt->fwmark = fwmark;
1215 	err = register_netdevice(dev);
1216 	if (err)
1217 		goto err_register_netdevice;
1218 
1219 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1220 		eth_hw_addr_random(dev);
1221 
1222 	mtu = ip_tunnel_bind_dev(dev);
1223 	if (tb[IFLA_MTU]) {
1224 		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1225 
1226 		if (dev->type == ARPHRD_ETHER)
1227 			max -= dev->hard_header_len;
1228 
1229 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1230 	}
1231 
1232 	err = dev_set_mtu(dev, mtu);
1233 	if (err)
1234 		goto err_dev_set_mtu;
1235 
1236 	ip_tunnel_add(itn, nt);
1237 	return 0;
1238 
1239 err_dev_set_mtu:
1240 	unregister_netdevice(dev);
1241 err_register_netdevice:
1242 	return err;
1243 }
1244 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1245 
ip_tunnel_changelink(struct net_device * dev,struct nlattr * tb[],struct ip_tunnel_parm_kern * p,__u32 fwmark)1246 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1247 			 struct ip_tunnel_parm_kern *p, __u32 fwmark)
1248 {
1249 	struct ip_tunnel *t;
1250 	struct ip_tunnel *tunnel = netdev_priv(dev);
1251 	struct net *net = tunnel->net;
1252 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1253 
1254 	if (dev == itn->fb_tunnel_dev)
1255 		return -EINVAL;
1256 
1257 	t = ip_tunnel_find(itn, p, dev->type);
1258 
1259 	if (t) {
1260 		if (t->dev != dev)
1261 			return -EEXIST;
1262 	} else {
1263 		t = tunnel;
1264 
1265 		if (dev->type != ARPHRD_ETHER) {
1266 			unsigned int nflags = 0;
1267 
1268 			if (ipv4_is_multicast(p->iph.daddr))
1269 				nflags = IFF_BROADCAST;
1270 			else if (p->iph.daddr)
1271 				nflags = IFF_POINTOPOINT;
1272 
1273 			if ((dev->flags ^ nflags) &
1274 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1275 				return -EINVAL;
1276 		}
1277 	}
1278 
1279 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1280 	return 0;
1281 }
1282 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1283 
ip_tunnel_init(struct net_device * dev)1284 int ip_tunnel_init(struct net_device *dev)
1285 {
1286 	struct ip_tunnel *tunnel = netdev_priv(dev);
1287 	struct iphdr *iph = &tunnel->parms.iph;
1288 	int err;
1289 
1290 	dev->needs_free_netdev = true;
1291 	dev->priv_destructor = ip_tunnel_dev_free;
1292 	dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
1293 
1294 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1295 	if (err)
1296 		return err;
1297 
1298 	err = gro_cells_init(&tunnel->gro_cells, dev);
1299 	if (err) {
1300 		dst_cache_destroy(&tunnel->dst_cache);
1301 		return err;
1302 	}
1303 
1304 	tunnel->dev = dev;
1305 	strscpy(tunnel->parms.name, dev->name);
1306 	iph->version		= 4;
1307 	iph->ihl		= 5;
1308 
1309 	if (tunnel->collect_md)
1310 		netif_keep_dst(dev);
1311 	netdev_lockdep_set_classes(dev);
1312 	return 0;
1313 }
1314 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1315 
ip_tunnel_uninit(struct net_device * dev)1316 void ip_tunnel_uninit(struct net_device *dev)
1317 {
1318 	struct ip_tunnel *tunnel = netdev_priv(dev);
1319 	struct net *net = tunnel->net;
1320 	struct ip_tunnel_net *itn;
1321 
1322 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1323 	ip_tunnel_del(itn, netdev_priv(dev));
1324 	if (itn->fb_tunnel_dev == dev)
1325 		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1326 
1327 	dst_cache_reset(&tunnel->dst_cache);
1328 }
1329 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1330 
1331 /* Do least required initialization, rest of init is done in tunnel_init call */
ip_tunnel_setup(struct net_device * dev,unsigned int net_id)1332 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1333 {
1334 	struct ip_tunnel *tunnel = netdev_priv(dev);
1335 	tunnel->ip_tnl_net_id = net_id;
1336 }
1337 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1338 
1339 MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1340 MODULE_LICENSE("GPL");
1341