xref: /linux/net/ipv4/ip_tunnel.c (revision 4f9786035f9e519db41375818e1d0b5f20da2f10)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5 
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30 
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/netdev_lock.h>
44 #include <net/rtnetlink.h>
45 #include <net/udp.h>
46 #include <net/dst_metadata.h>
47 #include <net/inet_dscp.h>
48 
49 #if IS_ENABLED(CONFIG_IPV6)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54 
55 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
56 {
57 	return hash_32((__force u32)key ^ (__force u32)remote,
58 			 IP_TNL_HASH_BITS);
59 }
60 
61 static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
62 				const unsigned long *flags, __be32 key)
63 {
64 	if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
65 		return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
66 
67 	return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
68 }
69 
70 /* Fallback tunnel: no source, no destination, no key, no options
71 
72    Tunnel hash table:
73    We require exact key match i.e. if a key is present in packet
74    it will match only tunnel with the same key; if it is not present,
75    it will match only keyless tunnel.
76 
77    All keysless packets, if not matched configured keyless tunnels
78    will match fallback tunnel.
79    Given src, dst and key, find appropriate for input tunnel.
80 */
81 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
82 				   int link, const unsigned long *flags,
83 				   __be32 remote, __be32 local,
84 				   __be32 key)
85 {
86 	struct ip_tunnel *t, *cand = NULL;
87 	struct hlist_head *head;
88 	struct net_device *ndev;
89 	unsigned int hash;
90 
91 	hash = ip_tunnel_hash(key, remote);
92 	head = &itn->tunnels[hash];
93 
94 	hlist_for_each_entry_rcu(t, head, hash_node) {
95 		if (local != t->parms.iph.saddr ||
96 		    remote != t->parms.iph.daddr ||
97 		    !(t->dev->flags & IFF_UP))
98 			continue;
99 
100 		if (!ip_tunnel_key_match(&t->parms, flags, key))
101 			continue;
102 
103 		if (READ_ONCE(t->parms.link) == link)
104 			return t;
105 		cand = t;
106 	}
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (remote != t->parms.iph.daddr ||
110 		    t->parms.iph.saddr != 0 ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (READ_ONCE(t->parms.link) == link)
118 			return t;
119 		if (!cand)
120 			cand = t;
121 	}
122 
123 	hash = ip_tunnel_hash(key, 0);
124 	head = &itn->tunnels[hash];
125 
126 	hlist_for_each_entry_rcu(t, head, hash_node) {
127 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
128 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
129 			continue;
130 
131 		if (!(t->dev->flags & IFF_UP))
132 			continue;
133 
134 		if (!ip_tunnel_key_match(&t->parms, flags, key))
135 			continue;
136 
137 		if (READ_ONCE(t->parms.link) == link)
138 			return t;
139 		if (!cand)
140 			cand = t;
141 	}
142 
143 	hlist_for_each_entry_rcu(t, head, hash_node) {
144 		if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
145 		     t->parms.i_key != key) ||
146 		    t->parms.iph.saddr != 0 ||
147 		    t->parms.iph.daddr != 0 ||
148 		    !(t->dev->flags & IFF_UP))
149 			continue;
150 
151 		if (READ_ONCE(t->parms.link) == link)
152 			return t;
153 		if (!cand)
154 			cand = t;
155 	}
156 
157 	if (cand)
158 		return cand;
159 
160 	t = rcu_dereference(itn->collect_md_tun);
161 	if (t && t->dev->flags & IFF_UP)
162 		return t;
163 
164 	ndev = READ_ONCE(itn->fb_tunnel_dev);
165 	if (ndev && ndev->flags & IFF_UP)
166 		return netdev_priv(ndev);
167 
168 	return NULL;
169 }
170 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
171 
172 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
173 				    struct ip_tunnel_parm_kern *parms)
174 {
175 	unsigned int h;
176 	__be32 remote;
177 	__be32 i_key = parms->i_key;
178 
179 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
180 		remote = parms->iph.daddr;
181 	else
182 		remote = 0;
183 
184 	if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
185 	    test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
186 		i_key = 0;
187 
188 	h = ip_tunnel_hash(i_key, remote);
189 	return &itn->tunnels[h];
190 }
191 
192 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
193 {
194 	struct hlist_head *head = ip_bucket(itn, &t->parms);
195 
196 	if (t->collect_md)
197 		rcu_assign_pointer(itn->collect_md_tun, t);
198 	hlist_add_head_rcu(&t->hash_node, head);
199 }
200 
201 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
202 {
203 	if (t->collect_md)
204 		rcu_assign_pointer(itn->collect_md_tun, NULL);
205 	hlist_del_init_rcu(&t->hash_node);
206 }
207 
208 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
209 					struct ip_tunnel_parm_kern *parms,
210 					int type)
211 {
212 	__be32 remote = parms->iph.daddr;
213 	__be32 local = parms->iph.saddr;
214 	IP_TUNNEL_DECLARE_FLAGS(flags);
215 	__be32 key = parms->i_key;
216 	int link = parms->link;
217 	struct ip_tunnel *t = NULL;
218 	struct hlist_head *head = ip_bucket(itn, parms);
219 
220 	ip_tunnel_flags_copy(flags, parms->i_flags);
221 
222 	hlist_for_each_entry_rcu(t, head, hash_node, lockdep_rtnl_is_held()) {
223 		if (local == t->parms.iph.saddr &&
224 		    remote == t->parms.iph.daddr &&
225 		    link == READ_ONCE(t->parms.link) &&
226 		    type == t->dev->type &&
227 		    ip_tunnel_key_match(&t->parms, flags, key))
228 			break;
229 	}
230 	return t;
231 }
232 
233 static struct net_device *__ip_tunnel_create(struct net *net,
234 					     const struct rtnl_link_ops *ops,
235 					     struct ip_tunnel_parm_kern *parms)
236 {
237 	int err;
238 	struct ip_tunnel *tunnel;
239 	struct net_device *dev;
240 	char name[IFNAMSIZ];
241 
242 	err = -E2BIG;
243 	if (parms->name[0]) {
244 		if (!dev_valid_name(parms->name))
245 			goto failed;
246 		strscpy(name, parms->name, IFNAMSIZ);
247 	} else {
248 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
249 			goto failed;
250 		strcpy(name, ops->kind);
251 		strcat(name, "%d");
252 	}
253 
254 	ASSERT_RTNL();
255 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
256 	if (!dev) {
257 		err = -ENOMEM;
258 		goto failed;
259 	}
260 	dev_net_set(dev, net);
261 
262 	dev->rtnl_link_ops = ops;
263 
264 	tunnel = netdev_priv(dev);
265 	tunnel->parms = *parms;
266 	tunnel->net = net;
267 
268 	err = register_netdevice(dev);
269 	if (err)
270 		goto failed_free;
271 
272 	return dev;
273 
274 failed_free:
275 	free_netdev(dev);
276 failed:
277 	return ERR_PTR(err);
278 }
279 
280 static int ip_tunnel_bind_dev(struct net_device *dev)
281 {
282 	struct net_device *tdev = NULL;
283 	struct ip_tunnel *tunnel = netdev_priv(dev);
284 	const struct iphdr *iph;
285 	int hlen = LL_MAX_HEADER;
286 	int mtu = ETH_DATA_LEN;
287 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
288 
289 	iph = &tunnel->parms.iph;
290 
291 	/* Guess output device to choose reasonable mtu and needed_headroom */
292 	if (iph->daddr) {
293 		struct flowi4 fl4;
294 		struct rtable *rt;
295 
296 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
297 				    iph->saddr, tunnel->parms.o_key,
298 				    iph->tos & INET_DSCP_MASK, tunnel->net,
299 				    tunnel->parms.link, tunnel->fwmark, 0, 0);
300 		rt = ip_route_output_key(tunnel->net, &fl4);
301 
302 		if (!IS_ERR(rt)) {
303 			tdev = rt->dst.dev;
304 			ip_rt_put(rt);
305 		}
306 		if (dev->type != ARPHRD_ETHER)
307 			dev->flags |= IFF_POINTOPOINT;
308 
309 		dst_cache_reset(&tunnel->dst_cache);
310 	}
311 
312 	if (!tdev && tunnel->parms.link)
313 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
314 
315 	if (tdev) {
316 		hlen = tdev->hard_header_len + tdev->needed_headroom;
317 		mtu = min(tdev->mtu, IP_MAX_MTU);
318 	}
319 
320 	dev->needed_headroom = t_hlen + hlen;
321 	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
322 
323 	if (mtu < IPV4_MIN_MTU)
324 		mtu = IPV4_MIN_MTU;
325 
326 	return mtu;
327 }
328 
329 static struct ip_tunnel *ip_tunnel_create(struct net *net,
330 					  struct ip_tunnel_net *itn,
331 					  struct ip_tunnel_parm_kern *parms)
332 {
333 	struct ip_tunnel *nt;
334 	struct net_device *dev;
335 	int t_hlen;
336 	int mtu;
337 	int err;
338 
339 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
340 	if (IS_ERR(dev))
341 		return ERR_CAST(dev);
342 
343 	mtu = ip_tunnel_bind_dev(dev);
344 	err = dev_set_mtu(dev, mtu);
345 	if (err)
346 		goto err_dev_set_mtu;
347 
348 	nt = netdev_priv(dev);
349 	t_hlen = nt->hlen + sizeof(struct iphdr);
350 	dev->min_mtu = ETH_MIN_MTU;
351 	dev->max_mtu = IP_MAX_MTU - t_hlen;
352 	if (dev->type == ARPHRD_ETHER)
353 		dev->max_mtu -= dev->hard_header_len;
354 
355 	ip_tunnel_add(itn, nt);
356 	return nt;
357 
358 err_dev_set_mtu:
359 	unregister_netdevice(dev);
360 	return ERR_PTR(err);
361 }
362 
363 void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
364 {
365 	const struct iphdr *iph = ip_hdr(skb);
366 	const struct udphdr *udph;
367 
368 	if (iph->protocol != IPPROTO_UDP)
369 		return;
370 
371 	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
372 	info->encap.sport = udph->source;
373 	info->encap.dport = udph->dest;
374 }
375 EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
376 
377 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
378 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
379 		  bool log_ecn_error)
380 {
381 	const struct iphdr *iph = ip_hdr(skb);
382 	int nh, err;
383 
384 #ifdef CONFIG_NET_IPGRE_BROADCAST
385 	if (ipv4_is_multicast(iph->daddr)) {
386 		DEV_STATS_INC(tunnel->dev, multicast);
387 		skb->pkt_type = PACKET_BROADCAST;
388 	}
389 #endif
390 
391 	if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
392 	    test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
393 		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
394 		DEV_STATS_INC(tunnel->dev, rx_errors);
395 		goto drop;
396 	}
397 
398 	if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
399 		if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
400 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
401 			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
402 			DEV_STATS_INC(tunnel->dev, rx_errors);
403 			goto drop;
404 		}
405 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
406 	}
407 
408 	/* Save offset of outer header relative to skb->head,
409 	 * because we are going to reset the network header to the inner header
410 	 * and might change skb->head.
411 	 */
412 	nh = skb_network_header(skb) - skb->head;
413 
414 	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
415 
416 	if (!pskb_inet_may_pull(skb)) {
417 		DEV_STATS_INC(tunnel->dev, rx_length_errors);
418 		DEV_STATS_INC(tunnel->dev, rx_errors);
419 		goto drop;
420 	}
421 	iph = (struct iphdr *)(skb->head + nh);
422 
423 	err = IP_ECN_decapsulate(iph, skb);
424 	if (unlikely(err)) {
425 		if (log_ecn_error)
426 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
427 					&iph->saddr, iph->tos);
428 		if (err > 1) {
429 			DEV_STATS_INC(tunnel->dev, rx_frame_errors);
430 			DEV_STATS_INC(tunnel->dev, rx_errors);
431 			goto drop;
432 		}
433 	}
434 
435 	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
436 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
437 
438 	if (tunnel->dev->type == ARPHRD_ETHER) {
439 		skb->protocol = eth_type_trans(skb, tunnel->dev);
440 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
441 	} else {
442 		skb->dev = tunnel->dev;
443 	}
444 
445 	if (tun_dst)
446 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
447 
448 	gro_cells_receive(&tunnel->gro_cells, skb);
449 	return 0;
450 
451 drop:
452 	if (tun_dst)
453 		dst_release((struct dst_entry *)tun_dst);
454 	kfree_skb(skb);
455 	return 0;
456 }
457 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
458 
459 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
460 			    unsigned int num)
461 {
462 	if (num >= MAX_IPTUN_ENCAP_OPS)
463 		return -ERANGE;
464 
465 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
466 			&iptun_encaps[num],
467 			NULL, ops) ? 0 : -1;
468 }
469 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
470 
471 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
472 			    unsigned int num)
473 {
474 	int ret;
475 
476 	if (num >= MAX_IPTUN_ENCAP_OPS)
477 		return -ERANGE;
478 
479 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
480 		       &iptun_encaps[num],
481 		       ops, NULL) == ops) ? 0 : -1;
482 
483 	synchronize_net();
484 
485 	return ret;
486 }
487 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
488 
489 int ip_tunnel_encap_setup(struct ip_tunnel *t,
490 			  struct ip_tunnel_encap *ipencap)
491 {
492 	int hlen;
493 
494 	memset(&t->encap, 0, sizeof(t->encap));
495 
496 	hlen = ip_encap_hlen(ipencap);
497 	if (hlen < 0)
498 		return hlen;
499 
500 	t->encap.type = ipencap->type;
501 	t->encap.sport = ipencap->sport;
502 	t->encap.dport = ipencap->dport;
503 	t->encap.flags = ipencap->flags;
504 
505 	t->encap_hlen = hlen;
506 	t->hlen = t->encap_hlen + t->tun_hlen;
507 
508 	return 0;
509 }
510 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
511 
512 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
513 			    struct rtable *rt, __be16 df,
514 			    const struct iphdr *inner_iph,
515 			    int tunnel_hlen, __be32 dst, bool md)
516 {
517 	struct ip_tunnel *tunnel = netdev_priv(dev);
518 	int pkt_size;
519 	int mtu;
520 
521 	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
522 	pkt_size = skb->len - tunnel_hlen;
523 	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
524 
525 	if (df) {
526 		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
527 		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
528 	} else {
529 		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
530 	}
531 
532 	if (skb_valid_dst(skb))
533 		skb_dst_update_pmtu_no_confirm(skb, mtu);
534 
535 	if (skb->protocol == htons(ETH_P_IP)) {
536 		if (!skb_is_gso(skb) &&
537 		    (inner_iph->frag_off & htons(IP_DF)) &&
538 		    mtu < pkt_size) {
539 			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
540 			return -E2BIG;
541 		}
542 	}
543 #if IS_ENABLED(CONFIG_IPV6)
544 	else if (skb->protocol == htons(ETH_P_IPV6)) {
545 		struct rt6_info *rt6;
546 		__be32 daddr;
547 
548 		rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
549 					   NULL;
550 		daddr = md ? dst : tunnel->parms.iph.daddr;
551 
552 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
553 			   mtu >= IPV6_MIN_MTU) {
554 			if ((daddr && !ipv4_is_multicast(daddr)) ||
555 			    rt6->rt6i_dst.plen == 128) {
556 				rt6->rt6i_flags |= RTF_MODIFIED;
557 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
558 			}
559 		}
560 
561 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
562 					mtu < pkt_size) {
563 			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
564 			return -E2BIG;
565 		}
566 	}
567 #endif
568 	return 0;
569 }
570 
571 static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
572 {
573 	/* we must cap headroom to some upperlimit, else pskb_expand_head
574 	 * will overflow header offsets in skb_headers_offset_update().
575 	 */
576 	static const unsigned int max_allowed = 512;
577 
578 	if (headroom > max_allowed)
579 		headroom = max_allowed;
580 
581 	if (headroom > READ_ONCE(dev->needed_headroom))
582 		WRITE_ONCE(dev->needed_headroom, headroom);
583 }
584 
585 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
586 		       u8 proto, int tunnel_hlen)
587 {
588 	struct ip_tunnel *tunnel = netdev_priv(dev);
589 	u32 headroom = sizeof(struct iphdr);
590 	struct ip_tunnel_info *tun_info;
591 	const struct ip_tunnel_key *key;
592 	const struct iphdr *inner_iph;
593 	struct rtable *rt = NULL;
594 	struct flowi4 fl4;
595 	__be16 df = 0;
596 	u8 tos, ttl;
597 	bool use_cache;
598 
599 	tun_info = skb_tunnel_info(skb);
600 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
601 		     ip_tunnel_info_af(tun_info) != AF_INET))
602 		goto tx_error;
603 	key = &tun_info->key;
604 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
605 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
606 	tos = key->tos;
607 	if (tos == 1) {
608 		if (skb->protocol == htons(ETH_P_IP))
609 			tos = inner_iph->tos;
610 		else if (skb->protocol == htons(ETH_P_IPV6))
611 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
612 	}
613 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
614 			    tunnel_id_to_key32(key->tun_id),
615 			    tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark,
616 			    skb_get_hash(skb), key->flow_flags);
617 
618 	if (!tunnel_hlen)
619 		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
620 
621 	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
622 		goto tx_error;
623 
624 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
625 	if (use_cache)
626 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
627 	if (!rt) {
628 		rt = ip_route_output_key(tunnel->net, &fl4);
629 		if (IS_ERR(rt)) {
630 			DEV_STATS_INC(dev, tx_carrier_errors);
631 			goto tx_error;
632 		}
633 		if (use_cache)
634 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
635 					  fl4.saddr);
636 	}
637 	if (rt->dst.dev == dev) {
638 		ip_rt_put(rt);
639 		DEV_STATS_INC(dev, collisions);
640 		goto tx_error;
641 	}
642 
643 	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
644 		df = htons(IP_DF);
645 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
646 			    key->u.ipv4.dst, true)) {
647 		ip_rt_put(rt);
648 		goto tx_error;
649 	}
650 
651 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
652 	ttl = key->ttl;
653 	if (ttl == 0) {
654 		if (skb->protocol == htons(ETH_P_IP))
655 			ttl = inner_iph->ttl;
656 		else if (skb->protocol == htons(ETH_P_IPV6))
657 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
658 		else
659 			ttl = ip4_dst_hoplimit(&rt->dst);
660 	}
661 
662 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
663 	if (skb_cow_head(skb, headroom)) {
664 		ip_rt_put(rt);
665 		goto tx_dropped;
666 	}
667 
668 	ip_tunnel_adj_headroom(dev, headroom);
669 
670 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
671 		      df, !net_eq(tunnel->net, dev_net(dev)));
672 	return;
673 tx_error:
674 	DEV_STATS_INC(dev, tx_errors);
675 	goto kfree;
676 tx_dropped:
677 	DEV_STATS_INC(dev, tx_dropped);
678 kfree:
679 	kfree_skb(skb);
680 }
681 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
682 
683 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
684 		    const struct iphdr *tnl_params, u8 protocol)
685 {
686 	struct ip_tunnel *tunnel = netdev_priv(dev);
687 	struct ip_tunnel_info *tun_info = NULL;
688 	const struct iphdr *inner_iph;
689 	unsigned int max_headroom;	/* The extra header space needed */
690 	struct rtable *rt = NULL;		/* Route to the other host */
691 	__be16 payload_protocol;
692 	bool use_cache = false;
693 	struct flowi4 fl4;
694 	bool md = false;
695 	bool connected;
696 	u8 tos, ttl;
697 	__be32 dst;
698 	__be16 df;
699 
700 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
701 	connected = (tunnel->parms.iph.daddr != 0);
702 	payload_protocol = skb_protocol(skb, true);
703 
704 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
705 
706 	dst = tnl_params->daddr;
707 	if (dst == 0) {
708 		/* NBMA tunnel */
709 
710 		if (!skb_dst(skb)) {
711 			DEV_STATS_INC(dev, tx_fifo_errors);
712 			goto tx_error;
713 		}
714 
715 		tun_info = skb_tunnel_info(skb);
716 		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
717 		    ip_tunnel_info_af(tun_info) == AF_INET &&
718 		    tun_info->key.u.ipv4.dst) {
719 			dst = tun_info->key.u.ipv4.dst;
720 			md = true;
721 			connected = true;
722 		} else if (payload_protocol == htons(ETH_P_IP)) {
723 			rt = skb_rtable(skb);
724 			dst = rt_nexthop(rt, inner_iph->daddr);
725 		}
726 #if IS_ENABLED(CONFIG_IPV6)
727 		else if (payload_protocol == htons(ETH_P_IPV6)) {
728 			const struct in6_addr *addr6;
729 			struct neighbour *neigh;
730 			bool do_tx_error_icmp;
731 			int addr_type;
732 
733 			neigh = dst_neigh_lookup(skb_dst(skb),
734 						 &ipv6_hdr(skb)->daddr);
735 			if (!neigh)
736 				goto tx_error;
737 
738 			addr6 = (const struct in6_addr *)&neigh->primary_key;
739 			addr_type = ipv6_addr_type(addr6);
740 
741 			if (addr_type == IPV6_ADDR_ANY) {
742 				addr6 = &ipv6_hdr(skb)->daddr;
743 				addr_type = ipv6_addr_type(addr6);
744 			}
745 
746 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
747 				do_tx_error_icmp = true;
748 			else {
749 				do_tx_error_icmp = false;
750 				dst = addr6->s6_addr32[3];
751 			}
752 			neigh_release(neigh);
753 			if (do_tx_error_icmp)
754 				goto tx_error_icmp;
755 		}
756 #endif
757 		else
758 			goto tx_error;
759 
760 		if (!md)
761 			connected = false;
762 	}
763 
764 	tos = tnl_params->tos;
765 	if (tos & 0x1) {
766 		tos &= ~0x1;
767 		if (payload_protocol == htons(ETH_P_IP)) {
768 			tos = inner_iph->tos;
769 			connected = false;
770 		} else if (payload_protocol == htons(ETH_P_IPV6)) {
771 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
772 			connected = false;
773 		}
774 	}
775 
776 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
777 			    tunnel->parms.o_key, tos & INET_DSCP_MASK,
778 			    tunnel->net, READ_ONCE(tunnel->parms.link),
779 			    tunnel->fwmark, skb_get_hash(skb), 0);
780 
781 	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
782 		goto tx_error;
783 
784 	if (connected && md) {
785 		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
786 		if (use_cache)
787 			rt = dst_cache_get_ip4(&tun_info->dst_cache,
788 					       &fl4.saddr);
789 	} else {
790 		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
791 						&fl4.saddr) : NULL;
792 	}
793 
794 	if (!rt) {
795 		rt = ip_route_output_key(tunnel->net, &fl4);
796 
797 		if (IS_ERR(rt)) {
798 			DEV_STATS_INC(dev, tx_carrier_errors);
799 			goto tx_error;
800 		}
801 		if (use_cache)
802 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
803 					  fl4.saddr);
804 		else if (!md && connected)
805 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
806 					  fl4.saddr);
807 	}
808 
809 	if (rt->dst.dev == dev) {
810 		ip_rt_put(rt);
811 		DEV_STATS_INC(dev, collisions);
812 		goto tx_error;
813 	}
814 
815 	df = tnl_params->frag_off;
816 	if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
817 		df |= (inner_iph->frag_off & htons(IP_DF));
818 
819 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
820 		ip_rt_put(rt);
821 		goto tx_error;
822 	}
823 
824 	if (tunnel->err_count > 0) {
825 		if (time_before(jiffies,
826 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
827 			tunnel->err_count--;
828 
829 			dst_link_failure(skb);
830 		} else
831 			tunnel->err_count = 0;
832 	}
833 
834 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
835 	ttl = tnl_params->ttl;
836 	if (ttl == 0) {
837 		if (payload_protocol == htons(ETH_P_IP))
838 			ttl = inner_iph->ttl;
839 #if IS_ENABLED(CONFIG_IPV6)
840 		else if (payload_protocol == htons(ETH_P_IPV6))
841 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
842 #endif
843 		else
844 			ttl = ip4_dst_hoplimit(&rt->dst);
845 	}
846 
847 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
848 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
849 
850 	if (skb_cow_head(skb, max_headroom)) {
851 		ip_rt_put(rt);
852 		DEV_STATS_INC(dev, tx_dropped);
853 		kfree_skb(skb);
854 		return;
855 	}
856 
857 	ip_tunnel_adj_headroom(dev, max_headroom);
858 
859 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
860 		      df, !net_eq(tunnel->net, dev_net(dev)));
861 	return;
862 
863 #if IS_ENABLED(CONFIG_IPV6)
864 tx_error_icmp:
865 	dst_link_failure(skb);
866 #endif
867 tx_error:
868 	DEV_STATS_INC(dev, tx_errors);
869 	kfree_skb(skb);
870 }
871 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
872 
873 static void ip_tunnel_update(struct ip_tunnel_net *itn,
874 			     struct ip_tunnel *t,
875 			     struct net_device *dev,
876 			     struct ip_tunnel_parm_kern *p,
877 			     bool set_mtu,
878 			     __u32 fwmark)
879 {
880 	ip_tunnel_del(itn, t);
881 	t->parms.iph.saddr = p->iph.saddr;
882 	t->parms.iph.daddr = p->iph.daddr;
883 	t->parms.i_key = p->i_key;
884 	t->parms.o_key = p->o_key;
885 	if (dev->type != ARPHRD_ETHER) {
886 		__dev_addr_set(dev, &p->iph.saddr, 4);
887 		memcpy(dev->broadcast, &p->iph.daddr, 4);
888 	}
889 	ip_tunnel_add(itn, t);
890 
891 	t->parms.iph.ttl = p->iph.ttl;
892 	t->parms.iph.tos = p->iph.tos;
893 	t->parms.iph.frag_off = p->iph.frag_off;
894 
895 	if (t->parms.link != p->link || t->fwmark != fwmark) {
896 		int mtu;
897 
898 		WRITE_ONCE(t->parms.link, p->link);
899 		t->fwmark = fwmark;
900 		mtu = ip_tunnel_bind_dev(dev);
901 		if (set_mtu)
902 			WRITE_ONCE(dev->mtu, mtu);
903 	}
904 	dst_cache_reset(&t->dst_cache);
905 	netdev_state_change(dev);
906 }
907 
908 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
909 		  int cmd)
910 {
911 	int err = 0;
912 	struct ip_tunnel *t = netdev_priv(dev);
913 	struct net *net = t->net;
914 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
915 
916 	switch (cmd) {
917 	case SIOCGETTUNNEL:
918 		if (dev == itn->fb_tunnel_dev) {
919 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
920 			if (!t)
921 				t = netdev_priv(dev);
922 		}
923 		memcpy(p, &t->parms, sizeof(*p));
924 		break;
925 
926 	case SIOCADDTUNNEL:
927 	case SIOCCHGTUNNEL:
928 		err = -EPERM;
929 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
930 			goto done;
931 		if (p->iph.ttl)
932 			p->iph.frag_off |= htons(IP_DF);
933 		if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
934 			if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
935 				p->i_key = 0;
936 			if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
937 				p->o_key = 0;
938 		}
939 
940 		t = ip_tunnel_find(itn, p, itn->type);
941 
942 		if (cmd == SIOCADDTUNNEL) {
943 			if (!t) {
944 				t = ip_tunnel_create(net, itn, p);
945 				err = PTR_ERR_OR_ZERO(t);
946 				break;
947 			}
948 
949 			err = -EEXIST;
950 			break;
951 		}
952 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
953 			if (t) {
954 				if (t->dev != dev) {
955 					err = -EEXIST;
956 					break;
957 				}
958 			} else {
959 				unsigned int nflags = 0;
960 
961 				if (ipv4_is_multicast(p->iph.daddr))
962 					nflags = IFF_BROADCAST;
963 				else if (p->iph.daddr)
964 					nflags = IFF_POINTOPOINT;
965 
966 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
967 					err = -EINVAL;
968 					break;
969 				}
970 
971 				t = netdev_priv(dev);
972 			}
973 		}
974 
975 		if (t) {
976 			err = 0;
977 			ip_tunnel_update(itn, t, dev, p, true, 0);
978 		} else {
979 			err = -ENOENT;
980 		}
981 		break;
982 
983 	case SIOCDELTUNNEL:
984 		err = -EPERM;
985 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
986 			goto done;
987 
988 		if (dev == itn->fb_tunnel_dev) {
989 			err = -ENOENT;
990 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
991 			if (!t)
992 				goto done;
993 			err = -EPERM;
994 			if (t == netdev_priv(itn->fb_tunnel_dev))
995 				goto done;
996 			dev = t->dev;
997 		}
998 		unregister_netdevice(dev);
999 		err = 0;
1000 		break;
1001 
1002 	default:
1003 		err = -EINVAL;
1004 	}
1005 
1006 done:
1007 	return err;
1008 }
1009 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
1010 
1011 bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
1012 			      const void __user *data)
1013 {
1014 	struct ip_tunnel_parm p;
1015 
1016 	if (copy_from_user(&p, data, sizeof(p)))
1017 		return false;
1018 
1019 	strscpy(kp->name, p.name);
1020 	kp->link = p.link;
1021 	ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
1022 	ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
1023 	kp->i_key = p.i_key;
1024 	kp->o_key = p.o_key;
1025 	memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
1026 
1027 	return true;
1028 }
1029 EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
1030 
1031 bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
1032 {
1033 	struct ip_tunnel_parm p;
1034 
1035 	if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
1036 	    !ip_tunnel_flags_is_be16_compat(kp->o_flags))
1037 		return false;
1038 
1039 	memset(&p, 0, sizeof(p));
1040 
1041 	strscpy(p.name, kp->name);
1042 	p.link = kp->link;
1043 	p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
1044 	p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
1045 	p.i_key = kp->i_key;
1046 	p.o_key = kp->o_key;
1047 	memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
1048 
1049 	return !copy_to_user(data, &p, sizeof(p));
1050 }
1051 EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
1052 
1053 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1054 			     void __user *data, int cmd)
1055 {
1056 	struct ip_tunnel_parm_kern p;
1057 	int err;
1058 
1059 	if (!ip_tunnel_parm_from_user(&p, data))
1060 		return -EFAULT;
1061 	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1062 	if (!err && !ip_tunnel_parm_to_user(data, &p))
1063 		return -EFAULT;
1064 	return err;
1065 }
1066 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1067 
1068 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1069 {
1070 	struct ip_tunnel *tunnel = netdev_priv(dev);
1071 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1072 	int max_mtu = IP_MAX_MTU - t_hlen;
1073 
1074 	if (dev->type == ARPHRD_ETHER)
1075 		max_mtu -= dev->hard_header_len;
1076 
1077 	if (new_mtu < ETH_MIN_MTU)
1078 		return -EINVAL;
1079 
1080 	if (new_mtu > max_mtu) {
1081 		if (strict)
1082 			return -EINVAL;
1083 
1084 		new_mtu = max_mtu;
1085 	}
1086 
1087 	WRITE_ONCE(dev->mtu, new_mtu);
1088 	return 0;
1089 }
1090 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1091 
1092 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1093 {
1094 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
1095 }
1096 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1097 
1098 static void ip_tunnel_dev_free(struct net_device *dev)
1099 {
1100 	struct ip_tunnel *tunnel = netdev_priv(dev);
1101 
1102 	gro_cells_destroy(&tunnel->gro_cells);
1103 	dst_cache_destroy(&tunnel->dst_cache);
1104 }
1105 
1106 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1107 {
1108 	struct ip_tunnel *tunnel = netdev_priv(dev);
1109 	struct ip_tunnel_net *itn;
1110 
1111 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1112 
1113 	if (itn->fb_tunnel_dev != dev) {
1114 		ip_tunnel_del(itn, netdev_priv(dev));
1115 		unregister_netdevice_queue(dev, head);
1116 	}
1117 }
1118 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1119 
1120 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1121 {
1122 	struct ip_tunnel *tunnel = netdev_priv(dev);
1123 
1124 	return READ_ONCE(tunnel->net);
1125 }
1126 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1127 
1128 int ip_tunnel_get_iflink(const struct net_device *dev)
1129 {
1130 	const struct ip_tunnel *tunnel = netdev_priv(dev);
1131 
1132 	return READ_ONCE(tunnel->parms.link);
1133 }
1134 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1135 
1136 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1137 				  struct rtnl_link_ops *ops, char *devname)
1138 {
1139 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1140 	struct ip_tunnel_parm_kern parms;
1141 	unsigned int i;
1142 
1143 	itn->rtnl_link_ops = ops;
1144 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1145 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1146 
1147 	if (!ops || !net_has_fallback_tunnels(net)) {
1148 		struct ip_tunnel_net *it_init_net;
1149 
1150 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1151 		itn->type = it_init_net->type;
1152 		itn->fb_tunnel_dev = NULL;
1153 		return 0;
1154 	}
1155 
1156 	memset(&parms, 0, sizeof(parms));
1157 	if (devname)
1158 		strscpy(parms.name, devname, IFNAMSIZ);
1159 
1160 	rtnl_lock();
1161 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1162 	/* FB netdevice is special: we have one, and only one per netns.
1163 	 * Allowing to move it to another netns is clearly unsafe.
1164 	 */
1165 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1166 		itn->fb_tunnel_dev->netns_immutable = true;
1167 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1168 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1169 		itn->type = itn->fb_tunnel_dev->type;
1170 	}
1171 	rtnl_unlock();
1172 
1173 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1174 }
1175 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1176 
1177 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1178 			      struct list_head *head,
1179 			      struct rtnl_link_ops *ops)
1180 {
1181 	struct net_device *dev, *aux;
1182 	int h;
1183 
1184 	for_each_netdev_safe(net, dev, aux)
1185 		if (dev->rtnl_link_ops == ops)
1186 			unregister_netdevice_queue(dev, head);
1187 
1188 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1189 		struct ip_tunnel *t;
1190 		struct hlist_node *n;
1191 		struct hlist_head *thead = &itn->tunnels[h];
1192 
1193 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1194 			/* If dev is in the same netns, it has already
1195 			 * been added to the list by the previous loop.
1196 			 */
1197 			if (!net_eq(dev_net(t->dev), net))
1198 				unregister_netdevice_queue(t->dev, head);
1199 	}
1200 }
1201 
1202 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1203 			   struct rtnl_link_ops *ops,
1204 			   struct list_head *dev_to_kill)
1205 {
1206 	struct ip_tunnel_net *itn;
1207 	struct net *net;
1208 
1209 	ASSERT_RTNL();
1210 	list_for_each_entry(net, net_list, exit_list) {
1211 		itn = net_generic(net, id);
1212 		ip_tunnel_destroy(net, itn, dev_to_kill, ops);
1213 	}
1214 }
1215 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1216 
1217 int ip_tunnel_newlink(struct net *net, struct net_device *dev,
1218 		      struct nlattr *tb[], struct ip_tunnel_parm_kern *p,
1219 		      __u32 fwmark)
1220 {
1221 	struct ip_tunnel *nt;
1222 	struct ip_tunnel_net *itn;
1223 	int mtu;
1224 	int err;
1225 
1226 	nt = netdev_priv(dev);
1227 	itn = net_generic(net, nt->ip_tnl_net_id);
1228 
1229 	if (nt->collect_md) {
1230 		if (rtnl_dereference(itn->collect_md_tun))
1231 			return -EEXIST;
1232 	} else {
1233 		if (ip_tunnel_find(itn, p, dev->type))
1234 			return -EEXIST;
1235 	}
1236 
1237 	nt->net = net;
1238 	nt->parms = *p;
1239 	nt->fwmark = fwmark;
1240 	err = register_netdevice(dev);
1241 	if (err)
1242 		goto err_register_netdevice;
1243 
1244 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1245 		eth_hw_addr_random(dev);
1246 
1247 	mtu = ip_tunnel_bind_dev(dev);
1248 	if (tb[IFLA_MTU]) {
1249 		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1250 
1251 		if (dev->type == ARPHRD_ETHER)
1252 			max -= dev->hard_header_len;
1253 
1254 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1255 	}
1256 
1257 	err = dev_set_mtu(dev, mtu);
1258 	if (err)
1259 		goto err_dev_set_mtu;
1260 
1261 	ip_tunnel_add(itn, nt);
1262 	return 0;
1263 
1264 err_dev_set_mtu:
1265 	unregister_netdevice(dev);
1266 err_register_netdevice:
1267 	return err;
1268 }
1269 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1270 
1271 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1272 			 struct ip_tunnel_parm_kern *p, __u32 fwmark)
1273 {
1274 	struct ip_tunnel *t;
1275 	struct ip_tunnel *tunnel = netdev_priv(dev);
1276 	struct net *net = tunnel->net;
1277 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1278 
1279 	if (dev == itn->fb_tunnel_dev)
1280 		return -EINVAL;
1281 
1282 	t = ip_tunnel_find(itn, p, dev->type);
1283 
1284 	if (t) {
1285 		if (t->dev != dev)
1286 			return -EEXIST;
1287 	} else {
1288 		t = tunnel;
1289 
1290 		if (dev->type != ARPHRD_ETHER) {
1291 			unsigned int nflags = 0;
1292 
1293 			if (ipv4_is_multicast(p->iph.daddr))
1294 				nflags = IFF_BROADCAST;
1295 			else if (p->iph.daddr)
1296 				nflags = IFF_POINTOPOINT;
1297 
1298 			if ((dev->flags ^ nflags) &
1299 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1300 				return -EINVAL;
1301 		}
1302 	}
1303 
1304 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1305 	return 0;
1306 }
1307 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1308 
1309 int ip_tunnel_init(struct net_device *dev)
1310 {
1311 	struct ip_tunnel *tunnel = netdev_priv(dev);
1312 	struct iphdr *iph = &tunnel->parms.iph;
1313 	int err;
1314 
1315 	dev->needs_free_netdev = true;
1316 	dev->priv_destructor = ip_tunnel_dev_free;
1317 	dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
1318 
1319 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1320 	if (err)
1321 		return err;
1322 
1323 	err = gro_cells_init(&tunnel->gro_cells, dev);
1324 	if (err) {
1325 		dst_cache_destroy(&tunnel->dst_cache);
1326 		return err;
1327 	}
1328 
1329 	tunnel->dev = dev;
1330 	strscpy(tunnel->parms.name, dev->name);
1331 	iph->version		= 4;
1332 	iph->ihl		= 5;
1333 
1334 	if (tunnel->collect_md)
1335 		netif_keep_dst(dev);
1336 	netdev_lockdep_set_classes(dev);
1337 	return 0;
1338 }
1339 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1340 
1341 void ip_tunnel_uninit(struct net_device *dev)
1342 {
1343 	struct ip_tunnel *tunnel = netdev_priv(dev);
1344 	struct net *net = tunnel->net;
1345 	struct ip_tunnel_net *itn;
1346 
1347 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1348 	ip_tunnel_del(itn, netdev_priv(dev));
1349 	if (itn->fb_tunnel_dev == dev)
1350 		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1351 
1352 	dst_cache_reset(&tunnel->dst_cache);
1353 }
1354 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1355 
1356 /* Do least required initialization, rest of init is done in tunnel_init call */
1357 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1358 {
1359 	struct ip_tunnel *tunnel = netdev_priv(dev);
1360 	tunnel->ip_tnl_net_id = net_id;
1361 }
1362 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1363 
1364 MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1365 MODULE_LICENSE("GPL");
1366