xref: /linux/net/ipv4/ip_tunnel.c (revision 2697b79a469b68e3ad3640f55284359c1396278d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5 
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30 
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46 
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52 
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55 	return hash_32((__force u32)key ^ (__force u32)remote,
56 			 IP_TNL_HASH_BITS);
57 }
58 
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
60 				const unsigned long *flags, __be32 key)
61 {
62 	if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
63 		return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
64 
65 	return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
66 }
67 
68 /* Fallback tunnel: no source, no destination, no key, no options
69 
70    Tunnel hash table:
71    We require exact key match i.e. if a key is present in packet
72    it will match only tunnel with the same key; if it is not present,
73    it will match only keyless tunnel.
74 
75    All keysless packets, if not matched configured keyless tunnels
76    will match fallback tunnel.
77    Given src, dst and key, find appropriate for input tunnel.
78 */
79 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
80 				   int link, const unsigned long *flags,
81 				   __be32 remote, __be32 local,
82 				   __be32 key)
83 {
84 	struct ip_tunnel *t, *cand = NULL;
85 	struct hlist_head *head;
86 	struct net_device *ndev;
87 	unsigned int hash;
88 
89 	hash = ip_tunnel_hash(key, remote);
90 	head = &itn->tunnels[hash];
91 
92 	hlist_for_each_entry_rcu(t, head, hash_node) {
93 		if (local != t->parms.iph.saddr ||
94 		    remote != t->parms.iph.daddr ||
95 		    !(t->dev->flags & IFF_UP))
96 			continue;
97 
98 		if (!ip_tunnel_key_match(&t->parms, flags, key))
99 			continue;
100 
101 		if (READ_ONCE(t->parms.link) == link)
102 			return t;
103 		cand = t;
104 	}
105 
106 	hlist_for_each_entry_rcu(t, head, hash_node) {
107 		if (remote != t->parms.iph.daddr ||
108 		    t->parms.iph.saddr != 0 ||
109 		    !(t->dev->flags & IFF_UP))
110 			continue;
111 
112 		if (!ip_tunnel_key_match(&t->parms, flags, key))
113 			continue;
114 
115 		if (READ_ONCE(t->parms.link) == link)
116 			return t;
117 		if (!cand)
118 			cand = t;
119 	}
120 
121 	hash = ip_tunnel_hash(key, 0);
122 	head = &itn->tunnels[hash];
123 
124 	hlist_for_each_entry_rcu(t, head, hash_node) {
125 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
126 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
127 			continue;
128 
129 		if (!(t->dev->flags & IFF_UP))
130 			continue;
131 
132 		if (!ip_tunnel_key_match(&t->parms, flags, key))
133 			continue;
134 
135 		if (READ_ONCE(t->parms.link) == link)
136 			return t;
137 		if (!cand)
138 			cand = t;
139 	}
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
143 		     t->parms.i_key != key) ||
144 		    t->parms.iph.saddr != 0 ||
145 		    t->parms.iph.daddr != 0 ||
146 		    !(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (READ_ONCE(t->parms.link) == link)
150 			return t;
151 		if (!cand)
152 			cand = t;
153 	}
154 
155 	if (cand)
156 		return cand;
157 
158 	t = rcu_dereference(itn->collect_md_tun);
159 	if (t && t->dev->flags & IFF_UP)
160 		return t;
161 
162 	ndev = READ_ONCE(itn->fb_tunnel_dev);
163 	if (ndev && ndev->flags & IFF_UP)
164 		return netdev_priv(ndev);
165 
166 	return NULL;
167 }
168 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
169 
170 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
171 				    struct ip_tunnel_parm_kern *parms)
172 {
173 	unsigned int h;
174 	__be32 remote;
175 	__be32 i_key = parms->i_key;
176 
177 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
178 		remote = parms->iph.daddr;
179 	else
180 		remote = 0;
181 
182 	if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
183 	    test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
184 		i_key = 0;
185 
186 	h = ip_tunnel_hash(i_key, remote);
187 	return &itn->tunnels[h];
188 }
189 
190 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
191 {
192 	struct hlist_head *head = ip_bucket(itn, &t->parms);
193 
194 	if (t->collect_md)
195 		rcu_assign_pointer(itn->collect_md_tun, t);
196 	hlist_add_head_rcu(&t->hash_node, head);
197 }
198 
199 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
200 {
201 	if (t->collect_md)
202 		rcu_assign_pointer(itn->collect_md_tun, NULL);
203 	hlist_del_init_rcu(&t->hash_node);
204 }
205 
206 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
207 					struct ip_tunnel_parm_kern *parms,
208 					int type)
209 {
210 	__be32 remote = parms->iph.daddr;
211 	__be32 local = parms->iph.saddr;
212 	IP_TUNNEL_DECLARE_FLAGS(flags);
213 	__be32 key = parms->i_key;
214 	int link = parms->link;
215 	struct ip_tunnel *t = NULL;
216 	struct hlist_head *head = ip_bucket(itn, parms);
217 
218 	ip_tunnel_flags_copy(flags, parms->i_flags);
219 
220 	hlist_for_each_entry_rcu(t, head, hash_node) {
221 		if (local == t->parms.iph.saddr &&
222 		    remote == t->parms.iph.daddr &&
223 		    link == READ_ONCE(t->parms.link) &&
224 		    type == t->dev->type &&
225 		    ip_tunnel_key_match(&t->parms, flags, key))
226 			break;
227 	}
228 	return t;
229 }
230 
231 static struct net_device *__ip_tunnel_create(struct net *net,
232 					     const struct rtnl_link_ops *ops,
233 					     struct ip_tunnel_parm_kern *parms)
234 {
235 	int err;
236 	struct ip_tunnel *tunnel;
237 	struct net_device *dev;
238 	char name[IFNAMSIZ];
239 
240 	err = -E2BIG;
241 	if (parms->name[0]) {
242 		if (!dev_valid_name(parms->name))
243 			goto failed;
244 		strscpy(name, parms->name, IFNAMSIZ);
245 	} else {
246 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
247 			goto failed;
248 		strcpy(name, ops->kind);
249 		strcat(name, "%d");
250 	}
251 
252 	ASSERT_RTNL();
253 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
254 	if (!dev) {
255 		err = -ENOMEM;
256 		goto failed;
257 	}
258 	dev_net_set(dev, net);
259 
260 	dev->rtnl_link_ops = ops;
261 
262 	tunnel = netdev_priv(dev);
263 	tunnel->parms = *parms;
264 	tunnel->net = net;
265 
266 	err = register_netdevice(dev);
267 	if (err)
268 		goto failed_free;
269 
270 	return dev;
271 
272 failed_free:
273 	free_netdev(dev);
274 failed:
275 	return ERR_PTR(err);
276 }
277 
278 static int ip_tunnel_bind_dev(struct net_device *dev)
279 {
280 	struct net_device *tdev = NULL;
281 	struct ip_tunnel *tunnel = netdev_priv(dev);
282 	const struct iphdr *iph;
283 	int hlen = LL_MAX_HEADER;
284 	int mtu = ETH_DATA_LEN;
285 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
286 
287 	iph = &tunnel->parms.iph;
288 
289 	/* Guess output device to choose reasonable mtu and needed_headroom */
290 	if (iph->daddr) {
291 		struct flowi4 fl4;
292 		struct rtable *rt;
293 
294 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
295 				    iph->saddr, tunnel->parms.o_key,
296 				    RT_TOS(iph->tos), dev_net(dev),
297 				    tunnel->parms.link, tunnel->fwmark, 0, 0);
298 		rt = ip_route_output_key(tunnel->net, &fl4);
299 
300 		if (!IS_ERR(rt)) {
301 			tdev = rt->dst.dev;
302 			ip_rt_put(rt);
303 		}
304 		if (dev->type != ARPHRD_ETHER)
305 			dev->flags |= IFF_POINTOPOINT;
306 
307 		dst_cache_reset(&tunnel->dst_cache);
308 	}
309 
310 	if (!tdev && tunnel->parms.link)
311 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
312 
313 	if (tdev) {
314 		hlen = tdev->hard_header_len + tdev->needed_headroom;
315 		mtu = min(tdev->mtu, IP_MAX_MTU);
316 	}
317 
318 	dev->needed_headroom = t_hlen + hlen;
319 	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
320 
321 	if (mtu < IPV4_MIN_MTU)
322 		mtu = IPV4_MIN_MTU;
323 
324 	return mtu;
325 }
326 
327 static struct ip_tunnel *ip_tunnel_create(struct net *net,
328 					  struct ip_tunnel_net *itn,
329 					  struct ip_tunnel_parm_kern *parms)
330 {
331 	struct ip_tunnel *nt;
332 	struct net_device *dev;
333 	int t_hlen;
334 	int mtu;
335 	int err;
336 
337 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
338 	if (IS_ERR(dev))
339 		return ERR_CAST(dev);
340 
341 	mtu = ip_tunnel_bind_dev(dev);
342 	err = dev_set_mtu(dev, mtu);
343 	if (err)
344 		goto err_dev_set_mtu;
345 
346 	nt = netdev_priv(dev);
347 	t_hlen = nt->hlen + sizeof(struct iphdr);
348 	dev->min_mtu = ETH_MIN_MTU;
349 	dev->max_mtu = IP_MAX_MTU - t_hlen;
350 	if (dev->type == ARPHRD_ETHER)
351 		dev->max_mtu -= dev->hard_header_len;
352 
353 	ip_tunnel_add(itn, nt);
354 	return nt;
355 
356 err_dev_set_mtu:
357 	unregister_netdevice(dev);
358 	return ERR_PTR(err);
359 }
360 
361 void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
362 {
363 	const struct iphdr *iph = ip_hdr(skb);
364 	const struct udphdr *udph;
365 
366 	if (iph->protocol != IPPROTO_UDP)
367 		return;
368 
369 	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
370 	info->encap.sport = udph->source;
371 	info->encap.dport = udph->dest;
372 }
373 EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
374 
375 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
376 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
377 		  bool log_ecn_error)
378 {
379 	const struct iphdr *iph = ip_hdr(skb);
380 	int nh, err;
381 
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383 	if (ipv4_is_multicast(iph->daddr)) {
384 		DEV_STATS_INC(tunnel->dev, multicast);
385 		skb->pkt_type = PACKET_BROADCAST;
386 	}
387 #endif
388 
389 	if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
390 	    test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
391 		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
392 		DEV_STATS_INC(tunnel->dev, rx_errors);
393 		goto drop;
394 	}
395 
396 	if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
397 		if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
398 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399 			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
400 			DEV_STATS_INC(tunnel->dev, rx_errors);
401 			goto drop;
402 		}
403 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
404 	}
405 
406 	/* Save offset of outer header relative to skb->head,
407 	 * because we are going to reset the network header to the inner header
408 	 * and might change skb->head.
409 	 */
410 	nh = skb_network_header(skb) - skb->head;
411 
412 	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
413 
414 	if (!pskb_inet_may_pull(skb)) {
415 		DEV_STATS_INC(tunnel->dev, rx_length_errors);
416 		DEV_STATS_INC(tunnel->dev, rx_errors);
417 		goto drop;
418 	}
419 	iph = (struct iphdr *)(skb->head + nh);
420 
421 	err = IP_ECN_decapsulate(iph, skb);
422 	if (unlikely(err)) {
423 		if (log_ecn_error)
424 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
425 					&iph->saddr, iph->tos);
426 		if (err > 1) {
427 			DEV_STATS_INC(tunnel->dev, rx_frame_errors);
428 			DEV_STATS_INC(tunnel->dev, rx_errors);
429 			goto drop;
430 		}
431 	}
432 
433 	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
434 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
435 
436 	if (tunnel->dev->type == ARPHRD_ETHER) {
437 		skb->protocol = eth_type_trans(skb, tunnel->dev);
438 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
439 	} else {
440 		skb->dev = tunnel->dev;
441 	}
442 
443 	if (tun_dst)
444 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
445 
446 	gro_cells_receive(&tunnel->gro_cells, skb);
447 	return 0;
448 
449 drop:
450 	if (tun_dst)
451 		dst_release((struct dst_entry *)tun_dst);
452 	kfree_skb(skb);
453 	return 0;
454 }
455 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
456 
457 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
458 			    unsigned int num)
459 {
460 	if (num >= MAX_IPTUN_ENCAP_OPS)
461 		return -ERANGE;
462 
463 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
464 			&iptun_encaps[num],
465 			NULL, ops) ? 0 : -1;
466 }
467 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
468 
469 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
470 			    unsigned int num)
471 {
472 	int ret;
473 
474 	if (num >= MAX_IPTUN_ENCAP_OPS)
475 		return -ERANGE;
476 
477 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
478 		       &iptun_encaps[num],
479 		       ops, NULL) == ops) ? 0 : -1;
480 
481 	synchronize_net();
482 
483 	return ret;
484 }
485 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
486 
487 int ip_tunnel_encap_setup(struct ip_tunnel *t,
488 			  struct ip_tunnel_encap *ipencap)
489 {
490 	int hlen;
491 
492 	memset(&t->encap, 0, sizeof(t->encap));
493 
494 	hlen = ip_encap_hlen(ipencap);
495 	if (hlen < 0)
496 		return hlen;
497 
498 	t->encap.type = ipencap->type;
499 	t->encap.sport = ipencap->sport;
500 	t->encap.dport = ipencap->dport;
501 	t->encap.flags = ipencap->flags;
502 
503 	t->encap_hlen = hlen;
504 	t->hlen = t->encap_hlen + t->tun_hlen;
505 
506 	return 0;
507 }
508 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
509 
510 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
511 			    struct rtable *rt, __be16 df,
512 			    const struct iphdr *inner_iph,
513 			    int tunnel_hlen, __be32 dst, bool md)
514 {
515 	struct ip_tunnel *tunnel = netdev_priv(dev);
516 	int pkt_size;
517 	int mtu;
518 
519 	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
520 	pkt_size = skb->len - tunnel_hlen;
521 	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
522 
523 	if (df) {
524 		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
525 		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
526 	} else {
527 		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
528 	}
529 
530 	if (skb_valid_dst(skb))
531 		skb_dst_update_pmtu_no_confirm(skb, mtu);
532 
533 	if (skb->protocol == htons(ETH_P_IP)) {
534 		if (!skb_is_gso(skb) &&
535 		    (inner_iph->frag_off & htons(IP_DF)) &&
536 		    mtu < pkt_size) {
537 			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
538 			return -E2BIG;
539 		}
540 	}
541 #if IS_ENABLED(CONFIG_IPV6)
542 	else if (skb->protocol == htons(ETH_P_IPV6)) {
543 		struct rt6_info *rt6;
544 		__be32 daddr;
545 
546 		rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
547 					   NULL;
548 		daddr = md ? dst : tunnel->parms.iph.daddr;
549 
550 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
551 			   mtu >= IPV6_MIN_MTU) {
552 			if ((daddr && !ipv4_is_multicast(daddr)) ||
553 			    rt6->rt6i_dst.plen == 128) {
554 				rt6->rt6i_flags |= RTF_MODIFIED;
555 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
556 			}
557 		}
558 
559 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
560 					mtu < pkt_size) {
561 			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
562 			return -E2BIG;
563 		}
564 	}
565 #endif
566 	return 0;
567 }
568 
569 static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
570 {
571 	/* we must cap headroom to some upperlimit, else pskb_expand_head
572 	 * will overflow header offsets in skb_headers_offset_update().
573 	 */
574 	static const unsigned int max_allowed = 512;
575 
576 	if (headroom > max_allowed)
577 		headroom = max_allowed;
578 
579 	if (headroom > READ_ONCE(dev->needed_headroom))
580 		WRITE_ONCE(dev->needed_headroom, headroom);
581 }
582 
583 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
584 		       u8 proto, int tunnel_hlen)
585 {
586 	struct ip_tunnel *tunnel = netdev_priv(dev);
587 	u32 headroom = sizeof(struct iphdr);
588 	struct ip_tunnel_info *tun_info;
589 	const struct ip_tunnel_key *key;
590 	const struct iphdr *inner_iph;
591 	struct rtable *rt = NULL;
592 	struct flowi4 fl4;
593 	__be16 df = 0;
594 	u8 tos, ttl;
595 	bool use_cache;
596 
597 	tun_info = skb_tunnel_info(skb);
598 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
599 		     ip_tunnel_info_af(tun_info) != AF_INET))
600 		goto tx_error;
601 	key = &tun_info->key;
602 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
603 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
604 	tos = key->tos;
605 	if (tos == 1) {
606 		if (skb->protocol == htons(ETH_P_IP))
607 			tos = inner_iph->tos;
608 		else if (skb->protocol == htons(ETH_P_IPV6))
609 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
610 	}
611 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
612 			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
613 			    dev_net(dev), 0, skb->mark, skb_get_hash(skb),
614 			    key->flow_flags);
615 
616 	if (!tunnel_hlen)
617 		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
618 
619 	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
620 		goto tx_error;
621 
622 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
623 	if (use_cache)
624 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
625 	if (!rt) {
626 		rt = ip_route_output_key(tunnel->net, &fl4);
627 		if (IS_ERR(rt)) {
628 			DEV_STATS_INC(dev, tx_carrier_errors);
629 			goto tx_error;
630 		}
631 		if (use_cache)
632 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
633 					  fl4.saddr);
634 	}
635 	if (rt->dst.dev == dev) {
636 		ip_rt_put(rt);
637 		DEV_STATS_INC(dev, collisions);
638 		goto tx_error;
639 	}
640 
641 	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
642 		df = htons(IP_DF);
643 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
644 			    key->u.ipv4.dst, true)) {
645 		ip_rt_put(rt);
646 		goto tx_error;
647 	}
648 
649 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
650 	ttl = key->ttl;
651 	if (ttl == 0) {
652 		if (skb->protocol == htons(ETH_P_IP))
653 			ttl = inner_iph->ttl;
654 		else if (skb->protocol == htons(ETH_P_IPV6))
655 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
656 		else
657 			ttl = ip4_dst_hoplimit(&rt->dst);
658 	}
659 
660 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
661 	if (skb_cow_head(skb, headroom)) {
662 		ip_rt_put(rt);
663 		goto tx_dropped;
664 	}
665 
666 	ip_tunnel_adj_headroom(dev, headroom);
667 
668 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
669 		      df, !net_eq(tunnel->net, dev_net(dev)));
670 	return;
671 tx_error:
672 	DEV_STATS_INC(dev, tx_errors);
673 	goto kfree;
674 tx_dropped:
675 	DEV_STATS_INC(dev, tx_dropped);
676 kfree:
677 	kfree_skb(skb);
678 }
679 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
680 
681 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
682 		    const struct iphdr *tnl_params, u8 protocol)
683 {
684 	struct ip_tunnel *tunnel = netdev_priv(dev);
685 	struct ip_tunnel_info *tun_info = NULL;
686 	const struct iphdr *inner_iph;
687 	unsigned int max_headroom;	/* The extra header space needed */
688 	struct rtable *rt = NULL;		/* Route to the other host */
689 	__be16 payload_protocol;
690 	bool use_cache = false;
691 	struct flowi4 fl4;
692 	bool md = false;
693 	bool connected;
694 	u8 tos, ttl;
695 	__be32 dst;
696 	__be16 df;
697 
698 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
699 	connected = (tunnel->parms.iph.daddr != 0);
700 	payload_protocol = skb_protocol(skb, true);
701 
702 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
703 
704 	dst = tnl_params->daddr;
705 	if (dst == 0) {
706 		/* NBMA tunnel */
707 
708 		if (!skb_dst(skb)) {
709 			DEV_STATS_INC(dev, tx_fifo_errors);
710 			goto tx_error;
711 		}
712 
713 		tun_info = skb_tunnel_info(skb);
714 		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
715 		    ip_tunnel_info_af(tun_info) == AF_INET &&
716 		    tun_info->key.u.ipv4.dst) {
717 			dst = tun_info->key.u.ipv4.dst;
718 			md = true;
719 			connected = true;
720 		} else if (payload_protocol == htons(ETH_P_IP)) {
721 			rt = skb_rtable(skb);
722 			dst = rt_nexthop(rt, inner_iph->daddr);
723 		}
724 #if IS_ENABLED(CONFIG_IPV6)
725 		else if (payload_protocol == htons(ETH_P_IPV6)) {
726 			const struct in6_addr *addr6;
727 			struct neighbour *neigh;
728 			bool do_tx_error_icmp;
729 			int addr_type;
730 
731 			neigh = dst_neigh_lookup(skb_dst(skb),
732 						 &ipv6_hdr(skb)->daddr);
733 			if (!neigh)
734 				goto tx_error;
735 
736 			addr6 = (const struct in6_addr *)&neigh->primary_key;
737 			addr_type = ipv6_addr_type(addr6);
738 
739 			if (addr_type == IPV6_ADDR_ANY) {
740 				addr6 = &ipv6_hdr(skb)->daddr;
741 				addr_type = ipv6_addr_type(addr6);
742 			}
743 
744 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
745 				do_tx_error_icmp = true;
746 			else {
747 				do_tx_error_icmp = false;
748 				dst = addr6->s6_addr32[3];
749 			}
750 			neigh_release(neigh);
751 			if (do_tx_error_icmp)
752 				goto tx_error_icmp;
753 		}
754 #endif
755 		else
756 			goto tx_error;
757 
758 		if (!md)
759 			connected = false;
760 	}
761 
762 	tos = tnl_params->tos;
763 	if (tos & 0x1) {
764 		tos &= ~0x1;
765 		if (payload_protocol == htons(ETH_P_IP)) {
766 			tos = inner_iph->tos;
767 			connected = false;
768 		} else if (payload_protocol == htons(ETH_P_IPV6)) {
769 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
770 			connected = false;
771 		}
772 	}
773 
774 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
775 			    tunnel->parms.o_key, RT_TOS(tos),
776 			    dev_net(dev), READ_ONCE(tunnel->parms.link),
777 			    tunnel->fwmark, skb_get_hash(skb), 0);
778 
779 	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
780 		goto tx_error;
781 
782 	if (connected && md) {
783 		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
784 		if (use_cache)
785 			rt = dst_cache_get_ip4(&tun_info->dst_cache,
786 					       &fl4.saddr);
787 	} else {
788 		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
789 						&fl4.saddr) : NULL;
790 	}
791 
792 	if (!rt) {
793 		rt = ip_route_output_key(tunnel->net, &fl4);
794 
795 		if (IS_ERR(rt)) {
796 			DEV_STATS_INC(dev, tx_carrier_errors);
797 			goto tx_error;
798 		}
799 		if (use_cache)
800 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
801 					  fl4.saddr);
802 		else if (!md && connected)
803 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
804 					  fl4.saddr);
805 	}
806 
807 	if (rt->dst.dev == dev) {
808 		ip_rt_put(rt);
809 		DEV_STATS_INC(dev, collisions);
810 		goto tx_error;
811 	}
812 
813 	df = tnl_params->frag_off;
814 	if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
815 		df |= (inner_iph->frag_off & htons(IP_DF));
816 
817 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
818 		ip_rt_put(rt);
819 		goto tx_error;
820 	}
821 
822 	if (tunnel->err_count > 0) {
823 		if (time_before(jiffies,
824 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
825 			tunnel->err_count--;
826 
827 			dst_link_failure(skb);
828 		} else
829 			tunnel->err_count = 0;
830 	}
831 
832 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
833 	ttl = tnl_params->ttl;
834 	if (ttl == 0) {
835 		if (payload_protocol == htons(ETH_P_IP))
836 			ttl = inner_iph->ttl;
837 #if IS_ENABLED(CONFIG_IPV6)
838 		else if (payload_protocol == htons(ETH_P_IPV6))
839 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
840 #endif
841 		else
842 			ttl = ip4_dst_hoplimit(&rt->dst);
843 	}
844 
845 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
846 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
847 
848 	if (skb_cow_head(skb, max_headroom)) {
849 		ip_rt_put(rt);
850 		DEV_STATS_INC(dev, tx_dropped);
851 		kfree_skb(skb);
852 		return;
853 	}
854 
855 	ip_tunnel_adj_headroom(dev, max_headroom);
856 
857 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
858 		      df, !net_eq(tunnel->net, dev_net(dev)));
859 	return;
860 
861 #if IS_ENABLED(CONFIG_IPV6)
862 tx_error_icmp:
863 	dst_link_failure(skb);
864 #endif
865 tx_error:
866 	DEV_STATS_INC(dev, tx_errors);
867 	kfree_skb(skb);
868 }
869 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
870 
871 static void ip_tunnel_update(struct ip_tunnel_net *itn,
872 			     struct ip_tunnel *t,
873 			     struct net_device *dev,
874 			     struct ip_tunnel_parm_kern *p,
875 			     bool set_mtu,
876 			     __u32 fwmark)
877 {
878 	ip_tunnel_del(itn, t);
879 	t->parms.iph.saddr = p->iph.saddr;
880 	t->parms.iph.daddr = p->iph.daddr;
881 	t->parms.i_key = p->i_key;
882 	t->parms.o_key = p->o_key;
883 	if (dev->type != ARPHRD_ETHER) {
884 		__dev_addr_set(dev, &p->iph.saddr, 4);
885 		memcpy(dev->broadcast, &p->iph.daddr, 4);
886 	}
887 	ip_tunnel_add(itn, t);
888 
889 	t->parms.iph.ttl = p->iph.ttl;
890 	t->parms.iph.tos = p->iph.tos;
891 	t->parms.iph.frag_off = p->iph.frag_off;
892 
893 	if (t->parms.link != p->link || t->fwmark != fwmark) {
894 		int mtu;
895 
896 		WRITE_ONCE(t->parms.link, p->link);
897 		t->fwmark = fwmark;
898 		mtu = ip_tunnel_bind_dev(dev);
899 		if (set_mtu)
900 			WRITE_ONCE(dev->mtu, mtu);
901 	}
902 	dst_cache_reset(&t->dst_cache);
903 	netdev_state_change(dev);
904 }
905 
906 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
907 		  int cmd)
908 {
909 	int err = 0;
910 	struct ip_tunnel *t = netdev_priv(dev);
911 	struct net *net = t->net;
912 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
913 
914 	switch (cmd) {
915 	case SIOCGETTUNNEL:
916 		if (dev == itn->fb_tunnel_dev) {
917 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918 			if (!t)
919 				t = netdev_priv(dev);
920 		}
921 		memcpy(p, &t->parms, sizeof(*p));
922 		break;
923 
924 	case SIOCADDTUNNEL:
925 	case SIOCCHGTUNNEL:
926 		err = -EPERM;
927 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
928 			goto done;
929 		if (p->iph.ttl)
930 			p->iph.frag_off |= htons(IP_DF);
931 		if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
932 			if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
933 				p->i_key = 0;
934 			if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
935 				p->o_key = 0;
936 		}
937 
938 		t = ip_tunnel_find(itn, p, itn->type);
939 
940 		if (cmd == SIOCADDTUNNEL) {
941 			if (!t) {
942 				t = ip_tunnel_create(net, itn, p);
943 				err = PTR_ERR_OR_ZERO(t);
944 				break;
945 			}
946 
947 			err = -EEXIST;
948 			break;
949 		}
950 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
951 			if (t) {
952 				if (t->dev != dev) {
953 					err = -EEXIST;
954 					break;
955 				}
956 			} else {
957 				unsigned int nflags = 0;
958 
959 				if (ipv4_is_multicast(p->iph.daddr))
960 					nflags = IFF_BROADCAST;
961 				else if (p->iph.daddr)
962 					nflags = IFF_POINTOPOINT;
963 
964 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
965 					err = -EINVAL;
966 					break;
967 				}
968 
969 				t = netdev_priv(dev);
970 			}
971 		}
972 
973 		if (t) {
974 			err = 0;
975 			ip_tunnel_update(itn, t, dev, p, true, 0);
976 		} else {
977 			err = -ENOENT;
978 		}
979 		break;
980 
981 	case SIOCDELTUNNEL:
982 		err = -EPERM;
983 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
984 			goto done;
985 
986 		if (dev == itn->fb_tunnel_dev) {
987 			err = -ENOENT;
988 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
989 			if (!t)
990 				goto done;
991 			err = -EPERM;
992 			if (t == netdev_priv(itn->fb_tunnel_dev))
993 				goto done;
994 			dev = t->dev;
995 		}
996 		unregister_netdevice(dev);
997 		err = 0;
998 		break;
999 
1000 	default:
1001 		err = -EINVAL;
1002 	}
1003 
1004 done:
1005 	return err;
1006 }
1007 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
1008 
1009 bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
1010 			      const void __user *data)
1011 {
1012 	struct ip_tunnel_parm p;
1013 
1014 	if (copy_from_user(&p, data, sizeof(p)))
1015 		return false;
1016 
1017 	strscpy(kp->name, p.name);
1018 	kp->link = p.link;
1019 	ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
1020 	ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
1021 	kp->i_key = p.i_key;
1022 	kp->o_key = p.o_key;
1023 	memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
1024 
1025 	return true;
1026 }
1027 EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
1028 
1029 bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
1030 {
1031 	struct ip_tunnel_parm p;
1032 
1033 	if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
1034 	    !ip_tunnel_flags_is_be16_compat(kp->o_flags))
1035 		return false;
1036 
1037 	memset(&p, 0, sizeof(p));
1038 
1039 	strscpy(p.name, kp->name);
1040 	p.link = kp->link;
1041 	p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
1042 	p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
1043 	p.i_key = kp->i_key;
1044 	p.o_key = kp->o_key;
1045 	memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
1046 
1047 	return !copy_to_user(data, &p, sizeof(p));
1048 }
1049 EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
1050 
1051 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1052 			     void __user *data, int cmd)
1053 {
1054 	struct ip_tunnel_parm_kern p;
1055 	int err;
1056 
1057 	if (!ip_tunnel_parm_from_user(&p, data))
1058 		return -EFAULT;
1059 	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1060 	if (!err && !ip_tunnel_parm_to_user(data, &p))
1061 		return -EFAULT;
1062 	return err;
1063 }
1064 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1065 
1066 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1067 {
1068 	struct ip_tunnel *tunnel = netdev_priv(dev);
1069 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1070 	int max_mtu = IP_MAX_MTU - t_hlen;
1071 
1072 	if (dev->type == ARPHRD_ETHER)
1073 		max_mtu -= dev->hard_header_len;
1074 
1075 	if (new_mtu < ETH_MIN_MTU)
1076 		return -EINVAL;
1077 
1078 	if (new_mtu > max_mtu) {
1079 		if (strict)
1080 			return -EINVAL;
1081 
1082 		new_mtu = max_mtu;
1083 	}
1084 
1085 	WRITE_ONCE(dev->mtu, new_mtu);
1086 	return 0;
1087 }
1088 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1089 
1090 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1091 {
1092 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
1093 }
1094 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1095 
1096 static void ip_tunnel_dev_free(struct net_device *dev)
1097 {
1098 	struct ip_tunnel *tunnel = netdev_priv(dev);
1099 
1100 	gro_cells_destroy(&tunnel->gro_cells);
1101 	dst_cache_destroy(&tunnel->dst_cache);
1102 	free_percpu(dev->tstats);
1103 }
1104 
1105 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1106 {
1107 	struct ip_tunnel *tunnel = netdev_priv(dev);
1108 	struct ip_tunnel_net *itn;
1109 
1110 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1111 
1112 	if (itn->fb_tunnel_dev != dev) {
1113 		ip_tunnel_del(itn, netdev_priv(dev));
1114 		unregister_netdevice_queue(dev, head);
1115 	}
1116 }
1117 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1118 
1119 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1120 {
1121 	struct ip_tunnel *tunnel = netdev_priv(dev);
1122 
1123 	return READ_ONCE(tunnel->net);
1124 }
1125 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1126 
1127 int ip_tunnel_get_iflink(const struct net_device *dev)
1128 {
1129 	const struct ip_tunnel *tunnel = netdev_priv(dev);
1130 
1131 	return READ_ONCE(tunnel->parms.link);
1132 }
1133 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1134 
1135 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1136 				  struct rtnl_link_ops *ops, char *devname)
1137 {
1138 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1139 	struct ip_tunnel_parm_kern parms;
1140 	unsigned int i;
1141 
1142 	itn->rtnl_link_ops = ops;
1143 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1144 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1145 
1146 	if (!ops || !net_has_fallback_tunnels(net)) {
1147 		struct ip_tunnel_net *it_init_net;
1148 
1149 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1150 		itn->type = it_init_net->type;
1151 		itn->fb_tunnel_dev = NULL;
1152 		return 0;
1153 	}
1154 
1155 	memset(&parms, 0, sizeof(parms));
1156 	if (devname)
1157 		strscpy(parms.name, devname, IFNAMSIZ);
1158 
1159 	rtnl_lock();
1160 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1161 	/* FB netdevice is special: we have one, and only one per netns.
1162 	 * Allowing to move it to another netns is clearly unsafe.
1163 	 */
1164 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1165 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1166 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1167 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1168 		itn->type = itn->fb_tunnel_dev->type;
1169 	}
1170 	rtnl_unlock();
1171 
1172 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1173 }
1174 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1175 
1176 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1177 			      struct list_head *head,
1178 			      struct rtnl_link_ops *ops)
1179 {
1180 	struct net_device *dev, *aux;
1181 	int h;
1182 
1183 	for_each_netdev_safe(net, dev, aux)
1184 		if (dev->rtnl_link_ops == ops)
1185 			unregister_netdevice_queue(dev, head);
1186 
1187 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1188 		struct ip_tunnel *t;
1189 		struct hlist_node *n;
1190 		struct hlist_head *thead = &itn->tunnels[h];
1191 
1192 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1193 			/* If dev is in the same netns, it has already
1194 			 * been added to the list by the previous loop.
1195 			 */
1196 			if (!net_eq(dev_net(t->dev), net))
1197 				unregister_netdevice_queue(t->dev, head);
1198 	}
1199 }
1200 
1201 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1202 			   struct rtnl_link_ops *ops,
1203 			   struct list_head *dev_to_kill)
1204 {
1205 	struct ip_tunnel_net *itn;
1206 	struct net *net;
1207 
1208 	ASSERT_RTNL();
1209 	list_for_each_entry(net, net_list, exit_list) {
1210 		itn = net_generic(net, id);
1211 		ip_tunnel_destroy(net, itn, dev_to_kill, ops);
1212 	}
1213 }
1214 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1215 
1216 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1217 		      struct ip_tunnel_parm_kern *p, __u32 fwmark)
1218 {
1219 	struct ip_tunnel *nt;
1220 	struct net *net = dev_net(dev);
1221 	struct ip_tunnel_net *itn;
1222 	int mtu;
1223 	int err;
1224 
1225 	nt = netdev_priv(dev);
1226 	itn = net_generic(net, nt->ip_tnl_net_id);
1227 
1228 	if (nt->collect_md) {
1229 		if (rtnl_dereference(itn->collect_md_tun))
1230 			return -EEXIST;
1231 	} else {
1232 		if (ip_tunnel_find(itn, p, dev->type))
1233 			return -EEXIST;
1234 	}
1235 
1236 	nt->net = net;
1237 	nt->parms = *p;
1238 	nt->fwmark = fwmark;
1239 	err = register_netdevice(dev);
1240 	if (err)
1241 		goto err_register_netdevice;
1242 
1243 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1244 		eth_hw_addr_random(dev);
1245 
1246 	mtu = ip_tunnel_bind_dev(dev);
1247 	if (tb[IFLA_MTU]) {
1248 		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1249 
1250 		if (dev->type == ARPHRD_ETHER)
1251 			max -= dev->hard_header_len;
1252 
1253 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1254 	}
1255 
1256 	err = dev_set_mtu(dev, mtu);
1257 	if (err)
1258 		goto err_dev_set_mtu;
1259 
1260 	ip_tunnel_add(itn, nt);
1261 	return 0;
1262 
1263 err_dev_set_mtu:
1264 	unregister_netdevice(dev);
1265 err_register_netdevice:
1266 	return err;
1267 }
1268 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1269 
1270 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1271 			 struct ip_tunnel_parm_kern *p, __u32 fwmark)
1272 {
1273 	struct ip_tunnel *t;
1274 	struct ip_tunnel *tunnel = netdev_priv(dev);
1275 	struct net *net = tunnel->net;
1276 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1277 
1278 	if (dev == itn->fb_tunnel_dev)
1279 		return -EINVAL;
1280 
1281 	t = ip_tunnel_find(itn, p, dev->type);
1282 
1283 	if (t) {
1284 		if (t->dev != dev)
1285 			return -EEXIST;
1286 	} else {
1287 		t = tunnel;
1288 
1289 		if (dev->type != ARPHRD_ETHER) {
1290 			unsigned int nflags = 0;
1291 
1292 			if (ipv4_is_multicast(p->iph.daddr))
1293 				nflags = IFF_BROADCAST;
1294 			else if (p->iph.daddr)
1295 				nflags = IFF_POINTOPOINT;
1296 
1297 			if ((dev->flags ^ nflags) &
1298 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1299 				return -EINVAL;
1300 		}
1301 	}
1302 
1303 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1304 	return 0;
1305 }
1306 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1307 
1308 int ip_tunnel_init(struct net_device *dev)
1309 {
1310 	struct ip_tunnel *tunnel = netdev_priv(dev);
1311 	struct iphdr *iph = &tunnel->parms.iph;
1312 	int err;
1313 
1314 	dev->needs_free_netdev = true;
1315 	dev->priv_destructor = ip_tunnel_dev_free;
1316 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1317 	if (!dev->tstats)
1318 		return -ENOMEM;
1319 
1320 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1321 	if (err) {
1322 		free_percpu(dev->tstats);
1323 		return err;
1324 	}
1325 
1326 	err = gro_cells_init(&tunnel->gro_cells, dev);
1327 	if (err) {
1328 		dst_cache_destroy(&tunnel->dst_cache);
1329 		free_percpu(dev->tstats);
1330 		return err;
1331 	}
1332 
1333 	tunnel->dev = dev;
1334 	tunnel->net = dev_net(dev);
1335 	strcpy(tunnel->parms.name, dev->name);
1336 	iph->version		= 4;
1337 	iph->ihl		= 5;
1338 
1339 	if (tunnel->collect_md)
1340 		netif_keep_dst(dev);
1341 	netdev_lockdep_set_classes(dev);
1342 	return 0;
1343 }
1344 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1345 
1346 void ip_tunnel_uninit(struct net_device *dev)
1347 {
1348 	struct ip_tunnel *tunnel = netdev_priv(dev);
1349 	struct net *net = tunnel->net;
1350 	struct ip_tunnel_net *itn;
1351 
1352 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1353 	ip_tunnel_del(itn, netdev_priv(dev));
1354 	if (itn->fb_tunnel_dev == dev)
1355 		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1356 
1357 	dst_cache_reset(&tunnel->dst_cache);
1358 }
1359 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1360 
1361 /* Do least required initialization, rest of init is done in tunnel_init call */
1362 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1363 {
1364 	struct ip_tunnel *tunnel = netdev_priv(dev);
1365 	tunnel->ip_tnl_net_id = net_id;
1366 }
1367 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1368 
1369 MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1370 MODULE_LICENSE("GPL");
1371