xref: /linux/net/ipv4/ip_tunnel.c (revision 6af91e3d2cfc8bb579b1aa2d22cd91f8c34acdf6)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5 
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30 
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46 
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52 
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55 	return hash_32((__force u32)key ^ (__force u32)remote,
56 			 IP_TNL_HASH_BITS);
57 }
58 
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
60 				const unsigned long *flags, __be32 key)
61 {
62 	if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
63 		return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
64 
65 	return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
66 }
67 
68 /* Fallback tunnel: no source, no destination, no key, no options
69 
70    Tunnel hash table:
71    We require exact key match i.e. if a key is present in packet
72    it will match only tunnel with the same key; if it is not present,
73    it will match only keyless tunnel.
74 
75    All keysless packets, if not matched configured keyless tunnels
76    will match fallback tunnel.
77    Given src, dst and key, find appropriate for input tunnel.
78 */
79 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
80 				   int link, const unsigned long *flags,
81 				   __be32 remote, __be32 local,
82 				   __be32 key)
83 {
84 	struct ip_tunnel *t, *cand = NULL;
85 	struct hlist_head *head;
86 	struct net_device *ndev;
87 	unsigned int hash;
88 
89 	hash = ip_tunnel_hash(key, remote);
90 	head = &itn->tunnels[hash];
91 
92 	hlist_for_each_entry_rcu(t, head, hash_node) {
93 		if (local != t->parms.iph.saddr ||
94 		    remote != t->parms.iph.daddr ||
95 		    !(t->dev->flags & IFF_UP))
96 			continue;
97 
98 		if (!ip_tunnel_key_match(&t->parms, flags, key))
99 			continue;
100 
101 		if (READ_ONCE(t->parms.link) == link)
102 			return t;
103 		cand = t;
104 	}
105 
106 	hlist_for_each_entry_rcu(t, head, hash_node) {
107 		if (remote != t->parms.iph.daddr ||
108 		    t->parms.iph.saddr != 0 ||
109 		    !(t->dev->flags & IFF_UP))
110 			continue;
111 
112 		if (!ip_tunnel_key_match(&t->parms, flags, key))
113 			continue;
114 
115 		if (READ_ONCE(t->parms.link) == link)
116 			return t;
117 		if (!cand)
118 			cand = t;
119 	}
120 
121 	hash = ip_tunnel_hash(key, 0);
122 	head = &itn->tunnels[hash];
123 
124 	hlist_for_each_entry_rcu(t, head, hash_node) {
125 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
126 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
127 			continue;
128 
129 		if (!(t->dev->flags & IFF_UP))
130 			continue;
131 
132 		if (!ip_tunnel_key_match(&t->parms, flags, key))
133 			continue;
134 
135 		if (READ_ONCE(t->parms.link) == link)
136 			return t;
137 		if (!cand)
138 			cand = t;
139 	}
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
143 		     t->parms.i_key != key) ||
144 		    t->parms.iph.saddr != 0 ||
145 		    t->parms.iph.daddr != 0 ||
146 		    !(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (READ_ONCE(t->parms.link) == link)
150 			return t;
151 		if (!cand)
152 			cand = t;
153 	}
154 
155 	if (cand)
156 		return cand;
157 
158 	t = rcu_dereference(itn->collect_md_tun);
159 	if (t && t->dev->flags & IFF_UP)
160 		return t;
161 
162 	ndev = READ_ONCE(itn->fb_tunnel_dev);
163 	if (ndev && ndev->flags & IFF_UP)
164 		return netdev_priv(ndev);
165 
166 	return NULL;
167 }
168 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
169 
170 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
171 				    struct ip_tunnel_parm_kern *parms)
172 {
173 	unsigned int h;
174 	__be32 remote;
175 	__be32 i_key = parms->i_key;
176 
177 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
178 		remote = parms->iph.daddr;
179 	else
180 		remote = 0;
181 
182 	if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
183 	    test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
184 		i_key = 0;
185 
186 	h = ip_tunnel_hash(i_key, remote);
187 	return &itn->tunnels[h];
188 }
189 
190 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
191 {
192 	struct hlist_head *head = ip_bucket(itn, &t->parms);
193 
194 	if (t->collect_md)
195 		rcu_assign_pointer(itn->collect_md_tun, t);
196 	hlist_add_head_rcu(&t->hash_node, head);
197 }
198 
199 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
200 {
201 	if (t->collect_md)
202 		rcu_assign_pointer(itn->collect_md_tun, NULL);
203 	hlist_del_init_rcu(&t->hash_node);
204 }
205 
206 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
207 					struct ip_tunnel_parm_kern *parms,
208 					int type)
209 {
210 	__be32 remote = parms->iph.daddr;
211 	__be32 local = parms->iph.saddr;
212 	IP_TUNNEL_DECLARE_FLAGS(flags);
213 	__be32 key = parms->i_key;
214 	int link = parms->link;
215 	struct ip_tunnel *t = NULL;
216 	struct hlist_head *head = ip_bucket(itn, parms);
217 
218 	ip_tunnel_flags_copy(flags, parms->i_flags);
219 
220 	hlist_for_each_entry_rcu(t, head, hash_node) {
221 		if (local == t->parms.iph.saddr &&
222 		    remote == t->parms.iph.daddr &&
223 		    link == READ_ONCE(t->parms.link) &&
224 		    type == t->dev->type &&
225 		    ip_tunnel_key_match(&t->parms, flags, key))
226 			break;
227 	}
228 	return t;
229 }
230 
231 static struct net_device *__ip_tunnel_create(struct net *net,
232 					     const struct rtnl_link_ops *ops,
233 					     struct ip_tunnel_parm_kern *parms)
234 {
235 	int err;
236 	struct ip_tunnel *tunnel;
237 	struct net_device *dev;
238 	char name[IFNAMSIZ];
239 
240 	err = -E2BIG;
241 	if (parms->name[0]) {
242 		if (!dev_valid_name(parms->name))
243 			goto failed;
244 		strscpy(name, parms->name, IFNAMSIZ);
245 	} else {
246 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
247 			goto failed;
248 		strcpy(name, ops->kind);
249 		strcat(name, "%d");
250 	}
251 
252 	ASSERT_RTNL();
253 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
254 	if (!dev) {
255 		err = -ENOMEM;
256 		goto failed;
257 	}
258 	dev_net_set(dev, net);
259 
260 	dev->rtnl_link_ops = ops;
261 
262 	tunnel = netdev_priv(dev);
263 	tunnel->parms = *parms;
264 	tunnel->net = net;
265 
266 	err = register_netdevice(dev);
267 	if (err)
268 		goto failed_free;
269 
270 	return dev;
271 
272 failed_free:
273 	free_netdev(dev);
274 failed:
275 	return ERR_PTR(err);
276 }
277 
278 static int ip_tunnel_bind_dev(struct net_device *dev)
279 {
280 	struct net_device *tdev = NULL;
281 	struct ip_tunnel *tunnel = netdev_priv(dev);
282 	const struct iphdr *iph;
283 	int hlen = LL_MAX_HEADER;
284 	int mtu = ETH_DATA_LEN;
285 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
286 
287 	iph = &tunnel->parms.iph;
288 
289 	/* Guess output device to choose reasonable mtu and needed_headroom */
290 	if (iph->daddr) {
291 		struct flowi4 fl4;
292 		struct rtable *rt;
293 
294 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
295 				    iph->saddr, tunnel->parms.o_key,
296 				    RT_TOS(iph->tos), dev_net(dev),
297 				    tunnel->parms.link, tunnel->fwmark, 0, 0);
298 		rt = ip_route_output_key(tunnel->net, &fl4);
299 
300 		if (!IS_ERR(rt)) {
301 			tdev = rt->dst.dev;
302 			ip_rt_put(rt);
303 		}
304 		if (dev->type != ARPHRD_ETHER)
305 			dev->flags |= IFF_POINTOPOINT;
306 
307 		dst_cache_reset(&tunnel->dst_cache);
308 	}
309 
310 	if (!tdev && tunnel->parms.link)
311 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
312 
313 	if (tdev) {
314 		hlen = tdev->hard_header_len + tdev->needed_headroom;
315 		mtu = min(tdev->mtu, IP_MAX_MTU);
316 	}
317 
318 	dev->needed_headroom = t_hlen + hlen;
319 	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
320 
321 	if (mtu < IPV4_MIN_MTU)
322 		mtu = IPV4_MIN_MTU;
323 
324 	return mtu;
325 }
326 
327 static struct ip_tunnel *ip_tunnel_create(struct net *net,
328 					  struct ip_tunnel_net *itn,
329 					  struct ip_tunnel_parm_kern *parms)
330 {
331 	struct ip_tunnel *nt;
332 	struct net_device *dev;
333 	int t_hlen;
334 	int mtu;
335 	int err;
336 
337 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
338 	if (IS_ERR(dev))
339 		return ERR_CAST(dev);
340 
341 	mtu = ip_tunnel_bind_dev(dev);
342 	err = dev_set_mtu(dev, mtu);
343 	if (err)
344 		goto err_dev_set_mtu;
345 
346 	nt = netdev_priv(dev);
347 	t_hlen = nt->hlen + sizeof(struct iphdr);
348 	dev->min_mtu = ETH_MIN_MTU;
349 	dev->max_mtu = IP_MAX_MTU - t_hlen;
350 	if (dev->type == ARPHRD_ETHER)
351 		dev->max_mtu -= dev->hard_header_len;
352 
353 	ip_tunnel_add(itn, nt);
354 	return nt;
355 
356 err_dev_set_mtu:
357 	unregister_netdevice(dev);
358 	return ERR_PTR(err);
359 }
360 
361 void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
362 {
363 	const struct iphdr *iph = ip_hdr(skb);
364 	const struct udphdr *udph;
365 
366 	if (iph->protocol != IPPROTO_UDP)
367 		return;
368 
369 	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
370 	info->encap.sport = udph->source;
371 	info->encap.dport = udph->dest;
372 }
373 EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
374 
375 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
376 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
377 		  bool log_ecn_error)
378 {
379 	const struct iphdr *iph = ip_hdr(skb);
380 	int nh, err;
381 
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383 	if (ipv4_is_multicast(iph->daddr)) {
384 		DEV_STATS_INC(tunnel->dev, multicast);
385 		skb->pkt_type = PACKET_BROADCAST;
386 	}
387 #endif
388 
389 	if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
390 	    test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
391 		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
392 		DEV_STATS_INC(tunnel->dev, rx_errors);
393 		goto drop;
394 	}
395 
396 	if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
397 		if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
398 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399 			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
400 			DEV_STATS_INC(tunnel->dev, rx_errors);
401 			goto drop;
402 		}
403 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
404 	}
405 
406 	/* Save offset of outer header relative to skb->head,
407 	 * because we are going to reset the network header to the inner header
408 	 * and might change skb->head.
409 	 */
410 	nh = skb_network_header(skb) - skb->head;
411 
412 	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
413 
414 	if (!pskb_inet_may_pull(skb)) {
415 		DEV_STATS_INC(tunnel->dev, rx_length_errors);
416 		DEV_STATS_INC(tunnel->dev, rx_errors);
417 		goto drop;
418 	}
419 	iph = (struct iphdr *)(skb->head + nh);
420 
421 	err = IP_ECN_decapsulate(iph, skb);
422 	if (unlikely(err)) {
423 		if (log_ecn_error)
424 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
425 					&iph->saddr, iph->tos);
426 		if (err > 1) {
427 			DEV_STATS_INC(tunnel->dev, rx_frame_errors);
428 			DEV_STATS_INC(tunnel->dev, rx_errors);
429 			goto drop;
430 		}
431 	}
432 
433 	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
434 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
435 
436 	if (tunnel->dev->type == ARPHRD_ETHER) {
437 		skb->protocol = eth_type_trans(skb, tunnel->dev);
438 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
439 	} else {
440 		skb->dev = tunnel->dev;
441 	}
442 
443 	if (tun_dst)
444 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
445 
446 	gro_cells_receive(&tunnel->gro_cells, skb);
447 	return 0;
448 
449 drop:
450 	if (tun_dst)
451 		dst_release((struct dst_entry *)tun_dst);
452 	kfree_skb(skb);
453 	return 0;
454 }
455 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
456 
457 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
458 			    unsigned int num)
459 {
460 	if (num >= MAX_IPTUN_ENCAP_OPS)
461 		return -ERANGE;
462 
463 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
464 			&iptun_encaps[num],
465 			NULL, ops) ? 0 : -1;
466 }
467 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
468 
469 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
470 			    unsigned int num)
471 {
472 	int ret;
473 
474 	if (num >= MAX_IPTUN_ENCAP_OPS)
475 		return -ERANGE;
476 
477 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
478 		       &iptun_encaps[num],
479 		       ops, NULL) == ops) ? 0 : -1;
480 
481 	synchronize_net();
482 
483 	return ret;
484 }
485 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
486 
487 int ip_tunnel_encap_setup(struct ip_tunnel *t,
488 			  struct ip_tunnel_encap *ipencap)
489 {
490 	int hlen;
491 
492 	memset(&t->encap, 0, sizeof(t->encap));
493 
494 	hlen = ip_encap_hlen(ipencap);
495 	if (hlen < 0)
496 		return hlen;
497 
498 	t->encap.type = ipencap->type;
499 	t->encap.sport = ipencap->sport;
500 	t->encap.dport = ipencap->dport;
501 	t->encap.flags = ipencap->flags;
502 
503 	t->encap_hlen = hlen;
504 	t->hlen = t->encap_hlen + t->tun_hlen;
505 
506 	return 0;
507 }
508 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
509 
510 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
511 			    struct rtable *rt, __be16 df,
512 			    const struct iphdr *inner_iph,
513 			    int tunnel_hlen, __be32 dst, bool md)
514 {
515 	struct ip_tunnel *tunnel = netdev_priv(dev);
516 	int pkt_size;
517 	int mtu;
518 
519 	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
520 	pkt_size = skb->len - tunnel_hlen;
521 	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
522 
523 	if (df) {
524 		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
525 		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
526 	} else {
527 		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
528 	}
529 
530 	if (skb_valid_dst(skb))
531 		skb_dst_update_pmtu_no_confirm(skb, mtu);
532 
533 	if (skb->protocol == htons(ETH_P_IP)) {
534 		if (!skb_is_gso(skb) &&
535 		    (inner_iph->frag_off & htons(IP_DF)) &&
536 		    mtu < pkt_size) {
537 			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
538 			return -E2BIG;
539 		}
540 	}
541 #if IS_ENABLED(CONFIG_IPV6)
542 	else if (skb->protocol == htons(ETH_P_IPV6)) {
543 		struct rt6_info *rt6;
544 		__be32 daddr;
545 
546 		rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
547 					   NULL;
548 		daddr = md ? dst : tunnel->parms.iph.daddr;
549 
550 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
551 			   mtu >= IPV6_MIN_MTU) {
552 			if ((daddr && !ipv4_is_multicast(daddr)) ||
553 			    rt6->rt6i_dst.plen == 128) {
554 				rt6->rt6i_flags |= RTF_MODIFIED;
555 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
556 			}
557 		}
558 
559 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
560 					mtu < pkt_size) {
561 			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
562 			return -E2BIG;
563 		}
564 	}
565 #endif
566 	return 0;
567 }
568 
569 static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
570 {
571 	/* we must cap headroom to some upperlimit, else pskb_expand_head
572 	 * will overflow header offsets in skb_headers_offset_update().
573 	 */
574 	static const unsigned int max_allowed = 512;
575 
576 	if (headroom > max_allowed)
577 		headroom = max_allowed;
578 
579 	if (headroom > READ_ONCE(dev->needed_headroom))
580 		WRITE_ONCE(dev->needed_headroom, headroom);
581 }
582 
583 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
584 		       u8 proto, int tunnel_hlen)
585 {
586 	struct ip_tunnel *tunnel = netdev_priv(dev);
587 	u32 headroom = sizeof(struct iphdr);
588 	struct ip_tunnel_info *tun_info;
589 	const struct ip_tunnel_key *key;
590 	const struct iphdr *inner_iph;
591 	struct rtable *rt = NULL;
592 	struct flowi4 fl4;
593 	__be16 df = 0;
594 	u8 tos, ttl;
595 	bool use_cache;
596 
597 	tun_info = skb_tunnel_info(skb);
598 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
599 		     ip_tunnel_info_af(tun_info) != AF_INET))
600 		goto tx_error;
601 	key = &tun_info->key;
602 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
603 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
604 	tos = key->tos;
605 	if (tos == 1) {
606 		if (skb->protocol == htons(ETH_P_IP))
607 			tos = inner_iph->tos;
608 		else if (skb->protocol == htons(ETH_P_IPV6))
609 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
610 	}
611 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
612 			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
613 			    dev_net(dev), 0, skb->mark, skb_get_hash(skb),
614 			    key->flow_flags);
615 
616 	if (!tunnel_hlen)
617 		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
618 
619 	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
620 		goto tx_error;
621 
622 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
623 	if (use_cache)
624 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
625 	if (!rt) {
626 		rt = ip_route_output_key(tunnel->net, &fl4);
627 		if (IS_ERR(rt)) {
628 			DEV_STATS_INC(dev, tx_carrier_errors);
629 			goto tx_error;
630 		}
631 		if (use_cache)
632 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
633 					  fl4.saddr);
634 	}
635 	if (rt->dst.dev == dev) {
636 		ip_rt_put(rt);
637 		DEV_STATS_INC(dev, collisions);
638 		goto tx_error;
639 	}
640 
641 	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
642 		df = htons(IP_DF);
643 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
644 			    key->u.ipv4.dst, true)) {
645 		ip_rt_put(rt);
646 		goto tx_error;
647 	}
648 
649 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
650 	ttl = key->ttl;
651 	if (ttl == 0) {
652 		if (skb->protocol == htons(ETH_P_IP))
653 			ttl = inner_iph->ttl;
654 		else if (skb->protocol == htons(ETH_P_IPV6))
655 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
656 		else
657 			ttl = ip4_dst_hoplimit(&rt->dst);
658 	}
659 
660 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
661 	if (skb_cow_head(skb, headroom)) {
662 		ip_rt_put(rt);
663 		goto tx_dropped;
664 	}
665 
666 	ip_tunnel_adj_headroom(dev, headroom);
667 
668 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
669 		      df, !net_eq(tunnel->net, dev_net(dev)));
670 	return;
671 tx_error:
672 	DEV_STATS_INC(dev, tx_errors);
673 	goto kfree;
674 tx_dropped:
675 	DEV_STATS_INC(dev, tx_dropped);
676 kfree:
677 	kfree_skb(skb);
678 }
679 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
680 
681 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
682 		    const struct iphdr *tnl_params, u8 protocol)
683 {
684 	struct ip_tunnel *tunnel = netdev_priv(dev);
685 	struct ip_tunnel_info *tun_info = NULL;
686 	const struct iphdr *inner_iph;
687 	unsigned int max_headroom;	/* The extra header space needed */
688 	struct rtable *rt = NULL;		/* Route to the other host */
689 	__be16 payload_protocol;
690 	bool use_cache = false;
691 	struct flowi4 fl4;
692 	bool md = false;
693 	bool connected;
694 	u8 tos, ttl;
695 	__be32 dst;
696 	__be16 df;
697 
698 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
699 	connected = (tunnel->parms.iph.daddr != 0);
700 	payload_protocol = skb_protocol(skb, true);
701 
702 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
703 
704 	dst = tnl_params->daddr;
705 	if (dst == 0) {
706 		/* NBMA tunnel */
707 
708 		if (!skb_dst(skb)) {
709 			DEV_STATS_INC(dev, tx_fifo_errors);
710 			goto tx_error;
711 		}
712 
713 		tun_info = skb_tunnel_info(skb);
714 		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
715 		    ip_tunnel_info_af(tun_info) == AF_INET &&
716 		    tun_info->key.u.ipv4.dst) {
717 			dst = tun_info->key.u.ipv4.dst;
718 			md = true;
719 			connected = true;
720 		} else if (payload_protocol == htons(ETH_P_IP)) {
721 			rt = skb_rtable(skb);
722 			dst = rt_nexthop(rt, inner_iph->daddr);
723 		}
724 #if IS_ENABLED(CONFIG_IPV6)
725 		else if (payload_protocol == htons(ETH_P_IPV6)) {
726 			const struct in6_addr *addr6;
727 			struct neighbour *neigh;
728 			bool do_tx_error_icmp;
729 			int addr_type;
730 
731 			neigh = dst_neigh_lookup(skb_dst(skb),
732 						 &ipv6_hdr(skb)->daddr);
733 			if (!neigh)
734 				goto tx_error;
735 
736 			addr6 = (const struct in6_addr *)&neigh->primary_key;
737 			addr_type = ipv6_addr_type(addr6);
738 
739 			if (addr_type == IPV6_ADDR_ANY) {
740 				addr6 = &ipv6_hdr(skb)->daddr;
741 				addr_type = ipv6_addr_type(addr6);
742 			}
743 
744 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
745 				do_tx_error_icmp = true;
746 			else {
747 				do_tx_error_icmp = false;
748 				dst = addr6->s6_addr32[3];
749 			}
750 			neigh_release(neigh);
751 			if (do_tx_error_icmp)
752 				goto tx_error_icmp;
753 		}
754 #endif
755 		else
756 			goto tx_error;
757 
758 		if (!md)
759 			connected = false;
760 	}
761 
762 	tos = tnl_params->tos;
763 	if (tos & 0x1) {
764 		tos &= ~0x1;
765 		if (payload_protocol == htons(ETH_P_IP)) {
766 			tos = inner_iph->tos;
767 			connected = false;
768 		} else if (payload_protocol == htons(ETH_P_IPV6)) {
769 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
770 			connected = false;
771 		}
772 	}
773 
774 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
775 			    tunnel->parms.o_key, RT_TOS(tos),
776 			    dev_net(dev), READ_ONCE(tunnel->parms.link),
777 			    tunnel->fwmark, skb_get_hash(skb), 0);
778 
779 	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
780 		goto tx_error;
781 
782 	if (connected && md) {
783 		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
784 		if (use_cache)
785 			rt = dst_cache_get_ip4(&tun_info->dst_cache,
786 					       &fl4.saddr);
787 	} else {
788 		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
789 						&fl4.saddr) : NULL;
790 	}
791 
792 	if (!rt) {
793 		rt = ip_route_output_key(tunnel->net, &fl4);
794 
795 		if (IS_ERR(rt)) {
796 			DEV_STATS_INC(dev, tx_carrier_errors);
797 			goto tx_error;
798 		}
799 		if (use_cache)
800 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
801 					  fl4.saddr);
802 		else if (!md && connected)
803 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
804 					  fl4.saddr);
805 	}
806 
807 	if (rt->dst.dev == dev) {
808 		ip_rt_put(rt);
809 		DEV_STATS_INC(dev, collisions);
810 		goto tx_error;
811 	}
812 
813 	df = tnl_params->frag_off;
814 	if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
815 		df |= (inner_iph->frag_off & htons(IP_DF));
816 
817 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
818 		ip_rt_put(rt);
819 		goto tx_error;
820 	}
821 
822 	if (tunnel->err_count > 0) {
823 		if (time_before(jiffies,
824 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
825 			tunnel->err_count--;
826 
827 			dst_link_failure(skb);
828 		} else
829 			tunnel->err_count = 0;
830 	}
831 
832 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
833 	ttl = tnl_params->ttl;
834 	if (ttl == 0) {
835 		if (payload_protocol == htons(ETH_P_IP))
836 			ttl = inner_iph->ttl;
837 #if IS_ENABLED(CONFIG_IPV6)
838 		else if (payload_protocol == htons(ETH_P_IPV6))
839 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
840 #endif
841 		else
842 			ttl = ip4_dst_hoplimit(&rt->dst);
843 	}
844 
845 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
846 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
847 
848 	if (skb_cow_head(skb, max_headroom)) {
849 		ip_rt_put(rt);
850 		DEV_STATS_INC(dev, tx_dropped);
851 		kfree_skb(skb);
852 		return;
853 	}
854 
855 	ip_tunnel_adj_headroom(dev, max_headroom);
856 
857 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
858 		      df, !net_eq(tunnel->net, dev_net(dev)));
859 	return;
860 
861 #if IS_ENABLED(CONFIG_IPV6)
862 tx_error_icmp:
863 	dst_link_failure(skb);
864 #endif
865 tx_error:
866 	DEV_STATS_INC(dev, tx_errors);
867 	kfree_skb(skb);
868 }
869 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
870 
871 static void ip_tunnel_update(struct ip_tunnel_net *itn,
872 			     struct ip_tunnel *t,
873 			     struct net_device *dev,
874 			     struct ip_tunnel_parm_kern *p,
875 			     bool set_mtu,
876 			     __u32 fwmark)
877 {
878 	ip_tunnel_del(itn, t);
879 	t->parms.iph.saddr = p->iph.saddr;
880 	t->parms.iph.daddr = p->iph.daddr;
881 	t->parms.i_key = p->i_key;
882 	t->parms.o_key = p->o_key;
883 	if (dev->type != ARPHRD_ETHER) {
884 		__dev_addr_set(dev, &p->iph.saddr, 4);
885 		memcpy(dev->broadcast, &p->iph.daddr, 4);
886 	}
887 	ip_tunnel_add(itn, t);
888 
889 	t->parms.iph.ttl = p->iph.ttl;
890 	t->parms.iph.tos = p->iph.tos;
891 	t->parms.iph.frag_off = p->iph.frag_off;
892 
893 	if (t->parms.link != p->link || t->fwmark != fwmark) {
894 		int mtu;
895 
896 		WRITE_ONCE(t->parms.link, p->link);
897 		t->fwmark = fwmark;
898 		mtu = ip_tunnel_bind_dev(dev);
899 		if (set_mtu)
900 			WRITE_ONCE(dev->mtu, mtu);
901 	}
902 	dst_cache_reset(&t->dst_cache);
903 	netdev_state_change(dev);
904 }
905 
906 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
907 		  int cmd)
908 {
909 	int err = 0;
910 	struct ip_tunnel *t = netdev_priv(dev);
911 	struct net *net = t->net;
912 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
913 
914 	switch (cmd) {
915 	case SIOCGETTUNNEL:
916 		if (dev == itn->fb_tunnel_dev) {
917 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918 			if (!t)
919 				t = netdev_priv(dev);
920 		}
921 		memcpy(p, &t->parms, sizeof(*p));
922 		break;
923 
924 	case SIOCADDTUNNEL:
925 	case SIOCCHGTUNNEL:
926 		err = -EPERM;
927 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
928 			goto done;
929 		if (p->iph.ttl)
930 			p->iph.frag_off |= htons(IP_DF);
931 		if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
932 			if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
933 				p->i_key = 0;
934 			if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
935 				p->o_key = 0;
936 		}
937 
938 		t = ip_tunnel_find(itn, p, itn->type);
939 
940 		if (cmd == SIOCADDTUNNEL) {
941 			if (!t) {
942 				t = ip_tunnel_create(net, itn, p);
943 				err = PTR_ERR_OR_ZERO(t);
944 				break;
945 			}
946 
947 			err = -EEXIST;
948 			break;
949 		}
950 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
951 			if (t) {
952 				if (t->dev != dev) {
953 					err = -EEXIST;
954 					break;
955 				}
956 			} else {
957 				unsigned int nflags = 0;
958 
959 				if (ipv4_is_multicast(p->iph.daddr))
960 					nflags = IFF_BROADCAST;
961 				else if (p->iph.daddr)
962 					nflags = IFF_POINTOPOINT;
963 
964 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
965 					err = -EINVAL;
966 					break;
967 				}
968 
969 				t = netdev_priv(dev);
970 			}
971 		}
972 
973 		if (t) {
974 			err = 0;
975 			ip_tunnel_update(itn, t, dev, p, true, 0);
976 		} else {
977 			err = -ENOENT;
978 		}
979 		break;
980 
981 	case SIOCDELTUNNEL:
982 		err = -EPERM;
983 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
984 			goto done;
985 
986 		if (dev == itn->fb_tunnel_dev) {
987 			err = -ENOENT;
988 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
989 			if (!t)
990 				goto done;
991 			err = -EPERM;
992 			if (t == netdev_priv(itn->fb_tunnel_dev))
993 				goto done;
994 			dev = t->dev;
995 		}
996 		unregister_netdevice(dev);
997 		err = 0;
998 		break;
999 
1000 	default:
1001 		err = -EINVAL;
1002 	}
1003 
1004 done:
1005 	return err;
1006 }
1007 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
1008 
1009 bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
1010 			      const void __user *data)
1011 {
1012 	struct ip_tunnel_parm p;
1013 
1014 	if (copy_from_user(&p, data, sizeof(p)))
1015 		return false;
1016 
1017 	strscpy(kp->name, p.name);
1018 	kp->link = p.link;
1019 	ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
1020 	ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
1021 	kp->i_key = p.i_key;
1022 	kp->o_key = p.o_key;
1023 	memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
1024 
1025 	return true;
1026 }
1027 EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
1028 
1029 bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
1030 {
1031 	struct ip_tunnel_parm p;
1032 
1033 	if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
1034 	    !ip_tunnel_flags_is_be16_compat(kp->o_flags))
1035 		return false;
1036 
1037 	memset(&p, 0, sizeof(p));
1038 
1039 	strscpy(p.name, kp->name);
1040 	p.link = kp->link;
1041 	p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
1042 	p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
1043 	p.i_key = kp->i_key;
1044 	p.o_key = kp->o_key;
1045 	memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
1046 
1047 	return !copy_to_user(data, &p, sizeof(p));
1048 }
1049 EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
1050 
1051 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1052 			     void __user *data, int cmd)
1053 {
1054 	struct ip_tunnel_parm_kern p;
1055 	int err;
1056 
1057 	if (!ip_tunnel_parm_from_user(&p, data))
1058 		return -EFAULT;
1059 	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1060 	if (!err && !ip_tunnel_parm_to_user(data, &p))
1061 		return -EFAULT;
1062 	return err;
1063 }
1064 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1065 
1066 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1067 {
1068 	struct ip_tunnel *tunnel = netdev_priv(dev);
1069 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1070 	int max_mtu = IP_MAX_MTU - t_hlen;
1071 
1072 	if (dev->type == ARPHRD_ETHER)
1073 		max_mtu -= dev->hard_header_len;
1074 
1075 	if (new_mtu < ETH_MIN_MTU)
1076 		return -EINVAL;
1077 
1078 	if (new_mtu > max_mtu) {
1079 		if (strict)
1080 			return -EINVAL;
1081 
1082 		new_mtu = max_mtu;
1083 	}
1084 
1085 	WRITE_ONCE(dev->mtu, new_mtu);
1086 	return 0;
1087 }
1088 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1089 
1090 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1091 {
1092 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
1093 }
1094 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1095 
1096 static void ip_tunnel_dev_free(struct net_device *dev)
1097 {
1098 	struct ip_tunnel *tunnel = netdev_priv(dev);
1099 
1100 	gro_cells_destroy(&tunnel->gro_cells);
1101 	dst_cache_destroy(&tunnel->dst_cache);
1102 }
1103 
1104 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1105 {
1106 	struct ip_tunnel *tunnel = netdev_priv(dev);
1107 	struct ip_tunnel_net *itn;
1108 
1109 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1110 
1111 	if (itn->fb_tunnel_dev != dev) {
1112 		ip_tunnel_del(itn, netdev_priv(dev));
1113 		unregister_netdevice_queue(dev, head);
1114 	}
1115 }
1116 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1117 
1118 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1119 {
1120 	struct ip_tunnel *tunnel = netdev_priv(dev);
1121 
1122 	return READ_ONCE(tunnel->net);
1123 }
1124 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1125 
1126 int ip_tunnel_get_iflink(const struct net_device *dev)
1127 {
1128 	const struct ip_tunnel *tunnel = netdev_priv(dev);
1129 
1130 	return READ_ONCE(tunnel->parms.link);
1131 }
1132 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1133 
1134 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1135 				  struct rtnl_link_ops *ops, char *devname)
1136 {
1137 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1138 	struct ip_tunnel_parm_kern parms;
1139 	unsigned int i;
1140 
1141 	itn->rtnl_link_ops = ops;
1142 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1143 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1144 
1145 	if (!ops || !net_has_fallback_tunnels(net)) {
1146 		struct ip_tunnel_net *it_init_net;
1147 
1148 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1149 		itn->type = it_init_net->type;
1150 		itn->fb_tunnel_dev = NULL;
1151 		return 0;
1152 	}
1153 
1154 	memset(&parms, 0, sizeof(parms));
1155 	if (devname)
1156 		strscpy(parms.name, devname, IFNAMSIZ);
1157 
1158 	rtnl_lock();
1159 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1160 	/* FB netdevice is special: we have one, and only one per netns.
1161 	 * Allowing to move it to another netns is clearly unsafe.
1162 	 */
1163 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1164 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1165 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1166 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1167 		itn->type = itn->fb_tunnel_dev->type;
1168 	}
1169 	rtnl_unlock();
1170 
1171 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1172 }
1173 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1174 
1175 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1176 			      struct list_head *head,
1177 			      struct rtnl_link_ops *ops)
1178 {
1179 	struct net_device *dev, *aux;
1180 	int h;
1181 
1182 	for_each_netdev_safe(net, dev, aux)
1183 		if (dev->rtnl_link_ops == ops)
1184 			unregister_netdevice_queue(dev, head);
1185 
1186 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1187 		struct ip_tunnel *t;
1188 		struct hlist_node *n;
1189 		struct hlist_head *thead = &itn->tunnels[h];
1190 
1191 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1192 			/* If dev is in the same netns, it has already
1193 			 * been added to the list by the previous loop.
1194 			 */
1195 			if (!net_eq(dev_net(t->dev), net))
1196 				unregister_netdevice_queue(t->dev, head);
1197 	}
1198 }
1199 
1200 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1201 			   struct rtnl_link_ops *ops,
1202 			   struct list_head *dev_to_kill)
1203 {
1204 	struct ip_tunnel_net *itn;
1205 	struct net *net;
1206 
1207 	ASSERT_RTNL();
1208 	list_for_each_entry(net, net_list, exit_list) {
1209 		itn = net_generic(net, id);
1210 		ip_tunnel_destroy(net, itn, dev_to_kill, ops);
1211 	}
1212 }
1213 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1214 
1215 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1216 		      struct ip_tunnel_parm_kern *p, __u32 fwmark)
1217 {
1218 	struct ip_tunnel *nt;
1219 	struct net *net = dev_net(dev);
1220 	struct ip_tunnel_net *itn;
1221 	int mtu;
1222 	int err;
1223 
1224 	nt = netdev_priv(dev);
1225 	itn = net_generic(net, nt->ip_tnl_net_id);
1226 
1227 	if (nt->collect_md) {
1228 		if (rtnl_dereference(itn->collect_md_tun))
1229 			return -EEXIST;
1230 	} else {
1231 		if (ip_tunnel_find(itn, p, dev->type))
1232 			return -EEXIST;
1233 	}
1234 
1235 	nt->net = net;
1236 	nt->parms = *p;
1237 	nt->fwmark = fwmark;
1238 	err = register_netdevice(dev);
1239 	if (err)
1240 		goto err_register_netdevice;
1241 
1242 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1243 		eth_hw_addr_random(dev);
1244 
1245 	mtu = ip_tunnel_bind_dev(dev);
1246 	if (tb[IFLA_MTU]) {
1247 		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1248 
1249 		if (dev->type == ARPHRD_ETHER)
1250 			max -= dev->hard_header_len;
1251 
1252 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1253 	}
1254 
1255 	err = dev_set_mtu(dev, mtu);
1256 	if (err)
1257 		goto err_dev_set_mtu;
1258 
1259 	ip_tunnel_add(itn, nt);
1260 	return 0;
1261 
1262 err_dev_set_mtu:
1263 	unregister_netdevice(dev);
1264 err_register_netdevice:
1265 	return err;
1266 }
1267 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1268 
1269 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1270 			 struct ip_tunnel_parm_kern *p, __u32 fwmark)
1271 {
1272 	struct ip_tunnel *t;
1273 	struct ip_tunnel *tunnel = netdev_priv(dev);
1274 	struct net *net = tunnel->net;
1275 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1276 
1277 	if (dev == itn->fb_tunnel_dev)
1278 		return -EINVAL;
1279 
1280 	t = ip_tunnel_find(itn, p, dev->type);
1281 
1282 	if (t) {
1283 		if (t->dev != dev)
1284 			return -EEXIST;
1285 	} else {
1286 		t = tunnel;
1287 
1288 		if (dev->type != ARPHRD_ETHER) {
1289 			unsigned int nflags = 0;
1290 
1291 			if (ipv4_is_multicast(p->iph.daddr))
1292 				nflags = IFF_BROADCAST;
1293 			else if (p->iph.daddr)
1294 				nflags = IFF_POINTOPOINT;
1295 
1296 			if ((dev->flags ^ nflags) &
1297 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1298 				return -EINVAL;
1299 		}
1300 	}
1301 
1302 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1303 	return 0;
1304 }
1305 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1306 
1307 int ip_tunnel_init(struct net_device *dev)
1308 {
1309 	struct ip_tunnel *tunnel = netdev_priv(dev);
1310 	struct iphdr *iph = &tunnel->parms.iph;
1311 	int err;
1312 
1313 	dev->needs_free_netdev = true;
1314 	dev->priv_destructor = ip_tunnel_dev_free;
1315 	dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
1316 
1317 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1318 	if (err)
1319 		return err;
1320 
1321 	err = gro_cells_init(&tunnel->gro_cells, dev);
1322 	if (err) {
1323 		dst_cache_destroy(&tunnel->dst_cache);
1324 		return err;
1325 	}
1326 
1327 	tunnel->dev = dev;
1328 	tunnel->net = dev_net(dev);
1329 	strscpy(tunnel->parms.name, dev->name);
1330 	iph->version		= 4;
1331 	iph->ihl		= 5;
1332 
1333 	if (tunnel->collect_md)
1334 		netif_keep_dst(dev);
1335 	netdev_lockdep_set_classes(dev);
1336 	return 0;
1337 }
1338 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1339 
1340 void ip_tunnel_uninit(struct net_device *dev)
1341 {
1342 	struct ip_tunnel *tunnel = netdev_priv(dev);
1343 	struct net *net = tunnel->net;
1344 	struct ip_tunnel_net *itn;
1345 
1346 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1347 	ip_tunnel_del(itn, netdev_priv(dev));
1348 	if (itn->fb_tunnel_dev == dev)
1349 		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1350 
1351 	dst_cache_reset(&tunnel->dst_cache);
1352 }
1353 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1354 
1355 /* Do least required initialization, rest of init is done in tunnel_init call */
1356 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1357 {
1358 	struct ip_tunnel *tunnel = netdev_priv(dev);
1359 	tunnel->ip_tnl_net_id = net_id;
1360 }
1361 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1362 
1363 MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1364 MODULE_LICENSE("GPL");
1365