xref: /linux/net/ipv6/ip6_output.c (revision ebf68996de0ab250c5d520eb2291ab65643e9a1e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 
58 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
59 {
60 	struct dst_entry *dst = skb_dst(skb);
61 	struct net_device *dev = dst->dev;
62 	struct neighbour *neigh;
63 	struct in6_addr *nexthop;
64 	int ret;
65 
66 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
67 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
68 
69 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
70 		    ((mroute6_is_socket(net, skb) &&
71 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
72 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
73 					 &ipv6_hdr(skb)->saddr))) {
74 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
75 
76 			/* Do not check for IFF_ALLMULTI; multicast routing
77 			   is not supported in any case.
78 			 */
79 			if (newskb)
80 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
81 					net, sk, newskb, NULL, newskb->dev,
82 					dev_loopback_xmit);
83 
84 			if (ipv6_hdr(skb)->hop_limit == 0) {
85 				IP6_INC_STATS(net, idev,
86 					      IPSTATS_MIB_OUTDISCARDS);
87 				kfree_skb(skb);
88 				return 0;
89 			}
90 		}
91 
92 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
93 
94 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
95 		    IPV6_ADDR_SCOPE_NODELOCAL &&
96 		    !(dev->flags & IFF_LOOPBACK)) {
97 			kfree_skb(skb);
98 			return 0;
99 		}
100 	}
101 
102 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
103 		int res = lwtunnel_xmit(skb);
104 
105 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
106 			return res;
107 	}
108 
109 	rcu_read_lock_bh();
110 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
111 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
112 	if (unlikely(!neigh))
113 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
114 	if (!IS_ERR(neigh)) {
115 		sock_confirm_neigh(skb, neigh);
116 		ret = neigh_output(neigh, skb, false);
117 		rcu_read_unlock_bh();
118 		return ret;
119 	}
120 	rcu_read_unlock_bh();
121 
122 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
123 	kfree_skb(skb);
124 	return -EINVAL;
125 }
126 
127 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
128 {
129 	int ret;
130 
131 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
132 	if (ret) {
133 		kfree_skb(skb);
134 		return ret;
135 	}
136 
137 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
138 	/* Policy lookup after SNAT yielded a new policy */
139 	if (skb_dst(skb)->xfrm) {
140 		IPCB(skb)->flags |= IPSKB_REROUTED;
141 		return dst_output(net, sk, skb);
142 	}
143 #endif
144 
145 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
146 	    dst_allfrag(skb_dst(skb)) ||
147 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
148 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
149 	else
150 		return ip6_finish_output2(net, sk, skb);
151 }
152 
153 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
154 {
155 	struct net_device *dev = skb_dst(skb)->dev;
156 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
157 
158 	skb->protocol = htons(ETH_P_IPV6);
159 	skb->dev = dev;
160 
161 	if (unlikely(idev->cnf.disable_ipv6)) {
162 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
163 		kfree_skb(skb);
164 		return 0;
165 	}
166 
167 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
168 			    net, sk, skb, NULL, dev,
169 			    ip6_finish_output,
170 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172 
173 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
174 {
175 	if (!np->autoflowlabel_set)
176 		return ip6_default_np_autolabel(net);
177 	else
178 		return np->autoflowlabel;
179 }
180 
181 /*
182  * xmit an sk_buff (used by TCP, SCTP and DCCP)
183  * Note : socket lock is not held for SYNACK packets, but might be modified
184  * by calls to skb_set_owner_w() and ipv6_local_error(),
185  * which are using proper atomic operations or spinlocks.
186  */
187 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
188 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
189 {
190 	struct net *net = sock_net(sk);
191 	const struct ipv6_pinfo *np = inet6_sk(sk);
192 	struct in6_addr *first_hop = &fl6->daddr;
193 	struct dst_entry *dst = skb_dst(skb);
194 	unsigned int head_room;
195 	struct ipv6hdr *hdr;
196 	u8  proto = fl6->flowi6_proto;
197 	int seg_len = skb->len;
198 	int hlimit = -1;
199 	u32 mtu;
200 
201 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
202 	if (opt)
203 		head_room += opt->opt_nflen + opt->opt_flen;
204 
205 	if (unlikely(skb_headroom(skb) < head_room)) {
206 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 		if (!skb2) {
208 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 				      IPSTATS_MIB_OUTDISCARDS);
210 			kfree_skb(skb);
211 			return -ENOBUFS;
212 		}
213 		if (skb->sk)
214 			skb_set_owner_w(skb2, skb->sk);
215 		consume_skb(skb);
216 		skb = skb2;
217 	}
218 
219 	if (opt) {
220 		seg_len += opt->opt_nflen + opt->opt_flen;
221 
222 		if (opt->opt_flen)
223 			ipv6_push_frag_opts(skb, opt, &proto);
224 
225 		if (opt->opt_nflen)
226 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
227 					     &fl6->saddr);
228 	}
229 
230 	skb_push(skb, sizeof(struct ipv6hdr));
231 	skb_reset_network_header(skb);
232 	hdr = ipv6_hdr(skb);
233 
234 	/*
235 	 *	Fill in the IPv6 header
236 	 */
237 	if (np)
238 		hlimit = np->hop_limit;
239 	if (hlimit < 0)
240 		hlimit = ip6_dst_hoplimit(dst);
241 
242 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
243 				ip6_autoflowlabel(net, np), fl6));
244 
245 	hdr->payload_len = htons(seg_len);
246 	hdr->nexthdr = proto;
247 	hdr->hop_limit = hlimit;
248 
249 	hdr->saddr = fl6->saddr;
250 	hdr->daddr = *first_hop;
251 
252 	skb->protocol = htons(ETH_P_IPV6);
253 	skb->priority = sk->sk_priority;
254 	skb->mark = mark;
255 
256 	mtu = dst_mtu(dst);
257 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
258 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
259 			      IPSTATS_MIB_OUT, skb->len);
260 
261 		/* if egress device is enslaved to an L3 master device pass the
262 		 * skb to its handler for processing
263 		 */
264 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
265 		if (unlikely(!skb))
266 			return 0;
267 
268 		/* hooks should never assume socket lock is held.
269 		 * we promote our socket to non const
270 		 */
271 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
272 			       net, (struct sock *)sk, skb, NULL, dst->dev,
273 			       dst_output);
274 	}
275 
276 	skb->dev = dst->dev;
277 	/* ipv6_local_error() does not require socket lock,
278 	 * we promote our socket to non const
279 	 */
280 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
281 
282 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
283 	kfree_skb(skb);
284 	return -EMSGSIZE;
285 }
286 EXPORT_SYMBOL(ip6_xmit);
287 
288 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
289 {
290 	struct ip6_ra_chain *ra;
291 	struct sock *last = NULL;
292 
293 	read_lock(&ip6_ra_lock);
294 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
295 		struct sock *sk = ra->sk;
296 		if (sk && ra->sel == sel &&
297 		    (!sk->sk_bound_dev_if ||
298 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
299 			struct ipv6_pinfo *np = inet6_sk(sk);
300 
301 			if (np && np->rtalert_isolate &&
302 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
303 				continue;
304 			}
305 			if (last) {
306 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307 				if (skb2)
308 					rawv6_rcv(last, skb2);
309 			}
310 			last = sk;
311 		}
312 	}
313 
314 	if (last) {
315 		rawv6_rcv(last, skb);
316 		read_unlock(&ip6_ra_lock);
317 		return 1;
318 	}
319 	read_unlock(&ip6_ra_lock);
320 	return 0;
321 }
322 
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325 	struct ipv6hdr *hdr = ipv6_hdr(skb);
326 	u8 nexthdr = hdr->nexthdr;
327 	__be16 frag_off;
328 	int offset;
329 
330 	if (ipv6_ext_hdr(nexthdr)) {
331 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332 		if (offset < 0)
333 			return 0;
334 	} else
335 		offset = sizeof(struct ipv6hdr);
336 
337 	if (nexthdr == IPPROTO_ICMPV6) {
338 		struct icmp6hdr *icmp6;
339 
340 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
341 					 offset + 1 - skb->data)))
342 			return 0;
343 
344 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345 
346 		switch (icmp6->icmp6_type) {
347 		case NDISC_ROUTER_SOLICITATION:
348 		case NDISC_ROUTER_ADVERTISEMENT:
349 		case NDISC_NEIGHBOUR_SOLICITATION:
350 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
351 		case NDISC_REDIRECT:
352 			/* For reaction involving unicast neighbor discovery
353 			 * message destined to the proxied address, pass it to
354 			 * input function.
355 			 */
356 			return 1;
357 		default:
358 			break;
359 		}
360 	}
361 
362 	/*
363 	 * The proxying router can't forward traffic sent to a link-local
364 	 * address, so signal the sender and discard the packet. This
365 	 * behavior is clarified by the MIPv6 specification.
366 	 */
367 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368 		dst_link_failure(skb);
369 		return -1;
370 	}
371 
372 	return 0;
373 }
374 
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376 				     struct sk_buff *skb)
377 {
378 	struct dst_entry *dst = skb_dst(skb);
379 
380 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
381 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
382 
383 #ifdef CONFIG_NET_SWITCHDEV
384 	if (skb->offload_l3_fwd_mark) {
385 		consume_skb(skb);
386 		return 0;
387 	}
388 #endif
389 
390 	skb->tstamp = 0;
391 	return dst_output(net, sk, skb);
392 }
393 
394 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
395 {
396 	if (skb->len <= mtu)
397 		return false;
398 
399 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
400 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
401 		return true;
402 
403 	if (skb->ignore_df)
404 		return false;
405 
406 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
407 		return false;
408 
409 	return true;
410 }
411 
412 int ip6_forward(struct sk_buff *skb)
413 {
414 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
415 	struct dst_entry *dst = skb_dst(skb);
416 	struct ipv6hdr *hdr = ipv6_hdr(skb);
417 	struct inet6_skb_parm *opt = IP6CB(skb);
418 	struct net *net = dev_net(dst->dev);
419 	u32 mtu;
420 
421 	if (net->ipv6.devconf_all->forwarding == 0)
422 		goto error;
423 
424 	if (skb->pkt_type != PACKET_HOST)
425 		goto drop;
426 
427 	if (unlikely(skb->sk))
428 		goto drop;
429 
430 	if (skb_warn_if_lro(skb))
431 		goto drop;
432 
433 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
434 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
435 		goto drop;
436 	}
437 
438 	skb_forward_csum(skb);
439 
440 	/*
441 	 *	We DO NOT make any processing on
442 	 *	RA packets, pushing them to user level AS IS
443 	 *	without ane WARRANTY that application will be able
444 	 *	to interpret them. The reason is that we
445 	 *	cannot make anything clever here.
446 	 *
447 	 *	We are not end-node, so that if packet contains
448 	 *	AH/ESP, we cannot make anything.
449 	 *	Defragmentation also would be mistake, RA packets
450 	 *	cannot be fragmented, because there is no warranty
451 	 *	that different fragments will go along one path. --ANK
452 	 */
453 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
454 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
455 			return 0;
456 	}
457 
458 	/*
459 	 *	check and decrement ttl
460 	 */
461 	if (hdr->hop_limit <= 1) {
462 		/* Force OUTPUT device used as source address */
463 		skb->dev = dst->dev;
464 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
465 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
466 
467 		kfree_skb(skb);
468 		return -ETIMEDOUT;
469 	}
470 
471 	/* XXX: idev->cnf.proxy_ndp? */
472 	if (net->ipv6.devconf_all->proxy_ndp &&
473 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
474 		int proxied = ip6_forward_proxy_check(skb);
475 		if (proxied > 0)
476 			return ip6_input(skb);
477 		else if (proxied < 0) {
478 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
479 			goto drop;
480 		}
481 	}
482 
483 	if (!xfrm6_route_forward(skb)) {
484 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
485 		goto drop;
486 	}
487 	dst = skb_dst(skb);
488 
489 	/* IPv6 specs say nothing about it, but it is clear that we cannot
490 	   send redirects to source routed frames.
491 	   We don't send redirects to frames decapsulated from IPsec.
492 	 */
493 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
494 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
495 		struct in6_addr *target = NULL;
496 		struct inet_peer *peer;
497 		struct rt6_info *rt;
498 
499 		/*
500 		 *	incoming and outgoing devices are the same
501 		 *	send a redirect.
502 		 */
503 
504 		rt = (struct rt6_info *) dst;
505 		if (rt->rt6i_flags & RTF_GATEWAY)
506 			target = &rt->rt6i_gateway;
507 		else
508 			target = &hdr->daddr;
509 
510 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
511 
512 		/* Limit redirects both by destination (here)
513 		   and by source (inside ndisc_send_redirect)
514 		 */
515 		if (inet_peer_xrlim_allow(peer, 1*HZ))
516 			ndisc_send_redirect(skb, target);
517 		if (peer)
518 			inet_putpeer(peer);
519 	} else {
520 		int addrtype = ipv6_addr_type(&hdr->saddr);
521 
522 		/* This check is security critical. */
523 		if (addrtype == IPV6_ADDR_ANY ||
524 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
525 			goto error;
526 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
527 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
528 				    ICMPV6_NOT_NEIGHBOUR, 0);
529 			goto error;
530 		}
531 	}
532 
533 	mtu = ip6_dst_mtu_forward(dst);
534 	if (mtu < IPV6_MIN_MTU)
535 		mtu = IPV6_MIN_MTU;
536 
537 	if (ip6_pkt_too_big(skb, mtu)) {
538 		/* Again, force OUTPUT device used as source address */
539 		skb->dev = dst->dev;
540 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
541 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
542 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
543 				IPSTATS_MIB_FRAGFAILS);
544 		kfree_skb(skb);
545 		return -EMSGSIZE;
546 	}
547 
548 	if (skb_cow(skb, dst->dev->hard_header_len)) {
549 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
550 				IPSTATS_MIB_OUTDISCARDS);
551 		goto drop;
552 	}
553 
554 	hdr = ipv6_hdr(skb);
555 
556 	/* Mangling hops number delayed to point after skb COW */
557 
558 	hdr->hop_limit--;
559 
560 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
561 		       net, NULL, skb, skb->dev, dst->dev,
562 		       ip6_forward_finish);
563 
564 error:
565 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
566 drop:
567 	kfree_skb(skb);
568 	return -EINVAL;
569 }
570 
571 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
572 {
573 	to->pkt_type = from->pkt_type;
574 	to->priority = from->priority;
575 	to->protocol = from->protocol;
576 	skb_dst_drop(to);
577 	skb_dst_set(to, dst_clone(skb_dst(from)));
578 	to->dev = from->dev;
579 	to->mark = from->mark;
580 
581 	skb_copy_hash(to, from);
582 
583 #ifdef CONFIG_NET_SCHED
584 	to->tc_index = from->tc_index;
585 #endif
586 	nf_copy(to, from);
587 	skb_ext_copy(to, from);
588 	skb_copy_secmark(to, from);
589 }
590 
591 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
592 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
593 {
594 	struct sk_buff *frag;
595 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
596 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
597 				inet6_sk(skb->sk) : NULL;
598 	struct ipv6hdr *tmp_hdr;
599 	struct frag_hdr *fh;
600 	unsigned int mtu, hlen, left, len, nexthdr_offset;
601 	int hroom, troom;
602 	__be32 frag_id;
603 	int ptr, offset = 0, err = 0;
604 	u8 *prevhdr, nexthdr = 0;
605 
606 	err = ip6_find_1stfragopt(skb, &prevhdr);
607 	if (err < 0)
608 		goto fail;
609 	hlen = err;
610 	nexthdr = *prevhdr;
611 	nexthdr_offset = prevhdr - skb_network_header(skb);
612 
613 	mtu = ip6_skb_dst_mtu(skb);
614 
615 	/* We must not fragment if the socket is set to force MTU discovery
616 	 * or if the skb it not generated by a local socket.
617 	 */
618 	if (unlikely(!skb->ignore_df && skb->len > mtu))
619 		goto fail_toobig;
620 
621 	if (IP6CB(skb)->frag_max_size) {
622 		if (IP6CB(skb)->frag_max_size > mtu)
623 			goto fail_toobig;
624 
625 		/* don't send fragments larger than what we received */
626 		mtu = IP6CB(skb)->frag_max_size;
627 		if (mtu < IPV6_MIN_MTU)
628 			mtu = IPV6_MIN_MTU;
629 	}
630 
631 	if (np && np->frag_size < mtu) {
632 		if (np->frag_size)
633 			mtu = np->frag_size;
634 	}
635 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
636 		goto fail_toobig;
637 	mtu -= hlen + sizeof(struct frag_hdr);
638 
639 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
640 				    &ipv6_hdr(skb)->saddr);
641 
642 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
643 	    (err = skb_checksum_help(skb)))
644 		goto fail;
645 
646 	prevhdr = skb_network_header(skb) + nexthdr_offset;
647 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
648 	if (skb_has_frag_list(skb)) {
649 		unsigned int first_len = skb_pagelen(skb);
650 		struct sk_buff *frag2;
651 
652 		if (first_len - hlen > mtu ||
653 		    ((first_len - hlen) & 7) ||
654 		    skb_cloned(skb) ||
655 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
656 			goto slow_path;
657 
658 		skb_walk_frags(skb, frag) {
659 			/* Correct geometry. */
660 			if (frag->len > mtu ||
661 			    ((frag->len & 7) && frag->next) ||
662 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
663 				goto slow_path_clean;
664 
665 			/* Partially cloned skb? */
666 			if (skb_shared(frag))
667 				goto slow_path_clean;
668 
669 			BUG_ON(frag->sk);
670 			if (skb->sk) {
671 				frag->sk = skb->sk;
672 				frag->destructor = sock_wfree;
673 			}
674 			skb->truesize -= frag->truesize;
675 		}
676 
677 		err = 0;
678 		offset = 0;
679 		/* BUILD HEADER */
680 
681 		*prevhdr = NEXTHDR_FRAGMENT;
682 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
683 		if (!tmp_hdr) {
684 			err = -ENOMEM;
685 			goto fail;
686 		}
687 		frag = skb_shinfo(skb)->frag_list;
688 		skb_frag_list_init(skb);
689 
690 		__skb_pull(skb, hlen);
691 		fh = __skb_push(skb, sizeof(struct frag_hdr));
692 		__skb_push(skb, hlen);
693 		skb_reset_network_header(skb);
694 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
695 
696 		fh->nexthdr = nexthdr;
697 		fh->reserved = 0;
698 		fh->frag_off = htons(IP6_MF);
699 		fh->identification = frag_id;
700 
701 		first_len = skb_pagelen(skb);
702 		skb->data_len = first_len - skb_headlen(skb);
703 		skb->len = first_len;
704 		ipv6_hdr(skb)->payload_len = htons(first_len -
705 						   sizeof(struct ipv6hdr));
706 
707 		for (;;) {
708 			/* Prepare header of the next frame,
709 			 * before previous one went down. */
710 			if (frag) {
711 				frag->ip_summed = CHECKSUM_NONE;
712 				skb_reset_transport_header(frag);
713 				fh = __skb_push(frag, sizeof(struct frag_hdr));
714 				__skb_push(frag, hlen);
715 				skb_reset_network_header(frag);
716 				memcpy(skb_network_header(frag), tmp_hdr,
717 				       hlen);
718 				offset += skb->len - hlen - sizeof(struct frag_hdr);
719 				fh->nexthdr = nexthdr;
720 				fh->reserved = 0;
721 				fh->frag_off = htons(offset);
722 				if (frag->next)
723 					fh->frag_off |= htons(IP6_MF);
724 				fh->identification = frag_id;
725 				ipv6_hdr(frag)->payload_len =
726 						htons(frag->len -
727 						      sizeof(struct ipv6hdr));
728 				ip6_copy_metadata(frag, skb);
729 			}
730 
731 			err = output(net, sk, skb);
732 			if (!err)
733 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
734 					      IPSTATS_MIB_FRAGCREATES);
735 
736 			if (err || !frag)
737 				break;
738 
739 			skb = frag;
740 			frag = skb->next;
741 			skb_mark_not_on_list(skb);
742 		}
743 
744 		kfree(tmp_hdr);
745 
746 		if (err == 0) {
747 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
748 				      IPSTATS_MIB_FRAGOKS);
749 			return 0;
750 		}
751 
752 		kfree_skb_list(frag);
753 
754 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
755 			      IPSTATS_MIB_FRAGFAILS);
756 		return err;
757 
758 slow_path_clean:
759 		skb_walk_frags(skb, frag2) {
760 			if (frag2 == frag)
761 				break;
762 			frag2->sk = NULL;
763 			frag2->destructor = NULL;
764 			skb->truesize += frag2->truesize;
765 		}
766 	}
767 
768 slow_path:
769 	left = skb->len - hlen;		/* Space per frame */
770 	ptr = hlen;			/* Where to start from */
771 
772 	/*
773 	 *	Fragment the datagram.
774 	 */
775 
776 	troom = rt->dst.dev->needed_tailroom;
777 
778 	/*
779 	 *	Keep copying data until we run out.
780 	 */
781 	while (left > 0)	{
782 		u8 *fragnexthdr_offset;
783 
784 		len = left;
785 		/* IF: it doesn't fit, use 'mtu' - the data space left */
786 		if (len > mtu)
787 			len = mtu;
788 		/* IF: we are not sending up to and including the packet end
789 		   then align the next start on an eight byte boundary */
790 		if (len < left)	{
791 			len &= ~7;
792 		}
793 
794 		/* Allocate buffer */
795 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
796 				 hroom + troom, GFP_ATOMIC);
797 		if (!frag) {
798 			err = -ENOMEM;
799 			goto fail;
800 		}
801 
802 		/*
803 		 *	Set up data on packet
804 		 */
805 
806 		ip6_copy_metadata(frag, skb);
807 		skb_reserve(frag, hroom);
808 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
809 		skb_reset_network_header(frag);
810 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
811 		frag->transport_header = (frag->network_header + hlen +
812 					  sizeof(struct frag_hdr));
813 
814 		/*
815 		 *	Charge the memory for the fragment to any owner
816 		 *	it might possess
817 		 */
818 		if (skb->sk)
819 			skb_set_owner_w(frag, skb->sk);
820 
821 		/*
822 		 *	Copy the packet header into the new buffer.
823 		 */
824 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
825 
826 		fragnexthdr_offset = skb_network_header(frag);
827 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
828 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
829 
830 		/*
831 		 *	Build fragment header.
832 		 */
833 		fh->nexthdr = nexthdr;
834 		fh->reserved = 0;
835 		fh->identification = frag_id;
836 
837 		/*
838 		 *	Copy a block of the IP datagram.
839 		 */
840 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
841 				     len));
842 		left -= len;
843 
844 		fh->frag_off = htons(offset);
845 		if (left > 0)
846 			fh->frag_off |= htons(IP6_MF);
847 		ipv6_hdr(frag)->payload_len = htons(frag->len -
848 						    sizeof(struct ipv6hdr));
849 
850 		ptr += len;
851 		offset += len;
852 
853 		/*
854 		 *	Put this fragment into the sending queue.
855 		 */
856 		err = output(net, sk, frag);
857 		if (err)
858 			goto fail;
859 
860 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861 			      IPSTATS_MIB_FRAGCREATES);
862 	}
863 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
864 		      IPSTATS_MIB_FRAGOKS);
865 	consume_skb(skb);
866 	return err;
867 
868 fail_toobig:
869 	if (skb->sk && dst_allfrag(skb_dst(skb)))
870 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
871 
872 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
873 	err = -EMSGSIZE;
874 
875 fail:
876 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
877 		      IPSTATS_MIB_FRAGFAILS);
878 	kfree_skb(skb);
879 	return err;
880 }
881 
882 static inline int ip6_rt_check(const struct rt6key *rt_key,
883 			       const struct in6_addr *fl_addr,
884 			       const struct in6_addr *addr_cache)
885 {
886 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
887 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
888 }
889 
890 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
891 					  struct dst_entry *dst,
892 					  const struct flowi6 *fl6)
893 {
894 	struct ipv6_pinfo *np = inet6_sk(sk);
895 	struct rt6_info *rt;
896 
897 	if (!dst)
898 		goto out;
899 
900 	if (dst->ops->family != AF_INET6) {
901 		dst_release(dst);
902 		return NULL;
903 	}
904 
905 	rt = (struct rt6_info *)dst;
906 	/* Yes, checking route validity in not connected
907 	 * case is not very simple. Take into account,
908 	 * that we do not support routing by source, TOS,
909 	 * and MSG_DONTROUTE		--ANK (980726)
910 	 *
911 	 * 1. ip6_rt_check(): If route was host route,
912 	 *    check that cached destination is current.
913 	 *    If it is network route, we still may
914 	 *    check its validity using saved pointer
915 	 *    to the last used address: daddr_cache.
916 	 *    We do not want to save whole address now,
917 	 *    (because main consumer of this service
918 	 *    is tcp, which has not this problem),
919 	 *    so that the last trick works only on connected
920 	 *    sockets.
921 	 * 2. oif also should be the same.
922 	 */
923 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
924 #ifdef CONFIG_IPV6_SUBTREES
925 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
926 #endif
927 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
928 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
929 		dst_release(dst);
930 		dst = NULL;
931 	}
932 
933 out:
934 	return dst;
935 }
936 
937 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
938 			       struct dst_entry **dst, struct flowi6 *fl6)
939 {
940 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
941 	struct neighbour *n;
942 	struct rt6_info *rt;
943 #endif
944 	int err;
945 	int flags = 0;
946 
947 	/* The correct way to handle this would be to do
948 	 * ip6_route_get_saddr, and then ip6_route_output; however,
949 	 * the route-specific preferred source forces the
950 	 * ip6_route_output call _before_ ip6_route_get_saddr.
951 	 *
952 	 * In source specific routing (no src=any default route),
953 	 * ip6_route_output will fail given src=any saddr, though, so
954 	 * that's why we try it again later.
955 	 */
956 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
957 		struct fib6_info *from;
958 		struct rt6_info *rt;
959 		bool had_dst = *dst != NULL;
960 
961 		if (!had_dst)
962 			*dst = ip6_route_output(net, sk, fl6);
963 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
964 
965 		rcu_read_lock();
966 		from = rt ? rcu_dereference(rt->from) : NULL;
967 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
968 					  sk ? inet6_sk(sk)->srcprefs : 0,
969 					  &fl6->saddr);
970 		rcu_read_unlock();
971 
972 		if (err)
973 			goto out_err_release;
974 
975 		/* If we had an erroneous initial result, pretend it
976 		 * never existed and let the SA-enabled version take
977 		 * over.
978 		 */
979 		if (!had_dst && (*dst)->error) {
980 			dst_release(*dst);
981 			*dst = NULL;
982 		}
983 
984 		if (fl6->flowi6_oif)
985 			flags |= RT6_LOOKUP_F_IFACE;
986 	}
987 
988 	if (!*dst)
989 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
990 
991 	err = (*dst)->error;
992 	if (err)
993 		goto out_err_release;
994 
995 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
996 	/*
997 	 * Here if the dst entry we've looked up
998 	 * has a neighbour entry that is in the INCOMPLETE
999 	 * state and the src address from the flow is
1000 	 * marked as OPTIMISTIC, we release the found
1001 	 * dst entry and replace it instead with the
1002 	 * dst entry of the nexthop router
1003 	 */
1004 	rt = (struct rt6_info *) *dst;
1005 	rcu_read_lock_bh();
1006 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1007 				      rt6_nexthop(rt, &fl6->daddr));
1008 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1009 	rcu_read_unlock_bh();
1010 
1011 	if (err) {
1012 		struct inet6_ifaddr *ifp;
1013 		struct flowi6 fl_gw6;
1014 		int redirect;
1015 
1016 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1017 				      (*dst)->dev, 1);
1018 
1019 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1020 		if (ifp)
1021 			in6_ifa_put(ifp);
1022 
1023 		if (redirect) {
1024 			/*
1025 			 * We need to get the dst entry for the
1026 			 * default router instead
1027 			 */
1028 			dst_release(*dst);
1029 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1030 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1031 			*dst = ip6_route_output(net, sk, &fl_gw6);
1032 			err = (*dst)->error;
1033 			if (err)
1034 				goto out_err_release;
1035 		}
1036 	}
1037 #endif
1038 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1039 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1040 		err = -EAFNOSUPPORT;
1041 		goto out_err_release;
1042 	}
1043 
1044 	return 0;
1045 
1046 out_err_release:
1047 	dst_release(*dst);
1048 	*dst = NULL;
1049 
1050 	if (err == -ENETUNREACH)
1051 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1052 	return err;
1053 }
1054 
1055 /**
1056  *	ip6_dst_lookup - perform route lookup on flow
1057  *	@sk: socket which provides route info
1058  *	@dst: pointer to dst_entry * for result
1059  *	@fl6: flow to lookup
1060  *
1061  *	This function performs a route lookup on the given flow.
1062  *
1063  *	It returns zero on success, or a standard errno code on error.
1064  */
1065 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1066 		   struct flowi6 *fl6)
1067 {
1068 	*dst = NULL;
1069 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1072 
1073 /**
1074  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1075  *	@sk: socket which provides route info
1076  *	@fl6: flow to lookup
1077  *	@final_dst: final destination address for ipsec lookup
1078  *
1079  *	This function performs a route lookup on the given flow.
1080  *
1081  *	It returns a valid dst pointer on success, or a pointer encoded
1082  *	error code.
1083  */
1084 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1085 				      const struct in6_addr *final_dst)
1086 {
1087 	struct dst_entry *dst = NULL;
1088 	int err;
1089 
1090 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1091 	if (err)
1092 		return ERR_PTR(err);
1093 	if (final_dst)
1094 		fl6->daddr = *final_dst;
1095 
1096 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1097 }
1098 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1099 
1100 /**
1101  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1102  *	@sk: socket which provides the dst cache and route info
1103  *	@fl6: flow to lookup
1104  *	@final_dst: final destination address for ipsec lookup
1105  *	@connected: whether @sk is connected or not
1106  *
1107  *	This function performs a route lookup on the given flow with the
1108  *	possibility of using the cached route in the socket if it is valid.
1109  *	It will take the socket dst lock when operating on the dst cache.
1110  *	As a result, this function can only be used in process context.
1111  *
1112  *	In addition, for a connected socket, cache the dst in the socket
1113  *	if the current cache is not valid.
1114  *
1115  *	It returns a valid dst pointer on success, or a pointer encoded
1116  *	error code.
1117  */
1118 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1119 					 const struct in6_addr *final_dst,
1120 					 bool connected)
1121 {
1122 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1123 
1124 	dst = ip6_sk_dst_check(sk, dst, fl6);
1125 	if (dst)
1126 		return dst;
1127 
1128 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1129 	if (connected && !IS_ERR(dst))
1130 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1131 
1132 	return dst;
1133 }
1134 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1135 
1136 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1137 					       gfp_t gfp)
1138 {
1139 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141 
1142 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1143 						gfp_t gfp)
1144 {
1145 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1146 }
1147 
1148 static void ip6_append_data_mtu(unsigned int *mtu,
1149 				int *maxfraglen,
1150 				unsigned int fragheaderlen,
1151 				struct sk_buff *skb,
1152 				struct rt6_info *rt,
1153 				unsigned int orig_mtu)
1154 {
1155 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1156 		if (!skb) {
1157 			/* first fragment, reserve header_len */
1158 			*mtu = orig_mtu - rt->dst.header_len;
1159 
1160 		} else {
1161 			/*
1162 			 * this fragment is not first, the headers
1163 			 * space is regarded as data space.
1164 			 */
1165 			*mtu = orig_mtu;
1166 		}
1167 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1168 			      + fragheaderlen - sizeof(struct frag_hdr);
1169 	}
1170 }
1171 
1172 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1173 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1174 			  struct rt6_info *rt, struct flowi6 *fl6)
1175 {
1176 	struct ipv6_pinfo *np = inet6_sk(sk);
1177 	unsigned int mtu;
1178 	struct ipv6_txoptions *opt = ipc6->opt;
1179 
1180 	/*
1181 	 * setup for corking
1182 	 */
1183 	if (opt) {
1184 		if (WARN_ON(v6_cork->opt))
1185 			return -EINVAL;
1186 
1187 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1188 		if (unlikely(!v6_cork->opt))
1189 			return -ENOBUFS;
1190 
1191 		v6_cork->opt->tot_len = sizeof(*opt);
1192 		v6_cork->opt->opt_flen = opt->opt_flen;
1193 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1194 
1195 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1196 						    sk->sk_allocation);
1197 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1198 			return -ENOBUFS;
1199 
1200 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1201 						    sk->sk_allocation);
1202 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1203 			return -ENOBUFS;
1204 
1205 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1206 						   sk->sk_allocation);
1207 		if (opt->hopopt && !v6_cork->opt->hopopt)
1208 			return -ENOBUFS;
1209 
1210 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1211 						    sk->sk_allocation);
1212 		if (opt->srcrt && !v6_cork->opt->srcrt)
1213 			return -ENOBUFS;
1214 
1215 		/* need source address above miyazawa*/
1216 	}
1217 	dst_hold(&rt->dst);
1218 	cork->base.dst = &rt->dst;
1219 	cork->fl.u.ip6 = *fl6;
1220 	v6_cork->hop_limit = ipc6->hlimit;
1221 	v6_cork->tclass = ipc6->tclass;
1222 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1223 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1224 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1225 	else
1226 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1227 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1228 	if (np->frag_size < mtu) {
1229 		if (np->frag_size)
1230 			mtu = np->frag_size;
1231 	}
1232 	if (mtu < IPV6_MIN_MTU)
1233 		return -EINVAL;
1234 	cork->base.fragsize = mtu;
1235 	cork->base.gso_size = ipc6->gso_size;
1236 	cork->base.tx_flags = 0;
1237 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1238 
1239 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1240 		cork->base.flags |= IPCORK_ALLFRAG;
1241 	cork->base.length = 0;
1242 
1243 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1244 
1245 	return 0;
1246 }
1247 
1248 static int __ip6_append_data(struct sock *sk,
1249 			     struct flowi6 *fl6,
1250 			     struct sk_buff_head *queue,
1251 			     struct inet_cork *cork,
1252 			     struct inet6_cork *v6_cork,
1253 			     struct page_frag *pfrag,
1254 			     int getfrag(void *from, char *to, int offset,
1255 					 int len, int odd, struct sk_buff *skb),
1256 			     void *from, int length, int transhdrlen,
1257 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1258 {
1259 	struct sk_buff *skb, *skb_prev = NULL;
1260 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1261 	struct ubuf_info *uarg = NULL;
1262 	int exthdrlen = 0;
1263 	int dst_exthdrlen = 0;
1264 	int hh_len;
1265 	int copy;
1266 	int err;
1267 	int offset = 0;
1268 	u32 tskey = 0;
1269 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1270 	struct ipv6_txoptions *opt = v6_cork->opt;
1271 	int csummode = CHECKSUM_NONE;
1272 	unsigned int maxnonfragsize, headersize;
1273 	unsigned int wmem_alloc_delta = 0;
1274 	bool paged, extra_uref = false;
1275 
1276 	skb = skb_peek_tail(queue);
1277 	if (!skb) {
1278 		exthdrlen = opt ? opt->opt_flen : 0;
1279 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1280 	}
1281 
1282 	paged = !!cork->gso_size;
1283 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1284 	orig_mtu = mtu;
1285 
1286 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1287 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1288 		tskey = sk->sk_tskey++;
1289 
1290 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1291 
1292 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1293 			(opt ? opt->opt_nflen : 0);
1294 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1295 		     sizeof(struct frag_hdr);
1296 
1297 	headersize = sizeof(struct ipv6hdr) +
1298 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1299 		     (dst_allfrag(&rt->dst) ?
1300 		      sizeof(struct frag_hdr) : 0) +
1301 		     rt->rt6i_nfheader_len;
1302 
1303 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1304 	 * the first fragment
1305 	 */
1306 	if (headersize + transhdrlen > mtu)
1307 		goto emsgsize;
1308 
1309 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1310 	    (sk->sk_protocol == IPPROTO_UDP ||
1311 	     sk->sk_protocol == IPPROTO_RAW)) {
1312 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1313 				sizeof(struct ipv6hdr));
1314 		goto emsgsize;
1315 	}
1316 
1317 	if (ip6_sk_ignore_df(sk))
1318 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1319 	else
1320 		maxnonfragsize = mtu;
1321 
1322 	if (cork->length + length > maxnonfragsize - headersize) {
1323 emsgsize:
1324 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1325 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1326 		return -EMSGSIZE;
1327 	}
1328 
1329 	/* CHECKSUM_PARTIAL only with no extension headers and when
1330 	 * we are not going to fragment
1331 	 */
1332 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1333 	    headersize == sizeof(struct ipv6hdr) &&
1334 	    length <= mtu - headersize &&
1335 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1336 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1337 		csummode = CHECKSUM_PARTIAL;
1338 
1339 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1340 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1341 		if (!uarg)
1342 			return -ENOBUFS;
1343 		extra_uref = !skb;	/* only extra ref if !MSG_MORE */
1344 		if (rt->dst.dev->features & NETIF_F_SG &&
1345 		    csummode == CHECKSUM_PARTIAL) {
1346 			paged = true;
1347 		} else {
1348 			uarg->zerocopy = 0;
1349 			skb_zcopy_set(skb, uarg, &extra_uref);
1350 		}
1351 	}
1352 
1353 	/*
1354 	 * Let's try using as much space as possible.
1355 	 * Use MTU if total length of the message fits into the MTU.
1356 	 * Otherwise, we need to reserve fragment header and
1357 	 * fragment alignment (= 8-15 octects, in total).
1358 	 *
1359 	 * Note that we may need to "move" the data from the tail of
1360 	 * of the buffer to the new fragment when we split
1361 	 * the message.
1362 	 *
1363 	 * FIXME: It may be fragmented into multiple chunks
1364 	 *        at once if non-fragmentable extension headers
1365 	 *        are too large.
1366 	 * --yoshfuji
1367 	 */
1368 
1369 	cork->length += length;
1370 	if (!skb)
1371 		goto alloc_new_skb;
1372 
1373 	while (length > 0) {
1374 		/* Check if the remaining data fits into current packet. */
1375 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1376 		if (copy < length)
1377 			copy = maxfraglen - skb->len;
1378 
1379 		if (copy <= 0) {
1380 			char *data;
1381 			unsigned int datalen;
1382 			unsigned int fraglen;
1383 			unsigned int fraggap;
1384 			unsigned int alloclen;
1385 			unsigned int pagedlen;
1386 alloc_new_skb:
1387 			/* There's no room in the current skb */
1388 			if (skb)
1389 				fraggap = skb->len - maxfraglen;
1390 			else
1391 				fraggap = 0;
1392 			/* update mtu and maxfraglen if necessary */
1393 			if (!skb || !skb_prev)
1394 				ip6_append_data_mtu(&mtu, &maxfraglen,
1395 						    fragheaderlen, skb, rt,
1396 						    orig_mtu);
1397 
1398 			skb_prev = skb;
1399 
1400 			/*
1401 			 * If remaining data exceeds the mtu,
1402 			 * we know we need more fragment(s).
1403 			 */
1404 			datalen = length + fraggap;
1405 
1406 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1407 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1408 			fraglen = datalen + fragheaderlen;
1409 			pagedlen = 0;
1410 
1411 			if ((flags & MSG_MORE) &&
1412 			    !(rt->dst.dev->features&NETIF_F_SG))
1413 				alloclen = mtu;
1414 			else if (!paged)
1415 				alloclen = fraglen;
1416 			else {
1417 				alloclen = min_t(int, fraglen, MAX_HEADER);
1418 				pagedlen = fraglen - alloclen;
1419 			}
1420 
1421 			alloclen += dst_exthdrlen;
1422 
1423 			if (datalen != length + fraggap) {
1424 				/*
1425 				 * this is not the last fragment, the trailer
1426 				 * space is regarded as data space.
1427 				 */
1428 				datalen += rt->dst.trailer_len;
1429 			}
1430 
1431 			alloclen += rt->dst.trailer_len;
1432 			fraglen = datalen + fragheaderlen;
1433 
1434 			/*
1435 			 * We just reserve space for fragment header.
1436 			 * Note: this may be overallocation if the message
1437 			 * (without MSG_MORE) fits into the MTU.
1438 			 */
1439 			alloclen += sizeof(struct frag_hdr);
1440 
1441 			copy = datalen - transhdrlen - fraggap - pagedlen;
1442 			if (copy < 0) {
1443 				err = -EINVAL;
1444 				goto error;
1445 			}
1446 			if (transhdrlen) {
1447 				skb = sock_alloc_send_skb(sk,
1448 						alloclen + hh_len,
1449 						(flags & MSG_DONTWAIT), &err);
1450 			} else {
1451 				skb = NULL;
1452 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1453 				    2 * sk->sk_sndbuf)
1454 					skb = alloc_skb(alloclen + hh_len,
1455 							sk->sk_allocation);
1456 				if (unlikely(!skb))
1457 					err = -ENOBUFS;
1458 			}
1459 			if (!skb)
1460 				goto error;
1461 			/*
1462 			 *	Fill in the control structures
1463 			 */
1464 			skb->protocol = htons(ETH_P_IPV6);
1465 			skb->ip_summed = csummode;
1466 			skb->csum = 0;
1467 			/* reserve for fragmentation and ipsec header */
1468 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1469 				    dst_exthdrlen);
1470 
1471 			/*
1472 			 *	Find where to start putting bytes
1473 			 */
1474 			data = skb_put(skb, fraglen - pagedlen);
1475 			skb_set_network_header(skb, exthdrlen);
1476 			data += fragheaderlen;
1477 			skb->transport_header = (skb->network_header +
1478 						 fragheaderlen);
1479 			if (fraggap) {
1480 				skb->csum = skb_copy_and_csum_bits(
1481 					skb_prev, maxfraglen,
1482 					data + transhdrlen, fraggap, 0);
1483 				skb_prev->csum = csum_sub(skb_prev->csum,
1484 							  skb->csum);
1485 				data += fraggap;
1486 				pskb_trim_unique(skb_prev, maxfraglen);
1487 			}
1488 			if (copy > 0 &&
1489 			    getfrag(from, data + transhdrlen, offset,
1490 				    copy, fraggap, skb) < 0) {
1491 				err = -EFAULT;
1492 				kfree_skb(skb);
1493 				goto error;
1494 			}
1495 
1496 			offset += copy;
1497 			length -= copy + transhdrlen;
1498 			transhdrlen = 0;
1499 			exthdrlen = 0;
1500 			dst_exthdrlen = 0;
1501 
1502 			/* Only the initial fragment is time stamped */
1503 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1504 			cork->tx_flags = 0;
1505 			skb_shinfo(skb)->tskey = tskey;
1506 			tskey = 0;
1507 			skb_zcopy_set(skb, uarg, &extra_uref);
1508 
1509 			if ((flags & MSG_CONFIRM) && !skb_prev)
1510 				skb_set_dst_pending_confirm(skb, 1);
1511 
1512 			/*
1513 			 * Put the packet on the pending queue
1514 			 */
1515 			if (!skb->destructor) {
1516 				skb->destructor = sock_wfree;
1517 				skb->sk = sk;
1518 				wmem_alloc_delta += skb->truesize;
1519 			}
1520 			__skb_queue_tail(queue, skb);
1521 			continue;
1522 		}
1523 
1524 		if (copy > length)
1525 			copy = length;
1526 
1527 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1528 		    skb_tailroom(skb) >= copy) {
1529 			unsigned int off;
1530 
1531 			off = skb->len;
1532 			if (getfrag(from, skb_put(skb, copy),
1533 						offset, copy, off, skb) < 0) {
1534 				__skb_trim(skb, off);
1535 				err = -EFAULT;
1536 				goto error;
1537 			}
1538 		} else if (!uarg || !uarg->zerocopy) {
1539 			int i = skb_shinfo(skb)->nr_frags;
1540 
1541 			err = -ENOMEM;
1542 			if (!sk_page_frag_refill(sk, pfrag))
1543 				goto error;
1544 
1545 			if (!skb_can_coalesce(skb, i, pfrag->page,
1546 					      pfrag->offset)) {
1547 				err = -EMSGSIZE;
1548 				if (i == MAX_SKB_FRAGS)
1549 					goto error;
1550 
1551 				__skb_fill_page_desc(skb, i, pfrag->page,
1552 						     pfrag->offset, 0);
1553 				skb_shinfo(skb)->nr_frags = ++i;
1554 				get_page(pfrag->page);
1555 			}
1556 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1557 			if (getfrag(from,
1558 				    page_address(pfrag->page) + pfrag->offset,
1559 				    offset, copy, skb->len, skb) < 0)
1560 				goto error_efault;
1561 
1562 			pfrag->offset += copy;
1563 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1564 			skb->len += copy;
1565 			skb->data_len += copy;
1566 			skb->truesize += copy;
1567 			wmem_alloc_delta += copy;
1568 		} else {
1569 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1570 			if (err < 0)
1571 				goto error;
1572 		}
1573 		offset += copy;
1574 		length -= copy;
1575 	}
1576 
1577 	if (wmem_alloc_delta)
1578 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1579 	return 0;
1580 
1581 error_efault:
1582 	err = -EFAULT;
1583 error:
1584 	if (uarg)
1585 		sock_zerocopy_put_abort(uarg, extra_uref);
1586 	cork->length -= length;
1587 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1588 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1589 	return err;
1590 }
1591 
1592 int ip6_append_data(struct sock *sk,
1593 		    int getfrag(void *from, char *to, int offset, int len,
1594 				int odd, struct sk_buff *skb),
1595 		    void *from, int length, int transhdrlen,
1596 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1597 		    struct rt6_info *rt, unsigned int flags)
1598 {
1599 	struct inet_sock *inet = inet_sk(sk);
1600 	struct ipv6_pinfo *np = inet6_sk(sk);
1601 	int exthdrlen;
1602 	int err;
1603 
1604 	if (flags&MSG_PROBE)
1605 		return 0;
1606 	if (skb_queue_empty(&sk->sk_write_queue)) {
1607 		/*
1608 		 * setup for corking
1609 		 */
1610 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1611 				     ipc6, rt, fl6);
1612 		if (err)
1613 			return err;
1614 
1615 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1616 		length += exthdrlen;
1617 		transhdrlen += exthdrlen;
1618 	} else {
1619 		fl6 = &inet->cork.fl.u.ip6;
1620 		transhdrlen = 0;
1621 	}
1622 
1623 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1624 				 &np->cork, sk_page_frag(sk), getfrag,
1625 				 from, length, transhdrlen, flags, ipc6);
1626 }
1627 EXPORT_SYMBOL_GPL(ip6_append_data);
1628 
1629 static void ip6_cork_release(struct inet_cork_full *cork,
1630 			     struct inet6_cork *v6_cork)
1631 {
1632 	if (v6_cork->opt) {
1633 		kfree(v6_cork->opt->dst0opt);
1634 		kfree(v6_cork->opt->dst1opt);
1635 		kfree(v6_cork->opt->hopopt);
1636 		kfree(v6_cork->opt->srcrt);
1637 		kfree(v6_cork->opt);
1638 		v6_cork->opt = NULL;
1639 	}
1640 
1641 	if (cork->base.dst) {
1642 		dst_release(cork->base.dst);
1643 		cork->base.dst = NULL;
1644 		cork->base.flags &= ~IPCORK_ALLFRAG;
1645 	}
1646 	memset(&cork->fl, 0, sizeof(cork->fl));
1647 }
1648 
1649 struct sk_buff *__ip6_make_skb(struct sock *sk,
1650 			       struct sk_buff_head *queue,
1651 			       struct inet_cork_full *cork,
1652 			       struct inet6_cork *v6_cork)
1653 {
1654 	struct sk_buff *skb, *tmp_skb;
1655 	struct sk_buff **tail_skb;
1656 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1657 	struct ipv6_pinfo *np = inet6_sk(sk);
1658 	struct net *net = sock_net(sk);
1659 	struct ipv6hdr *hdr;
1660 	struct ipv6_txoptions *opt = v6_cork->opt;
1661 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1662 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1663 	unsigned char proto = fl6->flowi6_proto;
1664 
1665 	skb = __skb_dequeue(queue);
1666 	if (!skb)
1667 		goto out;
1668 	tail_skb = &(skb_shinfo(skb)->frag_list);
1669 
1670 	/* move skb->data to ip header from ext header */
1671 	if (skb->data < skb_network_header(skb))
1672 		__skb_pull(skb, skb_network_offset(skb));
1673 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1674 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1675 		*tail_skb = tmp_skb;
1676 		tail_skb = &(tmp_skb->next);
1677 		skb->len += tmp_skb->len;
1678 		skb->data_len += tmp_skb->len;
1679 		skb->truesize += tmp_skb->truesize;
1680 		tmp_skb->destructor = NULL;
1681 		tmp_skb->sk = NULL;
1682 	}
1683 
1684 	/* Allow local fragmentation. */
1685 	skb->ignore_df = ip6_sk_ignore_df(sk);
1686 
1687 	*final_dst = fl6->daddr;
1688 	__skb_pull(skb, skb_network_header_len(skb));
1689 	if (opt && opt->opt_flen)
1690 		ipv6_push_frag_opts(skb, opt, &proto);
1691 	if (opt && opt->opt_nflen)
1692 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1693 
1694 	skb_push(skb, sizeof(struct ipv6hdr));
1695 	skb_reset_network_header(skb);
1696 	hdr = ipv6_hdr(skb);
1697 
1698 	ip6_flow_hdr(hdr, v6_cork->tclass,
1699 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1700 					ip6_autoflowlabel(net, np), fl6));
1701 	hdr->hop_limit = v6_cork->hop_limit;
1702 	hdr->nexthdr = proto;
1703 	hdr->saddr = fl6->saddr;
1704 	hdr->daddr = *final_dst;
1705 
1706 	skb->priority = sk->sk_priority;
1707 	skb->mark = sk->sk_mark;
1708 
1709 	skb->tstamp = cork->base.transmit_time;
1710 
1711 	skb_dst_set(skb, dst_clone(&rt->dst));
1712 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1713 	if (proto == IPPROTO_ICMPV6) {
1714 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1715 
1716 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1717 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1718 	}
1719 
1720 	ip6_cork_release(cork, v6_cork);
1721 out:
1722 	return skb;
1723 }
1724 
1725 int ip6_send_skb(struct sk_buff *skb)
1726 {
1727 	struct net *net = sock_net(skb->sk);
1728 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1729 	int err;
1730 
1731 	err = ip6_local_out(net, skb->sk, skb);
1732 	if (err) {
1733 		if (err > 0)
1734 			err = net_xmit_errno(err);
1735 		if (err)
1736 			IP6_INC_STATS(net, rt->rt6i_idev,
1737 				      IPSTATS_MIB_OUTDISCARDS);
1738 	}
1739 
1740 	return err;
1741 }
1742 
1743 int ip6_push_pending_frames(struct sock *sk)
1744 {
1745 	struct sk_buff *skb;
1746 
1747 	skb = ip6_finish_skb(sk);
1748 	if (!skb)
1749 		return 0;
1750 
1751 	return ip6_send_skb(skb);
1752 }
1753 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1754 
1755 static void __ip6_flush_pending_frames(struct sock *sk,
1756 				       struct sk_buff_head *queue,
1757 				       struct inet_cork_full *cork,
1758 				       struct inet6_cork *v6_cork)
1759 {
1760 	struct sk_buff *skb;
1761 
1762 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1763 		if (skb_dst(skb))
1764 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1765 				      IPSTATS_MIB_OUTDISCARDS);
1766 		kfree_skb(skb);
1767 	}
1768 
1769 	ip6_cork_release(cork, v6_cork);
1770 }
1771 
1772 void ip6_flush_pending_frames(struct sock *sk)
1773 {
1774 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1775 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1776 }
1777 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1778 
1779 struct sk_buff *ip6_make_skb(struct sock *sk,
1780 			     int getfrag(void *from, char *to, int offset,
1781 					 int len, int odd, struct sk_buff *skb),
1782 			     void *from, int length, int transhdrlen,
1783 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1784 			     struct rt6_info *rt, unsigned int flags,
1785 			     struct inet_cork_full *cork)
1786 {
1787 	struct inet6_cork v6_cork;
1788 	struct sk_buff_head queue;
1789 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1790 	int err;
1791 
1792 	if (flags & MSG_PROBE)
1793 		return NULL;
1794 
1795 	__skb_queue_head_init(&queue);
1796 
1797 	cork->base.flags = 0;
1798 	cork->base.addr = 0;
1799 	cork->base.opt = NULL;
1800 	cork->base.dst = NULL;
1801 	v6_cork.opt = NULL;
1802 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1803 	if (err) {
1804 		ip6_cork_release(cork, &v6_cork);
1805 		return ERR_PTR(err);
1806 	}
1807 	if (ipc6->dontfrag < 0)
1808 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1809 
1810 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1811 				&current->task_frag, getfrag, from,
1812 				length + exthdrlen, transhdrlen + exthdrlen,
1813 				flags, ipc6);
1814 	if (err) {
1815 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1816 		return ERR_PTR(err);
1817 	}
1818 
1819 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1820 }
1821