xref: /linux/net/ipv6/ip6_output.c (revision f412eed9dfdeeb6becd7de2ffe8b5d0a8b3f81ca)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	struct ipv6hdr *hdr;
199 	u8  proto = fl6->flowi6_proto;
200 	int seg_len = skb->len;
201 	int hlimit = -1;
202 	u32 mtu;
203 
204 	if (opt) {
205 		unsigned int head_room;
206 
207 		/* First: exthdrs may take lots of space (~8K for now)
208 		   MAX_HEADER is not enough.
209 		 */
210 		head_room = opt->opt_nflen + opt->opt_flen;
211 		seg_len += head_room;
212 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213 
214 		if (skb_headroom(skb) < head_room) {
215 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 			if (!skb2) {
217 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218 					      IPSTATS_MIB_OUTDISCARDS);
219 				kfree_skb(skb);
220 				return -ENOBUFS;
221 			}
222 			consume_skb(skb);
223 			skb = skb2;
224 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225 			 * it is safe to call in our context (socket lock not held)
226 			 */
227 			skb_set_owner_w(skb, (struct sock *)sk);
228 		}
229 		if (opt->opt_flen)
230 			ipv6_push_frag_opts(skb, opt, &proto);
231 		if (opt->opt_nflen)
232 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233 					     &fl6->saddr);
234 	}
235 
236 	skb_push(skb, sizeof(struct ipv6hdr));
237 	skb_reset_network_header(skb);
238 	hdr = ipv6_hdr(skb);
239 
240 	/*
241 	 *	Fill in the IPv6 header
242 	 */
243 	if (np)
244 		hlimit = np->hop_limit;
245 	if (hlimit < 0)
246 		hlimit = ip6_dst_hoplimit(dst);
247 
248 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249 				ip6_autoflowlabel(net, np), fl6));
250 
251 	hdr->payload_len = htons(seg_len);
252 	hdr->nexthdr = proto;
253 	hdr->hop_limit = hlimit;
254 
255 	hdr->saddr = fl6->saddr;
256 	hdr->daddr = *first_hop;
257 
258 	skb->protocol = htons(ETH_P_IPV6);
259 	skb->priority = sk->sk_priority;
260 	skb->mark = mark;
261 
262 	mtu = dst_mtu(dst);
263 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265 			      IPSTATS_MIB_OUT, skb->len);
266 
267 		/* if egress device is enslaved to an L3 master device pass the
268 		 * skb to its handler for processing
269 		 */
270 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
271 		if (unlikely(!skb))
272 			return 0;
273 
274 		/* hooks should never assume socket lock is held.
275 		 * we promote our socket to non const
276 		 */
277 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278 			       net, (struct sock *)sk, skb, NULL, dst->dev,
279 			       dst_output);
280 	}
281 
282 	skb->dev = dst->dev;
283 	/* ipv6_local_error() does not require socket lock,
284 	 * we promote our socket to non const
285 	 */
286 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287 
288 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289 	kfree_skb(skb);
290 	return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293 
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296 	struct ip6_ra_chain *ra;
297 	struct sock *last = NULL;
298 
299 	read_lock(&ip6_ra_lock);
300 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
301 		struct sock *sk = ra->sk;
302 		if (sk && ra->sel == sel &&
303 		    (!sk->sk_bound_dev_if ||
304 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
305 			if (last) {
306 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307 				if (skb2)
308 					rawv6_rcv(last, skb2);
309 			}
310 			last = sk;
311 		}
312 	}
313 
314 	if (last) {
315 		rawv6_rcv(last, skb);
316 		read_unlock(&ip6_ra_lock);
317 		return 1;
318 	}
319 	read_unlock(&ip6_ra_lock);
320 	return 0;
321 }
322 
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325 	struct ipv6hdr *hdr = ipv6_hdr(skb);
326 	u8 nexthdr = hdr->nexthdr;
327 	__be16 frag_off;
328 	int offset;
329 
330 	if (ipv6_ext_hdr(nexthdr)) {
331 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332 		if (offset < 0)
333 			return 0;
334 	} else
335 		offset = sizeof(struct ipv6hdr);
336 
337 	if (nexthdr == IPPROTO_ICMPV6) {
338 		struct icmp6hdr *icmp6;
339 
340 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
341 					 offset + 1 - skb->data)))
342 			return 0;
343 
344 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345 
346 		switch (icmp6->icmp6_type) {
347 		case NDISC_ROUTER_SOLICITATION:
348 		case NDISC_ROUTER_ADVERTISEMENT:
349 		case NDISC_NEIGHBOUR_SOLICITATION:
350 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
351 		case NDISC_REDIRECT:
352 			/* For reaction involving unicast neighbor discovery
353 			 * message destined to the proxied address, pass it to
354 			 * input function.
355 			 */
356 			return 1;
357 		default:
358 			break;
359 		}
360 	}
361 
362 	/*
363 	 * The proxying router can't forward traffic sent to a link-local
364 	 * address, so signal the sender and discard the packet. This
365 	 * behavior is clarified by the MIPv6 specification.
366 	 */
367 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368 		dst_link_failure(skb);
369 		return -1;
370 	}
371 
372 	return 0;
373 }
374 
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376 				     struct sk_buff *skb)
377 {
378 	struct dst_entry *dst = skb_dst(skb);
379 
380 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
381 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
382 
383 	return dst_output(net, sk, skb);
384 }
385 
386 unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
387 {
388 	unsigned int mtu;
389 	struct inet6_dev *idev;
390 
391 	if (dst_metric_locked(dst, RTAX_MTU)) {
392 		mtu = dst_metric_raw(dst, RTAX_MTU);
393 		if (mtu)
394 			return mtu;
395 	}
396 
397 	mtu = IPV6_MIN_MTU;
398 	rcu_read_lock();
399 	idev = __in6_dev_get(dst->dev);
400 	if (idev)
401 		mtu = idev->cnf.mtu6;
402 	rcu_read_unlock();
403 
404 	return mtu;
405 }
406 EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
407 
408 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
409 {
410 	if (skb->len <= mtu)
411 		return false;
412 
413 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
414 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
415 		return true;
416 
417 	if (skb->ignore_df)
418 		return false;
419 
420 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
421 		return false;
422 
423 	return true;
424 }
425 
426 int ip6_forward(struct sk_buff *skb)
427 {
428 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
429 	struct dst_entry *dst = skb_dst(skb);
430 	struct ipv6hdr *hdr = ipv6_hdr(skb);
431 	struct inet6_skb_parm *opt = IP6CB(skb);
432 	struct net *net = dev_net(dst->dev);
433 	u32 mtu;
434 
435 	if (net->ipv6.devconf_all->forwarding == 0)
436 		goto error;
437 
438 	if (skb->pkt_type != PACKET_HOST)
439 		goto drop;
440 
441 	if (unlikely(skb->sk))
442 		goto drop;
443 
444 	if (skb_warn_if_lro(skb))
445 		goto drop;
446 
447 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
448 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
449 		goto drop;
450 	}
451 
452 	skb_forward_csum(skb);
453 
454 	/*
455 	 *	We DO NOT make any processing on
456 	 *	RA packets, pushing them to user level AS IS
457 	 *	without ane WARRANTY that application will be able
458 	 *	to interpret them. The reason is that we
459 	 *	cannot make anything clever here.
460 	 *
461 	 *	We are not end-node, so that if packet contains
462 	 *	AH/ESP, we cannot make anything.
463 	 *	Defragmentation also would be mistake, RA packets
464 	 *	cannot be fragmented, because there is no warranty
465 	 *	that different fragments will go along one path. --ANK
466 	 */
467 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
468 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
469 			return 0;
470 	}
471 
472 	/*
473 	 *	check and decrement ttl
474 	 */
475 	if (hdr->hop_limit <= 1) {
476 		/* Force OUTPUT device used as source address */
477 		skb->dev = dst->dev;
478 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
479 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
480 
481 		kfree_skb(skb);
482 		return -ETIMEDOUT;
483 	}
484 
485 	/* XXX: idev->cnf.proxy_ndp? */
486 	if (net->ipv6.devconf_all->proxy_ndp &&
487 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
488 		int proxied = ip6_forward_proxy_check(skb);
489 		if (proxied > 0)
490 			return ip6_input(skb);
491 		else if (proxied < 0) {
492 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
493 			goto drop;
494 		}
495 	}
496 
497 	if (!xfrm6_route_forward(skb)) {
498 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
499 		goto drop;
500 	}
501 	dst = skb_dst(skb);
502 
503 	/* IPv6 specs say nothing about it, but it is clear that we cannot
504 	   send redirects to source routed frames.
505 	   We don't send redirects to frames decapsulated from IPsec.
506 	 */
507 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
508 		struct in6_addr *target = NULL;
509 		struct inet_peer *peer;
510 		struct rt6_info *rt;
511 
512 		/*
513 		 *	incoming and outgoing devices are the same
514 		 *	send a redirect.
515 		 */
516 
517 		rt = (struct rt6_info *) dst;
518 		if (rt->rt6i_flags & RTF_GATEWAY)
519 			target = &rt->rt6i_gateway;
520 		else
521 			target = &hdr->daddr;
522 
523 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
524 
525 		/* Limit redirects both by destination (here)
526 		   and by source (inside ndisc_send_redirect)
527 		 */
528 		if (inet_peer_xrlim_allow(peer, 1*HZ))
529 			ndisc_send_redirect(skb, target);
530 		if (peer)
531 			inet_putpeer(peer);
532 	} else {
533 		int addrtype = ipv6_addr_type(&hdr->saddr);
534 
535 		/* This check is security critical. */
536 		if (addrtype == IPV6_ADDR_ANY ||
537 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
538 			goto error;
539 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
540 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
541 				    ICMPV6_NOT_NEIGHBOUR, 0);
542 			goto error;
543 		}
544 	}
545 
546 	mtu = ip6_dst_mtu_forward(dst);
547 	if (mtu < IPV6_MIN_MTU)
548 		mtu = IPV6_MIN_MTU;
549 
550 	if (ip6_pkt_too_big(skb, mtu)) {
551 		/* Again, force OUTPUT device used as source address */
552 		skb->dev = dst->dev;
553 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
554 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
555 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
556 				IPSTATS_MIB_FRAGFAILS);
557 		kfree_skb(skb);
558 		return -EMSGSIZE;
559 	}
560 
561 	if (skb_cow(skb, dst->dev->hard_header_len)) {
562 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
563 				IPSTATS_MIB_OUTDISCARDS);
564 		goto drop;
565 	}
566 
567 	hdr = ipv6_hdr(skb);
568 
569 	/* Mangling hops number delayed to point after skb COW */
570 
571 	hdr->hop_limit--;
572 
573 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
574 		       net, NULL, skb, skb->dev, dst->dev,
575 		       ip6_forward_finish);
576 
577 error:
578 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
579 drop:
580 	kfree_skb(skb);
581 	return -EINVAL;
582 }
583 
584 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
585 {
586 	to->pkt_type = from->pkt_type;
587 	to->priority = from->priority;
588 	to->protocol = from->protocol;
589 	skb_dst_drop(to);
590 	skb_dst_set(to, dst_clone(skb_dst(from)));
591 	to->dev = from->dev;
592 	to->mark = from->mark;
593 
594 #ifdef CONFIG_NET_SCHED
595 	to->tc_index = from->tc_index;
596 #endif
597 	nf_copy(to, from);
598 	skb_copy_secmark(to, from);
599 }
600 
601 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
602 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
603 {
604 	struct sk_buff *frag;
605 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
606 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
607 				inet6_sk(skb->sk) : NULL;
608 	struct ipv6hdr *tmp_hdr;
609 	struct frag_hdr *fh;
610 	unsigned int mtu, hlen, left, len;
611 	int hroom, troom;
612 	__be32 frag_id;
613 	int ptr, offset = 0, err = 0;
614 	u8 *prevhdr, nexthdr = 0;
615 
616 	err = ip6_find_1stfragopt(skb, &prevhdr);
617 	if (err < 0)
618 		goto fail;
619 	hlen = err;
620 	nexthdr = *prevhdr;
621 
622 	mtu = ip6_skb_dst_mtu(skb);
623 
624 	/* We must not fragment if the socket is set to force MTU discovery
625 	 * or if the skb it not generated by a local socket.
626 	 */
627 	if (unlikely(!skb->ignore_df && skb->len > mtu))
628 		goto fail_toobig;
629 
630 	if (IP6CB(skb)->frag_max_size) {
631 		if (IP6CB(skb)->frag_max_size > mtu)
632 			goto fail_toobig;
633 
634 		/* don't send fragments larger than what we received */
635 		mtu = IP6CB(skb)->frag_max_size;
636 		if (mtu < IPV6_MIN_MTU)
637 			mtu = IPV6_MIN_MTU;
638 	}
639 
640 	if (np && np->frag_size < mtu) {
641 		if (np->frag_size)
642 			mtu = np->frag_size;
643 	}
644 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
645 		goto fail_toobig;
646 	mtu -= hlen + sizeof(struct frag_hdr);
647 
648 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
649 				    &ipv6_hdr(skb)->saddr);
650 
651 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
652 	    (err = skb_checksum_help(skb)))
653 		goto fail;
654 
655 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
656 	if (skb_has_frag_list(skb)) {
657 		unsigned int first_len = skb_pagelen(skb);
658 		struct sk_buff *frag2;
659 
660 		if (first_len - hlen > mtu ||
661 		    ((first_len - hlen) & 7) ||
662 		    skb_cloned(skb) ||
663 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
664 			goto slow_path;
665 
666 		skb_walk_frags(skb, frag) {
667 			/* Correct geometry. */
668 			if (frag->len > mtu ||
669 			    ((frag->len & 7) && frag->next) ||
670 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
671 				goto slow_path_clean;
672 
673 			/* Partially cloned skb? */
674 			if (skb_shared(frag))
675 				goto slow_path_clean;
676 
677 			BUG_ON(frag->sk);
678 			if (skb->sk) {
679 				frag->sk = skb->sk;
680 				frag->destructor = sock_wfree;
681 			}
682 			skb->truesize -= frag->truesize;
683 		}
684 
685 		err = 0;
686 		offset = 0;
687 		/* BUILD HEADER */
688 
689 		*prevhdr = NEXTHDR_FRAGMENT;
690 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691 		if (!tmp_hdr) {
692 			err = -ENOMEM;
693 			goto fail;
694 		}
695 		frag = skb_shinfo(skb)->frag_list;
696 		skb_frag_list_init(skb);
697 
698 		__skb_pull(skb, hlen);
699 		fh = __skb_push(skb, sizeof(struct frag_hdr));
700 		__skb_push(skb, hlen);
701 		skb_reset_network_header(skb);
702 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
703 
704 		fh->nexthdr = nexthdr;
705 		fh->reserved = 0;
706 		fh->frag_off = htons(IP6_MF);
707 		fh->identification = frag_id;
708 
709 		first_len = skb_pagelen(skb);
710 		skb->data_len = first_len - skb_headlen(skb);
711 		skb->len = first_len;
712 		ipv6_hdr(skb)->payload_len = htons(first_len -
713 						   sizeof(struct ipv6hdr));
714 
715 		for (;;) {
716 			/* Prepare header of the next frame,
717 			 * before previous one went down. */
718 			if (frag) {
719 				frag->ip_summed = CHECKSUM_NONE;
720 				skb_reset_transport_header(frag);
721 				fh = __skb_push(frag, sizeof(struct frag_hdr));
722 				__skb_push(frag, hlen);
723 				skb_reset_network_header(frag);
724 				memcpy(skb_network_header(frag), tmp_hdr,
725 				       hlen);
726 				offset += skb->len - hlen - sizeof(struct frag_hdr);
727 				fh->nexthdr = nexthdr;
728 				fh->reserved = 0;
729 				fh->frag_off = htons(offset);
730 				if (frag->next)
731 					fh->frag_off |= htons(IP6_MF);
732 				fh->identification = frag_id;
733 				ipv6_hdr(frag)->payload_len =
734 						htons(frag->len -
735 						      sizeof(struct ipv6hdr));
736 				ip6_copy_metadata(frag, skb);
737 			}
738 
739 			err = output(net, sk, skb);
740 			if (!err)
741 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
742 					      IPSTATS_MIB_FRAGCREATES);
743 
744 			if (err || !frag)
745 				break;
746 
747 			skb = frag;
748 			frag = skb->next;
749 			skb->next = NULL;
750 		}
751 
752 		kfree(tmp_hdr);
753 
754 		if (err == 0) {
755 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
756 				      IPSTATS_MIB_FRAGOKS);
757 			return 0;
758 		}
759 
760 		kfree_skb_list(frag);
761 
762 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
763 			      IPSTATS_MIB_FRAGFAILS);
764 		return err;
765 
766 slow_path_clean:
767 		skb_walk_frags(skb, frag2) {
768 			if (frag2 == frag)
769 				break;
770 			frag2->sk = NULL;
771 			frag2->destructor = NULL;
772 			skb->truesize += frag2->truesize;
773 		}
774 	}
775 
776 slow_path:
777 	left = skb->len - hlen;		/* Space per frame */
778 	ptr = hlen;			/* Where to start from */
779 
780 	/*
781 	 *	Fragment the datagram.
782 	 */
783 
784 	troom = rt->dst.dev->needed_tailroom;
785 
786 	/*
787 	 *	Keep copying data until we run out.
788 	 */
789 	while (left > 0)	{
790 		u8 *fragnexthdr_offset;
791 
792 		len = left;
793 		/* IF: it doesn't fit, use 'mtu' - the data space left */
794 		if (len > mtu)
795 			len = mtu;
796 		/* IF: we are not sending up to and including the packet end
797 		   then align the next start on an eight byte boundary */
798 		if (len < left)	{
799 			len &= ~7;
800 		}
801 
802 		/* Allocate buffer */
803 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
804 				 hroom + troom, GFP_ATOMIC);
805 		if (!frag) {
806 			err = -ENOMEM;
807 			goto fail;
808 		}
809 
810 		/*
811 		 *	Set up data on packet
812 		 */
813 
814 		ip6_copy_metadata(frag, skb);
815 		skb_reserve(frag, hroom);
816 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
817 		skb_reset_network_header(frag);
818 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
819 		frag->transport_header = (frag->network_header + hlen +
820 					  sizeof(struct frag_hdr));
821 
822 		/*
823 		 *	Charge the memory for the fragment to any owner
824 		 *	it might possess
825 		 */
826 		if (skb->sk)
827 			skb_set_owner_w(frag, skb->sk);
828 
829 		/*
830 		 *	Copy the packet header into the new buffer.
831 		 */
832 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
833 
834 		fragnexthdr_offset = skb_network_header(frag);
835 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
836 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
837 
838 		/*
839 		 *	Build fragment header.
840 		 */
841 		fh->nexthdr = nexthdr;
842 		fh->reserved = 0;
843 		fh->identification = frag_id;
844 
845 		/*
846 		 *	Copy a block of the IP datagram.
847 		 */
848 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
849 				     len));
850 		left -= len;
851 
852 		fh->frag_off = htons(offset);
853 		if (left > 0)
854 			fh->frag_off |= htons(IP6_MF);
855 		ipv6_hdr(frag)->payload_len = htons(frag->len -
856 						    sizeof(struct ipv6hdr));
857 
858 		ptr += len;
859 		offset += len;
860 
861 		/*
862 		 *	Put this fragment into the sending queue.
863 		 */
864 		err = output(net, sk, frag);
865 		if (err)
866 			goto fail;
867 
868 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
869 			      IPSTATS_MIB_FRAGCREATES);
870 	}
871 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
872 		      IPSTATS_MIB_FRAGOKS);
873 	consume_skb(skb);
874 	return err;
875 
876 fail_toobig:
877 	if (skb->sk && dst_allfrag(skb_dst(skb)))
878 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
879 
880 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
881 	err = -EMSGSIZE;
882 
883 fail:
884 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885 		      IPSTATS_MIB_FRAGFAILS);
886 	kfree_skb(skb);
887 	return err;
888 }
889 
890 static inline int ip6_rt_check(const struct rt6key *rt_key,
891 			       const struct in6_addr *fl_addr,
892 			       const struct in6_addr *addr_cache)
893 {
894 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
895 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
896 }
897 
898 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
899 					  struct dst_entry *dst,
900 					  const struct flowi6 *fl6)
901 {
902 	struct ipv6_pinfo *np = inet6_sk(sk);
903 	struct rt6_info *rt;
904 
905 	if (!dst)
906 		goto out;
907 
908 	if (dst->ops->family != AF_INET6) {
909 		dst_release(dst);
910 		return NULL;
911 	}
912 
913 	rt = (struct rt6_info *)dst;
914 	/* Yes, checking route validity in not connected
915 	 * case is not very simple. Take into account,
916 	 * that we do not support routing by source, TOS,
917 	 * and MSG_DONTROUTE		--ANK (980726)
918 	 *
919 	 * 1. ip6_rt_check(): If route was host route,
920 	 *    check that cached destination is current.
921 	 *    If it is network route, we still may
922 	 *    check its validity using saved pointer
923 	 *    to the last used address: daddr_cache.
924 	 *    We do not want to save whole address now,
925 	 *    (because main consumer of this service
926 	 *    is tcp, which has not this problem),
927 	 *    so that the last trick works only on connected
928 	 *    sockets.
929 	 * 2. oif also should be the same.
930 	 */
931 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
932 #ifdef CONFIG_IPV6_SUBTREES
933 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
934 #endif
935 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
936 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
937 		dst_release(dst);
938 		dst = NULL;
939 	}
940 
941 out:
942 	return dst;
943 }
944 
945 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
946 			       struct dst_entry **dst, struct flowi6 *fl6)
947 {
948 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
949 	struct neighbour *n;
950 	struct rt6_info *rt;
951 #endif
952 	int err;
953 	int flags = 0;
954 
955 	/* The correct way to handle this would be to do
956 	 * ip6_route_get_saddr, and then ip6_route_output; however,
957 	 * the route-specific preferred source forces the
958 	 * ip6_route_output call _before_ ip6_route_get_saddr.
959 	 *
960 	 * In source specific routing (no src=any default route),
961 	 * ip6_route_output will fail given src=any saddr, though, so
962 	 * that's why we try it again later.
963 	 */
964 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
965 		struct fib6_info *from;
966 		struct rt6_info *rt;
967 		bool had_dst = *dst != NULL;
968 
969 		if (!had_dst)
970 			*dst = ip6_route_output(net, sk, fl6);
971 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
972 
973 		rcu_read_lock();
974 		from = rt ? rcu_dereference(rt->from) : NULL;
975 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
976 					  sk ? inet6_sk(sk)->srcprefs : 0,
977 					  &fl6->saddr);
978 		rcu_read_unlock();
979 
980 		if (err)
981 			goto out_err_release;
982 
983 		/* If we had an erroneous initial result, pretend it
984 		 * never existed and let the SA-enabled version take
985 		 * over.
986 		 */
987 		if (!had_dst && (*dst)->error) {
988 			dst_release(*dst);
989 			*dst = NULL;
990 		}
991 
992 		if (fl6->flowi6_oif)
993 			flags |= RT6_LOOKUP_F_IFACE;
994 	}
995 
996 	if (!*dst)
997 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
998 
999 	err = (*dst)->error;
1000 	if (err)
1001 		goto out_err_release;
1002 
1003 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1004 	/*
1005 	 * Here if the dst entry we've looked up
1006 	 * has a neighbour entry that is in the INCOMPLETE
1007 	 * state and the src address from the flow is
1008 	 * marked as OPTIMISTIC, we release the found
1009 	 * dst entry and replace it instead with the
1010 	 * dst entry of the nexthop router
1011 	 */
1012 	rt = (struct rt6_info *) *dst;
1013 	rcu_read_lock_bh();
1014 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1015 				      rt6_nexthop(rt, &fl6->daddr));
1016 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1017 	rcu_read_unlock_bh();
1018 
1019 	if (err) {
1020 		struct inet6_ifaddr *ifp;
1021 		struct flowi6 fl_gw6;
1022 		int redirect;
1023 
1024 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1025 				      (*dst)->dev, 1);
1026 
1027 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1028 		if (ifp)
1029 			in6_ifa_put(ifp);
1030 
1031 		if (redirect) {
1032 			/*
1033 			 * We need to get the dst entry for the
1034 			 * default router instead
1035 			 */
1036 			dst_release(*dst);
1037 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1038 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1039 			*dst = ip6_route_output(net, sk, &fl_gw6);
1040 			err = (*dst)->error;
1041 			if (err)
1042 				goto out_err_release;
1043 		}
1044 	}
1045 #endif
1046 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1047 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1048 		err = -EAFNOSUPPORT;
1049 		goto out_err_release;
1050 	}
1051 
1052 	return 0;
1053 
1054 out_err_release:
1055 	dst_release(*dst);
1056 	*dst = NULL;
1057 
1058 	if (err == -ENETUNREACH)
1059 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1060 	return err;
1061 }
1062 
1063 /**
1064  *	ip6_dst_lookup - perform route lookup on flow
1065  *	@sk: socket which provides route info
1066  *	@dst: pointer to dst_entry * for result
1067  *	@fl6: flow to lookup
1068  *
1069  *	This function performs a route lookup on the given flow.
1070  *
1071  *	It returns zero on success, or a standard errno code on error.
1072  */
1073 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1074 		   struct flowi6 *fl6)
1075 {
1076 	*dst = NULL;
1077 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1078 }
1079 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1080 
1081 /**
1082  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1083  *	@sk: socket which provides route info
1084  *	@fl6: flow to lookup
1085  *	@final_dst: final destination address for ipsec lookup
1086  *
1087  *	This function performs a route lookup on the given flow.
1088  *
1089  *	It returns a valid dst pointer on success, or a pointer encoded
1090  *	error code.
1091  */
1092 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1093 				      const struct in6_addr *final_dst)
1094 {
1095 	struct dst_entry *dst = NULL;
1096 	int err;
1097 
1098 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1099 	if (err)
1100 		return ERR_PTR(err);
1101 	if (final_dst)
1102 		fl6->daddr = *final_dst;
1103 
1104 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1105 }
1106 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1107 
1108 /**
1109  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1110  *	@sk: socket which provides the dst cache and route info
1111  *	@fl6: flow to lookup
1112  *	@final_dst: final destination address for ipsec lookup
1113  *	@connected: whether @sk is connected or not
1114  *
1115  *	This function performs a route lookup on the given flow with the
1116  *	possibility of using the cached route in the socket if it is valid.
1117  *	It will take the socket dst lock when operating on the dst cache.
1118  *	As a result, this function can only be used in process context.
1119  *
1120  *	In addition, for a connected socket, cache the dst in the socket
1121  *	if the current cache is not valid.
1122  *
1123  *	It returns a valid dst pointer on success, or a pointer encoded
1124  *	error code.
1125  */
1126 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1127 					 const struct in6_addr *final_dst,
1128 					 bool connected)
1129 {
1130 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1131 
1132 	dst = ip6_sk_dst_check(sk, dst, fl6);
1133 	if (dst)
1134 		return dst;
1135 
1136 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1137 	if (connected && !IS_ERR(dst))
1138 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1139 
1140 	return dst;
1141 }
1142 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1143 
1144 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1145 					       gfp_t gfp)
1146 {
1147 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1148 }
1149 
1150 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1151 						gfp_t gfp)
1152 {
1153 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1154 }
1155 
1156 static void ip6_append_data_mtu(unsigned int *mtu,
1157 				int *maxfraglen,
1158 				unsigned int fragheaderlen,
1159 				struct sk_buff *skb,
1160 				struct rt6_info *rt,
1161 				unsigned int orig_mtu)
1162 {
1163 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1164 		if (!skb) {
1165 			/* first fragment, reserve header_len */
1166 			*mtu = orig_mtu - rt->dst.header_len;
1167 
1168 		} else {
1169 			/*
1170 			 * this fragment is not first, the headers
1171 			 * space is regarded as data space.
1172 			 */
1173 			*mtu = orig_mtu;
1174 		}
1175 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1176 			      + fragheaderlen - sizeof(struct frag_hdr);
1177 	}
1178 }
1179 
1180 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1181 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1182 			  struct rt6_info *rt, struct flowi6 *fl6)
1183 {
1184 	struct ipv6_pinfo *np = inet6_sk(sk);
1185 	unsigned int mtu;
1186 	struct ipv6_txoptions *opt = ipc6->opt;
1187 
1188 	/*
1189 	 * setup for corking
1190 	 */
1191 	if (opt) {
1192 		if (WARN_ON(v6_cork->opt))
1193 			return -EINVAL;
1194 
1195 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1196 		if (unlikely(!v6_cork->opt))
1197 			return -ENOBUFS;
1198 
1199 		v6_cork->opt->tot_len = sizeof(*opt);
1200 		v6_cork->opt->opt_flen = opt->opt_flen;
1201 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1202 
1203 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1204 						    sk->sk_allocation);
1205 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1206 			return -ENOBUFS;
1207 
1208 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1209 						    sk->sk_allocation);
1210 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1211 			return -ENOBUFS;
1212 
1213 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1214 						   sk->sk_allocation);
1215 		if (opt->hopopt && !v6_cork->opt->hopopt)
1216 			return -ENOBUFS;
1217 
1218 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1219 						    sk->sk_allocation);
1220 		if (opt->srcrt && !v6_cork->opt->srcrt)
1221 			return -ENOBUFS;
1222 
1223 		/* need source address above miyazawa*/
1224 	}
1225 	dst_hold(&rt->dst);
1226 	cork->base.dst = &rt->dst;
1227 	cork->fl.u.ip6 = *fl6;
1228 	v6_cork->hop_limit = ipc6->hlimit;
1229 	v6_cork->tclass = ipc6->tclass;
1230 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1231 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1232 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1233 	else
1234 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1235 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1236 	if (np->frag_size < mtu) {
1237 		if (np->frag_size)
1238 			mtu = np->frag_size;
1239 	}
1240 	if (mtu < IPV6_MIN_MTU)
1241 		return -EINVAL;
1242 	cork->base.fragsize = mtu;
1243 	cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
1244 
1245 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1246 		cork->base.flags |= IPCORK_ALLFRAG;
1247 	cork->base.length = 0;
1248 
1249 	return 0;
1250 }
1251 
1252 static int __ip6_append_data(struct sock *sk,
1253 			     struct flowi6 *fl6,
1254 			     struct sk_buff_head *queue,
1255 			     struct inet_cork *cork,
1256 			     struct inet6_cork *v6_cork,
1257 			     struct page_frag *pfrag,
1258 			     int getfrag(void *from, char *to, int offset,
1259 					 int len, int odd, struct sk_buff *skb),
1260 			     void *from, int length, int transhdrlen,
1261 			     unsigned int flags, struct ipcm6_cookie *ipc6,
1262 			     const struct sockcm_cookie *sockc)
1263 {
1264 	struct sk_buff *skb, *skb_prev = NULL;
1265 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1266 	int exthdrlen = 0;
1267 	int dst_exthdrlen = 0;
1268 	int hh_len;
1269 	int copy;
1270 	int err;
1271 	int offset = 0;
1272 	__u8 tx_flags = 0;
1273 	u32 tskey = 0;
1274 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1275 	struct ipv6_txoptions *opt = v6_cork->opt;
1276 	int csummode = CHECKSUM_NONE;
1277 	unsigned int maxnonfragsize, headersize;
1278 	unsigned int wmem_alloc_delta = 0;
1279 	bool paged;
1280 
1281 	skb = skb_peek_tail(queue);
1282 	if (!skb) {
1283 		exthdrlen = opt ? opt->opt_flen : 0;
1284 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1285 	}
1286 
1287 	paged = !!cork->gso_size;
1288 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1289 	orig_mtu = mtu;
1290 
1291 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1292 
1293 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1294 			(opt ? opt->opt_nflen : 0);
1295 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1296 		     sizeof(struct frag_hdr);
1297 
1298 	headersize = sizeof(struct ipv6hdr) +
1299 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1300 		     (dst_allfrag(&rt->dst) ?
1301 		      sizeof(struct frag_hdr) : 0) +
1302 		     rt->rt6i_nfheader_len;
1303 
1304 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1305 	 * the first fragment
1306 	 */
1307 	if (headersize + transhdrlen > mtu)
1308 		goto emsgsize;
1309 
1310 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1311 	    (sk->sk_protocol == IPPROTO_UDP ||
1312 	     sk->sk_protocol == IPPROTO_RAW)) {
1313 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1314 				sizeof(struct ipv6hdr));
1315 		goto emsgsize;
1316 	}
1317 
1318 	if (ip6_sk_ignore_df(sk))
1319 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1320 	else
1321 		maxnonfragsize = mtu;
1322 
1323 	if (cork->length + length > maxnonfragsize - headersize) {
1324 emsgsize:
1325 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1326 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1327 		return -EMSGSIZE;
1328 	}
1329 
1330 	/* CHECKSUM_PARTIAL only with no extension headers and when
1331 	 * we are not going to fragment
1332 	 */
1333 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1334 	    headersize == sizeof(struct ipv6hdr) &&
1335 	    length <= mtu - headersize &&
1336 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1337 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1338 		csummode = CHECKSUM_PARTIAL;
1339 
1340 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1341 		sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1342 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1343 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1344 			tskey = sk->sk_tskey++;
1345 	}
1346 
1347 	/*
1348 	 * Let's try using as much space as possible.
1349 	 * Use MTU if total length of the message fits into the MTU.
1350 	 * Otherwise, we need to reserve fragment header and
1351 	 * fragment alignment (= 8-15 octects, in total).
1352 	 *
1353 	 * Note that we may need to "move" the data from the tail of
1354 	 * of the buffer to the new fragment when we split
1355 	 * the message.
1356 	 *
1357 	 * FIXME: It may be fragmented into multiple chunks
1358 	 *        at once if non-fragmentable extension headers
1359 	 *        are too large.
1360 	 * --yoshfuji
1361 	 */
1362 
1363 	cork->length += length;
1364 	if (!skb)
1365 		goto alloc_new_skb;
1366 
1367 	while (length > 0) {
1368 		/* Check if the remaining data fits into current packet. */
1369 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1370 		if (copy < length)
1371 			copy = maxfraglen - skb->len;
1372 
1373 		if (copy <= 0) {
1374 			char *data;
1375 			unsigned int datalen;
1376 			unsigned int fraglen;
1377 			unsigned int fraggap;
1378 			unsigned int alloclen;
1379 			unsigned int pagedlen = 0;
1380 alloc_new_skb:
1381 			/* There's no room in the current skb */
1382 			if (skb)
1383 				fraggap = skb->len - maxfraglen;
1384 			else
1385 				fraggap = 0;
1386 			/* update mtu and maxfraglen if necessary */
1387 			if (!skb || !skb_prev)
1388 				ip6_append_data_mtu(&mtu, &maxfraglen,
1389 						    fragheaderlen, skb, rt,
1390 						    orig_mtu);
1391 
1392 			skb_prev = skb;
1393 
1394 			/*
1395 			 * If remaining data exceeds the mtu,
1396 			 * we know we need more fragment(s).
1397 			 */
1398 			datalen = length + fraggap;
1399 
1400 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1401 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1402 			fraglen = datalen + fragheaderlen;
1403 
1404 			if ((flags & MSG_MORE) &&
1405 			    !(rt->dst.dev->features&NETIF_F_SG))
1406 				alloclen = mtu;
1407 			else if (!paged)
1408 				alloclen = fraglen;
1409 			else {
1410 				alloclen = min_t(int, fraglen, MAX_HEADER);
1411 				pagedlen = fraglen - alloclen;
1412 			}
1413 
1414 			alloclen += dst_exthdrlen;
1415 
1416 			if (datalen != length + fraggap) {
1417 				/*
1418 				 * this is not the last fragment, the trailer
1419 				 * space is regarded as data space.
1420 				 */
1421 				datalen += rt->dst.trailer_len;
1422 			}
1423 
1424 			alloclen += rt->dst.trailer_len;
1425 			fraglen = datalen + fragheaderlen;
1426 
1427 			/*
1428 			 * We just reserve space for fragment header.
1429 			 * Note: this may be overallocation if the message
1430 			 * (without MSG_MORE) fits into the MTU.
1431 			 */
1432 			alloclen += sizeof(struct frag_hdr);
1433 
1434 			copy = datalen - transhdrlen - fraggap - pagedlen;
1435 			if (copy < 0) {
1436 				err = -EINVAL;
1437 				goto error;
1438 			}
1439 			if (transhdrlen) {
1440 				skb = sock_alloc_send_skb(sk,
1441 						alloclen + hh_len,
1442 						(flags & MSG_DONTWAIT), &err);
1443 			} else {
1444 				skb = NULL;
1445 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1446 				    2 * sk->sk_sndbuf)
1447 					skb = alloc_skb(alloclen + hh_len,
1448 							sk->sk_allocation);
1449 				if (unlikely(!skb))
1450 					err = -ENOBUFS;
1451 			}
1452 			if (!skb)
1453 				goto error;
1454 			/*
1455 			 *	Fill in the control structures
1456 			 */
1457 			skb->protocol = htons(ETH_P_IPV6);
1458 			skb->ip_summed = csummode;
1459 			skb->csum = 0;
1460 			/* reserve for fragmentation and ipsec header */
1461 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1462 				    dst_exthdrlen);
1463 
1464 			/* Only the initial fragment is time stamped */
1465 			skb_shinfo(skb)->tx_flags = tx_flags;
1466 			tx_flags = 0;
1467 			skb_shinfo(skb)->tskey = tskey;
1468 			tskey = 0;
1469 
1470 			/*
1471 			 *	Find where to start putting bytes
1472 			 */
1473 			data = skb_put(skb, fraglen - pagedlen);
1474 			skb_set_network_header(skb, exthdrlen);
1475 			data += fragheaderlen;
1476 			skb->transport_header = (skb->network_header +
1477 						 fragheaderlen);
1478 			if (fraggap) {
1479 				skb->csum = skb_copy_and_csum_bits(
1480 					skb_prev, maxfraglen,
1481 					data + transhdrlen, fraggap, 0);
1482 				skb_prev->csum = csum_sub(skb_prev->csum,
1483 							  skb->csum);
1484 				data += fraggap;
1485 				pskb_trim_unique(skb_prev, maxfraglen);
1486 			}
1487 			if (copy > 0 &&
1488 			    getfrag(from, data + transhdrlen, offset,
1489 				    copy, fraggap, skb) < 0) {
1490 				err = -EFAULT;
1491 				kfree_skb(skb);
1492 				goto error;
1493 			}
1494 
1495 			offset += copy;
1496 			length -= copy + transhdrlen;
1497 			transhdrlen = 0;
1498 			exthdrlen = 0;
1499 			dst_exthdrlen = 0;
1500 
1501 			if ((flags & MSG_CONFIRM) && !skb_prev)
1502 				skb_set_dst_pending_confirm(skb, 1);
1503 
1504 			/*
1505 			 * Put the packet on the pending queue
1506 			 */
1507 			if (!skb->destructor) {
1508 				skb->destructor = sock_wfree;
1509 				skb->sk = sk;
1510 				wmem_alloc_delta += skb->truesize;
1511 			}
1512 			__skb_queue_tail(queue, skb);
1513 			continue;
1514 		}
1515 
1516 		if (copy > length)
1517 			copy = length;
1518 
1519 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1520 			unsigned int off;
1521 
1522 			off = skb->len;
1523 			if (getfrag(from, skb_put(skb, copy),
1524 						offset, copy, off, skb) < 0) {
1525 				__skb_trim(skb, off);
1526 				err = -EFAULT;
1527 				goto error;
1528 			}
1529 		} else {
1530 			int i = skb_shinfo(skb)->nr_frags;
1531 
1532 			err = -ENOMEM;
1533 			if (!sk_page_frag_refill(sk, pfrag))
1534 				goto error;
1535 
1536 			if (!skb_can_coalesce(skb, i, pfrag->page,
1537 					      pfrag->offset)) {
1538 				err = -EMSGSIZE;
1539 				if (i == MAX_SKB_FRAGS)
1540 					goto error;
1541 
1542 				__skb_fill_page_desc(skb, i, pfrag->page,
1543 						     pfrag->offset, 0);
1544 				skb_shinfo(skb)->nr_frags = ++i;
1545 				get_page(pfrag->page);
1546 			}
1547 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1548 			if (getfrag(from,
1549 				    page_address(pfrag->page) + pfrag->offset,
1550 				    offset, copy, skb->len, skb) < 0)
1551 				goto error_efault;
1552 
1553 			pfrag->offset += copy;
1554 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1555 			skb->len += copy;
1556 			skb->data_len += copy;
1557 			skb->truesize += copy;
1558 			wmem_alloc_delta += copy;
1559 		}
1560 		offset += copy;
1561 		length -= copy;
1562 	}
1563 
1564 	if (wmem_alloc_delta)
1565 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1566 	return 0;
1567 
1568 error_efault:
1569 	err = -EFAULT;
1570 error:
1571 	cork->length -= length;
1572 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1573 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1574 	return err;
1575 }
1576 
1577 int ip6_append_data(struct sock *sk,
1578 		    int getfrag(void *from, char *to, int offset, int len,
1579 				int odd, struct sk_buff *skb),
1580 		    void *from, int length, int transhdrlen,
1581 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1582 		    struct rt6_info *rt, unsigned int flags,
1583 		    const struct sockcm_cookie *sockc)
1584 {
1585 	struct inet_sock *inet = inet_sk(sk);
1586 	struct ipv6_pinfo *np = inet6_sk(sk);
1587 	int exthdrlen;
1588 	int err;
1589 
1590 	if (flags&MSG_PROBE)
1591 		return 0;
1592 	if (skb_queue_empty(&sk->sk_write_queue)) {
1593 		/*
1594 		 * setup for corking
1595 		 */
1596 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1597 				     ipc6, rt, fl6);
1598 		if (err)
1599 			return err;
1600 
1601 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1602 		length += exthdrlen;
1603 		transhdrlen += exthdrlen;
1604 	} else {
1605 		fl6 = &inet->cork.fl.u.ip6;
1606 		transhdrlen = 0;
1607 	}
1608 
1609 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1610 				 &np->cork, sk_page_frag(sk), getfrag,
1611 				 from, length, transhdrlen, flags, ipc6, sockc);
1612 }
1613 EXPORT_SYMBOL_GPL(ip6_append_data);
1614 
1615 static void ip6_cork_release(struct inet_cork_full *cork,
1616 			     struct inet6_cork *v6_cork)
1617 {
1618 	if (v6_cork->opt) {
1619 		kfree(v6_cork->opt->dst0opt);
1620 		kfree(v6_cork->opt->dst1opt);
1621 		kfree(v6_cork->opt->hopopt);
1622 		kfree(v6_cork->opt->srcrt);
1623 		kfree(v6_cork->opt);
1624 		v6_cork->opt = NULL;
1625 	}
1626 
1627 	if (cork->base.dst) {
1628 		dst_release(cork->base.dst);
1629 		cork->base.dst = NULL;
1630 		cork->base.flags &= ~IPCORK_ALLFRAG;
1631 	}
1632 	memset(&cork->fl, 0, sizeof(cork->fl));
1633 }
1634 
1635 struct sk_buff *__ip6_make_skb(struct sock *sk,
1636 			       struct sk_buff_head *queue,
1637 			       struct inet_cork_full *cork,
1638 			       struct inet6_cork *v6_cork)
1639 {
1640 	struct sk_buff *skb, *tmp_skb;
1641 	struct sk_buff **tail_skb;
1642 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1643 	struct ipv6_pinfo *np = inet6_sk(sk);
1644 	struct net *net = sock_net(sk);
1645 	struct ipv6hdr *hdr;
1646 	struct ipv6_txoptions *opt = v6_cork->opt;
1647 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1648 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1649 	unsigned char proto = fl6->flowi6_proto;
1650 
1651 	skb = __skb_dequeue(queue);
1652 	if (!skb)
1653 		goto out;
1654 	tail_skb = &(skb_shinfo(skb)->frag_list);
1655 
1656 	/* move skb->data to ip header from ext header */
1657 	if (skb->data < skb_network_header(skb))
1658 		__skb_pull(skb, skb_network_offset(skb));
1659 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1660 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1661 		*tail_skb = tmp_skb;
1662 		tail_skb = &(tmp_skb->next);
1663 		skb->len += tmp_skb->len;
1664 		skb->data_len += tmp_skb->len;
1665 		skb->truesize += tmp_skb->truesize;
1666 		tmp_skb->destructor = NULL;
1667 		tmp_skb->sk = NULL;
1668 	}
1669 
1670 	/* Allow local fragmentation. */
1671 	skb->ignore_df = ip6_sk_ignore_df(sk);
1672 
1673 	*final_dst = fl6->daddr;
1674 	__skb_pull(skb, skb_network_header_len(skb));
1675 	if (opt && opt->opt_flen)
1676 		ipv6_push_frag_opts(skb, opt, &proto);
1677 	if (opt && opt->opt_nflen)
1678 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1679 
1680 	skb_push(skb, sizeof(struct ipv6hdr));
1681 	skb_reset_network_header(skb);
1682 	hdr = ipv6_hdr(skb);
1683 
1684 	ip6_flow_hdr(hdr, v6_cork->tclass,
1685 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1686 					ip6_autoflowlabel(net, np), fl6));
1687 	hdr->hop_limit = v6_cork->hop_limit;
1688 	hdr->nexthdr = proto;
1689 	hdr->saddr = fl6->saddr;
1690 	hdr->daddr = *final_dst;
1691 
1692 	skb->priority = sk->sk_priority;
1693 	skb->mark = sk->sk_mark;
1694 
1695 	skb_dst_set(skb, dst_clone(&rt->dst));
1696 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1697 	if (proto == IPPROTO_ICMPV6) {
1698 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1699 
1700 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1701 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1702 	}
1703 
1704 	ip6_cork_release(cork, v6_cork);
1705 out:
1706 	return skb;
1707 }
1708 
1709 int ip6_send_skb(struct sk_buff *skb)
1710 {
1711 	struct net *net = sock_net(skb->sk);
1712 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1713 	int err;
1714 
1715 	err = ip6_local_out(net, skb->sk, skb);
1716 	if (err) {
1717 		if (err > 0)
1718 			err = net_xmit_errno(err);
1719 		if (err)
1720 			IP6_INC_STATS(net, rt->rt6i_idev,
1721 				      IPSTATS_MIB_OUTDISCARDS);
1722 	}
1723 
1724 	return err;
1725 }
1726 
1727 int ip6_push_pending_frames(struct sock *sk)
1728 {
1729 	struct sk_buff *skb;
1730 
1731 	skb = ip6_finish_skb(sk);
1732 	if (!skb)
1733 		return 0;
1734 
1735 	return ip6_send_skb(skb);
1736 }
1737 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1738 
1739 static void __ip6_flush_pending_frames(struct sock *sk,
1740 				       struct sk_buff_head *queue,
1741 				       struct inet_cork_full *cork,
1742 				       struct inet6_cork *v6_cork)
1743 {
1744 	struct sk_buff *skb;
1745 
1746 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1747 		if (skb_dst(skb))
1748 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1749 				      IPSTATS_MIB_OUTDISCARDS);
1750 		kfree_skb(skb);
1751 	}
1752 
1753 	ip6_cork_release(cork, v6_cork);
1754 }
1755 
1756 void ip6_flush_pending_frames(struct sock *sk)
1757 {
1758 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1759 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1760 }
1761 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1762 
1763 struct sk_buff *ip6_make_skb(struct sock *sk,
1764 			     int getfrag(void *from, char *to, int offset,
1765 					 int len, int odd, struct sk_buff *skb),
1766 			     void *from, int length, int transhdrlen,
1767 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1768 			     struct rt6_info *rt, unsigned int flags,
1769 			     struct inet_cork_full *cork,
1770 			     const struct sockcm_cookie *sockc)
1771 {
1772 	struct inet6_cork v6_cork;
1773 	struct sk_buff_head queue;
1774 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1775 	int err;
1776 
1777 	if (flags & MSG_PROBE)
1778 		return NULL;
1779 
1780 	__skb_queue_head_init(&queue);
1781 
1782 	cork->base.flags = 0;
1783 	cork->base.addr = 0;
1784 	cork->base.opt = NULL;
1785 	cork->base.dst = NULL;
1786 	v6_cork.opt = NULL;
1787 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1788 	if (err) {
1789 		ip6_cork_release(cork, &v6_cork);
1790 		return ERR_PTR(err);
1791 	}
1792 	if (ipc6->dontfrag < 0)
1793 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1794 
1795 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1796 				&current->task_frag, getfrag, from,
1797 				length + exthdrlen, transhdrlen + exthdrlen,
1798 				flags, ipc6, sockc);
1799 	if (err) {
1800 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1801 		return ERR_PTR(err);
1802 	}
1803 
1804 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1805 }
1806