xref: /linux/net/ipv6/ip6_output.c (revision 2aceb896ee18ae35b21b14c978d8c2ef8c7b439d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res != LWTUNNEL_XMIT_CONTINUE)
117 			return res;
118 	}
119 
120 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
121 
122 	rcu_read_lock();
123 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			rcu_read_unlock();
131 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
132 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
133 			return -EINVAL;
134 		}
135 	}
136 	sock_confirm_neigh(skb, neigh);
137 	ret = neigh_output(neigh, skb, false);
138 	rcu_read_unlock();
139 	return ret;
140 }
141 
142 static int
143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
144 				    struct sk_buff *skb, unsigned int mtu)
145 {
146 	struct sk_buff *segs, *nskb;
147 	netdev_features_t features;
148 	int ret = 0;
149 
150 	/* Please see corresponding comment in ip_finish_output_gso
151 	 * describing the cases where GSO segment length exceeds the
152 	 * egress MTU.
153 	 */
154 	features = netif_skb_features(skb);
155 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
156 	if (IS_ERR_OR_NULL(segs)) {
157 		kfree_skb(skb);
158 		return -ENOMEM;
159 	}
160 
161 	consume_skb(skb);
162 
163 	skb_list_walk_safe(segs, segs, nskb) {
164 		int err;
165 
166 		skb_mark_not_on_list(segs);
167 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
168 		if (err && ret == 0)
169 			ret = err;
170 	}
171 
172 	return ret;
173 }
174 
175 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
176 {
177 	unsigned int mtu;
178 
179 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
180 	/* Policy lookup after SNAT yielded a new policy */
181 	if (skb_dst(skb)->xfrm) {
182 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
183 		return dst_output(net, sk, skb);
184 	}
185 #endif
186 
187 	mtu = ip6_skb_dst_mtu(skb);
188 	if (skb_is_gso(skb) &&
189 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
190 	    !skb_gso_validate_network_len(skb, mtu))
191 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
192 
193 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
194 	    dst_allfrag(skb_dst(skb)) ||
195 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
196 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
197 	else
198 		return ip6_finish_output2(net, sk, skb);
199 }
200 
201 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
202 {
203 	int ret;
204 
205 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
206 	switch (ret) {
207 	case NET_XMIT_SUCCESS:
208 	case NET_XMIT_CN:
209 		return __ip6_finish_output(net, sk, skb) ? : ret;
210 	default:
211 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
212 		return ret;
213 	}
214 }
215 
216 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
217 {
218 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
219 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
220 
221 	skb->protocol = htons(ETH_P_IPV6);
222 	skb->dev = dev;
223 
224 	if (unlikely(idev->cnf.disable_ipv6)) {
225 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
226 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
227 		return 0;
228 	}
229 
230 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
231 			    net, sk, skb, indev, dev,
232 			    ip6_finish_output,
233 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
234 }
235 EXPORT_SYMBOL(ip6_output);
236 
237 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
238 {
239 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
240 		return ip6_default_np_autolabel(net);
241 	return inet6_test_bit(AUTOFLOWLABEL, sk);
242 }
243 
244 /*
245  * xmit an sk_buff (used by TCP, SCTP and DCCP)
246  * Note : socket lock is not held for SYNACK packets, but might be modified
247  * by calls to skb_set_owner_w() and ipv6_local_error(),
248  * which are using proper atomic operations or spinlocks.
249  */
250 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
251 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
252 {
253 	struct net *net = sock_net(sk);
254 	const struct ipv6_pinfo *np = inet6_sk(sk);
255 	struct in6_addr *first_hop = &fl6->daddr;
256 	struct dst_entry *dst = skb_dst(skb);
257 	struct net_device *dev = dst->dev;
258 	struct inet6_dev *idev = ip6_dst_idev(dst);
259 	struct hop_jumbo_hdr *hop_jumbo;
260 	int hoplen = sizeof(*hop_jumbo);
261 	unsigned int head_room;
262 	struct ipv6hdr *hdr;
263 	u8  proto = fl6->flowi6_proto;
264 	int seg_len = skb->len;
265 	int hlimit = -1;
266 	u32 mtu;
267 
268 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
269 	if (opt)
270 		head_room += opt->opt_nflen + opt->opt_flen;
271 
272 	if (unlikely(head_room > skb_headroom(skb))) {
273 		skb = skb_expand_head(skb, head_room);
274 		if (!skb) {
275 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
276 			return -ENOBUFS;
277 		}
278 	}
279 
280 	if (opt) {
281 		seg_len += opt->opt_nflen + opt->opt_flen;
282 
283 		if (opt->opt_flen)
284 			ipv6_push_frag_opts(skb, opt, &proto);
285 
286 		if (opt->opt_nflen)
287 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
288 					     &fl6->saddr);
289 	}
290 
291 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
292 		hop_jumbo = skb_push(skb, hoplen);
293 
294 		hop_jumbo->nexthdr = proto;
295 		hop_jumbo->hdrlen = 0;
296 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
297 		hop_jumbo->tlv_len = 4;
298 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
299 
300 		proto = IPPROTO_HOPOPTS;
301 		seg_len = 0;
302 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
303 	}
304 
305 	skb_push(skb, sizeof(struct ipv6hdr));
306 	skb_reset_network_header(skb);
307 	hdr = ipv6_hdr(skb);
308 
309 	/*
310 	 *	Fill in the IPv6 header
311 	 */
312 	if (np)
313 		hlimit = READ_ONCE(np->hop_limit);
314 	if (hlimit < 0)
315 		hlimit = ip6_dst_hoplimit(dst);
316 
317 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
318 				ip6_autoflowlabel(net, sk), fl6));
319 
320 	hdr->payload_len = htons(seg_len);
321 	hdr->nexthdr = proto;
322 	hdr->hop_limit = hlimit;
323 
324 	hdr->saddr = fl6->saddr;
325 	hdr->daddr = *first_hop;
326 
327 	skb->protocol = htons(ETH_P_IPV6);
328 	skb->priority = priority;
329 	skb->mark = mark;
330 
331 	mtu = dst_mtu(dst);
332 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
333 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
334 
335 		/* if egress device is enslaved to an L3 master device pass the
336 		 * skb to its handler for processing
337 		 */
338 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
339 		if (unlikely(!skb))
340 			return 0;
341 
342 		/* hooks should never assume socket lock is held.
343 		 * we promote our socket to non const
344 		 */
345 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
346 			       net, (struct sock *)sk, skb, NULL, dev,
347 			       dst_output);
348 	}
349 
350 	skb->dev = dev;
351 	/* ipv6_local_error() does not require socket lock,
352 	 * we promote our socket to non const
353 	 */
354 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
355 
356 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
357 	kfree_skb(skb);
358 	return -EMSGSIZE;
359 }
360 EXPORT_SYMBOL(ip6_xmit);
361 
362 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
363 {
364 	struct ip6_ra_chain *ra;
365 	struct sock *last = NULL;
366 
367 	read_lock(&ip6_ra_lock);
368 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
369 		struct sock *sk = ra->sk;
370 		if (sk && ra->sel == sel &&
371 		    (!sk->sk_bound_dev_if ||
372 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
373 
374 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
375 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
376 				continue;
377 			}
378 			if (last) {
379 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
380 				if (skb2)
381 					rawv6_rcv(last, skb2);
382 			}
383 			last = sk;
384 		}
385 	}
386 
387 	if (last) {
388 		rawv6_rcv(last, skb);
389 		read_unlock(&ip6_ra_lock);
390 		return 1;
391 	}
392 	read_unlock(&ip6_ra_lock);
393 	return 0;
394 }
395 
396 static int ip6_forward_proxy_check(struct sk_buff *skb)
397 {
398 	struct ipv6hdr *hdr = ipv6_hdr(skb);
399 	u8 nexthdr = hdr->nexthdr;
400 	__be16 frag_off;
401 	int offset;
402 
403 	if (ipv6_ext_hdr(nexthdr)) {
404 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
405 		if (offset < 0)
406 			return 0;
407 	} else
408 		offset = sizeof(struct ipv6hdr);
409 
410 	if (nexthdr == IPPROTO_ICMPV6) {
411 		struct icmp6hdr *icmp6;
412 
413 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
414 					 offset + 1 - skb->data)))
415 			return 0;
416 
417 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
418 
419 		switch (icmp6->icmp6_type) {
420 		case NDISC_ROUTER_SOLICITATION:
421 		case NDISC_ROUTER_ADVERTISEMENT:
422 		case NDISC_NEIGHBOUR_SOLICITATION:
423 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
424 		case NDISC_REDIRECT:
425 			/* For reaction involving unicast neighbor discovery
426 			 * message destined to the proxied address, pass it to
427 			 * input function.
428 			 */
429 			return 1;
430 		default:
431 			break;
432 		}
433 	}
434 
435 	/*
436 	 * The proxying router can't forward traffic sent to a link-local
437 	 * address, so signal the sender and discard the packet. This
438 	 * behavior is clarified by the MIPv6 specification.
439 	 */
440 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
441 		dst_link_failure(skb);
442 		return -1;
443 	}
444 
445 	return 0;
446 }
447 
448 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
449 				     struct sk_buff *skb)
450 {
451 #ifdef CONFIG_NET_SWITCHDEV
452 	if (skb->offload_l3_fwd_mark) {
453 		consume_skb(skb);
454 		return 0;
455 	}
456 #endif
457 
458 	skb_clear_tstamp(skb);
459 	return dst_output(net, sk, skb);
460 }
461 
462 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
463 {
464 	if (skb->len <= mtu)
465 		return false;
466 
467 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
468 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
469 		return true;
470 
471 	if (skb->ignore_df)
472 		return false;
473 
474 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
475 		return false;
476 
477 	return true;
478 }
479 
480 int ip6_forward(struct sk_buff *skb)
481 {
482 	struct dst_entry *dst = skb_dst(skb);
483 	struct ipv6hdr *hdr = ipv6_hdr(skb);
484 	struct inet6_skb_parm *opt = IP6CB(skb);
485 	struct net *net = dev_net(dst->dev);
486 	struct inet6_dev *idev;
487 	SKB_DR(reason);
488 	u32 mtu;
489 
490 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
491 	if (net->ipv6.devconf_all->forwarding == 0)
492 		goto error;
493 
494 	if (skb->pkt_type != PACKET_HOST)
495 		goto drop;
496 
497 	if (unlikely(skb->sk))
498 		goto drop;
499 
500 	if (skb_warn_if_lro(skb))
501 		goto drop;
502 
503 	if (!net->ipv6.devconf_all->disable_policy &&
504 	    (!idev || !idev->cnf.disable_policy) &&
505 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
506 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
507 		goto drop;
508 	}
509 
510 	skb_forward_csum(skb);
511 
512 	/*
513 	 *	We DO NOT make any processing on
514 	 *	RA packets, pushing them to user level AS IS
515 	 *	without ane WARRANTY that application will be able
516 	 *	to interpret them. The reason is that we
517 	 *	cannot make anything clever here.
518 	 *
519 	 *	We are not end-node, so that if packet contains
520 	 *	AH/ESP, we cannot make anything.
521 	 *	Defragmentation also would be mistake, RA packets
522 	 *	cannot be fragmented, because there is no warranty
523 	 *	that different fragments will go along one path. --ANK
524 	 */
525 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
526 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
527 			return 0;
528 	}
529 
530 	/*
531 	 *	check and decrement ttl
532 	 */
533 	if (hdr->hop_limit <= 1) {
534 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
535 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
536 
537 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
538 		return -ETIMEDOUT;
539 	}
540 
541 	/* XXX: idev->cnf.proxy_ndp? */
542 	if (net->ipv6.devconf_all->proxy_ndp &&
543 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
544 		int proxied = ip6_forward_proxy_check(skb);
545 		if (proxied > 0) {
546 			/* It's tempting to decrease the hop limit
547 			 * here by 1, as we do at the end of the
548 			 * function too.
549 			 *
550 			 * But that would be incorrect, as proxying is
551 			 * not forwarding.  The ip6_input function
552 			 * will handle this packet locally, and it
553 			 * depends on the hop limit being unchanged.
554 			 *
555 			 * One example is the NDP hop limit, that
556 			 * always has to stay 255, but other would be
557 			 * similar checks around RA packets, where the
558 			 * user can even change the desired limit.
559 			 */
560 			return ip6_input(skb);
561 		} else if (proxied < 0) {
562 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
563 			goto drop;
564 		}
565 	}
566 
567 	if (!xfrm6_route_forward(skb)) {
568 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
569 		SKB_DR_SET(reason, XFRM_POLICY);
570 		goto drop;
571 	}
572 	dst = skb_dst(skb);
573 
574 	/* IPv6 specs say nothing about it, but it is clear that we cannot
575 	   send redirects to source routed frames.
576 	   We don't send redirects to frames decapsulated from IPsec.
577 	 */
578 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
579 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
580 		struct in6_addr *target = NULL;
581 		struct inet_peer *peer;
582 		struct rt6_info *rt;
583 
584 		/*
585 		 *	incoming and outgoing devices are the same
586 		 *	send a redirect.
587 		 */
588 
589 		rt = (struct rt6_info *) dst;
590 		if (rt->rt6i_flags & RTF_GATEWAY)
591 			target = &rt->rt6i_gateway;
592 		else
593 			target = &hdr->daddr;
594 
595 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
596 
597 		/* Limit redirects both by destination (here)
598 		   and by source (inside ndisc_send_redirect)
599 		 */
600 		if (inet_peer_xrlim_allow(peer, 1*HZ))
601 			ndisc_send_redirect(skb, target);
602 		if (peer)
603 			inet_putpeer(peer);
604 	} else {
605 		int addrtype = ipv6_addr_type(&hdr->saddr);
606 
607 		/* This check is security critical. */
608 		if (addrtype == IPV6_ADDR_ANY ||
609 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
610 			goto error;
611 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
612 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
613 				    ICMPV6_NOT_NEIGHBOUR, 0);
614 			goto error;
615 		}
616 	}
617 
618 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
619 
620 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
621 	if (mtu < IPV6_MIN_MTU)
622 		mtu = IPV6_MIN_MTU;
623 
624 	if (ip6_pkt_too_big(skb, mtu)) {
625 		/* Again, force OUTPUT device used as source address */
626 		skb->dev = dst->dev;
627 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
628 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
629 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
630 				IPSTATS_MIB_FRAGFAILS);
631 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
632 		return -EMSGSIZE;
633 	}
634 
635 	if (skb_cow(skb, dst->dev->hard_header_len)) {
636 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
637 				IPSTATS_MIB_OUTDISCARDS);
638 		goto drop;
639 	}
640 
641 	hdr = ipv6_hdr(skb);
642 
643 	/* Mangling hops number delayed to point after skb COW */
644 
645 	hdr->hop_limit--;
646 
647 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
648 		       net, NULL, skb, skb->dev, dst->dev,
649 		       ip6_forward_finish);
650 
651 error:
652 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
653 	SKB_DR_SET(reason, IP_INADDRERRORS);
654 drop:
655 	kfree_skb_reason(skb, reason);
656 	return -EINVAL;
657 }
658 
659 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
660 {
661 	to->pkt_type = from->pkt_type;
662 	to->priority = from->priority;
663 	to->protocol = from->protocol;
664 	skb_dst_drop(to);
665 	skb_dst_set(to, dst_clone(skb_dst(from)));
666 	to->dev = from->dev;
667 	to->mark = from->mark;
668 
669 	skb_copy_hash(to, from);
670 
671 #ifdef CONFIG_NET_SCHED
672 	to->tc_index = from->tc_index;
673 #endif
674 	nf_copy(to, from);
675 	skb_ext_copy(to, from);
676 	skb_copy_secmark(to, from);
677 }
678 
679 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
680 		      u8 nexthdr, __be32 frag_id,
681 		      struct ip6_fraglist_iter *iter)
682 {
683 	unsigned int first_len;
684 	struct frag_hdr *fh;
685 
686 	/* BUILD HEADER */
687 	*prevhdr = NEXTHDR_FRAGMENT;
688 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
689 	if (!iter->tmp_hdr)
690 		return -ENOMEM;
691 
692 	iter->frag = skb_shinfo(skb)->frag_list;
693 	skb_frag_list_init(skb);
694 
695 	iter->offset = 0;
696 	iter->hlen = hlen;
697 	iter->frag_id = frag_id;
698 	iter->nexthdr = nexthdr;
699 
700 	__skb_pull(skb, hlen);
701 	fh = __skb_push(skb, sizeof(struct frag_hdr));
702 	__skb_push(skb, hlen);
703 	skb_reset_network_header(skb);
704 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
705 
706 	fh->nexthdr = nexthdr;
707 	fh->reserved = 0;
708 	fh->frag_off = htons(IP6_MF);
709 	fh->identification = frag_id;
710 
711 	first_len = skb_pagelen(skb);
712 	skb->data_len = first_len - skb_headlen(skb);
713 	skb->len = first_len;
714 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
715 
716 	return 0;
717 }
718 EXPORT_SYMBOL(ip6_fraglist_init);
719 
720 void ip6_fraglist_prepare(struct sk_buff *skb,
721 			  struct ip6_fraglist_iter *iter)
722 {
723 	struct sk_buff *frag = iter->frag;
724 	unsigned int hlen = iter->hlen;
725 	struct frag_hdr *fh;
726 
727 	frag->ip_summed = CHECKSUM_NONE;
728 	skb_reset_transport_header(frag);
729 	fh = __skb_push(frag, sizeof(struct frag_hdr));
730 	__skb_push(frag, hlen);
731 	skb_reset_network_header(frag);
732 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
733 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
734 	fh->nexthdr = iter->nexthdr;
735 	fh->reserved = 0;
736 	fh->frag_off = htons(iter->offset);
737 	if (frag->next)
738 		fh->frag_off |= htons(IP6_MF);
739 	fh->identification = iter->frag_id;
740 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
741 	ip6_copy_metadata(frag, skb);
742 }
743 EXPORT_SYMBOL(ip6_fraglist_prepare);
744 
745 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
746 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
747 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
748 {
749 	state->prevhdr = prevhdr;
750 	state->nexthdr = nexthdr;
751 	state->frag_id = frag_id;
752 
753 	state->hlen = hlen;
754 	state->mtu = mtu;
755 
756 	state->left = skb->len - hlen;	/* Space per frame */
757 	state->ptr = hlen;		/* Where to start from */
758 
759 	state->hroom = hdr_room;
760 	state->troom = needed_tailroom;
761 
762 	state->offset = 0;
763 }
764 EXPORT_SYMBOL(ip6_frag_init);
765 
766 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
767 {
768 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
769 	struct sk_buff *frag;
770 	struct frag_hdr *fh;
771 	unsigned int len;
772 
773 	len = state->left;
774 	/* IF: it doesn't fit, use 'mtu' - the data space left */
775 	if (len > state->mtu)
776 		len = state->mtu;
777 	/* IF: we are not sending up to and including the packet end
778 	   then align the next start on an eight byte boundary */
779 	if (len < state->left)
780 		len &= ~7;
781 
782 	/* Allocate buffer */
783 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
784 			 state->hroom + state->troom, GFP_ATOMIC);
785 	if (!frag)
786 		return ERR_PTR(-ENOMEM);
787 
788 	/*
789 	 *	Set up data on packet
790 	 */
791 
792 	ip6_copy_metadata(frag, skb);
793 	skb_reserve(frag, state->hroom);
794 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
795 	skb_reset_network_header(frag);
796 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
797 	frag->transport_header = (frag->network_header + state->hlen +
798 				  sizeof(struct frag_hdr));
799 
800 	/*
801 	 *	Charge the memory for the fragment to any owner
802 	 *	it might possess
803 	 */
804 	if (skb->sk)
805 		skb_set_owner_w(frag, skb->sk);
806 
807 	/*
808 	 *	Copy the packet header into the new buffer.
809 	 */
810 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
811 
812 	fragnexthdr_offset = skb_network_header(frag);
813 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
814 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
815 
816 	/*
817 	 *	Build fragment header.
818 	 */
819 	fh->nexthdr = state->nexthdr;
820 	fh->reserved = 0;
821 	fh->identification = state->frag_id;
822 
823 	/*
824 	 *	Copy a block of the IP datagram.
825 	 */
826 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
827 			     len));
828 	state->left -= len;
829 
830 	fh->frag_off = htons(state->offset);
831 	if (state->left > 0)
832 		fh->frag_off |= htons(IP6_MF);
833 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
834 
835 	state->ptr += len;
836 	state->offset += len;
837 
838 	return frag;
839 }
840 EXPORT_SYMBOL(ip6_frag_next);
841 
842 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
843 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
844 {
845 	struct sk_buff *frag;
846 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
847 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
848 				inet6_sk(skb->sk) : NULL;
849 	bool mono_delivery_time = skb->mono_delivery_time;
850 	struct ip6_frag_state state;
851 	unsigned int mtu, hlen, nexthdr_offset;
852 	ktime_t tstamp = skb->tstamp;
853 	int hroom, err = 0;
854 	__be32 frag_id;
855 	u8 *prevhdr, nexthdr = 0;
856 
857 	err = ip6_find_1stfragopt(skb, &prevhdr);
858 	if (err < 0)
859 		goto fail;
860 	hlen = err;
861 	nexthdr = *prevhdr;
862 	nexthdr_offset = prevhdr - skb_network_header(skb);
863 
864 	mtu = ip6_skb_dst_mtu(skb);
865 
866 	/* We must not fragment if the socket is set to force MTU discovery
867 	 * or if the skb it not generated by a local socket.
868 	 */
869 	if (unlikely(!skb->ignore_df && skb->len > mtu))
870 		goto fail_toobig;
871 
872 	if (IP6CB(skb)->frag_max_size) {
873 		if (IP6CB(skb)->frag_max_size > mtu)
874 			goto fail_toobig;
875 
876 		/* don't send fragments larger than what we received */
877 		mtu = IP6CB(skb)->frag_max_size;
878 		if (mtu < IPV6_MIN_MTU)
879 			mtu = IPV6_MIN_MTU;
880 	}
881 
882 	if (np) {
883 		u32 frag_size = READ_ONCE(np->frag_size);
884 
885 		if (frag_size && frag_size < mtu)
886 			mtu = frag_size;
887 	}
888 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
889 		goto fail_toobig;
890 	mtu -= hlen + sizeof(struct frag_hdr);
891 
892 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
893 				    &ipv6_hdr(skb)->saddr);
894 
895 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
896 	    (err = skb_checksum_help(skb)))
897 		goto fail;
898 
899 	prevhdr = skb_network_header(skb) + nexthdr_offset;
900 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
901 	if (skb_has_frag_list(skb)) {
902 		unsigned int first_len = skb_pagelen(skb);
903 		struct ip6_fraglist_iter iter;
904 		struct sk_buff *frag2;
905 
906 		if (first_len - hlen > mtu ||
907 		    ((first_len - hlen) & 7) ||
908 		    skb_cloned(skb) ||
909 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
910 			goto slow_path;
911 
912 		skb_walk_frags(skb, frag) {
913 			/* Correct geometry. */
914 			if (frag->len > mtu ||
915 			    ((frag->len & 7) && frag->next) ||
916 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
917 				goto slow_path_clean;
918 
919 			/* Partially cloned skb? */
920 			if (skb_shared(frag))
921 				goto slow_path_clean;
922 
923 			BUG_ON(frag->sk);
924 			if (skb->sk) {
925 				frag->sk = skb->sk;
926 				frag->destructor = sock_wfree;
927 			}
928 			skb->truesize -= frag->truesize;
929 		}
930 
931 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
932 					&iter);
933 		if (err < 0)
934 			goto fail;
935 
936 		/* We prevent @rt from being freed. */
937 		rcu_read_lock();
938 
939 		for (;;) {
940 			/* Prepare header of the next frame,
941 			 * before previous one went down. */
942 			if (iter.frag)
943 				ip6_fraglist_prepare(skb, &iter);
944 
945 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
946 			err = output(net, sk, skb);
947 			if (!err)
948 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
949 					      IPSTATS_MIB_FRAGCREATES);
950 
951 			if (err || !iter.frag)
952 				break;
953 
954 			skb = ip6_fraglist_next(&iter);
955 		}
956 
957 		kfree(iter.tmp_hdr);
958 
959 		if (err == 0) {
960 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
961 				      IPSTATS_MIB_FRAGOKS);
962 			rcu_read_unlock();
963 			return 0;
964 		}
965 
966 		kfree_skb_list(iter.frag);
967 
968 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
969 			      IPSTATS_MIB_FRAGFAILS);
970 		rcu_read_unlock();
971 		return err;
972 
973 slow_path_clean:
974 		skb_walk_frags(skb, frag2) {
975 			if (frag2 == frag)
976 				break;
977 			frag2->sk = NULL;
978 			frag2->destructor = NULL;
979 			skb->truesize += frag2->truesize;
980 		}
981 	}
982 
983 slow_path:
984 	/*
985 	 *	Fragment the datagram.
986 	 */
987 
988 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
989 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
990 		      &state);
991 
992 	/*
993 	 *	Keep copying data until we run out.
994 	 */
995 
996 	while (state.left > 0) {
997 		frag = ip6_frag_next(skb, &state);
998 		if (IS_ERR(frag)) {
999 			err = PTR_ERR(frag);
1000 			goto fail;
1001 		}
1002 
1003 		/*
1004 		 *	Put this fragment into the sending queue.
1005 		 */
1006 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1007 		err = output(net, sk, frag);
1008 		if (err)
1009 			goto fail;
1010 
1011 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1012 			      IPSTATS_MIB_FRAGCREATES);
1013 	}
1014 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1015 		      IPSTATS_MIB_FRAGOKS);
1016 	consume_skb(skb);
1017 	return err;
1018 
1019 fail_toobig:
1020 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1021 		sk_gso_disable(skb->sk);
1022 
1023 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1024 	err = -EMSGSIZE;
1025 
1026 fail:
1027 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 		      IPSTATS_MIB_FRAGFAILS);
1029 	kfree_skb(skb);
1030 	return err;
1031 }
1032 
1033 static inline int ip6_rt_check(const struct rt6key *rt_key,
1034 			       const struct in6_addr *fl_addr,
1035 			       const struct in6_addr *addr_cache)
1036 {
1037 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1038 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1039 }
1040 
1041 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1042 					  struct dst_entry *dst,
1043 					  const struct flowi6 *fl6)
1044 {
1045 	struct ipv6_pinfo *np = inet6_sk(sk);
1046 	struct rt6_info *rt;
1047 
1048 	if (!dst)
1049 		goto out;
1050 
1051 	if (dst->ops->family != AF_INET6) {
1052 		dst_release(dst);
1053 		return NULL;
1054 	}
1055 
1056 	rt = (struct rt6_info *)dst;
1057 	/* Yes, checking route validity in not connected
1058 	 * case is not very simple. Take into account,
1059 	 * that we do not support routing by source, TOS,
1060 	 * and MSG_DONTROUTE		--ANK (980726)
1061 	 *
1062 	 * 1. ip6_rt_check(): If route was host route,
1063 	 *    check that cached destination is current.
1064 	 *    If it is network route, we still may
1065 	 *    check its validity using saved pointer
1066 	 *    to the last used address: daddr_cache.
1067 	 *    We do not want to save whole address now,
1068 	 *    (because main consumer of this service
1069 	 *    is tcp, which has not this problem),
1070 	 *    so that the last trick works only on connected
1071 	 *    sockets.
1072 	 * 2. oif also should be the same.
1073 	 */
1074 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1075 #ifdef CONFIG_IPV6_SUBTREES
1076 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1077 #endif
1078 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1079 		dst_release(dst);
1080 		dst = NULL;
1081 	}
1082 
1083 out:
1084 	return dst;
1085 }
1086 
1087 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1088 			       struct dst_entry **dst, struct flowi6 *fl6)
1089 {
1090 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1091 	struct neighbour *n;
1092 	struct rt6_info *rt;
1093 #endif
1094 	int err;
1095 	int flags = 0;
1096 
1097 	/* The correct way to handle this would be to do
1098 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1099 	 * the route-specific preferred source forces the
1100 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1101 	 *
1102 	 * In source specific routing (no src=any default route),
1103 	 * ip6_route_output will fail given src=any saddr, though, so
1104 	 * that's why we try it again later.
1105 	 */
1106 	if (ipv6_addr_any(&fl6->saddr)) {
1107 		struct fib6_info *from;
1108 		struct rt6_info *rt;
1109 
1110 		*dst = ip6_route_output(net, sk, fl6);
1111 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1112 
1113 		rcu_read_lock();
1114 		from = rt ? rcu_dereference(rt->from) : NULL;
1115 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1116 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1117 					  &fl6->saddr);
1118 		rcu_read_unlock();
1119 
1120 		if (err)
1121 			goto out_err_release;
1122 
1123 		/* If we had an erroneous initial result, pretend it
1124 		 * never existed and let the SA-enabled version take
1125 		 * over.
1126 		 */
1127 		if ((*dst)->error) {
1128 			dst_release(*dst);
1129 			*dst = NULL;
1130 		}
1131 
1132 		if (fl6->flowi6_oif)
1133 			flags |= RT6_LOOKUP_F_IFACE;
1134 	}
1135 
1136 	if (!*dst)
1137 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1138 
1139 	err = (*dst)->error;
1140 	if (err)
1141 		goto out_err_release;
1142 
1143 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1144 	/*
1145 	 * Here if the dst entry we've looked up
1146 	 * has a neighbour entry that is in the INCOMPLETE
1147 	 * state and the src address from the flow is
1148 	 * marked as OPTIMISTIC, we release the found
1149 	 * dst entry and replace it instead with the
1150 	 * dst entry of the nexthop router
1151 	 */
1152 	rt = (struct rt6_info *) *dst;
1153 	rcu_read_lock();
1154 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1155 				      rt6_nexthop(rt, &fl6->daddr));
1156 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1157 	rcu_read_unlock();
1158 
1159 	if (err) {
1160 		struct inet6_ifaddr *ifp;
1161 		struct flowi6 fl_gw6;
1162 		int redirect;
1163 
1164 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1165 				      (*dst)->dev, 1);
1166 
1167 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1168 		if (ifp)
1169 			in6_ifa_put(ifp);
1170 
1171 		if (redirect) {
1172 			/*
1173 			 * We need to get the dst entry for the
1174 			 * default router instead
1175 			 */
1176 			dst_release(*dst);
1177 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1178 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1179 			*dst = ip6_route_output(net, sk, &fl_gw6);
1180 			err = (*dst)->error;
1181 			if (err)
1182 				goto out_err_release;
1183 		}
1184 	}
1185 #endif
1186 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1187 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1188 		err = -EAFNOSUPPORT;
1189 		goto out_err_release;
1190 	}
1191 
1192 	return 0;
1193 
1194 out_err_release:
1195 	dst_release(*dst);
1196 	*dst = NULL;
1197 
1198 	if (err == -ENETUNREACH)
1199 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1200 	return err;
1201 }
1202 
1203 /**
1204  *	ip6_dst_lookup - perform route lookup on flow
1205  *	@net: Network namespace to perform lookup in
1206  *	@sk: socket which provides route info
1207  *	@dst: pointer to dst_entry * for result
1208  *	@fl6: flow to lookup
1209  *
1210  *	This function performs a route lookup on the given flow.
1211  *
1212  *	It returns zero on success, or a standard errno code on error.
1213  */
1214 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1215 		   struct flowi6 *fl6)
1216 {
1217 	*dst = NULL;
1218 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1221 
1222 /**
1223  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1224  *	@net: Network namespace to perform lookup in
1225  *	@sk: socket which provides route info
1226  *	@fl6: flow to lookup
1227  *	@final_dst: final destination address for ipsec lookup
1228  *
1229  *	This function performs a route lookup on the given flow.
1230  *
1231  *	It returns a valid dst pointer on success, or a pointer encoded
1232  *	error code.
1233  */
1234 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1235 				      const struct in6_addr *final_dst)
1236 {
1237 	struct dst_entry *dst = NULL;
1238 	int err;
1239 
1240 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1241 	if (err)
1242 		return ERR_PTR(err);
1243 	if (final_dst)
1244 		fl6->daddr = *final_dst;
1245 
1246 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1247 }
1248 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1249 
1250 /**
1251  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1252  *	@sk: socket which provides the dst cache and route info
1253  *	@fl6: flow to lookup
1254  *	@final_dst: final destination address for ipsec lookup
1255  *	@connected: whether @sk is connected or not
1256  *
1257  *	This function performs a route lookup on the given flow with the
1258  *	possibility of using the cached route in the socket if it is valid.
1259  *	It will take the socket dst lock when operating on the dst cache.
1260  *	As a result, this function can only be used in process context.
1261  *
1262  *	In addition, for a connected socket, cache the dst in the socket
1263  *	if the current cache is not valid.
1264  *
1265  *	It returns a valid dst pointer on success, or a pointer encoded
1266  *	error code.
1267  */
1268 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1269 					 const struct in6_addr *final_dst,
1270 					 bool connected)
1271 {
1272 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1273 
1274 	dst = ip6_sk_dst_check(sk, dst, fl6);
1275 	if (dst)
1276 		return dst;
1277 
1278 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1279 	if (connected && !IS_ERR(dst))
1280 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1281 
1282 	return dst;
1283 }
1284 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1285 
1286 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1287 					       gfp_t gfp)
1288 {
1289 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1290 }
1291 
1292 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1293 						gfp_t gfp)
1294 {
1295 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1296 }
1297 
1298 static void ip6_append_data_mtu(unsigned int *mtu,
1299 				int *maxfraglen,
1300 				unsigned int fragheaderlen,
1301 				struct sk_buff *skb,
1302 				struct rt6_info *rt,
1303 				unsigned int orig_mtu)
1304 {
1305 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1306 		if (!skb) {
1307 			/* first fragment, reserve header_len */
1308 			*mtu = orig_mtu - rt->dst.header_len;
1309 
1310 		} else {
1311 			/*
1312 			 * this fragment is not first, the headers
1313 			 * space is regarded as data space.
1314 			 */
1315 			*mtu = orig_mtu;
1316 		}
1317 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1318 			      + fragheaderlen - sizeof(struct frag_hdr);
1319 	}
1320 }
1321 
1322 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1323 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1324 			  struct rt6_info *rt)
1325 {
1326 	struct ipv6_pinfo *np = inet6_sk(sk);
1327 	unsigned int mtu, frag_size;
1328 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1329 
1330 	/* callers pass dst together with a reference, set it first so
1331 	 * ip6_cork_release() can put it down even in case of an error.
1332 	 */
1333 	cork->base.dst = &rt->dst;
1334 
1335 	/*
1336 	 * setup for corking
1337 	 */
1338 	if (opt) {
1339 		if (WARN_ON(v6_cork->opt))
1340 			return -EINVAL;
1341 
1342 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1343 		if (unlikely(!nopt))
1344 			return -ENOBUFS;
1345 
1346 		nopt->tot_len = sizeof(*opt);
1347 		nopt->opt_flen = opt->opt_flen;
1348 		nopt->opt_nflen = opt->opt_nflen;
1349 
1350 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1351 		if (opt->dst0opt && !nopt->dst0opt)
1352 			return -ENOBUFS;
1353 
1354 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1355 		if (opt->dst1opt && !nopt->dst1opt)
1356 			return -ENOBUFS;
1357 
1358 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1359 		if (opt->hopopt && !nopt->hopopt)
1360 			return -ENOBUFS;
1361 
1362 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1363 		if (opt->srcrt && !nopt->srcrt)
1364 			return -ENOBUFS;
1365 
1366 		/* need source address above miyazawa*/
1367 	}
1368 	v6_cork->hop_limit = ipc6->hlimit;
1369 	v6_cork->tclass = ipc6->tclass;
1370 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1371 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1372 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1373 	else
1374 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1375 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1376 
1377 	frag_size = READ_ONCE(np->frag_size);
1378 	if (frag_size && frag_size < mtu)
1379 		mtu = frag_size;
1380 
1381 	cork->base.fragsize = mtu;
1382 	cork->base.gso_size = ipc6->gso_size;
1383 	cork->base.tx_flags = 0;
1384 	cork->base.mark = ipc6->sockc.mark;
1385 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1386 
1387 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1388 		cork->base.flags |= IPCORK_ALLFRAG;
1389 	cork->base.length = 0;
1390 
1391 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1392 
1393 	return 0;
1394 }
1395 
1396 static int __ip6_append_data(struct sock *sk,
1397 			     struct sk_buff_head *queue,
1398 			     struct inet_cork_full *cork_full,
1399 			     struct inet6_cork *v6_cork,
1400 			     struct page_frag *pfrag,
1401 			     int getfrag(void *from, char *to, int offset,
1402 					 int len, int odd, struct sk_buff *skb),
1403 			     void *from, size_t length, int transhdrlen,
1404 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1405 {
1406 	struct sk_buff *skb, *skb_prev = NULL;
1407 	struct inet_cork *cork = &cork_full->base;
1408 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1409 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1410 	struct ubuf_info *uarg = NULL;
1411 	int exthdrlen = 0;
1412 	int dst_exthdrlen = 0;
1413 	int hh_len;
1414 	int copy;
1415 	int err;
1416 	int offset = 0;
1417 	bool zc = false;
1418 	u32 tskey = 0;
1419 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1420 	struct ipv6_txoptions *opt = v6_cork->opt;
1421 	int csummode = CHECKSUM_NONE;
1422 	unsigned int maxnonfragsize, headersize;
1423 	unsigned int wmem_alloc_delta = 0;
1424 	bool paged, extra_uref = false;
1425 
1426 	skb = skb_peek_tail(queue);
1427 	if (!skb) {
1428 		exthdrlen = opt ? opt->opt_flen : 0;
1429 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1430 	}
1431 
1432 	paged = !!cork->gso_size;
1433 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1434 	orig_mtu = mtu;
1435 
1436 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1437 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1438 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1439 
1440 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1441 
1442 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1443 			(opt ? opt->opt_nflen : 0);
1444 
1445 	headersize = sizeof(struct ipv6hdr) +
1446 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1447 		     (dst_allfrag(&rt->dst) ?
1448 		      sizeof(struct frag_hdr) : 0) +
1449 		     rt->rt6i_nfheader_len;
1450 
1451 	if (mtu <= fragheaderlen ||
1452 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1453 		goto emsgsize;
1454 
1455 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1456 		     sizeof(struct frag_hdr);
1457 
1458 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1459 	 * the first fragment
1460 	 */
1461 	if (headersize + transhdrlen > mtu)
1462 		goto emsgsize;
1463 
1464 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1465 	    (sk->sk_protocol == IPPROTO_UDP ||
1466 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1467 	     sk->sk_protocol == IPPROTO_RAW)) {
1468 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1469 				sizeof(struct ipv6hdr));
1470 		goto emsgsize;
1471 	}
1472 
1473 	if (ip6_sk_ignore_df(sk))
1474 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1475 	else
1476 		maxnonfragsize = mtu;
1477 
1478 	if (cork->length + length > maxnonfragsize - headersize) {
1479 emsgsize:
1480 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1481 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1482 		return -EMSGSIZE;
1483 	}
1484 
1485 	/* CHECKSUM_PARTIAL only with no extension headers and when
1486 	 * we are not going to fragment
1487 	 */
1488 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1489 	    headersize == sizeof(struct ipv6hdr) &&
1490 	    length <= mtu - headersize &&
1491 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1492 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1493 		csummode = CHECKSUM_PARTIAL;
1494 
1495 	if ((flags & MSG_ZEROCOPY) && length) {
1496 		struct msghdr *msg = from;
1497 
1498 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1499 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1500 				return -EINVAL;
1501 
1502 			/* Leave uarg NULL if can't zerocopy, callers should
1503 			 * be able to handle it.
1504 			 */
1505 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1506 			    csummode == CHECKSUM_PARTIAL) {
1507 				paged = true;
1508 				zc = true;
1509 				uarg = msg->msg_ubuf;
1510 			}
1511 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1512 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1513 			if (!uarg)
1514 				return -ENOBUFS;
1515 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1516 			if (rt->dst.dev->features & NETIF_F_SG &&
1517 			    csummode == CHECKSUM_PARTIAL) {
1518 				paged = true;
1519 				zc = true;
1520 			} else {
1521 				uarg_to_msgzc(uarg)->zerocopy = 0;
1522 				skb_zcopy_set(skb, uarg, &extra_uref);
1523 			}
1524 		}
1525 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1526 		if (inet_test_bit(HDRINCL, sk))
1527 			return -EPERM;
1528 		if (rt->dst.dev->features & NETIF_F_SG &&
1529 		    getfrag == ip_generic_getfrag)
1530 			/* We need an empty buffer to attach stuff to */
1531 			paged = true;
1532 		else
1533 			flags &= ~MSG_SPLICE_PAGES;
1534 	}
1535 
1536 	/*
1537 	 * Let's try using as much space as possible.
1538 	 * Use MTU if total length of the message fits into the MTU.
1539 	 * Otherwise, we need to reserve fragment header and
1540 	 * fragment alignment (= 8-15 octects, in total).
1541 	 *
1542 	 * Note that we may need to "move" the data from the tail
1543 	 * of the buffer to the new fragment when we split
1544 	 * the message.
1545 	 *
1546 	 * FIXME: It may be fragmented into multiple chunks
1547 	 *        at once if non-fragmentable extension headers
1548 	 *        are too large.
1549 	 * --yoshfuji
1550 	 */
1551 
1552 	cork->length += length;
1553 	if (!skb)
1554 		goto alloc_new_skb;
1555 
1556 	while (length > 0) {
1557 		/* Check if the remaining data fits into current packet. */
1558 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1559 		if (copy < length)
1560 			copy = maxfraglen - skb->len;
1561 
1562 		if (copy <= 0) {
1563 			char *data;
1564 			unsigned int datalen;
1565 			unsigned int fraglen;
1566 			unsigned int fraggap;
1567 			unsigned int alloclen, alloc_extra;
1568 			unsigned int pagedlen;
1569 alloc_new_skb:
1570 			/* There's no room in the current skb */
1571 			if (skb)
1572 				fraggap = skb->len - maxfraglen;
1573 			else
1574 				fraggap = 0;
1575 			/* update mtu and maxfraglen if necessary */
1576 			if (!skb || !skb_prev)
1577 				ip6_append_data_mtu(&mtu, &maxfraglen,
1578 						    fragheaderlen, skb, rt,
1579 						    orig_mtu);
1580 
1581 			skb_prev = skb;
1582 
1583 			/*
1584 			 * If remaining data exceeds the mtu,
1585 			 * we know we need more fragment(s).
1586 			 */
1587 			datalen = length + fraggap;
1588 
1589 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1590 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1591 			fraglen = datalen + fragheaderlen;
1592 			pagedlen = 0;
1593 
1594 			alloc_extra = hh_len;
1595 			alloc_extra += dst_exthdrlen;
1596 			alloc_extra += rt->dst.trailer_len;
1597 
1598 			/* We just reserve space for fragment header.
1599 			 * Note: this may be overallocation if the message
1600 			 * (without MSG_MORE) fits into the MTU.
1601 			 */
1602 			alloc_extra += sizeof(struct frag_hdr);
1603 
1604 			if ((flags & MSG_MORE) &&
1605 			    !(rt->dst.dev->features&NETIF_F_SG))
1606 				alloclen = mtu;
1607 			else if (!paged &&
1608 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1609 				  !(rt->dst.dev->features & NETIF_F_SG)))
1610 				alloclen = fraglen;
1611 			else {
1612 				alloclen = fragheaderlen + transhdrlen;
1613 				pagedlen = datalen - transhdrlen;
1614 			}
1615 			alloclen += alloc_extra;
1616 
1617 			if (datalen != length + fraggap) {
1618 				/*
1619 				 * this is not the last fragment, the trailer
1620 				 * space is regarded as data space.
1621 				 */
1622 				datalen += rt->dst.trailer_len;
1623 			}
1624 
1625 			fraglen = datalen + fragheaderlen;
1626 
1627 			copy = datalen - transhdrlen - fraggap - pagedlen;
1628 			/* [!] NOTE: copy may be negative if pagedlen>0
1629 			 * because then the equation may reduces to -fraggap.
1630 			 */
1631 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1632 				err = -EINVAL;
1633 				goto error;
1634 			}
1635 			if (transhdrlen) {
1636 				skb = sock_alloc_send_skb(sk, alloclen,
1637 						(flags & MSG_DONTWAIT), &err);
1638 			} else {
1639 				skb = NULL;
1640 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1641 				    2 * sk->sk_sndbuf)
1642 					skb = alloc_skb(alloclen,
1643 							sk->sk_allocation);
1644 				if (unlikely(!skb))
1645 					err = -ENOBUFS;
1646 			}
1647 			if (!skb)
1648 				goto error;
1649 			/*
1650 			 *	Fill in the control structures
1651 			 */
1652 			skb->protocol = htons(ETH_P_IPV6);
1653 			skb->ip_summed = csummode;
1654 			skb->csum = 0;
1655 			/* reserve for fragmentation and ipsec header */
1656 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1657 				    dst_exthdrlen);
1658 
1659 			/*
1660 			 *	Find where to start putting bytes
1661 			 */
1662 			data = skb_put(skb, fraglen - pagedlen);
1663 			skb_set_network_header(skb, exthdrlen);
1664 			data += fragheaderlen;
1665 			skb->transport_header = (skb->network_header +
1666 						 fragheaderlen);
1667 			if (fraggap) {
1668 				skb->csum = skb_copy_and_csum_bits(
1669 					skb_prev, maxfraglen,
1670 					data + transhdrlen, fraggap);
1671 				skb_prev->csum = csum_sub(skb_prev->csum,
1672 							  skb->csum);
1673 				data += fraggap;
1674 				pskb_trim_unique(skb_prev, maxfraglen);
1675 			}
1676 			if (copy > 0 &&
1677 			    getfrag(from, data + transhdrlen, offset,
1678 				    copy, fraggap, skb) < 0) {
1679 				err = -EFAULT;
1680 				kfree_skb(skb);
1681 				goto error;
1682 			} else if (flags & MSG_SPLICE_PAGES) {
1683 				copy = 0;
1684 			}
1685 
1686 			offset += copy;
1687 			length -= copy + transhdrlen;
1688 			transhdrlen = 0;
1689 			exthdrlen = 0;
1690 			dst_exthdrlen = 0;
1691 
1692 			/* Only the initial fragment is time stamped */
1693 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1694 			cork->tx_flags = 0;
1695 			skb_shinfo(skb)->tskey = tskey;
1696 			tskey = 0;
1697 			skb_zcopy_set(skb, uarg, &extra_uref);
1698 
1699 			if ((flags & MSG_CONFIRM) && !skb_prev)
1700 				skb_set_dst_pending_confirm(skb, 1);
1701 
1702 			/*
1703 			 * Put the packet on the pending queue
1704 			 */
1705 			if (!skb->destructor) {
1706 				skb->destructor = sock_wfree;
1707 				skb->sk = sk;
1708 				wmem_alloc_delta += skb->truesize;
1709 			}
1710 			__skb_queue_tail(queue, skb);
1711 			continue;
1712 		}
1713 
1714 		if (copy > length)
1715 			copy = length;
1716 
1717 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1718 		    skb_tailroom(skb) >= copy) {
1719 			unsigned int off;
1720 
1721 			off = skb->len;
1722 			if (getfrag(from, skb_put(skb, copy),
1723 						offset, copy, off, skb) < 0) {
1724 				__skb_trim(skb, off);
1725 				err = -EFAULT;
1726 				goto error;
1727 			}
1728 		} else if (flags & MSG_SPLICE_PAGES) {
1729 			struct msghdr *msg = from;
1730 
1731 			err = -EIO;
1732 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1733 				goto error;
1734 
1735 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1736 						   sk->sk_allocation);
1737 			if (err < 0)
1738 				goto error;
1739 			copy = err;
1740 			wmem_alloc_delta += copy;
1741 		} else if (!zc) {
1742 			int i = skb_shinfo(skb)->nr_frags;
1743 
1744 			err = -ENOMEM;
1745 			if (!sk_page_frag_refill(sk, pfrag))
1746 				goto error;
1747 
1748 			skb_zcopy_downgrade_managed(skb);
1749 			if (!skb_can_coalesce(skb, i, pfrag->page,
1750 					      pfrag->offset)) {
1751 				err = -EMSGSIZE;
1752 				if (i == MAX_SKB_FRAGS)
1753 					goto error;
1754 
1755 				__skb_fill_page_desc(skb, i, pfrag->page,
1756 						     pfrag->offset, 0);
1757 				skb_shinfo(skb)->nr_frags = ++i;
1758 				get_page(pfrag->page);
1759 			}
1760 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1761 			if (getfrag(from,
1762 				    page_address(pfrag->page) + pfrag->offset,
1763 				    offset, copy, skb->len, skb) < 0)
1764 				goto error_efault;
1765 
1766 			pfrag->offset += copy;
1767 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1768 			skb->len += copy;
1769 			skb->data_len += copy;
1770 			skb->truesize += copy;
1771 			wmem_alloc_delta += copy;
1772 		} else {
1773 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1774 			if (err < 0)
1775 				goto error;
1776 		}
1777 		offset += copy;
1778 		length -= copy;
1779 	}
1780 
1781 	if (wmem_alloc_delta)
1782 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1783 	return 0;
1784 
1785 error_efault:
1786 	err = -EFAULT;
1787 error:
1788 	net_zcopy_put_abort(uarg, extra_uref);
1789 	cork->length -= length;
1790 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1791 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1792 	return err;
1793 }
1794 
1795 int ip6_append_data(struct sock *sk,
1796 		    int getfrag(void *from, char *to, int offset, int len,
1797 				int odd, struct sk_buff *skb),
1798 		    void *from, size_t length, int transhdrlen,
1799 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1800 		    struct rt6_info *rt, unsigned int flags)
1801 {
1802 	struct inet_sock *inet = inet_sk(sk);
1803 	struct ipv6_pinfo *np = inet6_sk(sk);
1804 	int exthdrlen;
1805 	int err;
1806 
1807 	if (flags&MSG_PROBE)
1808 		return 0;
1809 	if (skb_queue_empty(&sk->sk_write_queue)) {
1810 		/*
1811 		 * setup for corking
1812 		 */
1813 		dst_hold(&rt->dst);
1814 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1815 				     ipc6, rt);
1816 		if (err)
1817 			return err;
1818 
1819 		inet->cork.fl.u.ip6 = *fl6;
1820 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1821 		length += exthdrlen;
1822 		transhdrlen += exthdrlen;
1823 	} else {
1824 		transhdrlen = 0;
1825 	}
1826 
1827 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1828 				 &np->cork, sk_page_frag(sk), getfrag,
1829 				 from, length, transhdrlen, flags, ipc6);
1830 }
1831 EXPORT_SYMBOL_GPL(ip6_append_data);
1832 
1833 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1834 {
1835 	struct dst_entry *dst = cork->base.dst;
1836 
1837 	cork->base.dst = NULL;
1838 	cork->base.flags &= ~IPCORK_ALLFRAG;
1839 	skb_dst_set(skb, dst);
1840 }
1841 
1842 static void ip6_cork_release(struct inet_cork_full *cork,
1843 			     struct inet6_cork *v6_cork)
1844 {
1845 	if (v6_cork->opt) {
1846 		struct ipv6_txoptions *opt = v6_cork->opt;
1847 
1848 		kfree(opt->dst0opt);
1849 		kfree(opt->dst1opt);
1850 		kfree(opt->hopopt);
1851 		kfree(opt->srcrt);
1852 		kfree(opt);
1853 		v6_cork->opt = NULL;
1854 	}
1855 
1856 	if (cork->base.dst) {
1857 		dst_release(cork->base.dst);
1858 		cork->base.dst = NULL;
1859 		cork->base.flags &= ~IPCORK_ALLFRAG;
1860 	}
1861 }
1862 
1863 struct sk_buff *__ip6_make_skb(struct sock *sk,
1864 			       struct sk_buff_head *queue,
1865 			       struct inet_cork_full *cork,
1866 			       struct inet6_cork *v6_cork)
1867 {
1868 	struct sk_buff *skb, *tmp_skb;
1869 	struct sk_buff **tail_skb;
1870 	struct in6_addr *final_dst;
1871 	struct net *net = sock_net(sk);
1872 	struct ipv6hdr *hdr;
1873 	struct ipv6_txoptions *opt = v6_cork->opt;
1874 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1875 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1876 	unsigned char proto = fl6->flowi6_proto;
1877 
1878 	skb = __skb_dequeue(queue);
1879 	if (!skb)
1880 		goto out;
1881 	tail_skb = &(skb_shinfo(skb)->frag_list);
1882 
1883 	/* move skb->data to ip header from ext header */
1884 	if (skb->data < skb_network_header(skb))
1885 		__skb_pull(skb, skb_network_offset(skb));
1886 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1887 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1888 		*tail_skb = tmp_skb;
1889 		tail_skb = &(tmp_skb->next);
1890 		skb->len += tmp_skb->len;
1891 		skb->data_len += tmp_skb->len;
1892 		skb->truesize += tmp_skb->truesize;
1893 		tmp_skb->destructor = NULL;
1894 		tmp_skb->sk = NULL;
1895 	}
1896 
1897 	/* Allow local fragmentation. */
1898 	skb->ignore_df = ip6_sk_ignore_df(sk);
1899 	__skb_pull(skb, skb_network_header_len(skb));
1900 
1901 	final_dst = &fl6->daddr;
1902 	if (opt && opt->opt_flen)
1903 		ipv6_push_frag_opts(skb, opt, &proto);
1904 	if (opt && opt->opt_nflen)
1905 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1906 
1907 	skb_push(skb, sizeof(struct ipv6hdr));
1908 	skb_reset_network_header(skb);
1909 	hdr = ipv6_hdr(skb);
1910 
1911 	ip6_flow_hdr(hdr, v6_cork->tclass,
1912 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1913 					ip6_autoflowlabel(net, sk), fl6));
1914 	hdr->hop_limit = v6_cork->hop_limit;
1915 	hdr->nexthdr = proto;
1916 	hdr->saddr = fl6->saddr;
1917 	hdr->daddr = *final_dst;
1918 
1919 	skb->priority = READ_ONCE(sk->sk_priority);
1920 	skb->mark = cork->base.mark;
1921 	skb->tstamp = cork->base.transmit_time;
1922 
1923 	ip6_cork_steal_dst(skb, cork);
1924 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1925 	if (proto == IPPROTO_ICMPV6) {
1926 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1927 		u8 icmp6_type;
1928 
1929 		if (sk->sk_socket->type == SOCK_RAW &&
1930 		   !inet_test_bit(HDRINCL, sk))
1931 			icmp6_type = fl6->fl6_icmp_type;
1932 		else
1933 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1934 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1935 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1936 	}
1937 
1938 	ip6_cork_release(cork, v6_cork);
1939 out:
1940 	return skb;
1941 }
1942 
1943 int ip6_send_skb(struct sk_buff *skb)
1944 {
1945 	struct net *net = sock_net(skb->sk);
1946 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1947 	int err;
1948 
1949 	err = ip6_local_out(net, skb->sk, skb);
1950 	if (err) {
1951 		if (err > 0)
1952 			err = net_xmit_errno(err);
1953 		if (err)
1954 			IP6_INC_STATS(net, rt->rt6i_idev,
1955 				      IPSTATS_MIB_OUTDISCARDS);
1956 	}
1957 
1958 	return err;
1959 }
1960 
1961 int ip6_push_pending_frames(struct sock *sk)
1962 {
1963 	struct sk_buff *skb;
1964 
1965 	skb = ip6_finish_skb(sk);
1966 	if (!skb)
1967 		return 0;
1968 
1969 	return ip6_send_skb(skb);
1970 }
1971 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1972 
1973 static void __ip6_flush_pending_frames(struct sock *sk,
1974 				       struct sk_buff_head *queue,
1975 				       struct inet_cork_full *cork,
1976 				       struct inet6_cork *v6_cork)
1977 {
1978 	struct sk_buff *skb;
1979 
1980 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1981 		if (skb_dst(skb))
1982 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1983 				      IPSTATS_MIB_OUTDISCARDS);
1984 		kfree_skb(skb);
1985 	}
1986 
1987 	ip6_cork_release(cork, v6_cork);
1988 }
1989 
1990 void ip6_flush_pending_frames(struct sock *sk)
1991 {
1992 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1993 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1994 }
1995 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1996 
1997 struct sk_buff *ip6_make_skb(struct sock *sk,
1998 			     int getfrag(void *from, char *to, int offset,
1999 					 int len, int odd, struct sk_buff *skb),
2000 			     void *from, size_t length, int transhdrlen,
2001 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2002 			     unsigned int flags, struct inet_cork_full *cork)
2003 {
2004 	struct inet6_cork v6_cork;
2005 	struct sk_buff_head queue;
2006 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2007 	int err;
2008 
2009 	if (flags & MSG_PROBE) {
2010 		dst_release(&rt->dst);
2011 		return NULL;
2012 	}
2013 
2014 	__skb_queue_head_init(&queue);
2015 
2016 	cork->base.flags = 0;
2017 	cork->base.addr = 0;
2018 	cork->base.opt = NULL;
2019 	v6_cork.opt = NULL;
2020 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2021 	if (err) {
2022 		ip6_cork_release(cork, &v6_cork);
2023 		return ERR_PTR(err);
2024 	}
2025 	if (ipc6->dontfrag < 0)
2026 		ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
2027 
2028 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2029 				&current->task_frag, getfrag, from,
2030 				length + exthdrlen, transhdrlen + exthdrlen,
2031 				flags, ipc6);
2032 	if (err) {
2033 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2034 		return ERR_PTR(err);
2035 	}
2036 
2037 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2038 }
2039