xref: /linux/net/ipv6/ip6_output.c (revision 6f19b2c136d98a84d79030b53e23d405edfdc783)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res != LWTUNNEL_XMIT_CONTINUE)
117 			return res;
118 	}
119 
120 	rcu_read_lock();
121 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
122 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
123 
124 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
125 		if (unlikely(!neigh))
126 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
127 		if (IS_ERR(neigh)) {
128 			rcu_read_unlock();
129 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
130 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
131 			return -EINVAL;
132 		}
133 	}
134 	sock_confirm_neigh(skb, neigh);
135 	ret = neigh_output(neigh, skb, false);
136 	rcu_read_unlock();
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
166 		if (err && ret == 0)
167 			ret = err;
168 	}
169 
170 	return ret;
171 }
172 
173 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
174 {
175 	unsigned int mtu;
176 
177 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
178 	/* Policy lookup after SNAT yielded a new policy */
179 	if (skb_dst(skb)->xfrm) {
180 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
181 		return dst_output(net, sk, skb);
182 	}
183 #endif
184 
185 	mtu = ip6_skb_dst_mtu(skb);
186 	if (skb_is_gso(skb) &&
187 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
188 	    !skb_gso_validate_network_len(skb, mtu))
189 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
190 
191 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
192 	    dst_allfrag(skb_dst(skb)) ||
193 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
194 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
195 	else
196 		return ip6_finish_output2(net, sk, skb);
197 }
198 
199 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
200 {
201 	int ret;
202 
203 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
204 	switch (ret) {
205 	case NET_XMIT_SUCCESS:
206 	case NET_XMIT_CN:
207 		return __ip6_finish_output(net, sk, skb) ? : ret;
208 	default:
209 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
210 		return ret;
211 	}
212 }
213 
214 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
217 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->dev = dev;
221 
222 	if (unlikely(idev->cnf.disable_ipv6)) {
223 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
224 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
225 		return 0;
226 	}
227 
228 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
229 			    net, sk, skb, indev, dev,
230 			    ip6_finish_output,
231 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
232 }
233 EXPORT_SYMBOL(ip6_output);
234 
235 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
236 {
237 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
238 		return ip6_default_np_autolabel(net);
239 	return inet6_test_bit(AUTOFLOWLABEL, sk);
240 }
241 
242 /*
243  * xmit an sk_buff (used by TCP, SCTP and DCCP)
244  * Note : socket lock is not held for SYNACK packets, but might be modified
245  * by calls to skb_set_owner_w() and ipv6_local_error(),
246  * which are using proper atomic operations or spinlocks.
247  */
248 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
249 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
250 {
251 	struct net *net = sock_net(sk);
252 	const struct ipv6_pinfo *np = inet6_sk(sk);
253 	struct in6_addr *first_hop = &fl6->daddr;
254 	struct dst_entry *dst = skb_dst(skb);
255 	struct net_device *dev = dst->dev;
256 	struct inet6_dev *idev = ip6_dst_idev(dst);
257 	struct hop_jumbo_hdr *hop_jumbo;
258 	int hoplen = sizeof(*hop_jumbo);
259 	unsigned int head_room;
260 	struct ipv6hdr *hdr;
261 	u8  proto = fl6->flowi6_proto;
262 	int seg_len = skb->len;
263 	int hlimit = -1;
264 	u32 mtu;
265 
266 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
267 	if (opt)
268 		head_room += opt->opt_nflen + opt->opt_flen;
269 
270 	if (unlikely(head_room > skb_headroom(skb))) {
271 		skb = skb_expand_head(skb, head_room);
272 		if (!skb) {
273 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
274 			return -ENOBUFS;
275 		}
276 	}
277 
278 	if (opt) {
279 		seg_len += opt->opt_nflen + opt->opt_flen;
280 
281 		if (opt->opt_flen)
282 			ipv6_push_frag_opts(skb, opt, &proto);
283 
284 		if (opt->opt_nflen)
285 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
286 					     &fl6->saddr);
287 	}
288 
289 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
290 		hop_jumbo = skb_push(skb, hoplen);
291 
292 		hop_jumbo->nexthdr = proto;
293 		hop_jumbo->hdrlen = 0;
294 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
295 		hop_jumbo->tlv_len = 4;
296 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
297 
298 		proto = IPPROTO_HOPOPTS;
299 		seg_len = 0;
300 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
301 	}
302 
303 	skb_push(skb, sizeof(struct ipv6hdr));
304 	skb_reset_network_header(skb);
305 	hdr = ipv6_hdr(skb);
306 
307 	/*
308 	 *	Fill in the IPv6 header
309 	 */
310 	if (np)
311 		hlimit = READ_ONCE(np->hop_limit);
312 	if (hlimit < 0)
313 		hlimit = ip6_dst_hoplimit(dst);
314 
315 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
316 				ip6_autoflowlabel(net, sk), fl6));
317 
318 	hdr->payload_len = htons(seg_len);
319 	hdr->nexthdr = proto;
320 	hdr->hop_limit = hlimit;
321 
322 	hdr->saddr = fl6->saddr;
323 	hdr->daddr = *first_hop;
324 
325 	skb->protocol = htons(ETH_P_IPV6);
326 	skb->priority = priority;
327 	skb->mark = mark;
328 
329 	mtu = dst_mtu(dst);
330 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
331 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
332 
333 		/* if egress device is enslaved to an L3 master device pass the
334 		 * skb to its handler for processing
335 		 */
336 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
337 		if (unlikely(!skb))
338 			return 0;
339 
340 		/* hooks should never assume socket lock is held.
341 		 * we promote our socket to non const
342 		 */
343 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
344 			       net, (struct sock *)sk, skb, NULL, dev,
345 			       dst_output);
346 	}
347 
348 	skb->dev = dev;
349 	/* ipv6_local_error() does not require socket lock,
350 	 * we promote our socket to non const
351 	 */
352 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
353 
354 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
355 	kfree_skb(skb);
356 	return -EMSGSIZE;
357 }
358 EXPORT_SYMBOL(ip6_xmit);
359 
360 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
361 {
362 	struct ip6_ra_chain *ra;
363 	struct sock *last = NULL;
364 
365 	read_lock(&ip6_ra_lock);
366 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
367 		struct sock *sk = ra->sk;
368 		if (sk && ra->sel == sel &&
369 		    (!sk->sk_bound_dev_if ||
370 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
371 
372 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
373 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
374 				continue;
375 			}
376 			if (last) {
377 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
378 				if (skb2)
379 					rawv6_rcv(last, skb2);
380 			}
381 			last = sk;
382 		}
383 	}
384 
385 	if (last) {
386 		rawv6_rcv(last, skb);
387 		read_unlock(&ip6_ra_lock);
388 		return 1;
389 	}
390 	read_unlock(&ip6_ra_lock);
391 	return 0;
392 }
393 
394 static int ip6_forward_proxy_check(struct sk_buff *skb)
395 {
396 	struct ipv6hdr *hdr = ipv6_hdr(skb);
397 	u8 nexthdr = hdr->nexthdr;
398 	__be16 frag_off;
399 	int offset;
400 
401 	if (ipv6_ext_hdr(nexthdr)) {
402 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
403 		if (offset < 0)
404 			return 0;
405 	} else
406 		offset = sizeof(struct ipv6hdr);
407 
408 	if (nexthdr == IPPROTO_ICMPV6) {
409 		struct icmp6hdr *icmp6;
410 
411 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
412 					 offset + 1 - skb->data)))
413 			return 0;
414 
415 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
416 
417 		switch (icmp6->icmp6_type) {
418 		case NDISC_ROUTER_SOLICITATION:
419 		case NDISC_ROUTER_ADVERTISEMENT:
420 		case NDISC_NEIGHBOUR_SOLICITATION:
421 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
422 		case NDISC_REDIRECT:
423 			/* For reaction involving unicast neighbor discovery
424 			 * message destined to the proxied address, pass it to
425 			 * input function.
426 			 */
427 			return 1;
428 		default:
429 			break;
430 		}
431 	}
432 
433 	/*
434 	 * The proxying router can't forward traffic sent to a link-local
435 	 * address, so signal the sender and discard the packet. This
436 	 * behavior is clarified by the MIPv6 specification.
437 	 */
438 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
439 		dst_link_failure(skb);
440 		return -1;
441 	}
442 
443 	return 0;
444 }
445 
446 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
447 				     struct sk_buff *skb)
448 {
449 #ifdef CONFIG_NET_SWITCHDEV
450 	if (skb->offload_l3_fwd_mark) {
451 		consume_skb(skb);
452 		return 0;
453 	}
454 #endif
455 
456 	skb_clear_tstamp(skb);
457 	return dst_output(net, sk, skb);
458 }
459 
460 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
461 {
462 	if (skb->len <= mtu)
463 		return false;
464 
465 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
466 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
467 		return true;
468 
469 	if (skb->ignore_df)
470 		return false;
471 
472 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
473 		return false;
474 
475 	return true;
476 }
477 
478 int ip6_forward(struct sk_buff *skb)
479 {
480 	struct dst_entry *dst = skb_dst(skb);
481 	struct ipv6hdr *hdr = ipv6_hdr(skb);
482 	struct inet6_skb_parm *opt = IP6CB(skb);
483 	struct net *net = dev_net(dst->dev);
484 	struct inet6_dev *idev;
485 	SKB_DR(reason);
486 	u32 mtu;
487 
488 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
489 	if (net->ipv6.devconf_all->forwarding == 0)
490 		goto error;
491 
492 	if (skb->pkt_type != PACKET_HOST)
493 		goto drop;
494 
495 	if (unlikely(skb->sk))
496 		goto drop;
497 
498 	if (skb_warn_if_lro(skb))
499 		goto drop;
500 
501 	if (!net->ipv6.devconf_all->disable_policy &&
502 	    (!idev || !idev->cnf.disable_policy) &&
503 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
504 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
505 		goto drop;
506 	}
507 
508 	skb_forward_csum(skb);
509 
510 	/*
511 	 *	We DO NOT make any processing on
512 	 *	RA packets, pushing them to user level AS IS
513 	 *	without ane WARRANTY that application will be able
514 	 *	to interpret them. The reason is that we
515 	 *	cannot make anything clever here.
516 	 *
517 	 *	We are not end-node, so that if packet contains
518 	 *	AH/ESP, we cannot make anything.
519 	 *	Defragmentation also would be mistake, RA packets
520 	 *	cannot be fragmented, because there is no warranty
521 	 *	that different fragments will go along one path. --ANK
522 	 */
523 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
524 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
525 			return 0;
526 	}
527 
528 	/*
529 	 *	check and decrement ttl
530 	 */
531 	if (hdr->hop_limit <= 1) {
532 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
533 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
534 
535 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
536 		return -ETIMEDOUT;
537 	}
538 
539 	/* XXX: idev->cnf.proxy_ndp? */
540 	if (net->ipv6.devconf_all->proxy_ndp &&
541 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
542 		int proxied = ip6_forward_proxy_check(skb);
543 		if (proxied > 0) {
544 			/* It's tempting to decrease the hop limit
545 			 * here by 1, as we do at the end of the
546 			 * function too.
547 			 *
548 			 * But that would be incorrect, as proxying is
549 			 * not forwarding.  The ip6_input function
550 			 * will handle this packet locally, and it
551 			 * depends on the hop limit being unchanged.
552 			 *
553 			 * One example is the NDP hop limit, that
554 			 * always has to stay 255, but other would be
555 			 * similar checks around RA packets, where the
556 			 * user can even change the desired limit.
557 			 */
558 			return ip6_input(skb);
559 		} else if (proxied < 0) {
560 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
561 			goto drop;
562 		}
563 	}
564 
565 	if (!xfrm6_route_forward(skb)) {
566 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
567 		SKB_DR_SET(reason, XFRM_POLICY);
568 		goto drop;
569 	}
570 	dst = skb_dst(skb);
571 
572 	/* IPv6 specs say nothing about it, but it is clear that we cannot
573 	   send redirects to source routed frames.
574 	   We don't send redirects to frames decapsulated from IPsec.
575 	 */
576 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
577 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
578 		struct in6_addr *target = NULL;
579 		struct inet_peer *peer;
580 		struct rt6_info *rt;
581 
582 		/*
583 		 *	incoming and outgoing devices are the same
584 		 *	send a redirect.
585 		 */
586 
587 		rt = (struct rt6_info *) dst;
588 		if (rt->rt6i_flags & RTF_GATEWAY)
589 			target = &rt->rt6i_gateway;
590 		else
591 			target = &hdr->daddr;
592 
593 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
594 
595 		/* Limit redirects both by destination (here)
596 		   and by source (inside ndisc_send_redirect)
597 		 */
598 		if (inet_peer_xrlim_allow(peer, 1*HZ))
599 			ndisc_send_redirect(skb, target);
600 		if (peer)
601 			inet_putpeer(peer);
602 	} else {
603 		int addrtype = ipv6_addr_type(&hdr->saddr);
604 
605 		/* This check is security critical. */
606 		if (addrtype == IPV6_ADDR_ANY ||
607 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
608 			goto error;
609 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
610 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
611 				    ICMPV6_NOT_NEIGHBOUR, 0);
612 			goto error;
613 		}
614 	}
615 
616 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
617 
618 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
619 	if (mtu < IPV6_MIN_MTU)
620 		mtu = IPV6_MIN_MTU;
621 
622 	if (ip6_pkt_too_big(skb, mtu)) {
623 		/* Again, force OUTPUT device used as source address */
624 		skb->dev = dst->dev;
625 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
626 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
627 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
628 				IPSTATS_MIB_FRAGFAILS);
629 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
630 		return -EMSGSIZE;
631 	}
632 
633 	if (skb_cow(skb, dst->dev->hard_header_len)) {
634 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
635 				IPSTATS_MIB_OUTDISCARDS);
636 		goto drop;
637 	}
638 
639 	hdr = ipv6_hdr(skb);
640 
641 	/* Mangling hops number delayed to point after skb COW */
642 
643 	hdr->hop_limit--;
644 
645 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
646 		       net, NULL, skb, skb->dev, dst->dev,
647 		       ip6_forward_finish);
648 
649 error:
650 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
651 	SKB_DR_SET(reason, IP_INADDRERRORS);
652 drop:
653 	kfree_skb_reason(skb, reason);
654 	return -EINVAL;
655 }
656 
657 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
658 {
659 	to->pkt_type = from->pkt_type;
660 	to->priority = from->priority;
661 	to->protocol = from->protocol;
662 	skb_dst_drop(to);
663 	skb_dst_set(to, dst_clone(skb_dst(from)));
664 	to->dev = from->dev;
665 	to->mark = from->mark;
666 
667 	skb_copy_hash(to, from);
668 
669 #ifdef CONFIG_NET_SCHED
670 	to->tc_index = from->tc_index;
671 #endif
672 	nf_copy(to, from);
673 	skb_ext_copy(to, from);
674 	skb_copy_secmark(to, from);
675 }
676 
677 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
678 		      u8 nexthdr, __be32 frag_id,
679 		      struct ip6_fraglist_iter *iter)
680 {
681 	unsigned int first_len;
682 	struct frag_hdr *fh;
683 
684 	/* BUILD HEADER */
685 	*prevhdr = NEXTHDR_FRAGMENT;
686 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
687 	if (!iter->tmp_hdr)
688 		return -ENOMEM;
689 
690 	iter->frag = skb_shinfo(skb)->frag_list;
691 	skb_frag_list_init(skb);
692 
693 	iter->offset = 0;
694 	iter->hlen = hlen;
695 	iter->frag_id = frag_id;
696 	iter->nexthdr = nexthdr;
697 
698 	__skb_pull(skb, hlen);
699 	fh = __skb_push(skb, sizeof(struct frag_hdr));
700 	__skb_push(skb, hlen);
701 	skb_reset_network_header(skb);
702 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
703 
704 	fh->nexthdr = nexthdr;
705 	fh->reserved = 0;
706 	fh->frag_off = htons(IP6_MF);
707 	fh->identification = frag_id;
708 
709 	first_len = skb_pagelen(skb);
710 	skb->data_len = first_len - skb_headlen(skb);
711 	skb->len = first_len;
712 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
713 
714 	return 0;
715 }
716 EXPORT_SYMBOL(ip6_fraglist_init);
717 
718 void ip6_fraglist_prepare(struct sk_buff *skb,
719 			  struct ip6_fraglist_iter *iter)
720 {
721 	struct sk_buff *frag = iter->frag;
722 	unsigned int hlen = iter->hlen;
723 	struct frag_hdr *fh;
724 
725 	frag->ip_summed = CHECKSUM_NONE;
726 	skb_reset_transport_header(frag);
727 	fh = __skb_push(frag, sizeof(struct frag_hdr));
728 	__skb_push(frag, hlen);
729 	skb_reset_network_header(frag);
730 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
731 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
732 	fh->nexthdr = iter->nexthdr;
733 	fh->reserved = 0;
734 	fh->frag_off = htons(iter->offset);
735 	if (frag->next)
736 		fh->frag_off |= htons(IP6_MF);
737 	fh->identification = iter->frag_id;
738 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
739 	ip6_copy_metadata(frag, skb);
740 }
741 EXPORT_SYMBOL(ip6_fraglist_prepare);
742 
743 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
744 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
745 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
746 {
747 	state->prevhdr = prevhdr;
748 	state->nexthdr = nexthdr;
749 	state->frag_id = frag_id;
750 
751 	state->hlen = hlen;
752 	state->mtu = mtu;
753 
754 	state->left = skb->len - hlen;	/* Space per frame */
755 	state->ptr = hlen;		/* Where to start from */
756 
757 	state->hroom = hdr_room;
758 	state->troom = needed_tailroom;
759 
760 	state->offset = 0;
761 }
762 EXPORT_SYMBOL(ip6_frag_init);
763 
764 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
765 {
766 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
767 	struct sk_buff *frag;
768 	struct frag_hdr *fh;
769 	unsigned int len;
770 
771 	len = state->left;
772 	/* IF: it doesn't fit, use 'mtu' - the data space left */
773 	if (len > state->mtu)
774 		len = state->mtu;
775 	/* IF: we are not sending up to and including the packet end
776 	   then align the next start on an eight byte boundary */
777 	if (len < state->left)
778 		len &= ~7;
779 
780 	/* Allocate buffer */
781 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
782 			 state->hroom + state->troom, GFP_ATOMIC);
783 	if (!frag)
784 		return ERR_PTR(-ENOMEM);
785 
786 	/*
787 	 *	Set up data on packet
788 	 */
789 
790 	ip6_copy_metadata(frag, skb);
791 	skb_reserve(frag, state->hroom);
792 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
793 	skb_reset_network_header(frag);
794 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
795 	frag->transport_header = (frag->network_header + state->hlen +
796 				  sizeof(struct frag_hdr));
797 
798 	/*
799 	 *	Charge the memory for the fragment to any owner
800 	 *	it might possess
801 	 */
802 	if (skb->sk)
803 		skb_set_owner_w(frag, skb->sk);
804 
805 	/*
806 	 *	Copy the packet header into the new buffer.
807 	 */
808 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
809 
810 	fragnexthdr_offset = skb_network_header(frag);
811 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
812 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
813 
814 	/*
815 	 *	Build fragment header.
816 	 */
817 	fh->nexthdr = state->nexthdr;
818 	fh->reserved = 0;
819 	fh->identification = state->frag_id;
820 
821 	/*
822 	 *	Copy a block of the IP datagram.
823 	 */
824 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
825 			     len));
826 	state->left -= len;
827 
828 	fh->frag_off = htons(state->offset);
829 	if (state->left > 0)
830 		fh->frag_off |= htons(IP6_MF);
831 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
832 
833 	state->ptr += len;
834 	state->offset += len;
835 
836 	return frag;
837 }
838 EXPORT_SYMBOL(ip6_frag_next);
839 
840 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
841 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
842 {
843 	struct sk_buff *frag;
844 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
845 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
846 				inet6_sk(skb->sk) : NULL;
847 	bool mono_delivery_time = skb->mono_delivery_time;
848 	struct ip6_frag_state state;
849 	unsigned int mtu, hlen, nexthdr_offset;
850 	ktime_t tstamp = skb->tstamp;
851 	int hroom, err = 0;
852 	__be32 frag_id;
853 	u8 *prevhdr, nexthdr = 0;
854 
855 	err = ip6_find_1stfragopt(skb, &prevhdr);
856 	if (err < 0)
857 		goto fail;
858 	hlen = err;
859 	nexthdr = *prevhdr;
860 	nexthdr_offset = prevhdr - skb_network_header(skb);
861 
862 	mtu = ip6_skb_dst_mtu(skb);
863 
864 	/* We must not fragment if the socket is set to force MTU discovery
865 	 * or if the skb it not generated by a local socket.
866 	 */
867 	if (unlikely(!skb->ignore_df && skb->len > mtu))
868 		goto fail_toobig;
869 
870 	if (IP6CB(skb)->frag_max_size) {
871 		if (IP6CB(skb)->frag_max_size > mtu)
872 			goto fail_toobig;
873 
874 		/* don't send fragments larger than what we received */
875 		mtu = IP6CB(skb)->frag_max_size;
876 		if (mtu < IPV6_MIN_MTU)
877 			mtu = IPV6_MIN_MTU;
878 	}
879 
880 	if (np) {
881 		u32 frag_size = READ_ONCE(np->frag_size);
882 
883 		if (frag_size && frag_size < mtu)
884 			mtu = frag_size;
885 	}
886 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
887 		goto fail_toobig;
888 	mtu -= hlen + sizeof(struct frag_hdr);
889 
890 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
891 				    &ipv6_hdr(skb)->saddr);
892 
893 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
894 	    (err = skb_checksum_help(skb)))
895 		goto fail;
896 
897 	prevhdr = skb_network_header(skb) + nexthdr_offset;
898 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
899 	if (skb_has_frag_list(skb)) {
900 		unsigned int first_len = skb_pagelen(skb);
901 		struct ip6_fraglist_iter iter;
902 		struct sk_buff *frag2;
903 
904 		if (first_len - hlen > mtu ||
905 		    ((first_len - hlen) & 7) ||
906 		    skb_cloned(skb) ||
907 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
908 			goto slow_path;
909 
910 		skb_walk_frags(skb, frag) {
911 			/* Correct geometry. */
912 			if (frag->len > mtu ||
913 			    ((frag->len & 7) && frag->next) ||
914 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
915 				goto slow_path_clean;
916 
917 			/* Partially cloned skb? */
918 			if (skb_shared(frag))
919 				goto slow_path_clean;
920 
921 			BUG_ON(frag->sk);
922 			if (skb->sk) {
923 				frag->sk = skb->sk;
924 				frag->destructor = sock_wfree;
925 			}
926 			skb->truesize -= frag->truesize;
927 		}
928 
929 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
930 					&iter);
931 		if (err < 0)
932 			goto fail;
933 
934 		/* We prevent @rt from being freed. */
935 		rcu_read_lock();
936 
937 		for (;;) {
938 			/* Prepare header of the next frame,
939 			 * before previous one went down. */
940 			if (iter.frag)
941 				ip6_fraglist_prepare(skb, &iter);
942 
943 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
944 			err = output(net, sk, skb);
945 			if (!err)
946 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
947 					      IPSTATS_MIB_FRAGCREATES);
948 
949 			if (err || !iter.frag)
950 				break;
951 
952 			skb = ip6_fraglist_next(&iter);
953 		}
954 
955 		kfree(iter.tmp_hdr);
956 
957 		if (err == 0) {
958 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
959 				      IPSTATS_MIB_FRAGOKS);
960 			rcu_read_unlock();
961 			return 0;
962 		}
963 
964 		kfree_skb_list(iter.frag);
965 
966 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
967 			      IPSTATS_MIB_FRAGFAILS);
968 		rcu_read_unlock();
969 		return err;
970 
971 slow_path_clean:
972 		skb_walk_frags(skb, frag2) {
973 			if (frag2 == frag)
974 				break;
975 			frag2->sk = NULL;
976 			frag2->destructor = NULL;
977 			skb->truesize += frag2->truesize;
978 		}
979 	}
980 
981 slow_path:
982 	/*
983 	 *	Fragment the datagram.
984 	 */
985 
986 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
987 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
988 		      &state);
989 
990 	/*
991 	 *	Keep copying data until we run out.
992 	 */
993 
994 	while (state.left > 0) {
995 		frag = ip6_frag_next(skb, &state);
996 		if (IS_ERR(frag)) {
997 			err = PTR_ERR(frag);
998 			goto fail;
999 		}
1000 
1001 		/*
1002 		 *	Put this fragment into the sending queue.
1003 		 */
1004 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1005 		err = output(net, sk, frag);
1006 		if (err)
1007 			goto fail;
1008 
1009 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1010 			      IPSTATS_MIB_FRAGCREATES);
1011 	}
1012 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1013 		      IPSTATS_MIB_FRAGOKS);
1014 	consume_skb(skb);
1015 	return err;
1016 
1017 fail_toobig:
1018 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1019 		sk_gso_disable(skb->sk);
1020 
1021 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1022 	err = -EMSGSIZE;
1023 
1024 fail:
1025 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1026 		      IPSTATS_MIB_FRAGFAILS);
1027 	kfree_skb(skb);
1028 	return err;
1029 }
1030 
1031 static inline int ip6_rt_check(const struct rt6key *rt_key,
1032 			       const struct in6_addr *fl_addr,
1033 			       const struct in6_addr *addr_cache)
1034 {
1035 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1036 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1037 }
1038 
1039 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1040 					  struct dst_entry *dst,
1041 					  const struct flowi6 *fl6)
1042 {
1043 	struct ipv6_pinfo *np = inet6_sk(sk);
1044 	struct rt6_info *rt;
1045 
1046 	if (!dst)
1047 		goto out;
1048 
1049 	if (dst->ops->family != AF_INET6) {
1050 		dst_release(dst);
1051 		return NULL;
1052 	}
1053 
1054 	rt = (struct rt6_info *)dst;
1055 	/* Yes, checking route validity in not connected
1056 	 * case is not very simple. Take into account,
1057 	 * that we do not support routing by source, TOS,
1058 	 * and MSG_DONTROUTE		--ANK (980726)
1059 	 *
1060 	 * 1. ip6_rt_check(): If route was host route,
1061 	 *    check that cached destination is current.
1062 	 *    If it is network route, we still may
1063 	 *    check its validity using saved pointer
1064 	 *    to the last used address: daddr_cache.
1065 	 *    We do not want to save whole address now,
1066 	 *    (because main consumer of this service
1067 	 *    is tcp, which has not this problem),
1068 	 *    so that the last trick works only on connected
1069 	 *    sockets.
1070 	 * 2. oif also should be the same.
1071 	 */
1072 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1073 #ifdef CONFIG_IPV6_SUBTREES
1074 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1075 #endif
1076 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1077 		dst_release(dst);
1078 		dst = NULL;
1079 	}
1080 
1081 out:
1082 	return dst;
1083 }
1084 
1085 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1086 			       struct dst_entry **dst, struct flowi6 *fl6)
1087 {
1088 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1089 	struct neighbour *n;
1090 	struct rt6_info *rt;
1091 #endif
1092 	int err;
1093 	int flags = 0;
1094 
1095 	/* The correct way to handle this would be to do
1096 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1097 	 * the route-specific preferred source forces the
1098 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1099 	 *
1100 	 * In source specific routing (no src=any default route),
1101 	 * ip6_route_output will fail given src=any saddr, though, so
1102 	 * that's why we try it again later.
1103 	 */
1104 	if (ipv6_addr_any(&fl6->saddr)) {
1105 		struct fib6_info *from;
1106 		struct rt6_info *rt;
1107 
1108 		*dst = ip6_route_output(net, sk, fl6);
1109 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1110 
1111 		rcu_read_lock();
1112 		from = rt ? rcu_dereference(rt->from) : NULL;
1113 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1114 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1115 					  &fl6->saddr);
1116 		rcu_read_unlock();
1117 
1118 		if (err)
1119 			goto out_err_release;
1120 
1121 		/* If we had an erroneous initial result, pretend it
1122 		 * never existed and let the SA-enabled version take
1123 		 * over.
1124 		 */
1125 		if ((*dst)->error) {
1126 			dst_release(*dst);
1127 			*dst = NULL;
1128 		}
1129 
1130 		if (fl6->flowi6_oif)
1131 			flags |= RT6_LOOKUP_F_IFACE;
1132 	}
1133 
1134 	if (!*dst)
1135 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1136 
1137 	err = (*dst)->error;
1138 	if (err)
1139 		goto out_err_release;
1140 
1141 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1142 	/*
1143 	 * Here if the dst entry we've looked up
1144 	 * has a neighbour entry that is in the INCOMPLETE
1145 	 * state and the src address from the flow is
1146 	 * marked as OPTIMISTIC, we release the found
1147 	 * dst entry and replace it instead with the
1148 	 * dst entry of the nexthop router
1149 	 */
1150 	rt = (struct rt6_info *) *dst;
1151 	rcu_read_lock();
1152 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1153 				      rt6_nexthop(rt, &fl6->daddr));
1154 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1155 	rcu_read_unlock();
1156 
1157 	if (err) {
1158 		struct inet6_ifaddr *ifp;
1159 		struct flowi6 fl_gw6;
1160 		int redirect;
1161 
1162 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1163 				      (*dst)->dev, 1);
1164 
1165 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1166 		if (ifp)
1167 			in6_ifa_put(ifp);
1168 
1169 		if (redirect) {
1170 			/*
1171 			 * We need to get the dst entry for the
1172 			 * default router instead
1173 			 */
1174 			dst_release(*dst);
1175 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1176 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1177 			*dst = ip6_route_output(net, sk, &fl_gw6);
1178 			err = (*dst)->error;
1179 			if (err)
1180 				goto out_err_release;
1181 		}
1182 	}
1183 #endif
1184 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1185 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1186 		err = -EAFNOSUPPORT;
1187 		goto out_err_release;
1188 	}
1189 
1190 	return 0;
1191 
1192 out_err_release:
1193 	dst_release(*dst);
1194 	*dst = NULL;
1195 
1196 	if (err == -ENETUNREACH)
1197 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1198 	return err;
1199 }
1200 
1201 /**
1202  *	ip6_dst_lookup - perform route lookup on flow
1203  *	@net: Network namespace to perform lookup in
1204  *	@sk: socket which provides route info
1205  *	@dst: pointer to dst_entry * for result
1206  *	@fl6: flow to lookup
1207  *
1208  *	This function performs a route lookup on the given flow.
1209  *
1210  *	It returns zero on success, or a standard errno code on error.
1211  */
1212 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1213 		   struct flowi6 *fl6)
1214 {
1215 	*dst = NULL;
1216 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1217 }
1218 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1219 
1220 /**
1221  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1222  *	@net: Network namespace to perform lookup in
1223  *	@sk: socket which provides route info
1224  *	@fl6: flow to lookup
1225  *	@final_dst: final destination address for ipsec lookup
1226  *
1227  *	This function performs a route lookup on the given flow.
1228  *
1229  *	It returns a valid dst pointer on success, or a pointer encoded
1230  *	error code.
1231  */
1232 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1233 				      const struct in6_addr *final_dst)
1234 {
1235 	struct dst_entry *dst = NULL;
1236 	int err;
1237 
1238 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1239 	if (err)
1240 		return ERR_PTR(err);
1241 	if (final_dst)
1242 		fl6->daddr = *final_dst;
1243 
1244 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1245 }
1246 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1247 
1248 /**
1249  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1250  *	@sk: socket which provides the dst cache and route info
1251  *	@fl6: flow to lookup
1252  *	@final_dst: final destination address for ipsec lookup
1253  *	@connected: whether @sk is connected or not
1254  *
1255  *	This function performs a route lookup on the given flow with the
1256  *	possibility of using the cached route in the socket if it is valid.
1257  *	It will take the socket dst lock when operating on the dst cache.
1258  *	As a result, this function can only be used in process context.
1259  *
1260  *	In addition, for a connected socket, cache the dst in the socket
1261  *	if the current cache is not valid.
1262  *
1263  *	It returns a valid dst pointer on success, or a pointer encoded
1264  *	error code.
1265  */
1266 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1267 					 const struct in6_addr *final_dst,
1268 					 bool connected)
1269 {
1270 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1271 
1272 	dst = ip6_sk_dst_check(sk, dst, fl6);
1273 	if (dst)
1274 		return dst;
1275 
1276 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1277 	if (connected && !IS_ERR(dst))
1278 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1279 
1280 	return dst;
1281 }
1282 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1283 
1284 /**
1285  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1286  *      @skb: Packet for which lookup is done
1287  *      @dev: Tunnel device
1288  *      @net: Network namespace of tunnel device
1289  *      @sock: Socket which provides route info
1290  *      @saddr: Memory to store the src ip address
1291  *      @info: Tunnel information
1292  *      @protocol: IP protocol
1293  *      @use_cache: Flag to enable cache usage
1294  *      This function performs a route lookup on a tunnel
1295  *
1296  *      It returns a valid dst pointer and stores src address to be used in
1297  *      tunnel in param saddr on success, else a pointer encoded error code.
1298  */
1299 
1300 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1301 					struct net_device *dev,
1302 					struct net *net,
1303 					struct socket *sock,
1304 					struct in6_addr *saddr,
1305 					const struct ip_tunnel_info *info,
1306 					u8 protocol,
1307 					bool use_cache)
1308 {
1309 	struct dst_entry *dst = NULL;
1310 #ifdef CONFIG_DST_CACHE
1311 	struct dst_cache *dst_cache;
1312 #endif
1313 	struct flowi6 fl6;
1314 	__u8 prio;
1315 
1316 #ifdef CONFIG_DST_CACHE
1317 	dst_cache = (struct dst_cache *)&info->dst_cache;
1318 	if (use_cache) {
1319 		dst = dst_cache_get_ip6(dst_cache, saddr);
1320 		if (dst)
1321 			return dst;
1322 	}
1323 #endif
1324 	memset(&fl6, 0, sizeof(fl6));
1325 	fl6.flowi6_mark = skb->mark;
1326 	fl6.flowi6_proto = protocol;
1327 	fl6.daddr = info->key.u.ipv6.dst;
1328 	fl6.saddr = info->key.u.ipv6.src;
1329 	prio = info->key.tos;
1330 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1331 
1332 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1333 					      NULL);
1334 	if (IS_ERR(dst)) {
1335 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1336 		return ERR_PTR(-ENETUNREACH);
1337 	}
1338 	if (dst->dev == dev) { /* is this necessary? */
1339 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1340 		dst_release(dst);
1341 		return ERR_PTR(-ELOOP);
1342 	}
1343 #ifdef CONFIG_DST_CACHE
1344 	if (use_cache)
1345 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1346 #endif
1347 	*saddr = fl6.saddr;
1348 	return dst;
1349 }
1350 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1351 
1352 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1353 					       gfp_t gfp)
1354 {
1355 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1356 }
1357 
1358 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1359 						gfp_t gfp)
1360 {
1361 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1362 }
1363 
1364 static void ip6_append_data_mtu(unsigned int *mtu,
1365 				int *maxfraglen,
1366 				unsigned int fragheaderlen,
1367 				struct sk_buff *skb,
1368 				struct rt6_info *rt,
1369 				unsigned int orig_mtu)
1370 {
1371 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1372 		if (!skb) {
1373 			/* first fragment, reserve header_len */
1374 			*mtu = orig_mtu - rt->dst.header_len;
1375 
1376 		} else {
1377 			/*
1378 			 * this fragment is not first, the headers
1379 			 * space is regarded as data space.
1380 			 */
1381 			*mtu = orig_mtu;
1382 		}
1383 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1384 			      + fragheaderlen - sizeof(struct frag_hdr);
1385 	}
1386 }
1387 
1388 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1389 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1390 			  struct rt6_info *rt)
1391 {
1392 	struct ipv6_pinfo *np = inet6_sk(sk);
1393 	unsigned int mtu, frag_size;
1394 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1395 
1396 	/* callers pass dst together with a reference, set it first so
1397 	 * ip6_cork_release() can put it down even in case of an error.
1398 	 */
1399 	cork->base.dst = &rt->dst;
1400 
1401 	/*
1402 	 * setup for corking
1403 	 */
1404 	if (opt) {
1405 		if (WARN_ON(v6_cork->opt))
1406 			return -EINVAL;
1407 
1408 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1409 		if (unlikely(!nopt))
1410 			return -ENOBUFS;
1411 
1412 		nopt->tot_len = sizeof(*opt);
1413 		nopt->opt_flen = opt->opt_flen;
1414 		nopt->opt_nflen = opt->opt_nflen;
1415 
1416 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1417 		if (opt->dst0opt && !nopt->dst0opt)
1418 			return -ENOBUFS;
1419 
1420 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1421 		if (opt->dst1opt && !nopt->dst1opt)
1422 			return -ENOBUFS;
1423 
1424 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1425 		if (opt->hopopt && !nopt->hopopt)
1426 			return -ENOBUFS;
1427 
1428 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1429 		if (opt->srcrt && !nopt->srcrt)
1430 			return -ENOBUFS;
1431 
1432 		/* need source address above miyazawa*/
1433 	}
1434 	v6_cork->hop_limit = ipc6->hlimit;
1435 	v6_cork->tclass = ipc6->tclass;
1436 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1437 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1438 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1439 	else
1440 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1441 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1442 
1443 	frag_size = READ_ONCE(np->frag_size);
1444 	if (frag_size && frag_size < mtu)
1445 		mtu = frag_size;
1446 
1447 	cork->base.fragsize = mtu;
1448 	cork->base.gso_size = ipc6->gso_size;
1449 	cork->base.tx_flags = 0;
1450 	cork->base.mark = ipc6->sockc.mark;
1451 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1452 
1453 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1454 		cork->base.flags |= IPCORK_ALLFRAG;
1455 	cork->base.length = 0;
1456 
1457 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1458 
1459 	return 0;
1460 }
1461 
1462 static int __ip6_append_data(struct sock *sk,
1463 			     struct sk_buff_head *queue,
1464 			     struct inet_cork_full *cork_full,
1465 			     struct inet6_cork *v6_cork,
1466 			     struct page_frag *pfrag,
1467 			     int getfrag(void *from, char *to, int offset,
1468 					 int len, int odd, struct sk_buff *skb),
1469 			     void *from, size_t length, int transhdrlen,
1470 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1471 {
1472 	struct sk_buff *skb, *skb_prev = NULL;
1473 	struct inet_cork *cork = &cork_full->base;
1474 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1475 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1476 	struct ubuf_info *uarg = NULL;
1477 	int exthdrlen = 0;
1478 	int dst_exthdrlen = 0;
1479 	int hh_len;
1480 	int copy;
1481 	int err;
1482 	int offset = 0;
1483 	bool zc = false;
1484 	u32 tskey = 0;
1485 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1486 	struct ipv6_txoptions *opt = v6_cork->opt;
1487 	int csummode = CHECKSUM_NONE;
1488 	unsigned int maxnonfragsize, headersize;
1489 	unsigned int wmem_alloc_delta = 0;
1490 	bool paged, extra_uref = false;
1491 
1492 	skb = skb_peek_tail(queue);
1493 	if (!skb) {
1494 		exthdrlen = opt ? opt->opt_flen : 0;
1495 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1496 	}
1497 
1498 	paged = !!cork->gso_size;
1499 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1500 	orig_mtu = mtu;
1501 
1502 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1503 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1504 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1505 
1506 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1507 
1508 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1509 			(opt ? opt->opt_nflen : 0);
1510 
1511 	headersize = sizeof(struct ipv6hdr) +
1512 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1513 		     (dst_allfrag(&rt->dst) ?
1514 		      sizeof(struct frag_hdr) : 0) +
1515 		     rt->rt6i_nfheader_len;
1516 
1517 	if (mtu <= fragheaderlen ||
1518 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1519 		goto emsgsize;
1520 
1521 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1522 		     sizeof(struct frag_hdr);
1523 
1524 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1525 	 * the first fragment
1526 	 */
1527 	if (headersize + transhdrlen > mtu)
1528 		goto emsgsize;
1529 
1530 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1531 	    (sk->sk_protocol == IPPROTO_UDP ||
1532 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1533 	     sk->sk_protocol == IPPROTO_RAW)) {
1534 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1535 				sizeof(struct ipv6hdr));
1536 		goto emsgsize;
1537 	}
1538 
1539 	if (ip6_sk_ignore_df(sk))
1540 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1541 	else
1542 		maxnonfragsize = mtu;
1543 
1544 	if (cork->length + length > maxnonfragsize - headersize) {
1545 emsgsize:
1546 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1547 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1548 		return -EMSGSIZE;
1549 	}
1550 
1551 	/* CHECKSUM_PARTIAL only with no extension headers and when
1552 	 * we are not going to fragment
1553 	 */
1554 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1555 	    headersize == sizeof(struct ipv6hdr) &&
1556 	    length <= mtu - headersize &&
1557 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1558 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1559 		csummode = CHECKSUM_PARTIAL;
1560 
1561 	if ((flags & MSG_ZEROCOPY) && length) {
1562 		struct msghdr *msg = from;
1563 
1564 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1565 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1566 				return -EINVAL;
1567 
1568 			/* Leave uarg NULL if can't zerocopy, callers should
1569 			 * be able to handle it.
1570 			 */
1571 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1572 			    csummode == CHECKSUM_PARTIAL) {
1573 				paged = true;
1574 				zc = true;
1575 				uarg = msg->msg_ubuf;
1576 			}
1577 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1578 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1579 			if (!uarg)
1580 				return -ENOBUFS;
1581 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1582 			if (rt->dst.dev->features & NETIF_F_SG &&
1583 			    csummode == CHECKSUM_PARTIAL) {
1584 				paged = true;
1585 				zc = true;
1586 			} else {
1587 				uarg_to_msgzc(uarg)->zerocopy = 0;
1588 				skb_zcopy_set(skb, uarg, &extra_uref);
1589 			}
1590 		}
1591 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1592 		if (inet_test_bit(HDRINCL, sk))
1593 			return -EPERM;
1594 		if (rt->dst.dev->features & NETIF_F_SG &&
1595 		    getfrag == ip_generic_getfrag)
1596 			/* We need an empty buffer to attach stuff to */
1597 			paged = true;
1598 		else
1599 			flags &= ~MSG_SPLICE_PAGES;
1600 	}
1601 
1602 	/*
1603 	 * Let's try using as much space as possible.
1604 	 * Use MTU if total length of the message fits into the MTU.
1605 	 * Otherwise, we need to reserve fragment header and
1606 	 * fragment alignment (= 8-15 octects, in total).
1607 	 *
1608 	 * Note that we may need to "move" the data from the tail
1609 	 * of the buffer to the new fragment when we split
1610 	 * the message.
1611 	 *
1612 	 * FIXME: It may be fragmented into multiple chunks
1613 	 *        at once if non-fragmentable extension headers
1614 	 *        are too large.
1615 	 * --yoshfuji
1616 	 */
1617 
1618 	cork->length += length;
1619 	if (!skb)
1620 		goto alloc_new_skb;
1621 
1622 	while (length > 0) {
1623 		/* Check if the remaining data fits into current packet. */
1624 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1625 		if (copy < length)
1626 			copy = maxfraglen - skb->len;
1627 
1628 		if (copy <= 0) {
1629 			char *data;
1630 			unsigned int datalen;
1631 			unsigned int fraglen;
1632 			unsigned int fraggap;
1633 			unsigned int alloclen, alloc_extra;
1634 			unsigned int pagedlen;
1635 alloc_new_skb:
1636 			/* There's no room in the current skb */
1637 			if (skb)
1638 				fraggap = skb->len - maxfraglen;
1639 			else
1640 				fraggap = 0;
1641 			/* update mtu and maxfraglen if necessary */
1642 			if (!skb || !skb_prev)
1643 				ip6_append_data_mtu(&mtu, &maxfraglen,
1644 						    fragheaderlen, skb, rt,
1645 						    orig_mtu);
1646 
1647 			skb_prev = skb;
1648 
1649 			/*
1650 			 * If remaining data exceeds the mtu,
1651 			 * we know we need more fragment(s).
1652 			 */
1653 			datalen = length + fraggap;
1654 
1655 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1656 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1657 			fraglen = datalen + fragheaderlen;
1658 			pagedlen = 0;
1659 
1660 			alloc_extra = hh_len;
1661 			alloc_extra += dst_exthdrlen;
1662 			alloc_extra += rt->dst.trailer_len;
1663 
1664 			/* We just reserve space for fragment header.
1665 			 * Note: this may be overallocation if the message
1666 			 * (without MSG_MORE) fits into the MTU.
1667 			 */
1668 			alloc_extra += sizeof(struct frag_hdr);
1669 
1670 			if ((flags & MSG_MORE) &&
1671 			    !(rt->dst.dev->features&NETIF_F_SG))
1672 				alloclen = mtu;
1673 			else if (!paged &&
1674 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1675 				  !(rt->dst.dev->features & NETIF_F_SG)))
1676 				alloclen = fraglen;
1677 			else {
1678 				alloclen = fragheaderlen + transhdrlen;
1679 				pagedlen = datalen - transhdrlen;
1680 			}
1681 			alloclen += alloc_extra;
1682 
1683 			if (datalen != length + fraggap) {
1684 				/*
1685 				 * this is not the last fragment, the trailer
1686 				 * space is regarded as data space.
1687 				 */
1688 				datalen += rt->dst.trailer_len;
1689 			}
1690 
1691 			fraglen = datalen + fragheaderlen;
1692 
1693 			copy = datalen - transhdrlen - fraggap - pagedlen;
1694 			/* [!] NOTE: copy may be negative if pagedlen>0
1695 			 * because then the equation may reduces to -fraggap.
1696 			 */
1697 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1698 				err = -EINVAL;
1699 				goto error;
1700 			}
1701 			if (transhdrlen) {
1702 				skb = sock_alloc_send_skb(sk, alloclen,
1703 						(flags & MSG_DONTWAIT), &err);
1704 			} else {
1705 				skb = NULL;
1706 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1707 				    2 * sk->sk_sndbuf)
1708 					skb = alloc_skb(alloclen,
1709 							sk->sk_allocation);
1710 				if (unlikely(!skb))
1711 					err = -ENOBUFS;
1712 			}
1713 			if (!skb)
1714 				goto error;
1715 			/*
1716 			 *	Fill in the control structures
1717 			 */
1718 			skb->protocol = htons(ETH_P_IPV6);
1719 			skb->ip_summed = csummode;
1720 			skb->csum = 0;
1721 			/* reserve for fragmentation and ipsec header */
1722 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1723 				    dst_exthdrlen);
1724 
1725 			/*
1726 			 *	Find where to start putting bytes
1727 			 */
1728 			data = skb_put(skb, fraglen - pagedlen);
1729 			skb_set_network_header(skb, exthdrlen);
1730 			data += fragheaderlen;
1731 			skb->transport_header = (skb->network_header +
1732 						 fragheaderlen);
1733 			if (fraggap) {
1734 				skb->csum = skb_copy_and_csum_bits(
1735 					skb_prev, maxfraglen,
1736 					data + transhdrlen, fraggap);
1737 				skb_prev->csum = csum_sub(skb_prev->csum,
1738 							  skb->csum);
1739 				data += fraggap;
1740 				pskb_trim_unique(skb_prev, maxfraglen);
1741 			}
1742 			if (copy > 0 &&
1743 			    getfrag(from, data + transhdrlen, offset,
1744 				    copy, fraggap, skb) < 0) {
1745 				err = -EFAULT;
1746 				kfree_skb(skb);
1747 				goto error;
1748 			} else if (flags & MSG_SPLICE_PAGES) {
1749 				copy = 0;
1750 			}
1751 
1752 			offset += copy;
1753 			length -= copy + transhdrlen;
1754 			transhdrlen = 0;
1755 			exthdrlen = 0;
1756 			dst_exthdrlen = 0;
1757 
1758 			/* Only the initial fragment is time stamped */
1759 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1760 			cork->tx_flags = 0;
1761 			skb_shinfo(skb)->tskey = tskey;
1762 			tskey = 0;
1763 			skb_zcopy_set(skb, uarg, &extra_uref);
1764 
1765 			if ((flags & MSG_CONFIRM) && !skb_prev)
1766 				skb_set_dst_pending_confirm(skb, 1);
1767 
1768 			/*
1769 			 * Put the packet on the pending queue
1770 			 */
1771 			if (!skb->destructor) {
1772 				skb->destructor = sock_wfree;
1773 				skb->sk = sk;
1774 				wmem_alloc_delta += skb->truesize;
1775 			}
1776 			__skb_queue_tail(queue, skb);
1777 			continue;
1778 		}
1779 
1780 		if (copy > length)
1781 			copy = length;
1782 
1783 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1784 		    skb_tailroom(skb) >= copy) {
1785 			unsigned int off;
1786 
1787 			off = skb->len;
1788 			if (getfrag(from, skb_put(skb, copy),
1789 						offset, copy, off, skb) < 0) {
1790 				__skb_trim(skb, off);
1791 				err = -EFAULT;
1792 				goto error;
1793 			}
1794 		} else if (flags & MSG_SPLICE_PAGES) {
1795 			struct msghdr *msg = from;
1796 
1797 			err = -EIO;
1798 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1799 				goto error;
1800 
1801 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1802 						   sk->sk_allocation);
1803 			if (err < 0)
1804 				goto error;
1805 			copy = err;
1806 			wmem_alloc_delta += copy;
1807 		} else if (!zc) {
1808 			int i = skb_shinfo(skb)->nr_frags;
1809 
1810 			err = -ENOMEM;
1811 			if (!sk_page_frag_refill(sk, pfrag))
1812 				goto error;
1813 
1814 			skb_zcopy_downgrade_managed(skb);
1815 			if (!skb_can_coalesce(skb, i, pfrag->page,
1816 					      pfrag->offset)) {
1817 				err = -EMSGSIZE;
1818 				if (i == MAX_SKB_FRAGS)
1819 					goto error;
1820 
1821 				__skb_fill_page_desc(skb, i, pfrag->page,
1822 						     pfrag->offset, 0);
1823 				skb_shinfo(skb)->nr_frags = ++i;
1824 				get_page(pfrag->page);
1825 			}
1826 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1827 			if (getfrag(from,
1828 				    page_address(pfrag->page) + pfrag->offset,
1829 				    offset, copy, skb->len, skb) < 0)
1830 				goto error_efault;
1831 
1832 			pfrag->offset += copy;
1833 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1834 			skb->len += copy;
1835 			skb->data_len += copy;
1836 			skb->truesize += copy;
1837 			wmem_alloc_delta += copy;
1838 		} else {
1839 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1840 			if (err < 0)
1841 				goto error;
1842 		}
1843 		offset += copy;
1844 		length -= copy;
1845 	}
1846 
1847 	if (wmem_alloc_delta)
1848 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1849 	return 0;
1850 
1851 error_efault:
1852 	err = -EFAULT;
1853 error:
1854 	net_zcopy_put_abort(uarg, extra_uref);
1855 	cork->length -= length;
1856 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1857 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1858 	return err;
1859 }
1860 
1861 int ip6_append_data(struct sock *sk,
1862 		    int getfrag(void *from, char *to, int offset, int len,
1863 				int odd, struct sk_buff *skb),
1864 		    void *from, size_t length, int transhdrlen,
1865 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1866 		    struct rt6_info *rt, unsigned int flags)
1867 {
1868 	struct inet_sock *inet = inet_sk(sk);
1869 	struct ipv6_pinfo *np = inet6_sk(sk);
1870 	int exthdrlen;
1871 	int err;
1872 
1873 	if (flags&MSG_PROBE)
1874 		return 0;
1875 	if (skb_queue_empty(&sk->sk_write_queue)) {
1876 		/*
1877 		 * setup for corking
1878 		 */
1879 		dst_hold(&rt->dst);
1880 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1881 				     ipc6, rt);
1882 		if (err)
1883 			return err;
1884 
1885 		inet->cork.fl.u.ip6 = *fl6;
1886 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1887 		length += exthdrlen;
1888 		transhdrlen += exthdrlen;
1889 	} else {
1890 		transhdrlen = 0;
1891 	}
1892 
1893 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1894 				 &np->cork, sk_page_frag(sk), getfrag,
1895 				 from, length, transhdrlen, flags, ipc6);
1896 }
1897 EXPORT_SYMBOL_GPL(ip6_append_data);
1898 
1899 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1900 {
1901 	struct dst_entry *dst = cork->base.dst;
1902 
1903 	cork->base.dst = NULL;
1904 	cork->base.flags &= ~IPCORK_ALLFRAG;
1905 	skb_dst_set(skb, dst);
1906 }
1907 
1908 static void ip6_cork_release(struct inet_cork_full *cork,
1909 			     struct inet6_cork *v6_cork)
1910 {
1911 	if (v6_cork->opt) {
1912 		struct ipv6_txoptions *opt = v6_cork->opt;
1913 
1914 		kfree(opt->dst0opt);
1915 		kfree(opt->dst1opt);
1916 		kfree(opt->hopopt);
1917 		kfree(opt->srcrt);
1918 		kfree(opt);
1919 		v6_cork->opt = NULL;
1920 	}
1921 
1922 	if (cork->base.dst) {
1923 		dst_release(cork->base.dst);
1924 		cork->base.dst = NULL;
1925 		cork->base.flags &= ~IPCORK_ALLFRAG;
1926 	}
1927 }
1928 
1929 struct sk_buff *__ip6_make_skb(struct sock *sk,
1930 			       struct sk_buff_head *queue,
1931 			       struct inet_cork_full *cork,
1932 			       struct inet6_cork *v6_cork)
1933 {
1934 	struct sk_buff *skb, *tmp_skb;
1935 	struct sk_buff **tail_skb;
1936 	struct in6_addr *final_dst;
1937 	struct net *net = sock_net(sk);
1938 	struct ipv6hdr *hdr;
1939 	struct ipv6_txoptions *opt = v6_cork->opt;
1940 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1941 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1942 	unsigned char proto = fl6->flowi6_proto;
1943 
1944 	skb = __skb_dequeue(queue);
1945 	if (!skb)
1946 		goto out;
1947 	tail_skb = &(skb_shinfo(skb)->frag_list);
1948 
1949 	/* move skb->data to ip header from ext header */
1950 	if (skb->data < skb_network_header(skb))
1951 		__skb_pull(skb, skb_network_offset(skb));
1952 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1953 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1954 		*tail_skb = tmp_skb;
1955 		tail_skb = &(tmp_skb->next);
1956 		skb->len += tmp_skb->len;
1957 		skb->data_len += tmp_skb->len;
1958 		skb->truesize += tmp_skb->truesize;
1959 		tmp_skb->destructor = NULL;
1960 		tmp_skb->sk = NULL;
1961 	}
1962 
1963 	/* Allow local fragmentation. */
1964 	skb->ignore_df = ip6_sk_ignore_df(sk);
1965 	__skb_pull(skb, skb_network_header_len(skb));
1966 
1967 	final_dst = &fl6->daddr;
1968 	if (opt && opt->opt_flen)
1969 		ipv6_push_frag_opts(skb, opt, &proto);
1970 	if (opt && opt->opt_nflen)
1971 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1972 
1973 	skb_push(skb, sizeof(struct ipv6hdr));
1974 	skb_reset_network_header(skb);
1975 	hdr = ipv6_hdr(skb);
1976 
1977 	ip6_flow_hdr(hdr, v6_cork->tclass,
1978 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1979 					ip6_autoflowlabel(net, sk), fl6));
1980 	hdr->hop_limit = v6_cork->hop_limit;
1981 	hdr->nexthdr = proto;
1982 	hdr->saddr = fl6->saddr;
1983 	hdr->daddr = *final_dst;
1984 
1985 	skb->priority = READ_ONCE(sk->sk_priority);
1986 	skb->mark = cork->base.mark;
1987 	skb->tstamp = cork->base.transmit_time;
1988 
1989 	ip6_cork_steal_dst(skb, cork);
1990 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1991 	if (proto == IPPROTO_ICMPV6) {
1992 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1993 		u8 icmp6_type;
1994 
1995 		if (sk->sk_socket->type == SOCK_RAW &&
1996 		   !inet_test_bit(HDRINCL, sk))
1997 			icmp6_type = fl6->fl6_icmp_type;
1998 		else
1999 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
2000 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2001 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2002 	}
2003 
2004 	ip6_cork_release(cork, v6_cork);
2005 out:
2006 	return skb;
2007 }
2008 
2009 int ip6_send_skb(struct sk_buff *skb)
2010 {
2011 	struct net *net = sock_net(skb->sk);
2012 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2013 	int err;
2014 
2015 	err = ip6_local_out(net, skb->sk, skb);
2016 	if (err) {
2017 		if (err > 0)
2018 			err = net_xmit_errno(err);
2019 		if (err)
2020 			IP6_INC_STATS(net, rt->rt6i_idev,
2021 				      IPSTATS_MIB_OUTDISCARDS);
2022 	}
2023 
2024 	return err;
2025 }
2026 
2027 int ip6_push_pending_frames(struct sock *sk)
2028 {
2029 	struct sk_buff *skb;
2030 
2031 	skb = ip6_finish_skb(sk);
2032 	if (!skb)
2033 		return 0;
2034 
2035 	return ip6_send_skb(skb);
2036 }
2037 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2038 
2039 static void __ip6_flush_pending_frames(struct sock *sk,
2040 				       struct sk_buff_head *queue,
2041 				       struct inet_cork_full *cork,
2042 				       struct inet6_cork *v6_cork)
2043 {
2044 	struct sk_buff *skb;
2045 
2046 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2047 		if (skb_dst(skb))
2048 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2049 				      IPSTATS_MIB_OUTDISCARDS);
2050 		kfree_skb(skb);
2051 	}
2052 
2053 	ip6_cork_release(cork, v6_cork);
2054 }
2055 
2056 void ip6_flush_pending_frames(struct sock *sk)
2057 {
2058 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2059 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2060 }
2061 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2062 
2063 struct sk_buff *ip6_make_skb(struct sock *sk,
2064 			     int getfrag(void *from, char *to, int offset,
2065 					 int len, int odd, struct sk_buff *skb),
2066 			     void *from, size_t length, int transhdrlen,
2067 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2068 			     unsigned int flags, struct inet_cork_full *cork)
2069 {
2070 	struct inet6_cork v6_cork;
2071 	struct sk_buff_head queue;
2072 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2073 	int err;
2074 
2075 	if (flags & MSG_PROBE) {
2076 		dst_release(&rt->dst);
2077 		return NULL;
2078 	}
2079 
2080 	__skb_queue_head_init(&queue);
2081 
2082 	cork->base.flags = 0;
2083 	cork->base.addr = 0;
2084 	cork->base.opt = NULL;
2085 	v6_cork.opt = NULL;
2086 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2087 	if (err) {
2088 		ip6_cork_release(cork, &v6_cork);
2089 		return ERR_PTR(err);
2090 	}
2091 	if (ipc6->dontfrag < 0)
2092 		ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
2093 
2094 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2095 				&current->task_frag, getfrag, from,
2096 				length + exthdrlen, transhdrlen + exthdrlen,
2097 				flags, ipc6);
2098 	if (err) {
2099 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2100 		return ERR_PTR(err);
2101 	}
2102 
2103 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2104 }
2105