xref: /linux/net/ipv6/ip6_output.c (revision 04317b129e4eb5c6f4a58bb899b2019c1545320b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res != LWTUNNEL_XMIT_CONTINUE)
117 			return res;
118 	}
119 
120 	rcu_read_lock();
121 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
122 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
123 
124 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
125 		if (unlikely(!neigh))
126 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
127 		if (IS_ERR(neigh)) {
128 			rcu_read_unlock();
129 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
130 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
131 			return -EINVAL;
132 		}
133 	}
134 	sock_confirm_neigh(skb, neigh);
135 	ret = neigh_output(neigh, skb, false);
136 	rcu_read_unlock();
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
166 		if (err && ret == 0)
167 			ret = err;
168 	}
169 
170 	return ret;
171 }
172 
173 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
174 {
175 	unsigned int mtu;
176 
177 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
178 	/* Policy lookup after SNAT yielded a new policy */
179 	if (skb_dst(skb)->xfrm) {
180 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
181 		return dst_output(net, sk, skb);
182 	}
183 #endif
184 
185 	mtu = ip6_skb_dst_mtu(skb);
186 	if (skb_is_gso(skb) &&
187 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
188 	    !skb_gso_validate_network_len(skb, mtu))
189 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
190 
191 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
192 	    dst_allfrag(skb_dst(skb)) ||
193 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
194 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
195 	else
196 		return ip6_finish_output2(net, sk, skb);
197 }
198 
199 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
200 {
201 	int ret;
202 
203 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
204 	switch (ret) {
205 	case NET_XMIT_SUCCESS:
206 	case NET_XMIT_CN:
207 		return __ip6_finish_output(net, sk, skb) ? : ret;
208 	default:
209 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
210 		return ret;
211 	}
212 }
213 
214 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
217 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->dev = dev;
221 
222 	if (unlikely(idev->cnf.disable_ipv6)) {
223 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
224 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
225 		return 0;
226 	}
227 
228 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
229 			    net, sk, skb, indev, dev,
230 			    ip6_finish_output,
231 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
232 }
233 EXPORT_SYMBOL(ip6_output);
234 
235 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
236 {
237 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
238 		return ip6_default_np_autolabel(net);
239 	return inet6_test_bit(AUTOFLOWLABEL, sk);
240 }
241 
242 /*
243  * xmit an sk_buff (used by TCP, SCTP and DCCP)
244  * Note : socket lock is not held for SYNACK packets, but might be modified
245  * by calls to skb_set_owner_w() and ipv6_local_error(),
246  * which are using proper atomic operations or spinlocks.
247  */
248 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
249 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
250 {
251 	struct net *net = sock_net(sk);
252 	const struct ipv6_pinfo *np = inet6_sk(sk);
253 	struct in6_addr *first_hop = &fl6->daddr;
254 	struct dst_entry *dst = skb_dst(skb);
255 	struct net_device *dev = dst->dev;
256 	struct inet6_dev *idev = ip6_dst_idev(dst);
257 	struct hop_jumbo_hdr *hop_jumbo;
258 	int hoplen = sizeof(*hop_jumbo);
259 	unsigned int head_room;
260 	struct ipv6hdr *hdr;
261 	u8  proto = fl6->flowi6_proto;
262 	int seg_len = skb->len;
263 	int hlimit = -1;
264 	u32 mtu;
265 
266 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
267 	if (opt)
268 		head_room += opt->opt_nflen + opt->opt_flen;
269 
270 	if (unlikely(head_room > skb_headroom(skb))) {
271 		skb = skb_expand_head(skb, head_room);
272 		if (!skb) {
273 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
274 			return -ENOBUFS;
275 		}
276 	}
277 
278 	if (opt) {
279 		seg_len += opt->opt_nflen + opt->opt_flen;
280 
281 		if (opt->opt_flen)
282 			ipv6_push_frag_opts(skb, opt, &proto);
283 
284 		if (opt->opt_nflen)
285 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
286 					     &fl6->saddr);
287 	}
288 
289 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
290 		hop_jumbo = skb_push(skb, hoplen);
291 
292 		hop_jumbo->nexthdr = proto;
293 		hop_jumbo->hdrlen = 0;
294 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
295 		hop_jumbo->tlv_len = 4;
296 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
297 
298 		proto = IPPROTO_HOPOPTS;
299 		seg_len = 0;
300 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
301 	}
302 
303 	skb_push(skb, sizeof(struct ipv6hdr));
304 	skb_reset_network_header(skb);
305 	hdr = ipv6_hdr(skb);
306 
307 	/*
308 	 *	Fill in the IPv6 header
309 	 */
310 	if (np)
311 		hlimit = READ_ONCE(np->hop_limit);
312 	if (hlimit < 0)
313 		hlimit = ip6_dst_hoplimit(dst);
314 
315 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
316 				ip6_autoflowlabel(net, sk), fl6));
317 
318 	hdr->payload_len = htons(seg_len);
319 	hdr->nexthdr = proto;
320 	hdr->hop_limit = hlimit;
321 
322 	hdr->saddr = fl6->saddr;
323 	hdr->daddr = *first_hop;
324 
325 	skb->protocol = htons(ETH_P_IPV6);
326 	skb->priority = priority;
327 	skb->mark = mark;
328 
329 	mtu = dst_mtu(dst);
330 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
331 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
332 
333 		/* if egress device is enslaved to an L3 master device pass the
334 		 * skb to its handler for processing
335 		 */
336 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
337 		if (unlikely(!skb))
338 			return 0;
339 
340 		/* hooks should never assume socket lock is held.
341 		 * we promote our socket to non const
342 		 */
343 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
344 			       net, (struct sock *)sk, skb, NULL, dev,
345 			       dst_output);
346 	}
347 
348 	skb->dev = dev;
349 	/* ipv6_local_error() does not require socket lock,
350 	 * we promote our socket to non const
351 	 */
352 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
353 
354 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
355 	kfree_skb(skb);
356 	return -EMSGSIZE;
357 }
358 EXPORT_SYMBOL(ip6_xmit);
359 
360 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
361 {
362 	struct ip6_ra_chain *ra;
363 	struct sock *last = NULL;
364 
365 	read_lock(&ip6_ra_lock);
366 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
367 		struct sock *sk = ra->sk;
368 		if (sk && ra->sel == sel &&
369 		    (!sk->sk_bound_dev_if ||
370 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
371 
372 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
373 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
374 				continue;
375 			}
376 			if (last) {
377 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
378 				if (skb2)
379 					rawv6_rcv(last, skb2);
380 			}
381 			last = sk;
382 		}
383 	}
384 
385 	if (last) {
386 		rawv6_rcv(last, skb);
387 		read_unlock(&ip6_ra_lock);
388 		return 1;
389 	}
390 	read_unlock(&ip6_ra_lock);
391 	return 0;
392 }
393 
394 static int ip6_forward_proxy_check(struct sk_buff *skb)
395 {
396 	struct ipv6hdr *hdr = ipv6_hdr(skb);
397 	u8 nexthdr = hdr->nexthdr;
398 	__be16 frag_off;
399 	int offset;
400 
401 	if (ipv6_ext_hdr(nexthdr)) {
402 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
403 		if (offset < 0)
404 			return 0;
405 	} else
406 		offset = sizeof(struct ipv6hdr);
407 
408 	if (nexthdr == IPPROTO_ICMPV6) {
409 		struct icmp6hdr *icmp6;
410 
411 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
412 					 offset + 1 - skb->data)))
413 			return 0;
414 
415 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
416 
417 		switch (icmp6->icmp6_type) {
418 		case NDISC_ROUTER_SOLICITATION:
419 		case NDISC_ROUTER_ADVERTISEMENT:
420 		case NDISC_NEIGHBOUR_SOLICITATION:
421 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
422 		case NDISC_REDIRECT:
423 			/* For reaction involving unicast neighbor discovery
424 			 * message destined to the proxied address, pass it to
425 			 * input function.
426 			 */
427 			return 1;
428 		default:
429 			break;
430 		}
431 	}
432 
433 	/*
434 	 * The proxying router can't forward traffic sent to a link-local
435 	 * address, so signal the sender and discard the packet. This
436 	 * behavior is clarified by the MIPv6 specification.
437 	 */
438 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
439 		dst_link_failure(skb);
440 		return -1;
441 	}
442 
443 	return 0;
444 }
445 
446 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
447 				     struct sk_buff *skb)
448 {
449 	struct dst_entry *dst = skb_dst(skb);
450 
451 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
452 
453 #ifdef CONFIG_NET_SWITCHDEV
454 	if (skb->offload_l3_fwd_mark) {
455 		consume_skb(skb);
456 		return 0;
457 	}
458 #endif
459 
460 	skb_clear_tstamp(skb);
461 	return dst_output(net, sk, skb);
462 }
463 
464 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
465 {
466 	if (skb->len <= mtu)
467 		return false;
468 
469 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
470 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
471 		return true;
472 
473 	if (skb->ignore_df)
474 		return false;
475 
476 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
477 		return false;
478 
479 	return true;
480 }
481 
482 int ip6_forward(struct sk_buff *skb)
483 {
484 	struct dst_entry *dst = skb_dst(skb);
485 	struct ipv6hdr *hdr = ipv6_hdr(skb);
486 	struct inet6_skb_parm *opt = IP6CB(skb);
487 	struct net *net = dev_net(dst->dev);
488 	struct inet6_dev *idev;
489 	SKB_DR(reason);
490 	u32 mtu;
491 
492 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
493 	if (net->ipv6.devconf_all->forwarding == 0)
494 		goto error;
495 
496 	if (skb->pkt_type != PACKET_HOST)
497 		goto drop;
498 
499 	if (unlikely(skb->sk))
500 		goto drop;
501 
502 	if (skb_warn_if_lro(skb))
503 		goto drop;
504 
505 	if (!net->ipv6.devconf_all->disable_policy &&
506 	    (!idev || !idev->cnf.disable_policy) &&
507 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
508 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
509 		goto drop;
510 	}
511 
512 	skb_forward_csum(skb);
513 
514 	/*
515 	 *	We DO NOT make any processing on
516 	 *	RA packets, pushing them to user level AS IS
517 	 *	without ane WARRANTY that application will be able
518 	 *	to interpret them. The reason is that we
519 	 *	cannot make anything clever here.
520 	 *
521 	 *	We are not end-node, so that if packet contains
522 	 *	AH/ESP, we cannot make anything.
523 	 *	Defragmentation also would be mistake, RA packets
524 	 *	cannot be fragmented, because there is no warranty
525 	 *	that different fragments will go along one path. --ANK
526 	 */
527 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
528 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
529 			return 0;
530 	}
531 
532 	/*
533 	 *	check and decrement ttl
534 	 */
535 	if (hdr->hop_limit <= 1) {
536 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
537 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
538 
539 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
540 		return -ETIMEDOUT;
541 	}
542 
543 	/* XXX: idev->cnf.proxy_ndp? */
544 	if (net->ipv6.devconf_all->proxy_ndp &&
545 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
546 		int proxied = ip6_forward_proxy_check(skb);
547 		if (proxied > 0) {
548 			/* It's tempting to decrease the hop limit
549 			 * here by 1, as we do at the end of the
550 			 * function too.
551 			 *
552 			 * But that would be incorrect, as proxying is
553 			 * not forwarding.  The ip6_input function
554 			 * will handle this packet locally, and it
555 			 * depends on the hop limit being unchanged.
556 			 *
557 			 * One example is the NDP hop limit, that
558 			 * always has to stay 255, but other would be
559 			 * similar checks around RA packets, where the
560 			 * user can even change the desired limit.
561 			 */
562 			return ip6_input(skb);
563 		} else if (proxied < 0) {
564 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
565 			goto drop;
566 		}
567 	}
568 
569 	if (!xfrm6_route_forward(skb)) {
570 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
571 		SKB_DR_SET(reason, XFRM_POLICY);
572 		goto drop;
573 	}
574 	dst = skb_dst(skb);
575 
576 	/* IPv6 specs say nothing about it, but it is clear that we cannot
577 	   send redirects to source routed frames.
578 	   We don't send redirects to frames decapsulated from IPsec.
579 	 */
580 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
581 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
582 		struct in6_addr *target = NULL;
583 		struct inet_peer *peer;
584 		struct rt6_info *rt;
585 
586 		/*
587 		 *	incoming and outgoing devices are the same
588 		 *	send a redirect.
589 		 */
590 
591 		rt = (struct rt6_info *) dst;
592 		if (rt->rt6i_flags & RTF_GATEWAY)
593 			target = &rt->rt6i_gateway;
594 		else
595 			target = &hdr->daddr;
596 
597 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
598 
599 		/* Limit redirects both by destination (here)
600 		   and by source (inside ndisc_send_redirect)
601 		 */
602 		if (inet_peer_xrlim_allow(peer, 1*HZ))
603 			ndisc_send_redirect(skb, target);
604 		if (peer)
605 			inet_putpeer(peer);
606 	} else {
607 		int addrtype = ipv6_addr_type(&hdr->saddr);
608 
609 		/* This check is security critical. */
610 		if (addrtype == IPV6_ADDR_ANY ||
611 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
612 			goto error;
613 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
614 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
615 				    ICMPV6_NOT_NEIGHBOUR, 0);
616 			goto error;
617 		}
618 	}
619 
620 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
621 	if (mtu < IPV6_MIN_MTU)
622 		mtu = IPV6_MIN_MTU;
623 
624 	if (ip6_pkt_too_big(skb, mtu)) {
625 		/* Again, force OUTPUT device used as source address */
626 		skb->dev = dst->dev;
627 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
628 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
629 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
630 				IPSTATS_MIB_FRAGFAILS);
631 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
632 		return -EMSGSIZE;
633 	}
634 
635 	if (skb_cow(skb, dst->dev->hard_header_len)) {
636 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
637 				IPSTATS_MIB_OUTDISCARDS);
638 		goto drop;
639 	}
640 
641 	hdr = ipv6_hdr(skb);
642 
643 	/* Mangling hops number delayed to point after skb COW */
644 
645 	hdr->hop_limit--;
646 
647 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
648 		       net, NULL, skb, skb->dev, dst->dev,
649 		       ip6_forward_finish);
650 
651 error:
652 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
653 	SKB_DR_SET(reason, IP_INADDRERRORS);
654 drop:
655 	kfree_skb_reason(skb, reason);
656 	return -EINVAL;
657 }
658 
659 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
660 {
661 	to->pkt_type = from->pkt_type;
662 	to->priority = from->priority;
663 	to->protocol = from->protocol;
664 	skb_dst_drop(to);
665 	skb_dst_set(to, dst_clone(skb_dst(from)));
666 	to->dev = from->dev;
667 	to->mark = from->mark;
668 
669 	skb_copy_hash(to, from);
670 
671 #ifdef CONFIG_NET_SCHED
672 	to->tc_index = from->tc_index;
673 #endif
674 	nf_copy(to, from);
675 	skb_ext_copy(to, from);
676 	skb_copy_secmark(to, from);
677 }
678 
679 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
680 		      u8 nexthdr, __be32 frag_id,
681 		      struct ip6_fraglist_iter *iter)
682 {
683 	unsigned int first_len;
684 	struct frag_hdr *fh;
685 
686 	/* BUILD HEADER */
687 	*prevhdr = NEXTHDR_FRAGMENT;
688 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
689 	if (!iter->tmp_hdr)
690 		return -ENOMEM;
691 
692 	iter->frag = skb_shinfo(skb)->frag_list;
693 	skb_frag_list_init(skb);
694 
695 	iter->offset = 0;
696 	iter->hlen = hlen;
697 	iter->frag_id = frag_id;
698 	iter->nexthdr = nexthdr;
699 
700 	__skb_pull(skb, hlen);
701 	fh = __skb_push(skb, sizeof(struct frag_hdr));
702 	__skb_push(skb, hlen);
703 	skb_reset_network_header(skb);
704 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
705 
706 	fh->nexthdr = nexthdr;
707 	fh->reserved = 0;
708 	fh->frag_off = htons(IP6_MF);
709 	fh->identification = frag_id;
710 
711 	first_len = skb_pagelen(skb);
712 	skb->data_len = first_len - skb_headlen(skb);
713 	skb->len = first_len;
714 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
715 
716 	return 0;
717 }
718 EXPORT_SYMBOL(ip6_fraglist_init);
719 
720 void ip6_fraglist_prepare(struct sk_buff *skb,
721 			  struct ip6_fraglist_iter *iter)
722 {
723 	struct sk_buff *frag = iter->frag;
724 	unsigned int hlen = iter->hlen;
725 	struct frag_hdr *fh;
726 
727 	frag->ip_summed = CHECKSUM_NONE;
728 	skb_reset_transport_header(frag);
729 	fh = __skb_push(frag, sizeof(struct frag_hdr));
730 	__skb_push(frag, hlen);
731 	skb_reset_network_header(frag);
732 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
733 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
734 	fh->nexthdr = iter->nexthdr;
735 	fh->reserved = 0;
736 	fh->frag_off = htons(iter->offset);
737 	if (frag->next)
738 		fh->frag_off |= htons(IP6_MF);
739 	fh->identification = iter->frag_id;
740 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
741 	ip6_copy_metadata(frag, skb);
742 }
743 EXPORT_SYMBOL(ip6_fraglist_prepare);
744 
745 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
746 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
747 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
748 {
749 	state->prevhdr = prevhdr;
750 	state->nexthdr = nexthdr;
751 	state->frag_id = frag_id;
752 
753 	state->hlen = hlen;
754 	state->mtu = mtu;
755 
756 	state->left = skb->len - hlen;	/* Space per frame */
757 	state->ptr = hlen;		/* Where to start from */
758 
759 	state->hroom = hdr_room;
760 	state->troom = needed_tailroom;
761 
762 	state->offset = 0;
763 }
764 EXPORT_SYMBOL(ip6_frag_init);
765 
766 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
767 {
768 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
769 	struct sk_buff *frag;
770 	struct frag_hdr *fh;
771 	unsigned int len;
772 
773 	len = state->left;
774 	/* IF: it doesn't fit, use 'mtu' - the data space left */
775 	if (len > state->mtu)
776 		len = state->mtu;
777 	/* IF: we are not sending up to and including the packet end
778 	   then align the next start on an eight byte boundary */
779 	if (len < state->left)
780 		len &= ~7;
781 
782 	/* Allocate buffer */
783 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
784 			 state->hroom + state->troom, GFP_ATOMIC);
785 	if (!frag)
786 		return ERR_PTR(-ENOMEM);
787 
788 	/*
789 	 *	Set up data on packet
790 	 */
791 
792 	ip6_copy_metadata(frag, skb);
793 	skb_reserve(frag, state->hroom);
794 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
795 	skb_reset_network_header(frag);
796 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
797 	frag->transport_header = (frag->network_header + state->hlen +
798 				  sizeof(struct frag_hdr));
799 
800 	/*
801 	 *	Charge the memory for the fragment to any owner
802 	 *	it might possess
803 	 */
804 	if (skb->sk)
805 		skb_set_owner_w(frag, skb->sk);
806 
807 	/*
808 	 *	Copy the packet header into the new buffer.
809 	 */
810 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
811 
812 	fragnexthdr_offset = skb_network_header(frag);
813 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
814 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
815 
816 	/*
817 	 *	Build fragment header.
818 	 */
819 	fh->nexthdr = state->nexthdr;
820 	fh->reserved = 0;
821 	fh->identification = state->frag_id;
822 
823 	/*
824 	 *	Copy a block of the IP datagram.
825 	 */
826 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
827 			     len));
828 	state->left -= len;
829 
830 	fh->frag_off = htons(state->offset);
831 	if (state->left > 0)
832 		fh->frag_off |= htons(IP6_MF);
833 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
834 
835 	state->ptr += len;
836 	state->offset += len;
837 
838 	return frag;
839 }
840 EXPORT_SYMBOL(ip6_frag_next);
841 
842 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
843 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
844 {
845 	struct sk_buff *frag;
846 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
847 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
848 				inet6_sk(skb->sk) : NULL;
849 	bool mono_delivery_time = skb->mono_delivery_time;
850 	struct ip6_frag_state state;
851 	unsigned int mtu, hlen, nexthdr_offset;
852 	ktime_t tstamp = skb->tstamp;
853 	int hroom, err = 0;
854 	__be32 frag_id;
855 	u8 *prevhdr, nexthdr = 0;
856 
857 	err = ip6_find_1stfragopt(skb, &prevhdr);
858 	if (err < 0)
859 		goto fail;
860 	hlen = err;
861 	nexthdr = *prevhdr;
862 	nexthdr_offset = prevhdr - skb_network_header(skb);
863 
864 	mtu = ip6_skb_dst_mtu(skb);
865 
866 	/* We must not fragment if the socket is set to force MTU discovery
867 	 * or if the skb it not generated by a local socket.
868 	 */
869 	if (unlikely(!skb->ignore_df && skb->len > mtu))
870 		goto fail_toobig;
871 
872 	if (IP6CB(skb)->frag_max_size) {
873 		if (IP6CB(skb)->frag_max_size > mtu)
874 			goto fail_toobig;
875 
876 		/* don't send fragments larger than what we received */
877 		mtu = IP6CB(skb)->frag_max_size;
878 		if (mtu < IPV6_MIN_MTU)
879 			mtu = IPV6_MIN_MTU;
880 	}
881 
882 	if (np) {
883 		u32 frag_size = READ_ONCE(np->frag_size);
884 
885 		if (frag_size && frag_size < mtu)
886 			mtu = frag_size;
887 	}
888 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
889 		goto fail_toobig;
890 	mtu -= hlen + sizeof(struct frag_hdr);
891 
892 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
893 				    &ipv6_hdr(skb)->saddr);
894 
895 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
896 	    (err = skb_checksum_help(skb)))
897 		goto fail;
898 
899 	prevhdr = skb_network_header(skb) + nexthdr_offset;
900 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
901 	if (skb_has_frag_list(skb)) {
902 		unsigned int first_len = skb_pagelen(skb);
903 		struct ip6_fraglist_iter iter;
904 		struct sk_buff *frag2;
905 
906 		if (first_len - hlen > mtu ||
907 		    ((first_len - hlen) & 7) ||
908 		    skb_cloned(skb) ||
909 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
910 			goto slow_path;
911 
912 		skb_walk_frags(skb, frag) {
913 			/* Correct geometry. */
914 			if (frag->len > mtu ||
915 			    ((frag->len & 7) && frag->next) ||
916 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
917 				goto slow_path_clean;
918 
919 			/* Partially cloned skb? */
920 			if (skb_shared(frag))
921 				goto slow_path_clean;
922 
923 			BUG_ON(frag->sk);
924 			if (skb->sk) {
925 				frag->sk = skb->sk;
926 				frag->destructor = sock_wfree;
927 			}
928 			skb->truesize -= frag->truesize;
929 		}
930 
931 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
932 					&iter);
933 		if (err < 0)
934 			goto fail;
935 
936 		/* We prevent @rt from being freed. */
937 		rcu_read_lock();
938 
939 		for (;;) {
940 			/* Prepare header of the next frame,
941 			 * before previous one went down. */
942 			if (iter.frag)
943 				ip6_fraglist_prepare(skb, &iter);
944 
945 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
946 			err = output(net, sk, skb);
947 			if (!err)
948 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
949 					      IPSTATS_MIB_FRAGCREATES);
950 
951 			if (err || !iter.frag)
952 				break;
953 
954 			skb = ip6_fraglist_next(&iter);
955 		}
956 
957 		kfree(iter.tmp_hdr);
958 
959 		if (err == 0) {
960 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
961 				      IPSTATS_MIB_FRAGOKS);
962 			rcu_read_unlock();
963 			return 0;
964 		}
965 
966 		kfree_skb_list(iter.frag);
967 
968 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
969 			      IPSTATS_MIB_FRAGFAILS);
970 		rcu_read_unlock();
971 		return err;
972 
973 slow_path_clean:
974 		skb_walk_frags(skb, frag2) {
975 			if (frag2 == frag)
976 				break;
977 			frag2->sk = NULL;
978 			frag2->destructor = NULL;
979 			skb->truesize += frag2->truesize;
980 		}
981 	}
982 
983 slow_path:
984 	/*
985 	 *	Fragment the datagram.
986 	 */
987 
988 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
989 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
990 		      &state);
991 
992 	/*
993 	 *	Keep copying data until we run out.
994 	 */
995 
996 	while (state.left > 0) {
997 		frag = ip6_frag_next(skb, &state);
998 		if (IS_ERR(frag)) {
999 			err = PTR_ERR(frag);
1000 			goto fail;
1001 		}
1002 
1003 		/*
1004 		 *	Put this fragment into the sending queue.
1005 		 */
1006 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1007 		err = output(net, sk, frag);
1008 		if (err)
1009 			goto fail;
1010 
1011 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1012 			      IPSTATS_MIB_FRAGCREATES);
1013 	}
1014 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1015 		      IPSTATS_MIB_FRAGOKS);
1016 	consume_skb(skb);
1017 	return err;
1018 
1019 fail_toobig:
1020 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1021 		sk_gso_disable(skb->sk);
1022 
1023 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1024 	err = -EMSGSIZE;
1025 
1026 fail:
1027 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 		      IPSTATS_MIB_FRAGFAILS);
1029 	kfree_skb(skb);
1030 	return err;
1031 }
1032 
1033 static inline int ip6_rt_check(const struct rt6key *rt_key,
1034 			       const struct in6_addr *fl_addr,
1035 			       const struct in6_addr *addr_cache)
1036 {
1037 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1038 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1039 }
1040 
1041 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1042 					  struct dst_entry *dst,
1043 					  const struct flowi6 *fl6)
1044 {
1045 	struct ipv6_pinfo *np = inet6_sk(sk);
1046 	struct rt6_info *rt;
1047 
1048 	if (!dst)
1049 		goto out;
1050 
1051 	if (dst->ops->family != AF_INET6) {
1052 		dst_release(dst);
1053 		return NULL;
1054 	}
1055 
1056 	rt = (struct rt6_info *)dst;
1057 	/* Yes, checking route validity in not connected
1058 	 * case is not very simple. Take into account,
1059 	 * that we do not support routing by source, TOS,
1060 	 * and MSG_DONTROUTE		--ANK (980726)
1061 	 *
1062 	 * 1. ip6_rt_check(): If route was host route,
1063 	 *    check that cached destination is current.
1064 	 *    If it is network route, we still may
1065 	 *    check its validity using saved pointer
1066 	 *    to the last used address: daddr_cache.
1067 	 *    We do not want to save whole address now,
1068 	 *    (because main consumer of this service
1069 	 *    is tcp, which has not this problem),
1070 	 *    so that the last trick works only on connected
1071 	 *    sockets.
1072 	 * 2. oif also should be the same.
1073 	 */
1074 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1075 #ifdef CONFIG_IPV6_SUBTREES
1076 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1077 #endif
1078 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1079 		dst_release(dst);
1080 		dst = NULL;
1081 	}
1082 
1083 out:
1084 	return dst;
1085 }
1086 
1087 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1088 			       struct dst_entry **dst, struct flowi6 *fl6)
1089 {
1090 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1091 	struct neighbour *n;
1092 	struct rt6_info *rt;
1093 #endif
1094 	int err;
1095 	int flags = 0;
1096 
1097 	/* The correct way to handle this would be to do
1098 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1099 	 * the route-specific preferred source forces the
1100 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1101 	 *
1102 	 * In source specific routing (no src=any default route),
1103 	 * ip6_route_output will fail given src=any saddr, though, so
1104 	 * that's why we try it again later.
1105 	 */
1106 	if (ipv6_addr_any(&fl6->saddr)) {
1107 		struct fib6_info *from;
1108 		struct rt6_info *rt;
1109 
1110 		*dst = ip6_route_output(net, sk, fl6);
1111 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1112 
1113 		rcu_read_lock();
1114 		from = rt ? rcu_dereference(rt->from) : NULL;
1115 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1116 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1117 					  &fl6->saddr);
1118 		rcu_read_unlock();
1119 
1120 		if (err)
1121 			goto out_err_release;
1122 
1123 		/* If we had an erroneous initial result, pretend it
1124 		 * never existed and let the SA-enabled version take
1125 		 * over.
1126 		 */
1127 		if ((*dst)->error) {
1128 			dst_release(*dst);
1129 			*dst = NULL;
1130 		}
1131 
1132 		if (fl6->flowi6_oif)
1133 			flags |= RT6_LOOKUP_F_IFACE;
1134 	}
1135 
1136 	if (!*dst)
1137 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1138 
1139 	err = (*dst)->error;
1140 	if (err)
1141 		goto out_err_release;
1142 
1143 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1144 	/*
1145 	 * Here if the dst entry we've looked up
1146 	 * has a neighbour entry that is in the INCOMPLETE
1147 	 * state and the src address from the flow is
1148 	 * marked as OPTIMISTIC, we release the found
1149 	 * dst entry and replace it instead with the
1150 	 * dst entry of the nexthop router
1151 	 */
1152 	rt = (struct rt6_info *) *dst;
1153 	rcu_read_lock();
1154 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1155 				      rt6_nexthop(rt, &fl6->daddr));
1156 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1157 	rcu_read_unlock();
1158 
1159 	if (err) {
1160 		struct inet6_ifaddr *ifp;
1161 		struct flowi6 fl_gw6;
1162 		int redirect;
1163 
1164 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1165 				      (*dst)->dev, 1);
1166 
1167 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1168 		if (ifp)
1169 			in6_ifa_put(ifp);
1170 
1171 		if (redirect) {
1172 			/*
1173 			 * We need to get the dst entry for the
1174 			 * default router instead
1175 			 */
1176 			dst_release(*dst);
1177 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1178 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1179 			*dst = ip6_route_output(net, sk, &fl_gw6);
1180 			err = (*dst)->error;
1181 			if (err)
1182 				goto out_err_release;
1183 		}
1184 	}
1185 #endif
1186 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1187 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1188 		err = -EAFNOSUPPORT;
1189 		goto out_err_release;
1190 	}
1191 
1192 	return 0;
1193 
1194 out_err_release:
1195 	dst_release(*dst);
1196 	*dst = NULL;
1197 
1198 	if (err == -ENETUNREACH)
1199 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1200 	return err;
1201 }
1202 
1203 /**
1204  *	ip6_dst_lookup - perform route lookup on flow
1205  *	@net: Network namespace to perform lookup in
1206  *	@sk: socket which provides route info
1207  *	@dst: pointer to dst_entry * for result
1208  *	@fl6: flow to lookup
1209  *
1210  *	This function performs a route lookup on the given flow.
1211  *
1212  *	It returns zero on success, or a standard errno code on error.
1213  */
1214 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1215 		   struct flowi6 *fl6)
1216 {
1217 	*dst = NULL;
1218 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1221 
1222 /**
1223  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1224  *	@net: Network namespace to perform lookup in
1225  *	@sk: socket which provides route info
1226  *	@fl6: flow to lookup
1227  *	@final_dst: final destination address for ipsec lookup
1228  *
1229  *	This function performs a route lookup on the given flow.
1230  *
1231  *	It returns a valid dst pointer on success, or a pointer encoded
1232  *	error code.
1233  */
1234 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1235 				      const struct in6_addr *final_dst)
1236 {
1237 	struct dst_entry *dst = NULL;
1238 	int err;
1239 
1240 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1241 	if (err)
1242 		return ERR_PTR(err);
1243 	if (final_dst)
1244 		fl6->daddr = *final_dst;
1245 
1246 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1247 }
1248 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1249 
1250 /**
1251  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1252  *	@sk: socket which provides the dst cache and route info
1253  *	@fl6: flow to lookup
1254  *	@final_dst: final destination address for ipsec lookup
1255  *	@connected: whether @sk is connected or not
1256  *
1257  *	This function performs a route lookup on the given flow with the
1258  *	possibility of using the cached route in the socket if it is valid.
1259  *	It will take the socket dst lock when operating on the dst cache.
1260  *	As a result, this function can only be used in process context.
1261  *
1262  *	In addition, for a connected socket, cache the dst in the socket
1263  *	if the current cache is not valid.
1264  *
1265  *	It returns a valid dst pointer on success, or a pointer encoded
1266  *	error code.
1267  */
1268 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1269 					 const struct in6_addr *final_dst,
1270 					 bool connected)
1271 {
1272 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1273 
1274 	dst = ip6_sk_dst_check(sk, dst, fl6);
1275 	if (dst)
1276 		return dst;
1277 
1278 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1279 	if (connected && !IS_ERR(dst))
1280 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1281 
1282 	return dst;
1283 }
1284 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1285 
1286 /**
1287  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1288  *      @skb: Packet for which lookup is done
1289  *      @dev: Tunnel device
1290  *      @net: Network namespace of tunnel device
1291  *      @sock: Socket which provides route info
1292  *      @saddr: Memory to store the src ip address
1293  *      @info: Tunnel information
1294  *      @protocol: IP protocol
1295  *      @use_cache: Flag to enable cache usage
1296  *      This function performs a route lookup on a tunnel
1297  *
1298  *      It returns a valid dst pointer and stores src address to be used in
1299  *      tunnel in param saddr on success, else a pointer encoded error code.
1300  */
1301 
1302 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1303 					struct net_device *dev,
1304 					struct net *net,
1305 					struct socket *sock,
1306 					struct in6_addr *saddr,
1307 					const struct ip_tunnel_info *info,
1308 					u8 protocol,
1309 					bool use_cache)
1310 {
1311 	struct dst_entry *dst = NULL;
1312 #ifdef CONFIG_DST_CACHE
1313 	struct dst_cache *dst_cache;
1314 #endif
1315 	struct flowi6 fl6;
1316 	__u8 prio;
1317 
1318 #ifdef CONFIG_DST_CACHE
1319 	dst_cache = (struct dst_cache *)&info->dst_cache;
1320 	if (use_cache) {
1321 		dst = dst_cache_get_ip6(dst_cache, saddr);
1322 		if (dst)
1323 			return dst;
1324 	}
1325 #endif
1326 	memset(&fl6, 0, sizeof(fl6));
1327 	fl6.flowi6_mark = skb->mark;
1328 	fl6.flowi6_proto = protocol;
1329 	fl6.daddr = info->key.u.ipv6.dst;
1330 	fl6.saddr = info->key.u.ipv6.src;
1331 	prio = info->key.tos;
1332 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1333 
1334 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1335 					      NULL);
1336 	if (IS_ERR(dst)) {
1337 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1338 		return ERR_PTR(-ENETUNREACH);
1339 	}
1340 	if (dst->dev == dev) { /* is this necessary? */
1341 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1342 		dst_release(dst);
1343 		return ERR_PTR(-ELOOP);
1344 	}
1345 #ifdef CONFIG_DST_CACHE
1346 	if (use_cache)
1347 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1348 #endif
1349 	*saddr = fl6.saddr;
1350 	return dst;
1351 }
1352 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1353 
1354 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1355 					       gfp_t gfp)
1356 {
1357 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1358 }
1359 
1360 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1361 						gfp_t gfp)
1362 {
1363 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1364 }
1365 
1366 static void ip6_append_data_mtu(unsigned int *mtu,
1367 				int *maxfraglen,
1368 				unsigned int fragheaderlen,
1369 				struct sk_buff *skb,
1370 				struct rt6_info *rt,
1371 				unsigned int orig_mtu)
1372 {
1373 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1374 		if (!skb) {
1375 			/* first fragment, reserve header_len */
1376 			*mtu = orig_mtu - rt->dst.header_len;
1377 
1378 		} else {
1379 			/*
1380 			 * this fragment is not first, the headers
1381 			 * space is regarded as data space.
1382 			 */
1383 			*mtu = orig_mtu;
1384 		}
1385 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1386 			      + fragheaderlen - sizeof(struct frag_hdr);
1387 	}
1388 }
1389 
1390 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1391 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1392 			  struct rt6_info *rt)
1393 {
1394 	struct ipv6_pinfo *np = inet6_sk(sk);
1395 	unsigned int mtu, frag_size;
1396 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1397 
1398 	/* callers pass dst together with a reference, set it first so
1399 	 * ip6_cork_release() can put it down even in case of an error.
1400 	 */
1401 	cork->base.dst = &rt->dst;
1402 
1403 	/*
1404 	 * setup for corking
1405 	 */
1406 	if (opt) {
1407 		if (WARN_ON(v6_cork->opt))
1408 			return -EINVAL;
1409 
1410 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1411 		if (unlikely(!nopt))
1412 			return -ENOBUFS;
1413 
1414 		nopt->tot_len = sizeof(*opt);
1415 		nopt->opt_flen = opt->opt_flen;
1416 		nopt->opt_nflen = opt->opt_nflen;
1417 
1418 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1419 		if (opt->dst0opt && !nopt->dst0opt)
1420 			return -ENOBUFS;
1421 
1422 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1423 		if (opt->dst1opt && !nopt->dst1opt)
1424 			return -ENOBUFS;
1425 
1426 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1427 		if (opt->hopopt && !nopt->hopopt)
1428 			return -ENOBUFS;
1429 
1430 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1431 		if (opt->srcrt && !nopt->srcrt)
1432 			return -ENOBUFS;
1433 
1434 		/* need source address above miyazawa*/
1435 	}
1436 	v6_cork->hop_limit = ipc6->hlimit;
1437 	v6_cork->tclass = ipc6->tclass;
1438 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1439 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1440 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1441 	else
1442 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1443 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1444 
1445 	frag_size = READ_ONCE(np->frag_size);
1446 	if (frag_size && frag_size < mtu)
1447 		mtu = frag_size;
1448 
1449 	cork->base.fragsize = mtu;
1450 	cork->base.gso_size = ipc6->gso_size;
1451 	cork->base.tx_flags = 0;
1452 	cork->base.mark = ipc6->sockc.mark;
1453 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1454 
1455 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1456 		cork->base.flags |= IPCORK_ALLFRAG;
1457 	cork->base.length = 0;
1458 
1459 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1460 
1461 	return 0;
1462 }
1463 
1464 static int __ip6_append_data(struct sock *sk,
1465 			     struct sk_buff_head *queue,
1466 			     struct inet_cork_full *cork_full,
1467 			     struct inet6_cork *v6_cork,
1468 			     struct page_frag *pfrag,
1469 			     int getfrag(void *from, char *to, int offset,
1470 					 int len, int odd, struct sk_buff *skb),
1471 			     void *from, size_t length, int transhdrlen,
1472 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1473 {
1474 	struct sk_buff *skb, *skb_prev = NULL;
1475 	struct inet_cork *cork = &cork_full->base;
1476 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1477 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1478 	struct ubuf_info *uarg = NULL;
1479 	int exthdrlen = 0;
1480 	int dst_exthdrlen = 0;
1481 	int hh_len;
1482 	int copy;
1483 	int err;
1484 	int offset = 0;
1485 	bool zc = false;
1486 	u32 tskey = 0;
1487 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1488 	struct ipv6_txoptions *opt = v6_cork->opt;
1489 	int csummode = CHECKSUM_NONE;
1490 	unsigned int maxnonfragsize, headersize;
1491 	unsigned int wmem_alloc_delta = 0;
1492 	bool paged, extra_uref = false;
1493 
1494 	skb = skb_peek_tail(queue);
1495 	if (!skb) {
1496 		exthdrlen = opt ? opt->opt_flen : 0;
1497 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1498 	}
1499 
1500 	paged = !!cork->gso_size;
1501 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1502 	orig_mtu = mtu;
1503 
1504 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1505 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1506 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1507 
1508 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1509 
1510 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1511 			(opt ? opt->opt_nflen : 0);
1512 
1513 	headersize = sizeof(struct ipv6hdr) +
1514 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1515 		     (dst_allfrag(&rt->dst) ?
1516 		      sizeof(struct frag_hdr) : 0) +
1517 		     rt->rt6i_nfheader_len;
1518 
1519 	if (mtu <= fragheaderlen ||
1520 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1521 		goto emsgsize;
1522 
1523 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1524 		     sizeof(struct frag_hdr);
1525 
1526 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1527 	 * the first fragment
1528 	 */
1529 	if (headersize + transhdrlen > mtu)
1530 		goto emsgsize;
1531 
1532 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1533 	    (sk->sk_protocol == IPPROTO_UDP ||
1534 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1535 	     sk->sk_protocol == IPPROTO_RAW)) {
1536 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1537 				sizeof(struct ipv6hdr));
1538 		goto emsgsize;
1539 	}
1540 
1541 	if (ip6_sk_ignore_df(sk))
1542 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1543 	else
1544 		maxnonfragsize = mtu;
1545 
1546 	if (cork->length + length > maxnonfragsize - headersize) {
1547 emsgsize:
1548 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1549 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1550 		return -EMSGSIZE;
1551 	}
1552 
1553 	/* CHECKSUM_PARTIAL only with no extension headers and when
1554 	 * we are not going to fragment
1555 	 */
1556 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1557 	    headersize == sizeof(struct ipv6hdr) &&
1558 	    length <= mtu - headersize &&
1559 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1560 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1561 		csummode = CHECKSUM_PARTIAL;
1562 
1563 	if ((flags & MSG_ZEROCOPY) && length) {
1564 		struct msghdr *msg = from;
1565 
1566 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1567 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1568 				return -EINVAL;
1569 
1570 			/* Leave uarg NULL if can't zerocopy, callers should
1571 			 * be able to handle it.
1572 			 */
1573 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1574 			    csummode == CHECKSUM_PARTIAL) {
1575 				paged = true;
1576 				zc = true;
1577 				uarg = msg->msg_ubuf;
1578 			}
1579 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1580 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1581 			if (!uarg)
1582 				return -ENOBUFS;
1583 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1584 			if (rt->dst.dev->features & NETIF_F_SG &&
1585 			    csummode == CHECKSUM_PARTIAL) {
1586 				paged = true;
1587 				zc = true;
1588 			} else {
1589 				uarg_to_msgzc(uarg)->zerocopy = 0;
1590 				skb_zcopy_set(skb, uarg, &extra_uref);
1591 			}
1592 		}
1593 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1594 		if (inet_test_bit(HDRINCL, sk))
1595 			return -EPERM;
1596 		if (rt->dst.dev->features & NETIF_F_SG &&
1597 		    getfrag == ip_generic_getfrag)
1598 			/* We need an empty buffer to attach stuff to */
1599 			paged = true;
1600 		else
1601 			flags &= ~MSG_SPLICE_PAGES;
1602 	}
1603 
1604 	/*
1605 	 * Let's try using as much space as possible.
1606 	 * Use MTU if total length of the message fits into the MTU.
1607 	 * Otherwise, we need to reserve fragment header and
1608 	 * fragment alignment (= 8-15 octects, in total).
1609 	 *
1610 	 * Note that we may need to "move" the data from the tail
1611 	 * of the buffer to the new fragment when we split
1612 	 * the message.
1613 	 *
1614 	 * FIXME: It may be fragmented into multiple chunks
1615 	 *        at once if non-fragmentable extension headers
1616 	 *        are too large.
1617 	 * --yoshfuji
1618 	 */
1619 
1620 	cork->length += length;
1621 	if (!skb)
1622 		goto alloc_new_skb;
1623 
1624 	while (length > 0) {
1625 		/* Check if the remaining data fits into current packet. */
1626 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1627 		if (copy < length)
1628 			copy = maxfraglen - skb->len;
1629 
1630 		if (copy <= 0) {
1631 			char *data;
1632 			unsigned int datalen;
1633 			unsigned int fraglen;
1634 			unsigned int fraggap;
1635 			unsigned int alloclen, alloc_extra;
1636 			unsigned int pagedlen;
1637 alloc_new_skb:
1638 			/* There's no room in the current skb */
1639 			if (skb)
1640 				fraggap = skb->len - maxfraglen;
1641 			else
1642 				fraggap = 0;
1643 			/* update mtu and maxfraglen if necessary */
1644 			if (!skb || !skb_prev)
1645 				ip6_append_data_mtu(&mtu, &maxfraglen,
1646 						    fragheaderlen, skb, rt,
1647 						    orig_mtu);
1648 
1649 			skb_prev = skb;
1650 
1651 			/*
1652 			 * If remaining data exceeds the mtu,
1653 			 * we know we need more fragment(s).
1654 			 */
1655 			datalen = length + fraggap;
1656 
1657 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1658 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1659 			fraglen = datalen + fragheaderlen;
1660 			pagedlen = 0;
1661 
1662 			alloc_extra = hh_len;
1663 			alloc_extra += dst_exthdrlen;
1664 			alloc_extra += rt->dst.trailer_len;
1665 
1666 			/* We just reserve space for fragment header.
1667 			 * Note: this may be overallocation if the message
1668 			 * (without MSG_MORE) fits into the MTU.
1669 			 */
1670 			alloc_extra += sizeof(struct frag_hdr);
1671 
1672 			if ((flags & MSG_MORE) &&
1673 			    !(rt->dst.dev->features&NETIF_F_SG))
1674 				alloclen = mtu;
1675 			else if (!paged &&
1676 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1677 				  !(rt->dst.dev->features & NETIF_F_SG)))
1678 				alloclen = fraglen;
1679 			else {
1680 				alloclen = fragheaderlen + transhdrlen;
1681 				pagedlen = datalen - transhdrlen;
1682 			}
1683 			alloclen += alloc_extra;
1684 
1685 			if (datalen != length + fraggap) {
1686 				/*
1687 				 * this is not the last fragment, the trailer
1688 				 * space is regarded as data space.
1689 				 */
1690 				datalen += rt->dst.trailer_len;
1691 			}
1692 
1693 			fraglen = datalen + fragheaderlen;
1694 
1695 			copy = datalen - transhdrlen - fraggap - pagedlen;
1696 			/* [!] NOTE: copy may be negative if pagedlen>0
1697 			 * because then the equation may reduces to -fraggap.
1698 			 */
1699 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1700 				err = -EINVAL;
1701 				goto error;
1702 			}
1703 			if (transhdrlen) {
1704 				skb = sock_alloc_send_skb(sk, alloclen,
1705 						(flags & MSG_DONTWAIT), &err);
1706 			} else {
1707 				skb = NULL;
1708 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1709 				    2 * sk->sk_sndbuf)
1710 					skb = alloc_skb(alloclen,
1711 							sk->sk_allocation);
1712 				if (unlikely(!skb))
1713 					err = -ENOBUFS;
1714 			}
1715 			if (!skb)
1716 				goto error;
1717 			/*
1718 			 *	Fill in the control structures
1719 			 */
1720 			skb->protocol = htons(ETH_P_IPV6);
1721 			skb->ip_summed = csummode;
1722 			skb->csum = 0;
1723 			/* reserve for fragmentation and ipsec header */
1724 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1725 				    dst_exthdrlen);
1726 
1727 			/*
1728 			 *	Find where to start putting bytes
1729 			 */
1730 			data = skb_put(skb, fraglen - pagedlen);
1731 			skb_set_network_header(skb, exthdrlen);
1732 			data += fragheaderlen;
1733 			skb->transport_header = (skb->network_header +
1734 						 fragheaderlen);
1735 			if (fraggap) {
1736 				skb->csum = skb_copy_and_csum_bits(
1737 					skb_prev, maxfraglen,
1738 					data + transhdrlen, fraggap);
1739 				skb_prev->csum = csum_sub(skb_prev->csum,
1740 							  skb->csum);
1741 				data += fraggap;
1742 				pskb_trim_unique(skb_prev, maxfraglen);
1743 			}
1744 			if (copy > 0 &&
1745 			    getfrag(from, data + transhdrlen, offset,
1746 				    copy, fraggap, skb) < 0) {
1747 				err = -EFAULT;
1748 				kfree_skb(skb);
1749 				goto error;
1750 			} else if (flags & MSG_SPLICE_PAGES) {
1751 				copy = 0;
1752 			}
1753 
1754 			offset += copy;
1755 			length -= copy + transhdrlen;
1756 			transhdrlen = 0;
1757 			exthdrlen = 0;
1758 			dst_exthdrlen = 0;
1759 
1760 			/* Only the initial fragment is time stamped */
1761 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1762 			cork->tx_flags = 0;
1763 			skb_shinfo(skb)->tskey = tskey;
1764 			tskey = 0;
1765 			skb_zcopy_set(skb, uarg, &extra_uref);
1766 
1767 			if ((flags & MSG_CONFIRM) && !skb_prev)
1768 				skb_set_dst_pending_confirm(skb, 1);
1769 
1770 			/*
1771 			 * Put the packet on the pending queue
1772 			 */
1773 			if (!skb->destructor) {
1774 				skb->destructor = sock_wfree;
1775 				skb->sk = sk;
1776 				wmem_alloc_delta += skb->truesize;
1777 			}
1778 			__skb_queue_tail(queue, skb);
1779 			continue;
1780 		}
1781 
1782 		if (copy > length)
1783 			copy = length;
1784 
1785 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1786 		    skb_tailroom(skb) >= copy) {
1787 			unsigned int off;
1788 
1789 			off = skb->len;
1790 			if (getfrag(from, skb_put(skb, copy),
1791 						offset, copy, off, skb) < 0) {
1792 				__skb_trim(skb, off);
1793 				err = -EFAULT;
1794 				goto error;
1795 			}
1796 		} else if (flags & MSG_SPLICE_PAGES) {
1797 			struct msghdr *msg = from;
1798 
1799 			err = -EIO;
1800 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1801 				goto error;
1802 
1803 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1804 						   sk->sk_allocation);
1805 			if (err < 0)
1806 				goto error;
1807 			copy = err;
1808 			wmem_alloc_delta += copy;
1809 		} else if (!zc) {
1810 			int i = skb_shinfo(skb)->nr_frags;
1811 
1812 			err = -ENOMEM;
1813 			if (!sk_page_frag_refill(sk, pfrag))
1814 				goto error;
1815 
1816 			skb_zcopy_downgrade_managed(skb);
1817 			if (!skb_can_coalesce(skb, i, pfrag->page,
1818 					      pfrag->offset)) {
1819 				err = -EMSGSIZE;
1820 				if (i == MAX_SKB_FRAGS)
1821 					goto error;
1822 
1823 				__skb_fill_page_desc(skb, i, pfrag->page,
1824 						     pfrag->offset, 0);
1825 				skb_shinfo(skb)->nr_frags = ++i;
1826 				get_page(pfrag->page);
1827 			}
1828 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1829 			if (getfrag(from,
1830 				    page_address(pfrag->page) + pfrag->offset,
1831 				    offset, copy, skb->len, skb) < 0)
1832 				goto error_efault;
1833 
1834 			pfrag->offset += copy;
1835 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1836 			skb->len += copy;
1837 			skb->data_len += copy;
1838 			skb->truesize += copy;
1839 			wmem_alloc_delta += copy;
1840 		} else {
1841 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1842 			if (err < 0)
1843 				goto error;
1844 		}
1845 		offset += copy;
1846 		length -= copy;
1847 	}
1848 
1849 	if (wmem_alloc_delta)
1850 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1851 	return 0;
1852 
1853 error_efault:
1854 	err = -EFAULT;
1855 error:
1856 	net_zcopy_put_abort(uarg, extra_uref);
1857 	cork->length -= length;
1858 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1859 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1860 	return err;
1861 }
1862 
1863 int ip6_append_data(struct sock *sk,
1864 		    int getfrag(void *from, char *to, int offset, int len,
1865 				int odd, struct sk_buff *skb),
1866 		    void *from, size_t length, int transhdrlen,
1867 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1868 		    struct rt6_info *rt, unsigned int flags)
1869 {
1870 	struct inet_sock *inet = inet_sk(sk);
1871 	struct ipv6_pinfo *np = inet6_sk(sk);
1872 	int exthdrlen;
1873 	int err;
1874 
1875 	if (flags&MSG_PROBE)
1876 		return 0;
1877 	if (skb_queue_empty(&sk->sk_write_queue)) {
1878 		/*
1879 		 * setup for corking
1880 		 */
1881 		dst_hold(&rt->dst);
1882 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1883 				     ipc6, rt);
1884 		if (err)
1885 			return err;
1886 
1887 		inet->cork.fl.u.ip6 = *fl6;
1888 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1889 		length += exthdrlen;
1890 		transhdrlen += exthdrlen;
1891 	} else {
1892 		transhdrlen = 0;
1893 	}
1894 
1895 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1896 				 &np->cork, sk_page_frag(sk), getfrag,
1897 				 from, length, transhdrlen, flags, ipc6);
1898 }
1899 EXPORT_SYMBOL_GPL(ip6_append_data);
1900 
1901 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1902 {
1903 	struct dst_entry *dst = cork->base.dst;
1904 
1905 	cork->base.dst = NULL;
1906 	cork->base.flags &= ~IPCORK_ALLFRAG;
1907 	skb_dst_set(skb, dst);
1908 }
1909 
1910 static void ip6_cork_release(struct inet_cork_full *cork,
1911 			     struct inet6_cork *v6_cork)
1912 {
1913 	if (v6_cork->opt) {
1914 		struct ipv6_txoptions *opt = v6_cork->opt;
1915 
1916 		kfree(opt->dst0opt);
1917 		kfree(opt->dst1opt);
1918 		kfree(opt->hopopt);
1919 		kfree(opt->srcrt);
1920 		kfree(opt);
1921 		v6_cork->opt = NULL;
1922 	}
1923 
1924 	if (cork->base.dst) {
1925 		dst_release(cork->base.dst);
1926 		cork->base.dst = NULL;
1927 		cork->base.flags &= ~IPCORK_ALLFRAG;
1928 	}
1929 }
1930 
1931 struct sk_buff *__ip6_make_skb(struct sock *sk,
1932 			       struct sk_buff_head *queue,
1933 			       struct inet_cork_full *cork,
1934 			       struct inet6_cork *v6_cork)
1935 {
1936 	struct sk_buff *skb, *tmp_skb;
1937 	struct sk_buff **tail_skb;
1938 	struct in6_addr *final_dst;
1939 	struct net *net = sock_net(sk);
1940 	struct ipv6hdr *hdr;
1941 	struct ipv6_txoptions *opt = v6_cork->opt;
1942 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1943 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1944 	unsigned char proto = fl6->flowi6_proto;
1945 
1946 	skb = __skb_dequeue(queue);
1947 	if (!skb)
1948 		goto out;
1949 	tail_skb = &(skb_shinfo(skb)->frag_list);
1950 
1951 	/* move skb->data to ip header from ext header */
1952 	if (skb->data < skb_network_header(skb))
1953 		__skb_pull(skb, skb_network_offset(skb));
1954 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1955 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1956 		*tail_skb = tmp_skb;
1957 		tail_skb = &(tmp_skb->next);
1958 		skb->len += tmp_skb->len;
1959 		skb->data_len += tmp_skb->len;
1960 		skb->truesize += tmp_skb->truesize;
1961 		tmp_skb->destructor = NULL;
1962 		tmp_skb->sk = NULL;
1963 	}
1964 
1965 	/* Allow local fragmentation. */
1966 	skb->ignore_df = ip6_sk_ignore_df(sk);
1967 	__skb_pull(skb, skb_network_header_len(skb));
1968 
1969 	final_dst = &fl6->daddr;
1970 	if (opt && opt->opt_flen)
1971 		ipv6_push_frag_opts(skb, opt, &proto);
1972 	if (opt && opt->opt_nflen)
1973 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1974 
1975 	skb_push(skb, sizeof(struct ipv6hdr));
1976 	skb_reset_network_header(skb);
1977 	hdr = ipv6_hdr(skb);
1978 
1979 	ip6_flow_hdr(hdr, v6_cork->tclass,
1980 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1981 					ip6_autoflowlabel(net, sk), fl6));
1982 	hdr->hop_limit = v6_cork->hop_limit;
1983 	hdr->nexthdr = proto;
1984 	hdr->saddr = fl6->saddr;
1985 	hdr->daddr = *final_dst;
1986 
1987 	skb->priority = READ_ONCE(sk->sk_priority);
1988 	skb->mark = cork->base.mark;
1989 	skb->tstamp = cork->base.transmit_time;
1990 
1991 	ip6_cork_steal_dst(skb, cork);
1992 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1993 	if (proto == IPPROTO_ICMPV6) {
1994 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1995 		u8 icmp6_type;
1996 
1997 		if (sk->sk_socket->type == SOCK_RAW &&
1998 		   !inet_test_bit(HDRINCL, sk))
1999 			icmp6_type = fl6->fl6_icmp_type;
2000 		else
2001 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
2002 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2003 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2004 	}
2005 
2006 	ip6_cork_release(cork, v6_cork);
2007 out:
2008 	return skb;
2009 }
2010 
2011 int ip6_send_skb(struct sk_buff *skb)
2012 {
2013 	struct net *net = sock_net(skb->sk);
2014 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2015 	int err;
2016 
2017 	err = ip6_local_out(net, skb->sk, skb);
2018 	if (err) {
2019 		if (err > 0)
2020 			err = net_xmit_errno(err);
2021 		if (err)
2022 			IP6_INC_STATS(net, rt->rt6i_idev,
2023 				      IPSTATS_MIB_OUTDISCARDS);
2024 	}
2025 
2026 	return err;
2027 }
2028 
2029 int ip6_push_pending_frames(struct sock *sk)
2030 {
2031 	struct sk_buff *skb;
2032 
2033 	skb = ip6_finish_skb(sk);
2034 	if (!skb)
2035 		return 0;
2036 
2037 	return ip6_send_skb(skb);
2038 }
2039 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2040 
2041 static void __ip6_flush_pending_frames(struct sock *sk,
2042 				       struct sk_buff_head *queue,
2043 				       struct inet_cork_full *cork,
2044 				       struct inet6_cork *v6_cork)
2045 {
2046 	struct sk_buff *skb;
2047 
2048 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2049 		if (skb_dst(skb))
2050 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2051 				      IPSTATS_MIB_OUTDISCARDS);
2052 		kfree_skb(skb);
2053 	}
2054 
2055 	ip6_cork_release(cork, v6_cork);
2056 }
2057 
2058 void ip6_flush_pending_frames(struct sock *sk)
2059 {
2060 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2061 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2062 }
2063 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2064 
2065 struct sk_buff *ip6_make_skb(struct sock *sk,
2066 			     int getfrag(void *from, char *to, int offset,
2067 					 int len, int odd, struct sk_buff *skb),
2068 			     void *from, size_t length, int transhdrlen,
2069 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2070 			     unsigned int flags, struct inet_cork_full *cork)
2071 {
2072 	struct inet6_cork v6_cork;
2073 	struct sk_buff_head queue;
2074 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2075 	int err;
2076 
2077 	if (flags & MSG_PROBE) {
2078 		dst_release(&rt->dst);
2079 		return NULL;
2080 	}
2081 
2082 	__skb_queue_head_init(&queue);
2083 
2084 	cork->base.flags = 0;
2085 	cork->base.addr = 0;
2086 	cork->base.opt = NULL;
2087 	v6_cork.opt = NULL;
2088 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2089 	if (err) {
2090 		ip6_cork_release(cork, &v6_cork);
2091 		return ERR_PTR(err);
2092 	}
2093 	if (ipc6->dontfrag < 0)
2094 		ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
2095 
2096 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2097 				&current->task_frag, getfrag, from,
2098 				length + exthdrlen, transhdrlen + exthdrlen,
2099 				flags, ipc6);
2100 	if (err) {
2101 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2102 		return ERR_PTR(err);
2103 	}
2104 
2105 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2106 }
2107