xref: /linux/net/ipv6/ip6_output.c (revision 9410fb4da2d42a75c0fdbc04c4e74f3a2c42793f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst_dev_rcu(dst);
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		/* idev stays alive because we hold rcu_read_lock(). */
74 		skb = skb_expand_head(skb, hh_len);
75 		if (!skb) {
76 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
77 			return -ENOMEM;
78 		}
79 	}
80 
81 	hdr = ipv6_hdr(skb);
82 	daddr = &hdr->daddr;
83 	if (unlikely(ipv6_addr_is_multicast(daddr))) {
84 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
85 		    ((mroute6_is_socket(net, skb) &&
86 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
87 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
88 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
89 
90 			/* Do not check for IFF_ALLMULTI; multicast routing
91 			   is not supported in any case.
92 			 */
93 			if (newskb)
94 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
95 					net, sk, newskb, NULL, newskb->dev,
96 					dev_loopback_xmit);
97 
98 			if (hdr->hop_limit == 0) {
99 				IP6_INC_STATS(net, idev,
100 					      IPSTATS_MIB_OUTDISCARDS);
101 				kfree_skb(skb);
102 				return 0;
103 			}
104 		}
105 
106 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
107 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
108 		    !(dev->flags & IFF_LOOPBACK)) {
109 			kfree_skb(skb);
110 			return 0;
111 		}
112 	}
113 
114 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
115 		int res = lwtunnel_xmit(skb);
116 
117 		if (res != LWTUNNEL_XMIT_CONTINUE)
118 			return res;
119 	}
120 
121 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
122 
123 	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (IS_ERR_OR_NULL(neigh)) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
131 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
132 			return -EINVAL;
133 		}
134 	}
135 	sock_confirm_neigh(skb, neigh);
136 	ret = neigh_output(neigh, skb, false);
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		/* Last GSO segment can be smaller than gso_size (and MTU).
166 		 * Adding a fragment header would produce an "atomic fragment",
167 		 * which is considered harmful (RFC-8021). Avoid that.
168 		 */
169 		err = segs->len > mtu ?
170 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
171 			ip6_finish_output2(net, sk, segs);
172 		if (err && ret == 0)
173 			ret = err;
174 	}
175 
176 	return ret;
177 }
178 
179 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
180 				 struct sk_buff *skb, unsigned int mtu)
181 {
182 	if (unlikely(!skb_gso_validate_network_len(skb, mtu)))
183 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
184 
185 	return ip6_finish_output2(net, sk, skb);
186 }
187 
188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
189 {
190 	unsigned int mtu;
191 
192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
193 	/* Policy lookup after SNAT yielded a new policy */
194 	if (skb_dst(skb)->xfrm) {
195 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
196 		return dst_output(net, sk, skb);
197 	}
198 #endif
199 
200 	mtu = ip6_skb_dst_mtu(skb);
201 	if (skb_is_gso(skb))
202 		return ip6_finish_output_gso(net, sk, skb, mtu);
203 
204 	if (unlikely(skb->len > mtu ||
205 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)))
206 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
207 
208 	return ip6_finish_output2(net, sk, skb);
209 }
210 
211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 {
213 	int ret;
214 
215 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
216 	switch (ret) {
217 	case NET_XMIT_SUCCESS:
218 	case NET_XMIT_CN:
219 		return __ip6_finish_output(net, sk, skb) ? : ret;
220 	default:
221 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
222 		return ret;
223 	}
224 }
225 
226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
227 {
228 	struct dst_entry *dst = skb_dst(skb);
229 	struct net_device *dev, *indev = skb->dev;
230 	struct inet6_dev *idev;
231 	int ret;
232 
233 	skb->protocol = htons(ETH_P_IPV6);
234 	rcu_read_lock();
235 	dev = dst_dev_rcu(dst);
236 	idev = ip6_dst_idev(dst);
237 	skb->dev = dev;
238 
239 	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
240 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
241 		rcu_read_unlock();
242 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
243 		return 0;
244 	}
245 
246 	ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
247 			   net, sk, skb, indev, dev,
248 			   ip6_finish_output,
249 			   !(IP6CB(skb)->flags & IP6SKB_REROUTED));
250 	rcu_read_unlock();
251 	return ret;
252 }
253 EXPORT_SYMBOL(ip6_output);
254 
255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
256 {
257 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
258 		return ip6_default_np_autolabel(net);
259 	return inet6_test_bit(AUTOFLOWLABEL, sk);
260 }
261 
262 int ip6_dst_hoplimit(struct dst_entry *dst)
263 {
264 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
265 
266 	rcu_read_lock();
267 	if (hoplimit == 0) {
268 		struct net_device *dev = dst_dev_rcu(dst);
269 		struct inet6_dev *idev;
270 
271 		idev = __in6_dev_get(dev);
272 		if (idev)
273 			hoplimit = READ_ONCE(idev->cnf.hop_limit);
274 		else
275 			hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit);
276 	}
277 	rcu_read_unlock();
278 
279 	return hoplimit;
280 }
281 EXPORT_SYMBOL(ip6_dst_hoplimit);
282 
283 /*
284  * xmit an sk_buff (used by TCP and SCTP)
285  * Note : socket lock is not held for SYNACK packets, but might be modified
286  * by calls to skb_set_owner_w() and ipv6_local_error(),
287  * which are using proper atomic operations or spinlocks.
288  */
289 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
290 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
291 {
292 	const struct ipv6_pinfo *np = inet6_sk(sk);
293 	struct in6_addr *first_hop = &fl6->daddr;
294 	struct dst_entry *dst = skb_dst(skb);
295 	struct inet6_dev *idev = ip6_dst_idev(dst);
296 	struct net *net = sock_net(sk);
297 	unsigned int head_room;
298 	struct net_device *dev;
299 	struct ipv6hdr *hdr;
300 	u8  proto = fl6->flowi6_proto;
301 	int seg_len = skb->len;
302 	int ret, hlimit = -1;
303 	u32 mtu;
304 
305 	rcu_read_lock();
306 
307 	dev = dst_dev_rcu(dst);
308 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
309 	if (opt)
310 		head_room += opt->opt_nflen + opt->opt_flen;
311 
312 	if (unlikely(head_room > skb_headroom(skb))) {
313 		/* idev stays alive while we hold rcu_read_lock(). */
314 		skb = skb_expand_head(skb, head_room);
315 		if (!skb) {
316 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
317 			ret = -ENOBUFS;
318 			goto unlock;
319 		}
320 	}
321 
322 	if (unlikely(opt)) {
323 		seg_len += opt->opt_nflen + opt->opt_flen;
324 
325 		if (opt->opt_flen)
326 			proto = ipv6_push_frag_opts(skb, opt, proto);
327 
328 		if (opt->opt_nflen)
329 			proto = ipv6_push_nfrag_opts(skb, opt, proto,
330 						     &first_hop,
331 						     &fl6->saddr);
332 	}
333 
334 	if (unlikely(seg_len > IPV6_MAXPLEN))
335 		seg_len = 0;
336 
337 	__skb_push(skb, sizeof(struct ipv6hdr));
338 	skb_reset_network_header(skb);
339 	hdr = ipv6_hdr(skb);
340 
341 	/*
342 	 *	Fill in the IPv6 header
343 	 */
344 	if (np)
345 		hlimit = READ_ONCE(np->hop_limit);
346 	if (hlimit < 0)
347 		hlimit = ip6_dst_hoplimit(dst);
348 
349 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
350 				ip6_autoflowlabel(net, sk), fl6));
351 
352 	hdr->payload_len = htons(seg_len);
353 	hdr->nexthdr = proto;
354 	hdr->hop_limit = hlimit;
355 
356 	hdr->saddr = fl6->saddr;
357 	hdr->daddr = *first_hop;
358 
359 	skb->protocol = htons(ETH_P_IPV6);
360 	skb->priority = priority;
361 	skb->mark = mark;
362 
363 	mtu = dst6_mtu(dst);
364 	if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) {
365 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
366 
367 		/* if egress device is enslaved to an L3 master device pass the
368 		 * skb to its handler for processing
369 		 */
370 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
371 		if (unlikely(!skb)) {
372 			ret = 0;
373 			goto unlock;
374 		}
375 
376 		/* hooks should never assume socket lock is held.
377 		 * we promote our socket to non const
378 		 */
379 		ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
380 			      net, (struct sock *)sk, skb, NULL, dev,
381 			      dst_output);
382 		goto unlock;
383 	}
384 
385 	ret = -EMSGSIZE;
386 	skb->dev = dev;
387 	/* ipv6_local_error() does not require socket lock,
388 	 * we promote our socket to non const
389 	 */
390 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
391 
392 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
393 	kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
394 unlock:
395 	rcu_read_unlock();
396 	return ret;
397 }
398 EXPORT_SYMBOL(ip6_xmit);
399 
400 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
401 {
402 	struct ip6_ra_chain *ra;
403 	struct sock *last = NULL;
404 
405 	read_lock(&ip6_ra_lock);
406 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
407 		struct sock *sk = ra->sk;
408 		if (sk && ra->sel == sel &&
409 		    (!sk->sk_bound_dev_if ||
410 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
411 
412 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
413 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
414 				continue;
415 			}
416 			if (last) {
417 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
418 				if (skb2)
419 					rawv6_rcv(last, skb2);
420 			}
421 			last = sk;
422 		}
423 	}
424 
425 	if (last) {
426 		rawv6_rcv(last, skb);
427 		read_unlock(&ip6_ra_lock);
428 		return 1;
429 	}
430 	read_unlock(&ip6_ra_lock);
431 	return 0;
432 }
433 
434 static int ip6_forward_proxy_check(struct sk_buff *skb)
435 {
436 	struct ipv6hdr *hdr = ipv6_hdr(skb);
437 	u8 nexthdr = hdr->nexthdr;
438 	__be16 frag_off;
439 	int offset;
440 
441 	if (ipv6_ext_hdr(nexthdr)) {
442 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
443 		if (offset < 0)
444 			return 0;
445 	} else
446 		offset = sizeof(struct ipv6hdr);
447 
448 	if (nexthdr == IPPROTO_ICMPV6) {
449 		struct icmp6hdr *icmp6;
450 
451 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
452 					 offset + 1 - skb->data)))
453 			return 0;
454 
455 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
456 
457 		switch (icmp6->icmp6_type) {
458 		case NDISC_ROUTER_SOLICITATION:
459 		case NDISC_ROUTER_ADVERTISEMENT:
460 		case NDISC_NEIGHBOUR_SOLICITATION:
461 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
462 		case NDISC_REDIRECT:
463 			/* For reaction involving unicast neighbor discovery
464 			 * message destined to the proxied address, pass it to
465 			 * input function.
466 			 */
467 			return 1;
468 		default:
469 			break;
470 		}
471 		hdr = ipv6_hdr(skb);
472 	}
473 
474 	/*
475 	 * The proxying router can't forward traffic sent to a link-local
476 	 * address, so signal the sender and discard the packet. This
477 	 * behavior is clarified by the MIPv6 specification.
478 	 */
479 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
480 		dst_link_failure(skb);
481 		return -1;
482 	}
483 
484 	return 0;
485 }
486 
487 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
488 				     struct sk_buff *skb)
489 {
490 #ifdef CONFIG_NET_SWITCHDEV
491 	if (skb->offload_l3_fwd_mark) {
492 		consume_skb(skb);
493 		return 0;
494 	}
495 #endif
496 
497 	skb_clear_tstamp(skb);
498 	return dst_output(net, sk, skb);
499 }
500 
501 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
502 {
503 	if (skb->len <= mtu)
504 		return false;
505 
506 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
507 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
508 		return true;
509 
510 	if (skb->ignore_df)
511 		return false;
512 
513 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
514 		return false;
515 
516 	return true;
517 }
518 
519 int ip6_forward(struct sk_buff *skb)
520 {
521 	struct dst_entry *dst = skb_dst(skb);
522 	struct ipv6hdr *hdr = ipv6_hdr(skb);
523 	struct inet6_skb_parm *opt = IP6CB(skb);
524 	struct net *net = dev_net(dst_dev(dst));
525 	struct net_device *dev;
526 	struct inet6_dev *idev;
527 	SKB_DR(reason);
528 	u32 mtu;
529 
530 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
531 	if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
532 	    (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
533 		goto error;
534 
535 	if (skb->pkt_type != PACKET_HOST)
536 		goto drop;
537 
538 	if (unlikely(skb->sk))
539 		goto drop;
540 
541 	if (skb_warn_if_lro(skb))
542 		goto drop;
543 
544 	if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
545 	    (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
546 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
547 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
548 		goto drop;
549 	}
550 
551 	skb_forward_csum(skb);
552 
553 	/*
554 	 *	We DO NOT make any processing on
555 	 *	RA packets, pushing them to user level AS IS
556 	 *	without ane WARRANTY that application will be able
557 	 *	to interpret them. The reason is that we
558 	 *	cannot make anything clever here.
559 	 *
560 	 *	We are not end-node, so that if packet contains
561 	 *	AH/ESP, we cannot make anything.
562 	 *	Defragmentation also would be mistake, RA packets
563 	 *	cannot be fragmented, because there is no warranty
564 	 *	that different fragments will go along one path. --ANK
565 	 */
566 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
567 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
568 			return 0;
569 	}
570 
571 	/*
572 	 *	check and decrement ttl
573 	 */
574 	if (hdr->hop_limit <= 1) {
575 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
576 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
577 
578 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
579 		return -ETIMEDOUT;
580 	}
581 
582 	/* XXX: idev->cnf.proxy_ndp? */
583 	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
584 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
585 		int proxied = ip6_forward_proxy_check(skb);
586 
587 		hdr = ipv6_hdr(skb);
588 		if (proxied > 0) {
589 			/* It's tempting to decrease the hop limit
590 			 * here by 1, as we do at the end of the
591 			 * function too.
592 			 *
593 			 * But that would be incorrect, as proxying is
594 			 * not forwarding.  The ip6_input function
595 			 * will handle this packet locally, and it
596 			 * depends on the hop limit being unchanged.
597 			 *
598 			 * One example is the NDP hop limit, that
599 			 * always has to stay 255, but other would be
600 			 * similar checks around RA packets, where the
601 			 * user can even change the desired limit.
602 			 */
603 			return ip6_input(skb);
604 		} else if (proxied < 0) {
605 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
606 			goto drop;
607 		}
608 	}
609 
610 	if (!xfrm6_route_forward(skb)) {
611 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
612 		SKB_DR_SET(reason, XFRM_POLICY);
613 		goto drop;
614 	}
615 	dst = skb_dst(skb);
616 	dev = dst_dev(dst);
617 	/* IPv6 specs say nothing about it, but it is clear that we cannot
618 	   send redirects to source routed frames.
619 	   We don't send redirects to frames decapsulated from IPsec.
620 	 */
621 	if (IP6CB(skb)->iif == dev->ifindex &&
622 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
623 		struct in6_addr *target = NULL;
624 		struct inet_peer *peer;
625 		struct rt6_info *rt;
626 
627 		/*
628 		 *	incoming and outgoing devices are the same
629 		 *	send a redirect.
630 		 */
631 
632 		rt = dst_rt6_info(dst);
633 		if (rt->rt6i_flags & RTF_GATEWAY)
634 			target = &rt->rt6i_gateway;
635 		else
636 			target = &hdr->daddr;
637 
638 		rcu_read_lock();
639 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
640 
641 		/* Limit redirects both by destination (here)
642 		   and by source (inside ndisc_send_redirect)
643 		 */
644 		if (inet_peer_xrlim_allow(peer, 1*HZ))
645 			ndisc_send_redirect(skb, target);
646 		rcu_read_unlock();
647 	} else {
648 		int addrtype = ipv6_addr_type(&hdr->saddr);
649 
650 		/* This check is security critical. */
651 		if (addrtype == IPV6_ADDR_ANY ||
652 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
653 			goto error;
654 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
655 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
656 				    ICMPV6_NOT_NEIGHBOUR, 0);
657 			goto error;
658 		}
659 	}
660 
661 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
662 
663 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
664 	if (mtu < IPV6_MIN_MTU)
665 		mtu = IPV6_MIN_MTU;
666 
667 	if (unlikely(ip6_pkt_too_big(skb, mtu))) {
668 		/* Again, force OUTPUT device used as source address */
669 		skb->dev = dev;
670 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
671 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
672 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
673 				IPSTATS_MIB_FRAGFAILS);
674 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
675 		return -EMSGSIZE;
676 	}
677 
678 	if (skb_cow(skb, dev->hard_header_len)) {
679 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
680 				IPSTATS_MIB_OUTDISCARDS);
681 		goto drop;
682 	}
683 
684 	hdr = ipv6_hdr(skb);
685 
686 	/* Mangling hops number delayed to point after skb COW */
687 
688 	hdr->hop_limit--;
689 
690 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
691 		       net, NULL, skb, skb->dev, dev,
692 		       ip6_forward_finish);
693 
694 error:
695 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
696 	SKB_DR_SET(reason, IP_INADDRERRORS);
697 drop:
698 	kfree_skb_reason(skb, reason);
699 	return -EINVAL;
700 }
701 
702 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
703 {
704 	to->pkt_type = from->pkt_type;
705 	to->priority = from->priority;
706 	to->protocol = from->protocol;
707 	skb_dst_drop(to);
708 	skb_dst_set(to, dst_clone(skb_dst(from)));
709 	to->dev = from->dev;
710 	to->mark = from->mark;
711 
712 	skb_copy_hash(to, from);
713 
714 #ifdef CONFIG_NET_SCHED
715 	to->tc_index = from->tc_index;
716 #endif
717 	nf_copy(to, from);
718 	skb_ext_copy(to, from);
719 	skb_copy_secmark(to, from);
720 }
721 
722 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
723 		      u8 nexthdr, __be32 frag_id,
724 		      struct ip6_fraglist_iter *iter)
725 {
726 	unsigned int first_len;
727 	struct frag_hdr *fh;
728 
729 	/* BUILD HEADER */
730 	*prevhdr = NEXTHDR_FRAGMENT;
731 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
732 	if (!iter->tmp_hdr)
733 		return -ENOMEM;
734 
735 	iter->frag = skb_shinfo(skb)->frag_list;
736 	skb_frag_list_init(skb);
737 
738 	iter->offset = 0;
739 	iter->hlen = hlen;
740 	iter->frag_id = frag_id;
741 	iter->nexthdr = nexthdr;
742 
743 	__skb_pull(skb, hlen);
744 	fh = __skb_push(skb, sizeof(struct frag_hdr));
745 	__skb_push(skb, hlen);
746 	skb_reset_network_header(skb);
747 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
748 
749 	fh->nexthdr = nexthdr;
750 	fh->reserved = 0;
751 	fh->frag_off = htons(IP6_MF);
752 	fh->identification = frag_id;
753 
754 	first_len = skb_pagelen(skb);
755 	skb->data_len = first_len - skb_headlen(skb);
756 	skb->len = first_len;
757 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
758 
759 	return 0;
760 }
761 EXPORT_SYMBOL(ip6_fraglist_init);
762 
763 void ip6_fraglist_prepare(struct sk_buff *skb,
764 			  struct ip6_fraglist_iter *iter)
765 {
766 	struct sk_buff *frag = iter->frag;
767 	unsigned int hlen = iter->hlen;
768 	struct frag_hdr *fh;
769 
770 	frag->ip_summed = CHECKSUM_NONE;
771 	skb_reset_transport_header(frag);
772 	fh = __skb_push(frag, sizeof(struct frag_hdr));
773 	__skb_push(frag, hlen);
774 	skb_reset_network_header(frag);
775 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
776 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
777 	fh->nexthdr = iter->nexthdr;
778 	fh->reserved = 0;
779 	fh->frag_off = htons(iter->offset);
780 	if (frag->next)
781 		fh->frag_off |= htons(IP6_MF);
782 	fh->identification = iter->frag_id;
783 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
784 	ip6_copy_metadata(frag, skb);
785 }
786 EXPORT_SYMBOL(ip6_fraglist_prepare);
787 
788 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
789 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
790 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
791 {
792 	state->prevhdr = prevhdr;
793 	state->nexthdr = nexthdr;
794 	state->frag_id = frag_id;
795 
796 	state->hlen = hlen;
797 	state->mtu = mtu;
798 
799 	state->left = skb->len - hlen;	/* Space per frame */
800 	state->ptr = hlen;		/* Where to start from */
801 
802 	state->hroom = hdr_room;
803 	state->troom = needed_tailroom;
804 
805 	state->offset = 0;
806 }
807 EXPORT_SYMBOL(ip6_frag_init);
808 
809 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
810 {
811 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
812 	struct sk_buff *frag;
813 	struct frag_hdr *fh;
814 	unsigned int len;
815 
816 	len = state->left;
817 	/* IF: it doesn't fit, use 'mtu' - the data space left */
818 	if (len > state->mtu)
819 		len = state->mtu;
820 	/* IF: we are not sending up to and including the packet end
821 	   then align the next start on an eight byte boundary */
822 	if (len < state->left)
823 		len &= ~7;
824 
825 	/* Allocate buffer */
826 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
827 			 state->hroom + state->troom, GFP_ATOMIC);
828 	if (!frag)
829 		return ERR_PTR(-ENOMEM);
830 
831 	/*
832 	 *	Set up data on packet
833 	 */
834 
835 	ip6_copy_metadata(frag, skb);
836 	skb_reserve(frag, state->hroom);
837 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
838 	skb_reset_network_header(frag);
839 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
840 	frag->transport_header = (frag->network_header + state->hlen +
841 				  sizeof(struct frag_hdr));
842 
843 	/*
844 	 *	Charge the memory for the fragment to any owner
845 	 *	it might possess
846 	 */
847 	if (skb->sk)
848 		skb_set_owner_w(frag, skb->sk);
849 
850 	/*
851 	 *	Copy the packet header into the new buffer.
852 	 */
853 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
854 
855 	fragnexthdr_offset = skb_network_header(frag);
856 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
857 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
858 
859 	/*
860 	 *	Build fragment header.
861 	 */
862 	fh->nexthdr = state->nexthdr;
863 	fh->reserved = 0;
864 	fh->identification = state->frag_id;
865 
866 	/*
867 	 *	Copy a block of the IP datagram.
868 	 */
869 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
870 			     len));
871 	state->left -= len;
872 
873 	fh->frag_off = htons(state->offset);
874 	if (state->left > 0)
875 		fh->frag_off |= htons(IP6_MF);
876 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
877 
878 	state->ptr += len;
879 	state->offset += len;
880 
881 	return frag;
882 }
883 EXPORT_SYMBOL(ip6_frag_next);
884 
885 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
886 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
887 {
888 	struct sk_buff *frag;
889 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
890 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
891 				inet6_sk(skb->sk) : NULL;
892 	u8 tstamp_type = skb->tstamp_type;
893 	struct ip6_frag_state state;
894 	unsigned int mtu, hlen, nexthdr_offset;
895 	ktime_t tstamp = skb->tstamp;
896 	int hroom, err = 0;
897 	__be32 frag_id;
898 	u8 *prevhdr, nexthdr = 0;
899 
900 	if (!ipv6_mod_enabled()) {
901 		kfree_skb(skb);
902 		return -EAFNOSUPPORT;
903 	}
904 
905 	err = ip6_find_1stfragopt(skb, &prevhdr);
906 	if (err < 0)
907 		goto fail;
908 	hlen = err;
909 	nexthdr = *prevhdr;
910 	nexthdr_offset = prevhdr - skb_network_header(skb);
911 
912 	mtu = ip6_skb_dst_mtu(skb);
913 
914 	/* We must not fragment if the socket is set to force MTU discovery
915 	 * or if the skb it not generated by a local socket.
916 	 */
917 	if (unlikely(!skb->ignore_df && skb->len > mtu))
918 		goto fail_toobig;
919 
920 	if (IP6CB(skb)->frag_max_size) {
921 		if (IP6CB(skb)->frag_max_size > mtu)
922 			goto fail_toobig;
923 
924 		/* don't send fragments larger than what we received */
925 		mtu = IP6CB(skb)->frag_max_size;
926 		if (mtu < IPV6_MIN_MTU)
927 			mtu = IPV6_MIN_MTU;
928 	}
929 
930 	if (np) {
931 		u32 frag_size = READ_ONCE(np->frag_size);
932 
933 		if (frag_size && frag_size < mtu)
934 			mtu = frag_size;
935 	}
936 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
937 		goto fail_toobig;
938 	mtu -= hlen + sizeof(struct frag_hdr);
939 
940 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
941 				    &ipv6_hdr(skb)->saddr);
942 
943 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
944 	    (err = skb_checksum_help(skb)))
945 		goto fail;
946 
947 	prevhdr = skb_network_header(skb) + nexthdr_offset;
948 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
949 	if (skb_has_frag_list(skb)) {
950 		unsigned int first_len = skb_pagelen(skb);
951 		struct ip6_fraglist_iter iter;
952 		struct sk_buff *frag2;
953 
954 		if (first_len - hlen > mtu ||
955 		    ((first_len - hlen) & 7) ||
956 		    skb_cloned(skb) ||
957 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
958 			goto slow_path;
959 
960 		skb_walk_frags(skb, frag) {
961 			/* Correct geometry. */
962 			if (frag->len > mtu ||
963 			    ((frag->len & 7) && frag->next) ||
964 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
965 				goto slow_path_clean;
966 
967 			/* Partially cloned skb? */
968 			if (skb_shared(frag))
969 				goto slow_path_clean;
970 
971 			BUG_ON(frag->sk);
972 			if (skb->sk) {
973 				frag->sk = skb->sk;
974 				frag->destructor = sock_wfree;
975 			}
976 			skb->truesize -= frag->truesize;
977 		}
978 
979 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
980 					&iter);
981 		if (err < 0)
982 			goto fail;
983 
984 		/* We prevent @rt from being freed. */
985 		rcu_read_lock();
986 
987 		for (;;) {
988 			/* Prepare header of the next frame,
989 			 * before previous one went down. */
990 			if (iter.frag)
991 				ip6_fraglist_prepare(skb, &iter);
992 
993 			skb_set_delivery_time(skb, tstamp, tstamp_type);
994 			err = output(net, sk, skb);
995 			if (!err)
996 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
997 					      IPSTATS_MIB_FRAGCREATES);
998 
999 			if (err || !iter.frag)
1000 				break;
1001 
1002 			skb = ip6_fraglist_next(&iter);
1003 		}
1004 
1005 		kfree(iter.tmp_hdr);
1006 
1007 		if (err == 0) {
1008 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
1009 				      IPSTATS_MIB_FRAGOKS);
1010 			rcu_read_unlock();
1011 			return 0;
1012 		}
1013 
1014 		kfree_skb_list(iter.frag);
1015 
1016 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
1017 			      IPSTATS_MIB_FRAGFAILS);
1018 		rcu_read_unlock();
1019 		return err;
1020 
1021 slow_path_clean:
1022 		skb_walk_frags(skb, frag2) {
1023 			if (frag2 == frag)
1024 				break;
1025 			frag2->sk = NULL;
1026 			frag2->destructor = NULL;
1027 			skb->truesize += frag2->truesize;
1028 		}
1029 	}
1030 
1031 slow_path:
1032 	/*
1033 	 *	Fragment the datagram.
1034 	 */
1035 
1036 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1037 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1038 		      &state);
1039 
1040 	/*
1041 	 *	Keep copying data until we run out.
1042 	 */
1043 
1044 	while (state.left > 0) {
1045 		frag = ip6_frag_next(skb, &state);
1046 		if (IS_ERR(frag)) {
1047 			err = PTR_ERR(frag);
1048 			goto fail;
1049 		}
1050 
1051 		/*
1052 		 *	Put this fragment into the sending queue.
1053 		 */
1054 		skb_set_delivery_time(frag, tstamp, tstamp_type);
1055 		err = output(net, sk, frag);
1056 		if (err)
1057 			goto fail;
1058 
1059 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1060 			      IPSTATS_MIB_FRAGCREATES);
1061 	}
1062 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1063 		      IPSTATS_MIB_FRAGOKS);
1064 	consume_skb(skb);
1065 	return err;
1066 
1067 fail_toobig:
1068 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1069 	err = -EMSGSIZE;
1070 
1071 fail:
1072 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1073 		      IPSTATS_MIB_FRAGFAILS);
1074 	kfree_skb(skb);
1075 	return err;
1076 }
1077 EXPORT_SYMBOL_GPL(ip6_fragment);
1078 
1079 static inline int ip6_rt_check(const struct rt6key *rt_key,
1080 			       const struct in6_addr *fl_addr,
1081 			       const struct in6_addr *addr_cache)
1082 {
1083 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1084 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1085 }
1086 
1087 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1088 					  struct dst_entry *dst,
1089 					  const struct flowi6 *fl6)
1090 {
1091 	struct ipv6_pinfo *np = inet6_sk(sk);
1092 	struct rt6_info *rt;
1093 
1094 	if (!dst)
1095 		goto out;
1096 
1097 	if (dst->ops->family != AF_INET6) {
1098 		dst_release(dst);
1099 		return NULL;
1100 	}
1101 
1102 	rt = dst_rt6_info(dst);
1103 	/* Yes, checking route validity in not connected
1104 	 * case is not very simple. Take into account,
1105 	 * that we do not support routing by source, TOS,
1106 	 * and MSG_DONTROUTE		--ANK (980726)
1107 	 *
1108 	 * 1. ip6_rt_check(): If route was host route,
1109 	 *    check that cached destination is current.
1110 	 *    If it is network route, we still may
1111 	 *    check its validity using saved pointer
1112 	 *    to the last used address: daddr_cache.
1113 	 *    We do not want to save whole address now,
1114 	 *    (because main consumer of this service
1115 	 *    is tcp, which has not this problem),
1116 	 *    so that the last trick works only on connected
1117 	 *    sockets.
1118 	 * 2. oif also should be the same.
1119 	 */
1120 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
1121 			 np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
1122 #ifdef CONFIG_IPV6_SUBTREES
1123 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
1124 			 np->saddr_cache ? &np->saddr : NULL) ||
1125 #endif
1126 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1127 		dst_release(dst);
1128 		dst = NULL;
1129 	}
1130 
1131 out:
1132 	return dst;
1133 }
1134 
1135 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1136 			       struct dst_entry **dst, struct flowi6 *fl6)
1137 {
1138 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1139 	struct neighbour *n;
1140 	struct rt6_info *rt;
1141 #endif
1142 	int err;
1143 	int flags = 0;
1144 
1145 	/* The correct way to handle this would be to do
1146 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1147 	 * the route-specific preferred source forces the
1148 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1149 	 *
1150 	 * In source specific routing (no src=any default route),
1151 	 * ip6_route_output will fail given src=any saddr, though, so
1152 	 * that's why we try it again later.
1153 	 */
1154 	if (ipv6_addr_any(&fl6->saddr)) {
1155 		struct fib6_info *from;
1156 		struct rt6_info *rt;
1157 
1158 		*dst = ip6_route_output(net, sk, fl6);
1159 		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1160 
1161 		rcu_read_lock();
1162 		from = rt ? rcu_dereference(rt->from) : NULL;
1163 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1164 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1165 					  fl6->flowi6_l3mdev,
1166 					  &fl6->saddr);
1167 		rcu_read_unlock();
1168 
1169 		if (err)
1170 			goto out_err_release;
1171 
1172 		/* If we had an erroneous initial result, pretend it
1173 		 * never existed and let the SA-enabled version take
1174 		 * over.
1175 		 */
1176 		if ((*dst)->error) {
1177 			dst_release(*dst);
1178 			*dst = NULL;
1179 		}
1180 
1181 		if (fl6->flowi6_oif)
1182 			flags |= RT6_LOOKUP_F_IFACE;
1183 	}
1184 
1185 	if (!*dst)
1186 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1187 
1188 	err = (*dst)->error;
1189 	if (err)
1190 		goto out_err_release;
1191 
1192 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1193 	/*
1194 	 * Here if the dst entry we've looked up
1195 	 * has a neighbour entry that is in the INCOMPLETE
1196 	 * state and the src address from the flow is
1197 	 * marked as OPTIMISTIC, we release the found
1198 	 * dst entry and replace it instead with the
1199 	 * dst entry of the nexthop router
1200 	 */
1201 	rt = dst_rt6_info(*dst);
1202 	rcu_read_lock();
1203 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1204 				      rt6_nexthop(rt, &fl6->daddr));
1205 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1206 	rcu_read_unlock();
1207 
1208 	if (err) {
1209 		struct inet6_ifaddr *ifp;
1210 		struct flowi6 fl_gw6;
1211 		int redirect;
1212 
1213 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1214 				      (*dst)->dev, 1);
1215 
1216 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1217 		if (ifp)
1218 			in6_ifa_put(ifp);
1219 
1220 		if (redirect) {
1221 			/*
1222 			 * We need to get the dst entry for the
1223 			 * default router instead
1224 			 */
1225 			dst_release(*dst);
1226 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1227 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1228 			*dst = ip6_route_output(net, sk, &fl_gw6);
1229 			err = (*dst)->error;
1230 			if (err)
1231 				goto out_err_release;
1232 		}
1233 	}
1234 #endif
1235 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1236 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1237 		err = -EAFNOSUPPORT;
1238 		goto out_err_release;
1239 	}
1240 
1241 	return 0;
1242 
1243 out_err_release:
1244 	dst_release(*dst);
1245 	*dst = NULL;
1246 
1247 	if (err == -ENETUNREACH)
1248 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1249 	return err;
1250 }
1251 
1252 /**
1253  *	ip6_dst_lookup - perform route lookup on flow
1254  *	@net: Network namespace to perform lookup in
1255  *	@sk: socket which provides route info
1256  *	@dst: pointer to dst_entry * for result
1257  *	@fl6: flow to lookup
1258  *
1259  *	This function performs a route lookup on the given flow.
1260  *
1261  *	It returns zero on success, or a standard errno code on error.
1262  */
1263 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1264 		   struct flowi6 *fl6)
1265 {
1266 	*dst = NULL;
1267 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1268 }
1269 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1270 
1271 /**
1272  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1273  *	@net: Network namespace to perform lookup in
1274  *	@sk: socket which provides route info
1275  *	@fl6: flow to lookup
1276  *	@final_dst: final destination address for ipsec lookup
1277  *
1278  *	This function performs a route lookup on the given flow.
1279  *
1280  *	It returns a valid dst pointer on success, or a pointer encoded
1281  *	error code.
1282  */
1283 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1284 				      const struct in6_addr *final_dst)
1285 {
1286 	struct dst_entry *dst = NULL;
1287 	int err;
1288 
1289 	if (!ipv6_mod_enabled())
1290 		return ERR_PTR(-EAFNOSUPPORT);
1291 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1292 	if (err)
1293 		return ERR_PTR(err);
1294 	if (final_dst)
1295 		fl6->daddr = *final_dst;
1296 
1297 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1298 }
1299 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1300 
1301 /**
1302  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1303  *	@sk: socket which provides the dst cache and route info
1304  *	@fl6: flow to lookup
1305  *	@final_dst: final destination address for ipsec lookup
1306  *	@connected: whether @sk is connected or not
1307  *
1308  *	This function performs a route lookup on the given flow with the
1309  *	possibility of using the cached route in the socket if it is valid.
1310  *	It will take the socket dst lock when operating on the dst cache.
1311  *	As a result, this function can only be used in process context.
1312  *
1313  *	In addition, for a connected socket, cache the dst in the socket
1314  *	if the current cache is not valid.
1315  *
1316  *	It returns a valid dst pointer on success, or a pointer encoded
1317  *	error code.
1318  */
1319 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1320 					 const struct in6_addr *final_dst,
1321 					 bool connected)
1322 {
1323 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1324 
1325 	dst = ip6_sk_dst_check(sk, dst, fl6);
1326 	if (dst)
1327 		return dst;
1328 
1329 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1330 	if (connected && !IS_ERR(dst))
1331 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1332 
1333 	return dst;
1334 }
1335 
1336 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1337 					       gfp_t gfp)
1338 {
1339 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1340 }
1341 
1342 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1343 						gfp_t gfp)
1344 {
1345 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1346 }
1347 
1348 static void ip6_append_data_mtu(unsigned int *mtu,
1349 				int *maxfraglen,
1350 				unsigned int fragheaderlen,
1351 				struct sk_buff *skb,
1352 				struct rt6_info *rt,
1353 				unsigned int orig_mtu)
1354 {
1355 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1356 		if (!skb) {
1357 			/* first fragment, reserve header_len */
1358 			*mtu = orig_mtu - rt->dst.header_len;
1359 
1360 		} else {
1361 			/*
1362 			 * this fragment is not first, the headers
1363 			 * space is regarded as data space.
1364 			 */
1365 			*mtu = orig_mtu;
1366 		}
1367 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1368 			      + fragheaderlen - sizeof(struct frag_hdr);
1369 	}
1370 }
1371 
1372 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1373 			  struct ipcm6_cookie *ipc6,
1374 			  struct rt6_info *rt)
1375 {
1376 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1377 	struct inet6_cork *v6_cork = &cork->base6;
1378 	struct ipv6_pinfo *np = inet6_sk(sk);
1379 	unsigned int mtu, frag_size;
1380 
1381 	/* callers pass dst together with a reference, set it first so
1382 	 * ip6_cork_release() can put it down even in case of an error.
1383 	 */
1384 	cork->base.dst = &rt->dst;
1385 
1386 	/*
1387 	 * setup for corking
1388 	 */
1389 	if (unlikely(opt)) {
1390 		if (WARN_ON(v6_cork->opt))
1391 			return -EINVAL;
1392 
1393 		nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation);
1394 		if (unlikely(!nopt))
1395 			return -ENOBUFS;
1396 
1397 		nopt->tot_len = sizeof(*opt);
1398 		nopt->opt_flen = opt->opt_flen;
1399 		nopt->opt_nflen = opt->opt_nflen;
1400 
1401 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1402 		if (opt->dst0opt && !nopt->dst0opt)
1403 			return -ENOBUFS;
1404 
1405 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1406 		if (opt->dst1opt && !nopt->dst1opt)
1407 			return -ENOBUFS;
1408 
1409 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1410 		if (opt->hopopt && !nopt->hopopt)
1411 			return -ENOBUFS;
1412 
1413 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1414 		if (opt->srcrt && !nopt->srcrt)
1415 			return -ENOBUFS;
1416 
1417 		/* need source address above miyazawa*/
1418 	}
1419 	v6_cork->hop_limit = ipc6->hlimit;
1420 	v6_cork->tclass = ipc6->tclass;
1421 	v6_cork->dontfrag = ipc6->dontfrag;
1422 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1423 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1424 		      READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst);
1425 	else
1426 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1427 			READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst));
1428 
1429 	frag_size = READ_ONCE(np->frag_size);
1430 	if (frag_size && frag_size < mtu)
1431 		mtu = frag_size;
1432 
1433 	cork->base.fragsize = mtu;
1434 	cork->base.gso_size = ipc6->gso_size;
1435 	cork->base.tx_flags = 0;
1436 	cork->base.mark = ipc6->sockc.mark;
1437 	cork->base.priority = ipc6->sockc.priority;
1438 	sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1439 	if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1440 		cork->base.flags |= IPCORK_TS_OPT_ID;
1441 		cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1442 	}
1443 	cork->base.length = 0;
1444 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1445 
1446 	return 0;
1447 }
1448 
1449 static int __ip6_append_data(struct sock *sk,
1450 			     struct sk_buff_head *queue,
1451 			     struct inet_cork_full *cork_full,
1452 			     struct page_frag *pfrag,
1453 			     int getfrag(void *from, char *to, int offset,
1454 					 int len, int odd, struct sk_buff *skb),
1455 			     void *from, size_t length, int transhdrlen,
1456 			     unsigned int flags)
1457 {
1458 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1459 	struct inet6_cork *v6_cork = &cork_full->base6;
1460 	struct inet_cork *cork = &cork_full->base;
1461 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1462 	struct sk_buff *skb, *skb_prev = NULL;
1463 	struct ubuf_info *uarg = NULL;
1464 	int exthdrlen = 0;
1465 	int dst_exthdrlen = 0;
1466 	int hh_len;
1467 	int copy;
1468 	int err;
1469 	int offset = 0;
1470 	bool zc = false;
1471 	u32 tskey = 0;
1472 	struct rt6_info *rt = dst_rt6_info(cork->dst);
1473 	bool paged, hold_tskey = false, extra_uref = false;
1474 	struct ipv6_txoptions *opt = v6_cork->opt;
1475 	int csummode = CHECKSUM_NONE;
1476 	unsigned int maxnonfragsize, headersize;
1477 	unsigned int wmem_alloc_delta = 0;
1478 
1479 	skb = skb_peek_tail(queue);
1480 	if (!skb) {
1481 		exthdrlen = opt ? opt->opt_flen : 0;
1482 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1483 	}
1484 
1485 	paged = !!cork->gso_size;
1486 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1487 	orig_mtu = mtu;
1488 
1489 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1490 
1491 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1492 			(opt ? opt->opt_nflen : 0);
1493 
1494 	headersize = sizeof(struct ipv6hdr) +
1495 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1496 		     rt->rt6i_nfheader_len;
1497 
1498 	if (mtu <= fragheaderlen ||
1499 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1500 		goto emsgsize;
1501 
1502 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1503 		     sizeof(struct frag_hdr);
1504 
1505 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1506 	 * the first fragment
1507 	 */
1508 	if (headersize + transhdrlen > mtu)
1509 		goto emsgsize;
1510 
1511 	if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1512 	    (sk->sk_protocol == IPPROTO_UDP ||
1513 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1514 	     sk->sk_protocol == IPPROTO_RAW)) {
1515 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1516 				sizeof(struct ipv6hdr));
1517 		goto emsgsize;
1518 	}
1519 
1520 	if (ip6_sk_ignore_df(sk))
1521 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1522 	else
1523 		maxnonfragsize = mtu;
1524 
1525 	if (cork->length + length > maxnonfragsize - headersize) {
1526 emsgsize:
1527 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1528 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1529 		return -EMSGSIZE;
1530 	}
1531 
1532 	/* CHECKSUM_PARTIAL only with no extension headers and when
1533 	 * we are not going to fragment
1534 	 */
1535 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1536 	    headersize == sizeof(struct ipv6hdr) &&
1537 	    length <= mtu - headersize &&
1538 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1539 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1540 		csummode = CHECKSUM_PARTIAL;
1541 
1542 	if ((flags & MSG_ZEROCOPY) && length) {
1543 		struct msghdr *msg = from;
1544 
1545 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1546 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1547 				return -EINVAL;
1548 
1549 			/* Leave uarg NULL if can't zerocopy, callers should
1550 			 * be able to handle it.
1551 			 */
1552 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1553 			    csummode == CHECKSUM_PARTIAL) {
1554 				paged = true;
1555 				zc = true;
1556 				uarg = msg->msg_ubuf;
1557 			}
1558 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1559 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1560 						    false);
1561 			if (!uarg)
1562 				return -ENOBUFS;
1563 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1564 			if (rt->dst.dev->features & NETIF_F_SG &&
1565 			    csummode == CHECKSUM_PARTIAL) {
1566 				paged = true;
1567 				zc = true;
1568 			} else {
1569 				uarg_to_msgzc(uarg)->zerocopy = 0;
1570 				skb_zcopy_set(skb, uarg, &extra_uref);
1571 			}
1572 		}
1573 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1574 		if (inet_test_bit(HDRINCL, sk))
1575 			return -EPERM;
1576 		if (rt->dst.dev->features & NETIF_F_SG &&
1577 		    getfrag == ip_generic_getfrag)
1578 			/* We need an empty buffer to attach stuff to */
1579 			paged = true;
1580 		else
1581 			flags &= ~MSG_SPLICE_PAGES;
1582 	}
1583 
1584 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1585 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1586 		if (cork->flags & IPCORK_TS_OPT_ID) {
1587 			tskey = cork->ts_opt_id;
1588 		} else {
1589 			tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1590 			hold_tskey = true;
1591 		}
1592 	}
1593 
1594 	/*
1595 	 * Let's try using as much space as possible.
1596 	 * Use MTU if total length of the message fits into the MTU.
1597 	 * Otherwise, we need to reserve fragment header and
1598 	 * fragment alignment (= 8-15 octects, in total).
1599 	 *
1600 	 * Note that we may need to "move" the data from the tail
1601 	 * of the buffer to the new fragment when we split
1602 	 * the message.
1603 	 *
1604 	 * FIXME: It may be fragmented into multiple chunks
1605 	 *        at once if non-fragmentable extension headers
1606 	 *        are too large.
1607 	 * --yoshfuji
1608 	 */
1609 
1610 	cork->length += length;
1611 	if (!skb)
1612 		goto alloc_new_skb;
1613 
1614 	while (length > 0) {
1615 		/* Check if the remaining data fits into current packet. */
1616 		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1617 		if (copy < length)
1618 			copy = maxfraglen - skb->len;
1619 
1620 		if (copy <= 0) {
1621 			char *data;
1622 			unsigned int datalen;
1623 			unsigned int fraglen;
1624 			unsigned int fraggap;
1625 			unsigned int alloclen, alloc_extra;
1626 			unsigned int pagedlen;
1627 alloc_new_skb:
1628 			/* There's no room in the current skb */
1629 			if (skb)
1630 				fraggap = skb->len - maxfraglen;
1631 			else
1632 				fraggap = 0;
1633 			/* update mtu and maxfraglen if necessary */
1634 			if (!skb || !skb_prev)
1635 				ip6_append_data_mtu(&mtu, &maxfraglen,
1636 						    fragheaderlen, skb, rt,
1637 						    orig_mtu);
1638 
1639 			skb_prev = skb;
1640 
1641 			/*
1642 			 * If remaining data exceeds the mtu,
1643 			 * we know we need more fragment(s).
1644 			 */
1645 			datalen = length + fraggap;
1646 
1647 			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1648 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1649 			fraglen = datalen + fragheaderlen;
1650 			pagedlen = 0;
1651 
1652 			alloc_extra = hh_len;
1653 			alloc_extra += dst_exthdrlen;
1654 			alloc_extra += rt->dst.trailer_len;
1655 
1656 			/* We just reserve space for fragment header.
1657 			 * Note: this may be overallocation if the message
1658 			 * (without MSG_MORE) fits into the MTU.
1659 			 */
1660 			alloc_extra += sizeof(struct frag_hdr);
1661 
1662 			if ((flags & MSG_MORE) &&
1663 			    !(rt->dst.dev->features&NETIF_F_SG))
1664 				alloclen = mtu;
1665 			else if (!paged &&
1666 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1667 				  !(rt->dst.dev->features & NETIF_F_SG)))
1668 				alloclen = fraglen;
1669 			else {
1670 				alloclen = fragheaderlen + transhdrlen;
1671 				pagedlen = datalen - transhdrlen;
1672 			}
1673 			alloclen += alloc_extra;
1674 
1675 			if (datalen != length + fraggap) {
1676 				/*
1677 				 * this is not the last fragment, the trailer
1678 				 * space is regarded as data space.
1679 				 */
1680 				datalen += rt->dst.trailer_len;
1681 			}
1682 
1683 			fraglen = datalen + fragheaderlen;
1684 
1685 			copy = datalen - transhdrlen - fraggap - pagedlen;
1686 			/* [!] NOTE: copy may be negative if pagedlen>0
1687 			 * because then the equation may reduces to -fraggap.
1688 			 */
1689 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1690 				err = -EINVAL;
1691 				goto error;
1692 			}
1693 			if (transhdrlen) {
1694 				skb = sock_alloc_send_skb(sk, alloclen,
1695 						(flags & MSG_DONTWAIT), &err);
1696 			} else {
1697 				skb = NULL;
1698 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1699 				    2 * sk->sk_sndbuf)
1700 					skb = alloc_skb(alloclen,
1701 							sk->sk_allocation);
1702 				if (unlikely(!skb))
1703 					err = -ENOBUFS;
1704 			}
1705 			if (!skb)
1706 				goto error;
1707 			/*
1708 			 *	Fill in the control structures
1709 			 */
1710 			skb->protocol = htons(ETH_P_IPV6);
1711 			skb->ip_summed = csummode;
1712 			skb->csum = 0;
1713 			/* reserve for fragmentation and ipsec header */
1714 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1715 				    dst_exthdrlen);
1716 
1717 			/*
1718 			 *	Find where to start putting bytes
1719 			 */
1720 			data = skb_put(skb, fraglen - pagedlen);
1721 			skb_set_network_header(skb, exthdrlen);
1722 			data += fragheaderlen;
1723 			skb->transport_header = (skb->network_header +
1724 						 fragheaderlen);
1725 			if (fraggap) {
1726 				skb->csum = skb_copy_and_csum_bits(
1727 					skb_prev, maxfraglen,
1728 					data + transhdrlen, fraggap);
1729 				skb_prev->csum = csum_sub(skb_prev->csum,
1730 							  skb->csum);
1731 				data += fraggap;
1732 				pskb_trim_unique(skb_prev, maxfraglen);
1733 			}
1734 			if (copy > 0 &&
1735 			    INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1736 					   from, data + transhdrlen, offset,
1737 					   copy, fraggap, skb) < 0) {
1738 				err = -EFAULT;
1739 				kfree_skb(skb);
1740 				goto error;
1741 			} else if (flags & MSG_SPLICE_PAGES) {
1742 				copy = 0;
1743 			}
1744 
1745 			offset += copy;
1746 			length -= copy + transhdrlen;
1747 			transhdrlen = 0;
1748 			exthdrlen = 0;
1749 			dst_exthdrlen = 0;
1750 
1751 			/* Only the initial fragment is time stamped */
1752 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1753 			cork->tx_flags = 0;
1754 			skb_shinfo(skb)->tskey = tskey;
1755 			tskey = 0;
1756 			skb_zcopy_set(skb, uarg, &extra_uref);
1757 
1758 			if ((flags & MSG_CONFIRM) && !skb_prev)
1759 				skb_set_dst_pending_confirm(skb, 1);
1760 
1761 			/*
1762 			 * Put the packet on the pending queue
1763 			 */
1764 			if (!skb->destructor) {
1765 				skb->destructor = sock_wfree;
1766 				skb->sk = sk;
1767 				wmem_alloc_delta += skb->truesize;
1768 			}
1769 			__skb_queue_tail(queue, skb);
1770 			continue;
1771 		}
1772 
1773 		if (copy > length)
1774 			copy = length;
1775 
1776 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1777 		    skb_tailroom(skb) >= copy) {
1778 			unsigned int off;
1779 
1780 			off = skb->len;
1781 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1782 					    from, skb_put(skb, copy),
1783 					    offset, copy, off, skb) < 0) {
1784 				__skb_trim(skb, off);
1785 				err = -EFAULT;
1786 				goto error;
1787 			}
1788 		} else if (flags & MSG_SPLICE_PAGES) {
1789 			struct msghdr *msg = from;
1790 
1791 			err = -EIO;
1792 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1793 				goto error;
1794 
1795 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
1796 			if (err < 0)
1797 				goto error;
1798 			copy = err;
1799 			if (!(flags & MSG_NO_SHARED_FRAGS))
1800 				skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
1801 			wmem_alloc_delta += copy;
1802 		} else if (!zc) {
1803 			int i = skb_shinfo(skb)->nr_frags;
1804 
1805 			err = -ENOMEM;
1806 			if (!sk_page_frag_refill(sk, pfrag))
1807 				goto error;
1808 
1809 			skb_zcopy_downgrade_managed(skb);
1810 			if (!skb_can_coalesce(skb, i, pfrag->page,
1811 					      pfrag->offset)) {
1812 				err = -EMSGSIZE;
1813 				if (i == MAX_SKB_FRAGS)
1814 					goto error;
1815 
1816 				__skb_fill_page_desc(skb, i, pfrag->page,
1817 						     pfrag->offset, 0);
1818 				skb_shinfo(skb)->nr_frags = ++i;
1819 				get_page(pfrag->page);
1820 			}
1821 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1822 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1823 				    from,
1824 				    page_address(pfrag->page) + pfrag->offset,
1825 				    offset, copy, skb->len, skb) < 0)
1826 				goto error_efault;
1827 
1828 			pfrag->offset += copy;
1829 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1830 			skb->len += copy;
1831 			skb->data_len += copy;
1832 			skb->truesize += copy;
1833 			wmem_alloc_delta += copy;
1834 		} else {
1835 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1836 			if (err < 0)
1837 				goto error;
1838 		}
1839 		offset += copy;
1840 		length -= copy;
1841 	}
1842 
1843 	if (wmem_alloc_delta)
1844 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1845 	return 0;
1846 
1847 error_efault:
1848 	err = -EFAULT;
1849 error:
1850 	net_zcopy_put_abort(uarg, extra_uref);
1851 	cork->length -= length;
1852 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1853 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1854 	if (hold_tskey)
1855 		atomic_dec(&sk->sk_tskey);
1856 	return err;
1857 }
1858 
1859 int ip6_append_data(struct sock *sk,
1860 		    int getfrag(void *from, char *to, int offset, int len,
1861 				int odd, struct sk_buff *skb),
1862 		    void *from, size_t length, int transhdrlen,
1863 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1864 		    struct rt6_info *rt, unsigned int flags)
1865 {
1866 	struct inet_sock *inet = inet_sk(sk);
1867 	int exthdrlen;
1868 	int err;
1869 
1870 	if (flags&MSG_PROBE)
1871 		return 0;
1872 	if (skb_queue_empty(&sk->sk_write_queue)) {
1873 		/*
1874 		 * setup for corking
1875 		 */
1876 		dst_hold(&rt->dst);
1877 		err = ip6_setup_cork(sk, &inet->cork,
1878 				     ipc6, rt);
1879 		if (err)
1880 			return err;
1881 
1882 		inet->cork.fl.u.ip6 = *fl6;
1883 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1884 		length += exthdrlen;
1885 		transhdrlen += exthdrlen;
1886 	} else {
1887 		transhdrlen = 0;
1888 	}
1889 
1890 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1891 				 sk_page_frag(sk), getfrag,
1892 				 from, length, transhdrlen, flags);
1893 }
1894 EXPORT_SYMBOL_GPL(ip6_append_data);
1895 
1896 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1897 {
1898 	struct dst_entry *dst = cork->base.dst;
1899 
1900 	cork->base.dst = NULL;
1901 	skb_dst_set(skb, dst);
1902 }
1903 
1904 static void ip6_cork_release(struct inet_cork_full *cork)
1905 {
1906 	struct inet6_cork *v6_cork = &cork->base6;
1907 
1908 	if (unlikely(v6_cork->opt)) {
1909 		struct ipv6_txoptions *opt = v6_cork->opt;
1910 
1911 		kfree(opt->dst0opt);
1912 		kfree(opt->dst1opt);
1913 		kfree(opt->hopopt);
1914 		kfree(opt->srcrt);
1915 		kfree(opt);
1916 		v6_cork->opt = NULL;
1917 	}
1918 
1919 	if (cork->base.dst) {
1920 		dst_release(cork->base.dst);
1921 		cork->base.dst = NULL;
1922 	}
1923 }
1924 
1925 struct sk_buff *__ip6_make_skb(struct sock *sk,
1926 			       struct sk_buff_head *queue,
1927 			       struct inet_cork_full *cork)
1928 {
1929 	struct sk_buff *skb, *tmp_skb;
1930 	struct sk_buff **tail_skb;
1931 	struct in6_addr *final_dst;
1932 	struct net *net = sock_net(sk);
1933 	struct ipv6hdr *hdr;
1934 	struct ipv6_txoptions *opt;
1935 	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1936 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1937 	unsigned char proto = fl6->flowi6_proto;
1938 
1939 	skb = __skb_dequeue(queue);
1940 	if (!skb)
1941 		goto out;
1942 	tail_skb = &(skb_shinfo(skb)->frag_list);
1943 
1944 	/* move skb->data to ip header from ext header */
1945 	if (skb->data < skb_network_header(skb))
1946 		__skb_pull(skb, skb_network_offset(skb));
1947 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1948 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1949 		*tail_skb = tmp_skb;
1950 		tail_skb = &(tmp_skb->next);
1951 		skb->len += tmp_skb->len;
1952 		skb->data_len += tmp_skb->len;
1953 		skb->truesize += tmp_skb->truesize;
1954 		tmp_skb->destructor = NULL;
1955 		tmp_skb->sk = NULL;
1956 	}
1957 
1958 	/* Allow local fragmentation. */
1959 	skb->ignore_df = ip6_sk_ignore_df(sk);
1960 	__skb_pull(skb, skb_network_header_len(skb));
1961 
1962 	final_dst = &fl6->daddr;
1963 	opt = cork->base6.opt;
1964 	if (unlikely(opt)) {
1965 		if (opt->opt_flen)
1966 			proto = ipv6_push_frag_opts(skb, opt, proto);
1967 		if (opt->opt_nflen)
1968 			proto = ipv6_push_nfrag_opts(skb, opt, proto,
1969 						     &final_dst, &fl6->saddr);
1970 	}
1971 	skb_push(skb, sizeof(struct ipv6hdr));
1972 	skb_reset_network_header(skb);
1973 	hdr = ipv6_hdr(skb);
1974 
1975 	ip6_flow_hdr(hdr, cork->base6.tclass,
1976 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1977 					ip6_autoflowlabel(net, sk), fl6));
1978 	hdr->hop_limit = cork->base6.hop_limit;
1979 	hdr->nexthdr = proto;
1980 	hdr->saddr = fl6->saddr;
1981 	hdr->daddr = *final_dst;
1982 
1983 	skb->priority = cork->base.priority;
1984 	skb->mark = cork->base.mark;
1985 	if (sk_is_tcp(sk))
1986 		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1987 	else
1988 		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1989 
1990 	ip6_cork_steal_dst(skb, cork);
1991 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1992 	if (unlikely(proto == IPPROTO_ICMPV6)) {
1993 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1994 		u8 icmp6_type;
1995 
1996 		if (sk->sk_socket->type == SOCK_RAW &&
1997 		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1998 			icmp6_type = fl6->fl6_icmp_type;
1999 		else
2000 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
2001 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2002 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2003 	}
2004 
2005 	ip6_cork_release(cork);
2006 out:
2007 	return skb;
2008 }
2009 
2010 int ip6_send_skb(struct sk_buff *skb)
2011 {
2012 	struct net *net = sock_net(skb->sk);
2013 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
2014 	int err;
2015 
2016 	rcu_read_lock();
2017 	err = ip6_local_out(net, skb->sk, skb);
2018 	if (err) {
2019 		if (err > 0)
2020 			err = net_xmit_errno(err);
2021 		if (err)
2022 			IP6_INC_STATS(net, rt->rt6i_idev,
2023 				      IPSTATS_MIB_OUTDISCARDS);
2024 	}
2025 
2026 	rcu_read_unlock();
2027 	return err;
2028 }
2029 
2030 int ip6_push_pending_frames(struct sock *sk)
2031 {
2032 	struct sk_buff *skb;
2033 
2034 	skb = ip6_finish_skb(sk);
2035 	if (!skb)
2036 		return 0;
2037 
2038 	return ip6_send_skb(skb);
2039 }
2040 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2041 
2042 static void __ip6_flush_pending_frames(struct sock *sk,
2043 				       struct sk_buff_head *queue,
2044 				       struct inet_cork_full *cork)
2045 {
2046 	struct sk_buff *skb;
2047 
2048 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2049 		if (skb_dst(skb))
2050 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2051 				      IPSTATS_MIB_OUTDISCARDS);
2052 		kfree_skb(skb);
2053 	}
2054 
2055 	ip6_cork_release(cork);
2056 }
2057 
2058 void ip6_flush_pending_frames(struct sock *sk)
2059 {
2060 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2061 				   &inet_sk(sk)->cork);
2062 }
2063 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2064 
2065 struct sk_buff *ip6_make_skb(struct sock *sk,
2066 			     int getfrag(void *from, char *to, int offset,
2067 					 int len, int odd, struct sk_buff *skb),
2068 			     void *from, size_t length, int transhdrlen,
2069 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2070 			     unsigned int flags, struct inet_cork_full *cork)
2071 {
2072 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2073 	struct sk_buff_head queue;
2074 	int err;
2075 
2076 	if (flags & MSG_PROBE) {
2077 		dst_release(&rt->dst);
2078 		return NULL;
2079 	}
2080 
2081 	__skb_queue_head_init(&queue);
2082 
2083 	cork->base.flags = 0;
2084 	cork->base.addr = 0;
2085 	cork->base.opt = NULL;
2086 	cork->base6.opt = NULL;
2087 	err = ip6_setup_cork(sk, cork, ipc6, rt);
2088 	if (err) {
2089 		ip6_cork_release(cork);
2090 		return ERR_PTR(err);
2091 	}
2092 
2093 	err = __ip6_append_data(sk, &queue, cork,
2094 				&current->task_frag, getfrag, from,
2095 				length + exthdrlen, transhdrlen + exthdrlen,
2096 				flags);
2097 	if (err) {
2098 		__ip6_flush_pending_frames(sk, &queue, cork);
2099 		return ERR_PTR(err);
2100 	}
2101 
2102 	return __ip6_make_skb(sk, &queue, cork);
2103 }
2104