xref: /linux/net/ipv6/ip6_output.c (revision dfecb0c5af3b07ebfa84be63a7a21bfc9e29a872)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst_dev_rcu(dst);
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		/* idev stays alive because we hold rcu_read_lock(). */
74 		skb = skb_expand_head(skb, hh_len);
75 		if (!skb) {
76 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
77 			return -ENOMEM;
78 		}
79 	}
80 
81 	hdr = ipv6_hdr(skb);
82 	daddr = &hdr->daddr;
83 	if (unlikely(ipv6_addr_is_multicast(daddr))) {
84 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
85 		    ((mroute6_is_socket(net, skb) &&
86 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
87 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
88 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
89 
90 			/* Do not check for IFF_ALLMULTI; multicast routing
91 			   is not supported in any case.
92 			 */
93 			if (newskb)
94 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
95 					net, sk, newskb, NULL, newskb->dev,
96 					dev_loopback_xmit);
97 
98 			if (hdr->hop_limit == 0) {
99 				IP6_INC_STATS(net, idev,
100 					      IPSTATS_MIB_OUTDISCARDS);
101 				kfree_skb(skb);
102 				return 0;
103 			}
104 		}
105 
106 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
107 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
108 		    !(dev->flags & IFF_LOOPBACK)) {
109 			kfree_skb(skb);
110 			return 0;
111 		}
112 	}
113 
114 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
115 		int res = lwtunnel_xmit(skb);
116 
117 		if (res != LWTUNNEL_XMIT_CONTINUE)
118 			return res;
119 	}
120 
121 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
122 
123 	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (IS_ERR_OR_NULL(neigh)) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
131 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
132 			return -EINVAL;
133 		}
134 	}
135 	sock_confirm_neigh(skb, neigh);
136 	ret = neigh_output(neigh, skb, false);
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		/* Last GSO segment can be smaller than gso_size (and MTU).
166 		 * Adding a fragment header would produce an "atomic fragment",
167 		 * which is considered harmful (RFC-8021). Avoid that.
168 		 */
169 		err = segs->len > mtu ?
170 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
171 			ip6_finish_output2(net, sk, segs);
172 		if (err && ret == 0)
173 			ret = err;
174 	}
175 
176 	return ret;
177 }
178 
179 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
180 				 struct sk_buff *skb, unsigned int mtu)
181 {
182 	if (unlikely(!skb_gso_validate_network_len(skb, mtu)))
183 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
184 
185 	return ip6_finish_output2(net, sk, skb);
186 }
187 
188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
189 {
190 	unsigned int mtu;
191 
192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
193 	/* Policy lookup after SNAT yielded a new policy */
194 	if (skb_dst(skb)->xfrm) {
195 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
196 		return dst_output(net, sk, skb);
197 	}
198 #endif
199 
200 	mtu = ip6_skb_dst_mtu(skb);
201 	if (skb_is_gso(skb))
202 		return ip6_finish_output_gso(net, sk, skb, mtu);
203 
204 	if (unlikely(skb->len > mtu ||
205 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)))
206 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
207 
208 	return ip6_finish_output2(net, sk, skb);
209 }
210 
211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 {
213 	int ret;
214 
215 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
216 	switch (ret) {
217 	case NET_XMIT_SUCCESS:
218 	case NET_XMIT_CN:
219 		return __ip6_finish_output(net, sk, skb) ? : ret;
220 	default:
221 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
222 		return ret;
223 	}
224 }
225 
226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
227 {
228 	struct dst_entry *dst = skb_dst(skb);
229 	struct net_device *dev, *indev = skb->dev;
230 	struct inet6_dev *idev;
231 	int ret;
232 
233 	skb->protocol = htons(ETH_P_IPV6);
234 	rcu_read_lock();
235 	dev = dst_dev_rcu(dst);
236 	idev = ip6_dst_idev(dst);
237 	skb->dev = dev;
238 
239 	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
240 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
241 		rcu_read_unlock();
242 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
243 		return 0;
244 	}
245 
246 	ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
247 			   net, sk, skb, indev, dev,
248 			   ip6_finish_output,
249 			   !(IP6CB(skb)->flags & IP6SKB_REROUTED));
250 	rcu_read_unlock();
251 	return ret;
252 }
253 EXPORT_SYMBOL(ip6_output);
254 
255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
256 {
257 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
258 		return ip6_default_np_autolabel(net);
259 	return inet6_test_bit(AUTOFLOWLABEL, sk);
260 }
261 
262 int ip6_dst_hoplimit(struct dst_entry *dst)
263 {
264 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
265 
266 	rcu_read_lock();
267 	if (hoplimit == 0) {
268 		struct net_device *dev = dst_dev_rcu(dst);
269 		struct inet6_dev *idev;
270 
271 		idev = __in6_dev_get(dev);
272 		if (idev)
273 			hoplimit = READ_ONCE(idev->cnf.hop_limit);
274 		else
275 			hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit);
276 	}
277 	rcu_read_unlock();
278 
279 	return hoplimit;
280 }
281 EXPORT_SYMBOL(ip6_dst_hoplimit);
282 
283 /*
284  * xmit an sk_buff (used by TCP and SCTP)
285  * Note : socket lock is not held for SYNACK packets, but might be modified
286  * by calls to skb_set_owner_w() and ipv6_local_error(),
287  * which are using proper atomic operations or spinlocks.
288  */
289 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
290 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
291 {
292 	const struct ipv6_pinfo *np = inet6_sk(sk);
293 	struct in6_addr *first_hop = &fl6->daddr;
294 	struct dst_entry *dst = skb_dst(skb);
295 	struct inet6_dev *idev = ip6_dst_idev(dst);
296 	struct net *net = sock_net(sk);
297 	unsigned int head_room;
298 	struct net_device *dev;
299 	struct ipv6hdr *hdr;
300 	u8  proto = fl6->flowi6_proto;
301 	int seg_len = skb->len;
302 	int ret, hlimit = -1;
303 	u32 mtu;
304 
305 	rcu_read_lock();
306 
307 	dev = dst_dev_rcu(dst);
308 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
309 	if (opt)
310 		head_room += opt->opt_nflen + opt->opt_flen;
311 
312 	if (unlikely(head_room > skb_headroom(skb))) {
313 		/* idev stays alive while we hold rcu_read_lock(). */
314 		skb = skb_expand_head(skb, head_room);
315 		if (!skb) {
316 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
317 			ret = -ENOBUFS;
318 			goto unlock;
319 		}
320 	}
321 
322 	if (unlikely(opt)) {
323 		seg_len += opt->opt_nflen + opt->opt_flen;
324 
325 		if (opt->opt_flen)
326 			proto = ipv6_push_frag_opts(skb, opt, proto);
327 
328 		if (opt->opt_nflen)
329 			proto = ipv6_push_nfrag_opts(skb, opt, proto,
330 						     &first_hop,
331 						     &fl6->saddr);
332 	}
333 
334 	if (unlikely(seg_len > IPV6_MAXPLEN))
335 		seg_len = 0;
336 
337 	__skb_push(skb, sizeof(struct ipv6hdr));
338 	skb_reset_network_header(skb);
339 	hdr = ipv6_hdr(skb);
340 
341 	/*
342 	 *	Fill in the IPv6 header
343 	 */
344 	if (np)
345 		hlimit = READ_ONCE(np->hop_limit);
346 	if (hlimit < 0)
347 		hlimit = ip6_dst_hoplimit(dst);
348 
349 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
350 				ip6_autoflowlabel(net, sk), fl6));
351 
352 	hdr->payload_len = htons(seg_len);
353 	hdr->nexthdr = proto;
354 	hdr->hop_limit = hlimit;
355 
356 	hdr->saddr = fl6->saddr;
357 	hdr->daddr = *first_hop;
358 
359 	skb->protocol = htons(ETH_P_IPV6);
360 	skb->priority = priority;
361 	skb->mark = mark;
362 
363 	mtu = dst6_mtu(dst);
364 	if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) {
365 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
366 
367 		/* if egress device is enslaved to an L3 master device pass the
368 		 * skb to its handler for processing
369 		 */
370 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
371 		if (unlikely(!skb)) {
372 			ret = 0;
373 			goto unlock;
374 		}
375 
376 		/* hooks should never assume socket lock is held.
377 		 * we promote our socket to non const
378 		 */
379 		ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
380 			      net, (struct sock *)sk, skb, NULL, dev,
381 			      dst_output);
382 		goto unlock;
383 	}
384 
385 	ret = -EMSGSIZE;
386 	skb->dev = dev;
387 	/* ipv6_local_error() does not require socket lock,
388 	 * we promote our socket to non const
389 	 */
390 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
391 
392 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
393 	kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
394 unlock:
395 	rcu_read_unlock();
396 	return ret;
397 }
398 EXPORT_SYMBOL(ip6_xmit);
399 
400 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
401 {
402 	struct ip6_ra_chain *ra;
403 	struct sock *last = NULL;
404 
405 	read_lock(&ip6_ra_lock);
406 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
407 		struct sock *sk = ra->sk;
408 		if (sk && ra->sel == sel &&
409 		    (!sk->sk_bound_dev_if ||
410 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
411 
412 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
413 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
414 				continue;
415 			}
416 			if (last) {
417 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
418 				if (skb2)
419 					rawv6_rcv(last, skb2);
420 			}
421 			last = sk;
422 		}
423 	}
424 
425 	if (last) {
426 		rawv6_rcv(last, skb);
427 		read_unlock(&ip6_ra_lock);
428 		return 1;
429 	}
430 	read_unlock(&ip6_ra_lock);
431 	return 0;
432 }
433 
434 static int ip6_forward_proxy_check(struct sk_buff *skb)
435 {
436 	struct ipv6hdr *hdr = ipv6_hdr(skb);
437 	u8 nexthdr = hdr->nexthdr;
438 	__be16 frag_off;
439 	int offset;
440 
441 	if (ipv6_ext_hdr(nexthdr)) {
442 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
443 		if (offset < 0)
444 			return 0;
445 	} else
446 		offset = sizeof(struct ipv6hdr);
447 
448 	if (nexthdr == IPPROTO_ICMPV6) {
449 		struct icmp6hdr *icmp6;
450 
451 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
452 					 offset + 1 - skb->data)))
453 			return 0;
454 
455 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
456 
457 		switch (icmp6->icmp6_type) {
458 		case NDISC_ROUTER_SOLICITATION:
459 		case NDISC_ROUTER_ADVERTISEMENT:
460 		case NDISC_NEIGHBOUR_SOLICITATION:
461 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
462 		case NDISC_REDIRECT:
463 			/* For reaction involving unicast neighbor discovery
464 			 * message destined to the proxied address, pass it to
465 			 * input function.
466 			 */
467 			return 1;
468 		default:
469 			break;
470 		}
471 	}
472 
473 	/*
474 	 * The proxying router can't forward traffic sent to a link-local
475 	 * address, so signal the sender and discard the packet. This
476 	 * behavior is clarified by the MIPv6 specification.
477 	 */
478 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
479 		dst_link_failure(skb);
480 		return -1;
481 	}
482 
483 	return 0;
484 }
485 
486 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
487 				     struct sk_buff *skb)
488 {
489 #ifdef CONFIG_NET_SWITCHDEV
490 	if (skb->offload_l3_fwd_mark) {
491 		consume_skb(skb);
492 		return 0;
493 	}
494 #endif
495 
496 	skb_clear_tstamp(skb);
497 	return dst_output(net, sk, skb);
498 }
499 
500 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
501 {
502 	if (skb->len <= mtu)
503 		return false;
504 
505 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
506 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
507 		return true;
508 
509 	if (skb->ignore_df)
510 		return false;
511 
512 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
513 		return false;
514 
515 	return true;
516 }
517 
518 int ip6_forward(struct sk_buff *skb)
519 {
520 	struct dst_entry *dst = skb_dst(skb);
521 	struct ipv6hdr *hdr = ipv6_hdr(skb);
522 	struct inet6_skb_parm *opt = IP6CB(skb);
523 	struct net *net = dev_net(dst_dev(dst));
524 	struct net_device *dev;
525 	struct inet6_dev *idev;
526 	SKB_DR(reason);
527 	u32 mtu;
528 
529 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
530 	if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
531 	    (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
532 		goto error;
533 
534 	if (skb->pkt_type != PACKET_HOST)
535 		goto drop;
536 
537 	if (unlikely(skb->sk))
538 		goto drop;
539 
540 	if (skb_warn_if_lro(skb))
541 		goto drop;
542 
543 	if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
544 	    (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
545 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
546 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
547 		goto drop;
548 	}
549 
550 	skb_forward_csum(skb);
551 
552 	/*
553 	 *	We DO NOT make any processing on
554 	 *	RA packets, pushing them to user level AS IS
555 	 *	without ane WARRANTY that application will be able
556 	 *	to interpret them. The reason is that we
557 	 *	cannot make anything clever here.
558 	 *
559 	 *	We are not end-node, so that if packet contains
560 	 *	AH/ESP, we cannot make anything.
561 	 *	Defragmentation also would be mistake, RA packets
562 	 *	cannot be fragmented, because there is no warranty
563 	 *	that different fragments will go along one path. --ANK
564 	 */
565 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
566 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
567 			return 0;
568 	}
569 
570 	/*
571 	 *	check and decrement ttl
572 	 */
573 	if (hdr->hop_limit <= 1) {
574 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
575 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
576 
577 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
578 		return -ETIMEDOUT;
579 	}
580 
581 	/* XXX: idev->cnf.proxy_ndp? */
582 	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
583 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
584 		int proxied = ip6_forward_proxy_check(skb);
585 		if (proxied > 0) {
586 			/* It's tempting to decrease the hop limit
587 			 * here by 1, as we do at the end of the
588 			 * function too.
589 			 *
590 			 * But that would be incorrect, as proxying is
591 			 * not forwarding.  The ip6_input function
592 			 * will handle this packet locally, and it
593 			 * depends on the hop limit being unchanged.
594 			 *
595 			 * One example is the NDP hop limit, that
596 			 * always has to stay 255, but other would be
597 			 * similar checks around RA packets, where the
598 			 * user can even change the desired limit.
599 			 */
600 			return ip6_input(skb);
601 		} else if (proxied < 0) {
602 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
603 			goto drop;
604 		}
605 	}
606 
607 	if (!xfrm6_route_forward(skb)) {
608 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
609 		SKB_DR_SET(reason, XFRM_POLICY);
610 		goto drop;
611 	}
612 	dst = skb_dst(skb);
613 	dev = dst_dev(dst);
614 	/* IPv6 specs say nothing about it, but it is clear that we cannot
615 	   send redirects to source routed frames.
616 	   We don't send redirects to frames decapsulated from IPsec.
617 	 */
618 	if (IP6CB(skb)->iif == dev->ifindex &&
619 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
620 		struct in6_addr *target = NULL;
621 		struct inet_peer *peer;
622 		struct rt6_info *rt;
623 
624 		/*
625 		 *	incoming and outgoing devices are the same
626 		 *	send a redirect.
627 		 */
628 
629 		rt = dst_rt6_info(dst);
630 		if (rt->rt6i_flags & RTF_GATEWAY)
631 			target = &rt->rt6i_gateway;
632 		else
633 			target = &hdr->daddr;
634 
635 		rcu_read_lock();
636 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
637 
638 		/* Limit redirects both by destination (here)
639 		   and by source (inside ndisc_send_redirect)
640 		 */
641 		if (inet_peer_xrlim_allow(peer, 1*HZ))
642 			ndisc_send_redirect(skb, target);
643 		rcu_read_unlock();
644 	} else {
645 		int addrtype = ipv6_addr_type(&hdr->saddr);
646 
647 		/* This check is security critical. */
648 		if (addrtype == IPV6_ADDR_ANY ||
649 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
650 			goto error;
651 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
652 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
653 				    ICMPV6_NOT_NEIGHBOUR, 0);
654 			goto error;
655 		}
656 	}
657 
658 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
659 
660 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
661 	if (mtu < IPV6_MIN_MTU)
662 		mtu = IPV6_MIN_MTU;
663 
664 	if (unlikely(ip6_pkt_too_big(skb, mtu))) {
665 		/* Again, force OUTPUT device used as source address */
666 		skb->dev = dev;
667 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
668 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
669 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
670 				IPSTATS_MIB_FRAGFAILS);
671 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
672 		return -EMSGSIZE;
673 	}
674 
675 	if (skb_cow(skb, dev->hard_header_len)) {
676 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
677 				IPSTATS_MIB_OUTDISCARDS);
678 		goto drop;
679 	}
680 
681 	hdr = ipv6_hdr(skb);
682 
683 	/* Mangling hops number delayed to point after skb COW */
684 
685 	hdr->hop_limit--;
686 
687 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
688 		       net, NULL, skb, skb->dev, dev,
689 		       ip6_forward_finish);
690 
691 error:
692 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
693 	SKB_DR_SET(reason, IP_INADDRERRORS);
694 drop:
695 	kfree_skb_reason(skb, reason);
696 	return -EINVAL;
697 }
698 
699 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
700 {
701 	to->pkt_type = from->pkt_type;
702 	to->priority = from->priority;
703 	to->protocol = from->protocol;
704 	skb_dst_drop(to);
705 	skb_dst_set(to, dst_clone(skb_dst(from)));
706 	to->dev = from->dev;
707 	to->mark = from->mark;
708 
709 	skb_copy_hash(to, from);
710 
711 #ifdef CONFIG_NET_SCHED
712 	to->tc_index = from->tc_index;
713 #endif
714 	nf_copy(to, from);
715 	skb_ext_copy(to, from);
716 	skb_copy_secmark(to, from);
717 }
718 
719 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
720 		      u8 nexthdr, __be32 frag_id,
721 		      struct ip6_fraglist_iter *iter)
722 {
723 	unsigned int first_len;
724 	struct frag_hdr *fh;
725 
726 	/* BUILD HEADER */
727 	*prevhdr = NEXTHDR_FRAGMENT;
728 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
729 	if (!iter->tmp_hdr)
730 		return -ENOMEM;
731 
732 	iter->frag = skb_shinfo(skb)->frag_list;
733 	skb_frag_list_init(skb);
734 
735 	iter->offset = 0;
736 	iter->hlen = hlen;
737 	iter->frag_id = frag_id;
738 	iter->nexthdr = nexthdr;
739 
740 	__skb_pull(skb, hlen);
741 	fh = __skb_push(skb, sizeof(struct frag_hdr));
742 	__skb_push(skb, hlen);
743 	skb_reset_network_header(skb);
744 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
745 
746 	fh->nexthdr = nexthdr;
747 	fh->reserved = 0;
748 	fh->frag_off = htons(IP6_MF);
749 	fh->identification = frag_id;
750 
751 	first_len = skb_pagelen(skb);
752 	skb->data_len = first_len - skb_headlen(skb);
753 	skb->len = first_len;
754 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
755 
756 	return 0;
757 }
758 EXPORT_SYMBOL(ip6_fraglist_init);
759 
760 void ip6_fraglist_prepare(struct sk_buff *skb,
761 			  struct ip6_fraglist_iter *iter)
762 {
763 	struct sk_buff *frag = iter->frag;
764 	unsigned int hlen = iter->hlen;
765 	struct frag_hdr *fh;
766 
767 	frag->ip_summed = CHECKSUM_NONE;
768 	skb_reset_transport_header(frag);
769 	fh = __skb_push(frag, sizeof(struct frag_hdr));
770 	__skb_push(frag, hlen);
771 	skb_reset_network_header(frag);
772 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
773 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
774 	fh->nexthdr = iter->nexthdr;
775 	fh->reserved = 0;
776 	fh->frag_off = htons(iter->offset);
777 	if (frag->next)
778 		fh->frag_off |= htons(IP6_MF);
779 	fh->identification = iter->frag_id;
780 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
781 	ip6_copy_metadata(frag, skb);
782 }
783 EXPORT_SYMBOL(ip6_fraglist_prepare);
784 
785 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
786 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
787 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
788 {
789 	state->prevhdr = prevhdr;
790 	state->nexthdr = nexthdr;
791 	state->frag_id = frag_id;
792 
793 	state->hlen = hlen;
794 	state->mtu = mtu;
795 
796 	state->left = skb->len - hlen;	/* Space per frame */
797 	state->ptr = hlen;		/* Where to start from */
798 
799 	state->hroom = hdr_room;
800 	state->troom = needed_tailroom;
801 
802 	state->offset = 0;
803 }
804 EXPORT_SYMBOL(ip6_frag_init);
805 
806 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
807 {
808 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
809 	struct sk_buff *frag;
810 	struct frag_hdr *fh;
811 	unsigned int len;
812 
813 	len = state->left;
814 	/* IF: it doesn't fit, use 'mtu' - the data space left */
815 	if (len > state->mtu)
816 		len = state->mtu;
817 	/* IF: we are not sending up to and including the packet end
818 	   then align the next start on an eight byte boundary */
819 	if (len < state->left)
820 		len &= ~7;
821 
822 	/* Allocate buffer */
823 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
824 			 state->hroom + state->troom, GFP_ATOMIC);
825 	if (!frag)
826 		return ERR_PTR(-ENOMEM);
827 
828 	/*
829 	 *	Set up data on packet
830 	 */
831 
832 	ip6_copy_metadata(frag, skb);
833 	skb_reserve(frag, state->hroom);
834 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
835 	skb_reset_network_header(frag);
836 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
837 	frag->transport_header = (frag->network_header + state->hlen +
838 				  sizeof(struct frag_hdr));
839 
840 	/*
841 	 *	Charge the memory for the fragment to any owner
842 	 *	it might possess
843 	 */
844 	if (skb->sk)
845 		skb_set_owner_w(frag, skb->sk);
846 
847 	/*
848 	 *	Copy the packet header into the new buffer.
849 	 */
850 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
851 
852 	fragnexthdr_offset = skb_network_header(frag);
853 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
854 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
855 
856 	/*
857 	 *	Build fragment header.
858 	 */
859 	fh->nexthdr = state->nexthdr;
860 	fh->reserved = 0;
861 	fh->identification = state->frag_id;
862 
863 	/*
864 	 *	Copy a block of the IP datagram.
865 	 */
866 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
867 			     len));
868 	state->left -= len;
869 
870 	fh->frag_off = htons(state->offset);
871 	if (state->left > 0)
872 		fh->frag_off |= htons(IP6_MF);
873 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
874 
875 	state->ptr += len;
876 	state->offset += len;
877 
878 	return frag;
879 }
880 EXPORT_SYMBOL(ip6_frag_next);
881 
882 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
883 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
884 {
885 	struct sk_buff *frag;
886 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
887 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
888 				inet6_sk(skb->sk) : NULL;
889 	u8 tstamp_type = skb->tstamp_type;
890 	struct ip6_frag_state state;
891 	unsigned int mtu, hlen, nexthdr_offset;
892 	ktime_t tstamp = skb->tstamp;
893 	int hroom, err = 0;
894 	__be32 frag_id;
895 	u8 *prevhdr, nexthdr = 0;
896 
897 	if (!ipv6_mod_enabled()) {
898 		kfree_skb(skb);
899 		return -EAFNOSUPPORT;
900 	}
901 
902 	err = ip6_find_1stfragopt(skb, &prevhdr);
903 	if (err < 0)
904 		goto fail;
905 	hlen = err;
906 	nexthdr = *prevhdr;
907 	nexthdr_offset = prevhdr - skb_network_header(skb);
908 
909 	mtu = ip6_skb_dst_mtu(skb);
910 
911 	/* We must not fragment if the socket is set to force MTU discovery
912 	 * or if the skb it not generated by a local socket.
913 	 */
914 	if (unlikely(!skb->ignore_df && skb->len > mtu))
915 		goto fail_toobig;
916 
917 	if (IP6CB(skb)->frag_max_size) {
918 		if (IP6CB(skb)->frag_max_size > mtu)
919 			goto fail_toobig;
920 
921 		/* don't send fragments larger than what we received */
922 		mtu = IP6CB(skb)->frag_max_size;
923 		if (mtu < IPV6_MIN_MTU)
924 			mtu = IPV6_MIN_MTU;
925 	}
926 
927 	if (np) {
928 		u32 frag_size = READ_ONCE(np->frag_size);
929 
930 		if (frag_size && frag_size < mtu)
931 			mtu = frag_size;
932 	}
933 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
934 		goto fail_toobig;
935 	mtu -= hlen + sizeof(struct frag_hdr);
936 
937 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
938 				    &ipv6_hdr(skb)->saddr);
939 
940 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
941 	    (err = skb_checksum_help(skb)))
942 		goto fail;
943 
944 	prevhdr = skb_network_header(skb) + nexthdr_offset;
945 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
946 	if (skb_has_frag_list(skb)) {
947 		unsigned int first_len = skb_pagelen(skb);
948 		struct ip6_fraglist_iter iter;
949 		struct sk_buff *frag2;
950 
951 		if (first_len - hlen > mtu ||
952 		    ((first_len - hlen) & 7) ||
953 		    skb_cloned(skb) ||
954 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
955 			goto slow_path;
956 
957 		skb_walk_frags(skb, frag) {
958 			/* Correct geometry. */
959 			if (frag->len > mtu ||
960 			    ((frag->len & 7) && frag->next) ||
961 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
962 				goto slow_path_clean;
963 
964 			/* Partially cloned skb? */
965 			if (skb_shared(frag))
966 				goto slow_path_clean;
967 
968 			BUG_ON(frag->sk);
969 			if (skb->sk) {
970 				frag->sk = skb->sk;
971 				frag->destructor = sock_wfree;
972 			}
973 			skb->truesize -= frag->truesize;
974 		}
975 
976 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
977 					&iter);
978 		if (err < 0)
979 			goto fail;
980 
981 		/* We prevent @rt from being freed. */
982 		rcu_read_lock();
983 
984 		for (;;) {
985 			/* Prepare header of the next frame,
986 			 * before previous one went down. */
987 			if (iter.frag)
988 				ip6_fraglist_prepare(skb, &iter);
989 
990 			skb_set_delivery_time(skb, tstamp, tstamp_type);
991 			err = output(net, sk, skb);
992 			if (!err)
993 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
994 					      IPSTATS_MIB_FRAGCREATES);
995 
996 			if (err || !iter.frag)
997 				break;
998 
999 			skb = ip6_fraglist_next(&iter);
1000 		}
1001 
1002 		kfree(iter.tmp_hdr);
1003 
1004 		if (err == 0) {
1005 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
1006 				      IPSTATS_MIB_FRAGOKS);
1007 			rcu_read_unlock();
1008 			return 0;
1009 		}
1010 
1011 		kfree_skb_list(iter.frag);
1012 
1013 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
1014 			      IPSTATS_MIB_FRAGFAILS);
1015 		rcu_read_unlock();
1016 		return err;
1017 
1018 slow_path_clean:
1019 		skb_walk_frags(skb, frag2) {
1020 			if (frag2 == frag)
1021 				break;
1022 			frag2->sk = NULL;
1023 			frag2->destructor = NULL;
1024 			skb->truesize += frag2->truesize;
1025 		}
1026 	}
1027 
1028 slow_path:
1029 	/*
1030 	 *	Fragment the datagram.
1031 	 */
1032 
1033 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1034 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1035 		      &state);
1036 
1037 	/*
1038 	 *	Keep copying data until we run out.
1039 	 */
1040 
1041 	while (state.left > 0) {
1042 		frag = ip6_frag_next(skb, &state);
1043 		if (IS_ERR(frag)) {
1044 			err = PTR_ERR(frag);
1045 			goto fail;
1046 		}
1047 
1048 		/*
1049 		 *	Put this fragment into the sending queue.
1050 		 */
1051 		skb_set_delivery_time(frag, tstamp, tstamp_type);
1052 		err = output(net, sk, frag);
1053 		if (err)
1054 			goto fail;
1055 
1056 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1057 			      IPSTATS_MIB_FRAGCREATES);
1058 	}
1059 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1060 		      IPSTATS_MIB_FRAGOKS);
1061 	consume_skb(skb);
1062 	return err;
1063 
1064 fail_toobig:
1065 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1066 	err = -EMSGSIZE;
1067 
1068 fail:
1069 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1070 		      IPSTATS_MIB_FRAGFAILS);
1071 	kfree_skb(skb);
1072 	return err;
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_fragment);
1075 
1076 static inline int ip6_rt_check(const struct rt6key *rt_key,
1077 			       const struct in6_addr *fl_addr,
1078 			       const struct in6_addr *addr_cache)
1079 {
1080 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1081 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1082 }
1083 
1084 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1085 					  struct dst_entry *dst,
1086 					  const struct flowi6 *fl6)
1087 {
1088 	struct ipv6_pinfo *np = inet6_sk(sk);
1089 	struct rt6_info *rt;
1090 
1091 	if (!dst)
1092 		goto out;
1093 
1094 	if (dst->ops->family != AF_INET6) {
1095 		dst_release(dst);
1096 		return NULL;
1097 	}
1098 
1099 	rt = dst_rt6_info(dst);
1100 	/* Yes, checking route validity in not connected
1101 	 * case is not very simple. Take into account,
1102 	 * that we do not support routing by source, TOS,
1103 	 * and MSG_DONTROUTE		--ANK (980726)
1104 	 *
1105 	 * 1. ip6_rt_check(): If route was host route,
1106 	 *    check that cached destination is current.
1107 	 *    If it is network route, we still may
1108 	 *    check its validity using saved pointer
1109 	 *    to the last used address: daddr_cache.
1110 	 *    We do not want to save whole address now,
1111 	 *    (because main consumer of this service
1112 	 *    is tcp, which has not this problem),
1113 	 *    so that the last trick works only on connected
1114 	 *    sockets.
1115 	 * 2. oif also should be the same.
1116 	 */
1117 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
1118 			 np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
1119 #ifdef CONFIG_IPV6_SUBTREES
1120 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
1121 			 np->saddr_cache ? &np->saddr : NULL) ||
1122 #endif
1123 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1124 		dst_release(dst);
1125 		dst = NULL;
1126 	}
1127 
1128 out:
1129 	return dst;
1130 }
1131 
1132 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1133 			       struct dst_entry **dst, struct flowi6 *fl6)
1134 {
1135 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1136 	struct neighbour *n;
1137 	struct rt6_info *rt;
1138 #endif
1139 	int err;
1140 	int flags = 0;
1141 
1142 	/* The correct way to handle this would be to do
1143 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1144 	 * the route-specific preferred source forces the
1145 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1146 	 *
1147 	 * In source specific routing (no src=any default route),
1148 	 * ip6_route_output will fail given src=any saddr, though, so
1149 	 * that's why we try it again later.
1150 	 */
1151 	if (ipv6_addr_any(&fl6->saddr)) {
1152 		struct fib6_info *from;
1153 		struct rt6_info *rt;
1154 
1155 		*dst = ip6_route_output(net, sk, fl6);
1156 		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1157 
1158 		rcu_read_lock();
1159 		from = rt ? rcu_dereference(rt->from) : NULL;
1160 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1161 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1162 					  fl6->flowi6_l3mdev,
1163 					  &fl6->saddr);
1164 		rcu_read_unlock();
1165 
1166 		if (err)
1167 			goto out_err_release;
1168 
1169 		/* If we had an erroneous initial result, pretend it
1170 		 * never existed and let the SA-enabled version take
1171 		 * over.
1172 		 */
1173 		if ((*dst)->error) {
1174 			dst_release(*dst);
1175 			*dst = NULL;
1176 		}
1177 
1178 		if (fl6->flowi6_oif)
1179 			flags |= RT6_LOOKUP_F_IFACE;
1180 	}
1181 
1182 	if (!*dst)
1183 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1184 
1185 	err = (*dst)->error;
1186 	if (err)
1187 		goto out_err_release;
1188 
1189 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1190 	/*
1191 	 * Here if the dst entry we've looked up
1192 	 * has a neighbour entry that is in the INCOMPLETE
1193 	 * state and the src address from the flow is
1194 	 * marked as OPTIMISTIC, we release the found
1195 	 * dst entry and replace it instead with the
1196 	 * dst entry of the nexthop router
1197 	 */
1198 	rt = dst_rt6_info(*dst);
1199 	rcu_read_lock();
1200 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1201 				      rt6_nexthop(rt, &fl6->daddr));
1202 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1203 	rcu_read_unlock();
1204 
1205 	if (err) {
1206 		struct inet6_ifaddr *ifp;
1207 		struct flowi6 fl_gw6;
1208 		int redirect;
1209 
1210 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1211 				      (*dst)->dev, 1);
1212 
1213 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1214 		if (ifp)
1215 			in6_ifa_put(ifp);
1216 
1217 		if (redirect) {
1218 			/*
1219 			 * We need to get the dst entry for the
1220 			 * default router instead
1221 			 */
1222 			dst_release(*dst);
1223 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1224 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1225 			*dst = ip6_route_output(net, sk, &fl_gw6);
1226 			err = (*dst)->error;
1227 			if (err)
1228 				goto out_err_release;
1229 		}
1230 	}
1231 #endif
1232 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1233 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1234 		err = -EAFNOSUPPORT;
1235 		goto out_err_release;
1236 	}
1237 
1238 	return 0;
1239 
1240 out_err_release:
1241 	dst_release(*dst);
1242 	*dst = NULL;
1243 
1244 	if (err == -ENETUNREACH)
1245 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1246 	return err;
1247 }
1248 
1249 /**
1250  *	ip6_dst_lookup - perform route lookup on flow
1251  *	@net: Network namespace to perform lookup in
1252  *	@sk: socket which provides route info
1253  *	@dst: pointer to dst_entry * for result
1254  *	@fl6: flow to lookup
1255  *
1256  *	This function performs a route lookup on the given flow.
1257  *
1258  *	It returns zero on success, or a standard errno code on error.
1259  */
1260 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1261 		   struct flowi6 *fl6)
1262 {
1263 	*dst = NULL;
1264 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1265 }
1266 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1267 
1268 /**
1269  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1270  *	@net: Network namespace to perform lookup in
1271  *	@sk: socket which provides route info
1272  *	@fl6: flow to lookup
1273  *	@final_dst: final destination address for ipsec lookup
1274  *
1275  *	This function performs a route lookup on the given flow.
1276  *
1277  *	It returns a valid dst pointer on success, or a pointer encoded
1278  *	error code.
1279  */
1280 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1281 				      const struct in6_addr *final_dst)
1282 {
1283 	struct dst_entry *dst = NULL;
1284 	int err;
1285 
1286 	if (!ipv6_mod_enabled())
1287 		return ERR_PTR(-EAFNOSUPPORT);
1288 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1289 	if (err)
1290 		return ERR_PTR(err);
1291 	if (final_dst)
1292 		fl6->daddr = *final_dst;
1293 
1294 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1295 }
1296 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1297 
1298 /**
1299  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1300  *	@sk: socket which provides the dst cache and route info
1301  *	@fl6: flow to lookup
1302  *	@final_dst: final destination address for ipsec lookup
1303  *	@connected: whether @sk is connected or not
1304  *
1305  *	This function performs a route lookup on the given flow with the
1306  *	possibility of using the cached route in the socket if it is valid.
1307  *	It will take the socket dst lock when operating on the dst cache.
1308  *	As a result, this function can only be used in process context.
1309  *
1310  *	In addition, for a connected socket, cache the dst in the socket
1311  *	if the current cache is not valid.
1312  *
1313  *	It returns a valid dst pointer on success, or a pointer encoded
1314  *	error code.
1315  */
1316 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1317 					 const struct in6_addr *final_dst,
1318 					 bool connected)
1319 {
1320 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1321 
1322 	dst = ip6_sk_dst_check(sk, dst, fl6);
1323 	if (dst)
1324 		return dst;
1325 
1326 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1327 	if (connected && !IS_ERR(dst))
1328 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1329 
1330 	return dst;
1331 }
1332 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1333 
1334 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1335 					       gfp_t gfp)
1336 {
1337 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1338 }
1339 
1340 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1341 						gfp_t gfp)
1342 {
1343 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1344 }
1345 
1346 static void ip6_append_data_mtu(unsigned int *mtu,
1347 				int *maxfraglen,
1348 				unsigned int fragheaderlen,
1349 				struct sk_buff *skb,
1350 				struct rt6_info *rt,
1351 				unsigned int orig_mtu)
1352 {
1353 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1354 		if (!skb) {
1355 			/* first fragment, reserve header_len */
1356 			*mtu = orig_mtu - rt->dst.header_len;
1357 
1358 		} else {
1359 			/*
1360 			 * this fragment is not first, the headers
1361 			 * space is regarded as data space.
1362 			 */
1363 			*mtu = orig_mtu;
1364 		}
1365 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1366 			      + fragheaderlen - sizeof(struct frag_hdr);
1367 	}
1368 }
1369 
1370 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1371 			  struct ipcm6_cookie *ipc6,
1372 			  struct rt6_info *rt)
1373 {
1374 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1375 	struct inet6_cork *v6_cork = &cork->base6;
1376 	struct ipv6_pinfo *np = inet6_sk(sk);
1377 	unsigned int mtu, frag_size;
1378 
1379 	/* callers pass dst together with a reference, set it first so
1380 	 * ip6_cork_release() can put it down even in case of an error.
1381 	 */
1382 	cork->base.dst = &rt->dst;
1383 
1384 	/*
1385 	 * setup for corking
1386 	 */
1387 	if (unlikely(opt)) {
1388 		if (WARN_ON(v6_cork->opt))
1389 			return -EINVAL;
1390 
1391 		nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation);
1392 		if (unlikely(!nopt))
1393 			return -ENOBUFS;
1394 
1395 		nopt->tot_len = sizeof(*opt);
1396 		nopt->opt_flen = opt->opt_flen;
1397 		nopt->opt_nflen = opt->opt_nflen;
1398 
1399 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1400 		if (opt->dst0opt && !nopt->dst0opt)
1401 			return -ENOBUFS;
1402 
1403 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1404 		if (opt->dst1opt && !nopt->dst1opt)
1405 			return -ENOBUFS;
1406 
1407 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1408 		if (opt->hopopt && !nopt->hopopt)
1409 			return -ENOBUFS;
1410 
1411 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1412 		if (opt->srcrt && !nopt->srcrt)
1413 			return -ENOBUFS;
1414 
1415 		/* need source address above miyazawa*/
1416 	}
1417 	v6_cork->hop_limit = ipc6->hlimit;
1418 	v6_cork->tclass = ipc6->tclass;
1419 	v6_cork->dontfrag = ipc6->dontfrag;
1420 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1421 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1422 		      READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst);
1423 	else
1424 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1425 			READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst));
1426 
1427 	frag_size = READ_ONCE(np->frag_size);
1428 	if (frag_size && frag_size < mtu)
1429 		mtu = frag_size;
1430 
1431 	cork->base.fragsize = mtu;
1432 	cork->base.gso_size = ipc6->gso_size;
1433 	cork->base.tx_flags = 0;
1434 	cork->base.mark = ipc6->sockc.mark;
1435 	cork->base.priority = ipc6->sockc.priority;
1436 	sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1437 	if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1438 		cork->base.flags |= IPCORK_TS_OPT_ID;
1439 		cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1440 	}
1441 	cork->base.length = 0;
1442 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1443 
1444 	return 0;
1445 }
1446 
1447 static int __ip6_append_data(struct sock *sk,
1448 			     struct sk_buff_head *queue,
1449 			     struct inet_cork_full *cork_full,
1450 			     struct page_frag *pfrag,
1451 			     int getfrag(void *from, char *to, int offset,
1452 					 int len, int odd, struct sk_buff *skb),
1453 			     void *from, size_t length, int transhdrlen,
1454 			     unsigned int flags)
1455 {
1456 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1457 	struct inet6_cork *v6_cork = &cork_full->base6;
1458 	struct inet_cork *cork = &cork_full->base;
1459 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1460 	struct sk_buff *skb, *skb_prev = NULL;
1461 	struct ubuf_info *uarg = NULL;
1462 	int exthdrlen = 0;
1463 	int dst_exthdrlen = 0;
1464 	int hh_len;
1465 	int copy;
1466 	int err;
1467 	int offset = 0;
1468 	bool zc = false;
1469 	u32 tskey = 0;
1470 	struct rt6_info *rt = dst_rt6_info(cork->dst);
1471 	bool paged, hold_tskey = false, extra_uref = false;
1472 	struct ipv6_txoptions *opt = v6_cork->opt;
1473 	int csummode = CHECKSUM_NONE;
1474 	unsigned int maxnonfragsize, headersize;
1475 	unsigned int wmem_alloc_delta = 0;
1476 
1477 	skb = skb_peek_tail(queue);
1478 	if (!skb) {
1479 		exthdrlen = opt ? opt->opt_flen : 0;
1480 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1481 	}
1482 
1483 	paged = !!cork->gso_size;
1484 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1485 	orig_mtu = mtu;
1486 
1487 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1488 
1489 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1490 			(opt ? opt->opt_nflen : 0);
1491 
1492 	headersize = sizeof(struct ipv6hdr) +
1493 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1494 		     rt->rt6i_nfheader_len;
1495 
1496 	if (mtu <= fragheaderlen ||
1497 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1498 		goto emsgsize;
1499 
1500 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1501 		     sizeof(struct frag_hdr);
1502 
1503 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1504 	 * the first fragment
1505 	 */
1506 	if (headersize + transhdrlen > mtu)
1507 		goto emsgsize;
1508 
1509 	if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1510 	    (sk->sk_protocol == IPPROTO_UDP ||
1511 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1512 	     sk->sk_protocol == IPPROTO_RAW)) {
1513 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1514 				sizeof(struct ipv6hdr));
1515 		goto emsgsize;
1516 	}
1517 
1518 	if (ip6_sk_ignore_df(sk))
1519 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1520 	else
1521 		maxnonfragsize = mtu;
1522 
1523 	if (cork->length + length > maxnonfragsize - headersize) {
1524 emsgsize:
1525 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1526 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1527 		return -EMSGSIZE;
1528 	}
1529 
1530 	/* CHECKSUM_PARTIAL only with no extension headers and when
1531 	 * we are not going to fragment
1532 	 */
1533 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1534 	    headersize == sizeof(struct ipv6hdr) &&
1535 	    length <= mtu - headersize &&
1536 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1537 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1538 		csummode = CHECKSUM_PARTIAL;
1539 
1540 	if ((flags & MSG_ZEROCOPY) && length) {
1541 		struct msghdr *msg = from;
1542 
1543 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1544 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1545 				return -EINVAL;
1546 
1547 			/* Leave uarg NULL if can't zerocopy, callers should
1548 			 * be able to handle it.
1549 			 */
1550 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1551 			    csummode == CHECKSUM_PARTIAL) {
1552 				paged = true;
1553 				zc = true;
1554 				uarg = msg->msg_ubuf;
1555 			}
1556 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1557 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1558 						    false);
1559 			if (!uarg)
1560 				return -ENOBUFS;
1561 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1562 			if (rt->dst.dev->features & NETIF_F_SG &&
1563 			    csummode == CHECKSUM_PARTIAL) {
1564 				paged = true;
1565 				zc = true;
1566 			} else {
1567 				uarg_to_msgzc(uarg)->zerocopy = 0;
1568 				skb_zcopy_set(skb, uarg, &extra_uref);
1569 			}
1570 		}
1571 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1572 		if (inet_test_bit(HDRINCL, sk))
1573 			return -EPERM;
1574 		if (rt->dst.dev->features & NETIF_F_SG &&
1575 		    getfrag == ip_generic_getfrag)
1576 			/* We need an empty buffer to attach stuff to */
1577 			paged = true;
1578 		else
1579 			flags &= ~MSG_SPLICE_PAGES;
1580 	}
1581 
1582 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1583 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1584 		if (cork->flags & IPCORK_TS_OPT_ID) {
1585 			tskey = cork->ts_opt_id;
1586 		} else {
1587 			tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1588 			hold_tskey = true;
1589 		}
1590 	}
1591 
1592 	/*
1593 	 * Let's try using as much space as possible.
1594 	 * Use MTU if total length of the message fits into the MTU.
1595 	 * Otherwise, we need to reserve fragment header and
1596 	 * fragment alignment (= 8-15 octects, in total).
1597 	 *
1598 	 * Note that we may need to "move" the data from the tail
1599 	 * of the buffer to the new fragment when we split
1600 	 * the message.
1601 	 *
1602 	 * FIXME: It may be fragmented into multiple chunks
1603 	 *        at once if non-fragmentable extension headers
1604 	 *        are too large.
1605 	 * --yoshfuji
1606 	 */
1607 
1608 	cork->length += length;
1609 	if (!skb)
1610 		goto alloc_new_skb;
1611 
1612 	while (length > 0) {
1613 		/* Check if the remaining data fits into current packet. */
1614 		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1615 		if (copy < length)
1616 			copy = maxfraglen - skb->len;
1617 
1618 		if (copy <= 0) {
1619 			char *data;
1620 			unsigned int datalen;
1621 			unsigned int fraglen;
1622 			unsigned int fraggap;
1623 			unsigned int alloclen, alloc_extra;
1624 			unsigned int pagedlen;
1625 alloc_new_skb:
1626 			/* There's no room in the current skb */
1627 			if (skb)
1628 				fraggap = skb->len - maxfraglen;
1629 			else
1630 				fraggap = 0;
1631 			/* update mtu and maxfraglen if necessary */
1632 			if (!skb || !skb_prev)
1633 				ip6_append_data_mtu(&mtu, &maxfraglen,
1634 						    fragheaderlen, skb, rt,
1635 						    orig_mtu);
1636 
1637 			skb_prev = skb;
1638 
1639 			/*
1640 			 * If remaining data exceeds the mtu,
1641 			 * we know we need more fragment(s).
1642 			 */
1643 			datalen = length + fraggap;
1644 
1645 			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1646 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1647 			fraglen = datalen + fragheaderlen;
1648 			pagedlen = 0;
1649 
1650 			alloc_extra = hh_len;
1651 			alloc_extra += dst_exthdrlen;
1652 			alloc_extra += rt->dst.trailer_len;
1653 
1654 			/* We just reserve space for fragment header.
1655 			 * Note: this may be overallocation if the message
1656 			 * (without MSG_MORE) fits into the MTU.
1657 			 */
1658 			alloc_extra += sizeof(struct frag_hdr);
1659 
1660 			if ((flags & MSG_MORE) &&
1661 			    !(rt->dst.dev->features&NETIF_F_SG))
1662 				alloclen = mtu;
1663 			else if (!paged &&
1664 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1665 				  !(rt->dst.dev->features & NETIF_F_SG)))
1666 				alloclen = fraglen;
1667 			else {
1668 				alloclen = fragheaderlen + transhdrlen;
1669 				pagedlen = datalen - transhdrlen;
1670 			}
1671 			alloclen += alloc_extra;
1672 
1673 			if (datalen != length + fraggap) {
1674 				/*
1675 				 * this is not the last fragment, the trailer
1676 				 * space is regarded as data space.
1677 				 */
1678 				datalen += rt->dst.trailer_len;
1679 			}
1680 
1681 			fraglen = datalen + fragheaderlen;
1682 
1683 			copy = datalen - transhdrlen - fraggap - pagedlen;
1684 			/* [!] NOTE: copy may be negative if pagedlen>0
1685 			 * because then the equation may reduces to -fraggap.
1686 			 */
1687 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1688 				err = -EINVAL;
1689 				goto error;
1690 			}
1691 			if (transhdrlen) {
1692 				skb = sock_alloc_send_skb(sk, alloclen,
1693 						(flags & MSG_DONTWAIT), &err);
1694 			} else {
1695 				skb = NULL;
1696 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1697 				    2 * sk->sk_sndbuf)
1698 					skb = alloc_skb(alloclen,
1699 							sk->sk_allocation);
1700 				if (unlikely(!skb))
1701 					err = -ENOBUFS;
1702 			}
1703 			if (!skb)
1704 				goto error;
1705 			/*
1706 			 *	Fill in the control structures
1707 			 */
1708 			skb->protocol = htons(ETH_P_IPV6);
1709 			skb->ip_summed = csummode;
1710 			skb->csum = 0;
1711 			/* reserve for fragmentation and ipsec header */
1712 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1713 				    dst_exthdrlen);
1714 
1715 			/*
1716 			 *	Find where to start putting bytes
1717 			 */
1718 			data = skb_put(skb, fraglen - pagedlen);
1719 			skb_set_network_header(skb, exthdrlen);
1720 			data += fragheaderlen;
1721 			skb->transport_header = (skb->network_header +
1722 						 fragheaderlen);
1723 			if (fraggap) {
1724 				skb->csum = skb_copy_and_csum_bits(
1725 					skb_prev, maxfraglen,
1726 					data + transhdrlen, fraggap);
1727 				skb_prev->csum = csum_sub(skb_prev->csum,
1728 							  skb->csum);
1729 				data += fraggap;
1730 				pskb_trim_unique(skb_prev, maxfraglen);
1731 			}
1732 			if (copy > 0 &&
1733 			    INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1734 					   from, data + transhdrlen, offset,
1735 					   copy, fraggap, skb) < 0) {
1736 				err = -EFAULT;
1737 				kfree_skb(skb);
1738 				goto error;
1739 			} else if (flags & MSG_SPLICE_PAGES) {
1740 				copy = 0;
1741 			}
1742 
1743 			offset += copy;
1744 			length -= copy + transhdrlen;
1745 			transhdrlen = 0;
1746 			exthdrlen = 0;
1747 			dst_exthdrlen = 0;
1748 
1749 			/* Only the initial fragment is time stamped */
1750 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1751 			cork->tx_flags = 0;
1752 			skb_shinfo(skb)->tskey = tskey;
1753 			tskey = 0;
1754 			skb_zcopy_set(skb, uarg, &extra_uref);
1755 
1756 			if ((flags & MSG_CONFIRM) && !skb_prev)
1757 				skb_set_dst_pending_confirm(skb, 1);
1758 
1759 			/*
1760 			 * Put the packet on the pending queue
1761 			 */
1762 			if (!skb->destructor) {
1763 				skb->destructor = sock_wfree;
1764 				skb->sk = sk;
1765 				wmem_alloc_delta += skb->truesize;
1766 			}
1767 			__skb_queue_tail(queue, skb);
1768 			continue;
1769 		}
1770 
1771 		if (copy > length)
1772 			copy = length;
1773 
1774 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1775 		    skb_tailroom(skb) >= copy) {
1776 			unsigned int off;
1777 
1778 			off = skb->len;
1779 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1780 					    from, skb_put(skb, copy),
1781 					    offset, copy, off, skb) < 0) {
1782 				__skb_trim(skb, off);
1783 				err = -EFAULT;
1784 				goto error;
1785 			}
1786 		} else if (flags & MSG_SPLICE_PAGES) {
1787 			struct msghdr *msg = from;
1788 
1789 			err = -EIO;
1790 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1791 				goto error;
1792 
1793 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
1794 			if (err < 0)
1795 				goto error;
1796 			copy = err;
1797 			wmem_alloc_delta += copy;
1798 		} else if (!zc) {
1799 			int i = skb_shinfo(skb)->nr_frags;
1800 
1801 			err = -ENOMEM;
1802 			if (!sk_page_frag_refill(sk, pfrag))
1803 				goto error;
1804 
1805 			skb_zcopy_downgrade_managed(skb);
1806 			if (!skb_can_coalesce(skb, i, pfrag->page,
1807 					      pfrag->offset)) {
1808 				err = -EMSGSIZE;
1809 				if (i == MAX_SKB_FRAGS)
1810 					goto error;
1811 
1812 				__skb_fill_page_desc(skb, i, pfrag->page,
1813 						     pfrag->offset, 0);
1814 				skb_shinfo(skb)->nr_frags = ++i;
1815 				get_page(pfrag->page);
1816 			}
1817 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1818 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1819 				    from,
1820 				    page_address(pfrag->page) + pfrag->offset,
1821 				    offset, copy, skb->len, skb) < 0)
1822 				goto error_efault;
1823 
1824 			pfrag->offset += copy;
1825 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1826 			skb->len += copy;
1827 			skb->data_len += copy;
1828 			skb->truesize += copy;
1829 			wmem_alloc_delta += copy;
1830 		} else {
1831 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1832 			if (err < 0)
1833 				goto error;
1834 		}
1835 		offset += copy;
1836 		length -= copy;
1837 	}
1838 
1839 	if (wmem_alloc_delta)
1840 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1841 	return 0;
1842 
1843 error_efault:
1844 	err = -EFAULT;
1845 error:
1846 	net_zcopy_put_abort(uarg, extra_uref);
1847 	cork->length -= length;
1848 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1849 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1850 	if (hold_tskey)
1851 		atomic_dec(&sk->sk_tskey);
1852 	return err;
1853 }
1854 
1855 int ip6_append_data(struct sock *sk,
1856 		    int getfrag(void *from, char *to, int offset, int len,
1857 				int odd, struct sk_buff *skb),
1858 		    void *from, size_t length, int transhdrlen,
1859 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1860 		    struct rt6_info *rt, unsigned int flags)
1861 {
1862 	struct inet_sock *inet = inet_sk(sk);
1863 	int exthdrlen;
1864 	int err;
1865 
1866 	if (flags&MSG_PROBE)
1867 		return 0;
1868 	if (skb_queue_empty(&sk->sk_write_queue)) {
1869 		/*
1870 		 * setup for corking
1871 		 */
1872 		dst_hold(&rt->dst);
1873 		err = ip6_setup_cork(sk, &inet->cork,
1874 				     ipc6, rt);
1875 		if (err)
1876 			return err;
1877 
1878 		inet->cork.fl.u.ip6 = *fl6;
1879 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1880 		length += exthdrlen;
1881 		transhdrlen += exthdrlen;
1882 	} else {
1883 		transhdrlen = 0;
1884 	}
1885 
1886 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1887 				 sk_page_frag(sk), getfrag,
1888 				 from, length, transhdrlen, flags);
1889 }
1890 EXPORT_SYMBOL_GPL(ip6_append_data);
1891 
1892 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1893 {
1894 	struct dst_entry *dst = cork->base.dst;
1895 
1896 	cork->base.dst = NULL;
1897 	skb_dst_set(skb, dst);
1898 }
1899 
1900 static void ip6_cork_release(struct inet_cork_full *cork)
1901 {
1902 	struct inet6_cork *v6_cork = &cork->base6;
1903 
1904 	if (unlikely(v6_cork->opt)) {
1905 		struct ipv6_txoptions *opt = v6_cork->opt;
1906 
1907 		kfree(opt->dst0opt);
1908 		kfree(opt->dst1opt);
1909 		kfree(opt->hopopt);
1910 		kfree(opt->srcrt);
1911 		kfree(opt);
1912 		v6_cork->opt = NULL;
1913 	}
1914 
1915 	if (cork->base.dst) {
1916 		dst_release(cork->base.dst);
1917 		cork->base.dst = NULL;
1918 	}
1919 }
1920 
1921 struct sk_buff *__ip6_make_skb(struct sock *sk,
1922 			       struct sk_buff_head *queue,
1923 			       struct inet_cork_full *cork)
1924 {
1925 	struct sk_buff *skb, *tmp_skb;
1926 	struct sk_buff **tail_skb;
1927 	struct in6_addr *final_dst;
1928 	struct net *net = sock_net(sk);
1929 	struct ipv6hdr *hdr;
1930 	struct ipv6_txoptions *opt;
1931 	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1932 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1933 	unsigned char proto = fl6->flowi6_proto;
1934 
1935 	skb = __skb_dequeue(queue);
1936 	if (!skb)
1937 		goto out;
1938 	tail_skb = &(skb_shinfo(skb)->frag_list);
1939 
1940 	/* move skb->data to ip header from ext header */
1941 	if (skb->data < skb_network_header(skb))
1942 		__skb_pull(skb, skb_network_offset(skb));
1943 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1944 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1945 		*tail_skb = tmp_skb;
1946 		tail_skb = &(tmp_skb->next);
1947 		skb->len += tmp_skb->len;
1948 		skb->data_len += tmp_skb->len;
1949 		skb->truesize += tmp_skb->truesize;
1950 		tmp_skb->destructor = NULL;
1951 		tmp_skb->sk = NULL;
1952 	}
1953 
1954 	/* Allow local fragmentation. */
1955 	skb->ignore_df = ip6_sk_ignore_df(sk);
1956 	__skb_pull(skb, skb_network_header_len(skb));
1957 
1958 	final_dst = &fl6->daddr;
1959 	opt = cork->base6.opt;
1960 	if (unlikely(opt)) {
1961 		if (opt->opt_flen)
1962 			proto = ipv6_push_frag_opts(skb, opt, proto);
1963 		if (opt->opt_nflen)
1964 			proto = ipv6_push_nfrag_opts(skb, opt, proto,
1965 						     &final_dst, &fl6->saddr);
1966 	}
1967 	skb_push(skb, sizeof(struct ipv6hdr));
1968 	skb_reset_network_header(skb);
1969 	hdr = ipv6_hdr(skb);
1970 
1971 	ip6_flow_hdr(hdr, cork->base6.tclass,
1972 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1973 					ip6_autoflowlabel(net, sk), fl6));
1974 	hdr->hop_limit = cork->base6.hop_limit;
1975 	hdr->nexthdr = proto;
1976 	hdr->saddr = fl6->saddr;
1977 	hdr->daddr = *final_dst;
1978 
1979 	skb->priority = cork->base.priority;
1980 	skb->mark = cork->base.mark;
1981 	if (sk_is_tcp(sk))
1982 		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1983 	else
1984 		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1985 
1986 	ip6_cork_steal_dst(skb, cork);
1987 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1988 	if (unlikely(proto == IPPROTO_ICMPV6)) {
1989 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1990 		u8 icmp6_type;
1991 
1992 		if (sk->sk_socket->type == SOCK_RAW &&
1993 		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1994 			icmp6_type = fl6->fl6_icmp_type;
1995 		else
1996 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1997 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1998 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1999 	}
2000 
2001 	ip6_cork_release(cork);
2002 out:
2003 	return skb;
2004 }
2005 
2006 int ip6_send_skb(struct sk_buff *skb)
2007 {
2008 	struct net *net = sock_net(skb->sk);
2009 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
2010 	int err;
2011 
2012 	rcu_read_lock();
2013 	err = ip6_local_out(net, skb->sk, skb);
2014 	if (err) {
2015 		if (err > 0)
2016 			err = net_xmit_errno(err);
2017 		if (err)
2018 			IP6_INC_STATS(net, rt->rt6i_idev,
2019 				      IPSTATS_MIB_OUTDISCARDS);
2020 	}
2021 
2022 	rcu_read_unlock();
2023 	return err;
2024 }
2025 
2026 int ip6_push_pending_frames(struct sock *sk)
2027 {
2028 	struct sk_buff *skb;
2029 
2030 	skb = ip6_finish_skb(sk);
2031 	if (!skb)
2032 		return 0;
2033 
2034 	return ip6_send_skb(skb);
2035 }
2036 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2037 
2038 static void __ip6_flush_pending_frames(struct sock *sk,
2039 				       struct sk_buff_head *queue,
2040 				       struct inet_cork_full *cork)
2041 {
2042 	struct sk_buff *skb;
2043 
2044 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2045 		if (skb_dst(skb))
2046 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2047 				      IPSTATS_MIB_OUTDISCARDS);
2048 		kfree_skb(skb);
2049 	}
2050 
2051 	ip6_cork_release(cork);
2052 }
2053 
2054 void ip6_flush_pending_frames(struct sock *sk)
2055 {
2056 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2057 				   &inet_sk(sk)->cork);
2058 }
2059 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2060 
2061 struct sk_buff *ip6_make_skb(struct sock *sk,
2062 			     int getfrag(void *from, char *to, int offset,
2063 					 int len, int odd, struct sk_buff *skb),
2064 			     void *from, size_t length, int transhdrlen,
2065 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2066 			     unsigned int flags, struct inet_cork_full *cork)
2067 {
2068 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2069 	struct sk_buff_head queue;
2070 	int err;
2071 
2072 	if (flags & MSG_PROBE) {
2073 		dst_release(&rt->dst);
2074 		return NULL;
2075 	}
2076 
2077 	__skb_queue_head_init(&queue);
2078 
2079 	cork->base.flags = 0;
2080 	cork->base.addr = 0;
2081 	cork->base.opt = NULL;
2082 	cork->base6.opt = NULL;
2083 	err = ip6_setup_cork(sk, cork, ipc6, rt);
2084 	if (err) {
2085 		ip6_cork_release(cork);
2086 		return ERR_PTR(err);
2087 	}
2088 
2089 	err = __ip6_append_data(sk, &queue, cork,
2090 				&current->task_frag, getfrag, from,
2091 				length + exthdrlen, transhdrlen + exthdrlen,
2092 				flags);
2093 	if (err) {
2094 		__ip6_flush_pending_frames(sk, &queue, cork);
2095 		return ERR_PTR(err);
2096 	}
2097 
2098 	return __ip6_make_skb(sk, &queue, cork);
2099 }
2100