xref: /linux/net/ipv6/ip6_output.c (revision d0f4771e2befbe8de3a16a564c6bbd1d5502cec3)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst_dev_rcu(dst);
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		/* idev stays alive because we hold rcu_read_lock(). */
74 		skb = skb_expand_head(skb, hh_len);
75 		if (!skb) {
76 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
77 			return -ENOMEM;
78 		}
79 	}
80 
81 	hdr = ipv6_hdr(skb);
82 	daddr = &hdr->daddr;
83 	if (unlikely(ipv6_addr_is_multicast(daddr))) {
84 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
85 		    ((mroute6_is_socket(net, skb) &&
86 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
87 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
88 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
89 
90 			/* Do not check for IFF_ALLMULTI; multicast routing
91 			   is not supported in any case.
92 			 */
93 			if (newskb)
94 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
95 					net, sk, newskb, NULL, newskb->dev,
96 					dev_loopback_xmit);
97 
98 			if (hdr->hop_limit == 0) {
99 				IP6_INC_STATS(net, idev,
100 					      IPSTATS_MIB_OUTDISCARDS);
101 				kfree_skb(skb);
102 				return 0;
103 			}
104 		}
105 
106 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
107 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
108 		    !(dev->flags & IFF_LOOPBACK)) {
109 			kfree_skb(skb);
110 			return 0;
111 		}
112 	}
113 
114 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
115 		int res = lwtunnel_xmit(skb);
116 
117 		if (res != LWTUNNEL_XMIT_CONTINUE)
118 			return res;
119 	}
120 
121 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
122 
123 	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (IS_ERR_OR_NULL(neigh)) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
131 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
132 			return -EINVAL;
133 		}
134 	}
135 	sock_confirm_neigh(skb, neigh);
136 	ret = neigh_output(neigh, skb, false);
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		/* Last GSO segment can be smaller than gso_size (and MTU).
166 		 * Adding a fragment header would produce an "atomic fragment",
167 		 * which is considered harmful (RFC-8021). Avoid that.
168 		 */
169 		err = segs->len > mtu ?
170 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
171 			ip6_finish_output2(net, sk, segs);
172 		if (err && ret == 0)
173 			ret = err;
174 	}
175 
176 	return ret;
177 }
178 
179 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
180 				 struct sk_buff *skb, unsigned int mtu)
181 {
182 	if (unlikely(!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
183 	    !skb_gso_validate_network_len(skb, mtu)))
184 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
185 
186 	return ip6_finish_output2(net, sk, skb);
187 }
188 
189 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
190 {
191 	unsigned int mtu;
192 
193 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
194 	/* Policy lookup after SNAT yielded a new policy */
195 	if (skb_dst(skb)->xfrm) {
196 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
197 		return dst_output(net, sk, skb);
198 	}
199 #endif
200 
201 	mtu = ip6_skb_dst_mtu(skb);
202 	if (skb_is_gso(skb))
203 		return ip6_finish_output_gso(net, sk, skb, mtu);
204 
205 	if (unlikely(skb->len > mtu ||
206 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)))
207 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
208 
209 	return ip6_finish_output2(net, sk, skb);
210 }
211 
212 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
213 {
214 	int ret;
215 
216 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
217 	switch (ret) {
218 	case NET_XMIT_SUCCESS:
219 	case NET_XMIT_CN:
220 		return __ip6_finish_output(net, sk, skb) ? : ret;
221 	default:
222 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
223 		return ret;
224 	}
225 }
226 
227 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
228 {
229 	struct dst_entry *dst = skb_dst(skb);
230 	struct net_device *dev, *indev = skb->dev;
231 	struct inet6_dev *idev;
232 	int ret;
233 
234 	skb->protocol = htons(ETH_P_IPV6);
235 	rcu_read_lock();
236 	dev = dst_dev_rcu(dst);
237 	idev = ip6_dst_idev(dst);
238 	skb->dev = dev;
239 
240 	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
241 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
242 		rcu_read_unlock();
243 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
244 		return 0;
245 	}
246 
247 	ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
248 			   net, sk, skb, indev, dev,
249 			   ip6_finish_output,
250 			   !(IP6CB(skb)->flags & IP6SKB_REROUTED));
251 	rcu_read_unlock();
252 	return ret;
253 }
254 EXPORT_SYMBOL(ip6_output);
255 
256 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
257 {
258 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
259 		return ip6_default_np_autolabel(net);
260 	return inet6_test_bit(AUTOFLOWLABEL, sk);
261 }
262 
263 /*
264  * xmit an sk_buff (used by TCP and SCTP)
265  * Note : socket lock is not held for SYNACK packets, but might be modified
266  * by calls to skb_set_owner_w() and ipv6_local_error(),
267  * which are using proper atomic operations or spinlocks.
268  */
269 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
270 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
271 {
272 	const struct ipv6_pinfo *np = inet6_sk(sk);
273 	struct in6_addr *first_hop = &fl6->daddr;
274 	struct dst_entry *dst = skb_dst(skb);
275 	struct inet6_dev *idev = ip6_dst_idev(dst);
276 	struct hop_jumbo_hdr *hop_jumbo;
277 	int hoplen = sizeof(*hop_jumbo);
278 	struct net *net = sock_net(sk);
279 	unsigned int head_room;
280 	struct net_device *dev;
281 	struct ipv6hdr *hdr;
282 	u8  proto = fl6->flowi6_proto;
283 	int seg_len = skb->len;
284 	int ret, hlimit = -1;
285 	u32 mtu;
286 
287 	rcu_read_lock();
288 
289 	dev = dst_dev_rcu(dst);
290 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
291 	if (opt)
292 		head_room += opt->opt_nflen + opt->opt_flen;
293 
294 	if (unlikely(head_room > skb_headroom(skb))) {
295 		/* idev stays alive while we hold rcu_read_lock(). */
296 		skb = skb_expand_head(skb, head_room);
297 		if (!skb) {
298 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
299 			ret = -ENOBUFS;
300 			goto unlock;
301 		}
302 	}
303 
304 	if (unlikely(opt)) {
305 		seg_len += opt->opt_nflen + opt->opt_flen;
306 
307 		if (opt->opt_flen)
308 			proto = ipv6_push_frag_opts(skb, opt, proto);
309 
310 		if (opt->opt_nflen)
311 			proto = ipv6_push_nfrag_opts(skb, opt, proto,
312 						     &first_hop,
313 						     &fl6->saddr);
314 	}
315 
316 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
317 		hop_jumbo = __skb_push(skb, hoplen);
318 
319 		hop_jumbo->nexthdr = proto;
320 		hop_jumbo->hdrlen = 0;
321 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
322 		hop_jumbo->tlv_len = 4;
323 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
324 
325 		proto = IPPROTO_HOPOPTS;
326 		seg_len = 0;
327 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
328 	}
329 
330 	__skb_push(skb, sizeof(struct ipv6hdr));
331 	skb_reset_network_header(skb);
332 	hdr = ipv6_hdr(skb);
333 
334 	/*
335 	 *	Fill in the IPv6 header
336 	 */
337 	if (np)
338 		hlimit = READ_ONCE(np->hop_limit);
339 	if (hlimit < 0)
340 		hlimit = ip6_dst_hoplimit(dst);
341 
342 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
343 				ip6_autoflowlabel(net, sk), fl6));
344 
345 	hdr->payload_len = htons(seg_len);
346 	hdr->nexthdr = proto;
347 	hdr->hop_limit = hlimit;
348 
349 	hdr->saddr = fl6->saddr;
350 	hdr->daddr = *first_hop;
351 
352 	skb->protocol = htons(ETH_P_IPV6);
353 	skb->priority = priority;
354 	skb->mark = mark;
355 
356 	mtu = dst6_mtu(dst);
357 	if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) {
358 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
359 
360 		/* if egress device is enslaved to an L3 master device pass the
361 		 * skb to its handler for processing
362 		 */
363 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
364 		if (unlikely(!skb)) {
365 			ret = 0;
366 			goto unlock;
367 		}
368 
369 		/* hooks should never assume socket lock is held.
370 		 * we promote our socket to non const
371 		 */
372 		ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
373 			      net, (struct sock *)sk, skb, NULL, dev,
374 			      dst_output);
375 		goto unlock;
376 	}
377 
378 	ret = -EMSGSIZE;
379 	skb->dev = dev;
380 	/* ipv6_local_error() does not require socket lock,
381 	 * we promote our socket to non const
382 	 */
383 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
384 
385 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
386 	kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
387 unlock:
388 	rcu_read_unlock();
389 	return ret;
390 }
391 EXPORT_SYMBOL(ip6_xmit);
392 
393 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
394 {
395 	struct ip6_ra_chain *ra;
396 	struct sock *last = NULL;
397 
398 	read_lock(&ip6_ra_lock);
399 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
400 		struct sock *sk = ra->sk;
401 		if (sk && ra->sel == sel &&
402 		    (!sk->sk_bound_dev_if ||
403 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
404 
405 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
406 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
407 				continue;
408 			}
409 			if (last) {
410 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
411 				if (skb2)
412 					rawv6_rcv(last, skb2);
413 			}
414 			last = sk;
415 		}
416 	}
417 
418 	if (last) {
419 		rawv6_rcv(last, skb);
420 		read_unlock(&ip6_ra_lock);
421 		return 1;
422 	}
423 	read_unlock(&ip6_ra_lock);
424 	return 0;
425 }
426 
427 static int ip6_forward_proxy_check(struct sk_buff *skb)
428 {
429 	struct ipv6hdr *hdr = ipv6_hdr(skb);
430 	u8 nexthdr = hdr->nexthdr;
431 	__be16 frag_off;
432 	int offset;
433 
434 	if (ipv6_ext_hdr(nexthdr)) {
435 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
436 		if (offset < 0)
437 			return 0;
438 	} else
439 		offset = sizeof(struct ipv6hdr);
440 
441 	if (nexthdr == IPPROTO_ICMPV6) {
442 		struct icmp6hdr *icmp6;
443 
444 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
445 					 offset + 1 - skb->data)))
446 			return 0;
447 
448 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
449 
450 		switch (icmp6->icmp6_type) {
451 		case NDISC_ROUTER_SOLICITATION:
452 		case NDISC_ROUTER_ADVERTISEMENT:
453 		case NDISC_NEIGHBOUR_SOLICITATION:
454 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
455 		case NDISC_REDIRECT:
456 			/* For reaction involving unicast neighbor discovery
457 			 * message destined to the proxied address, pass it to
458 			 * input function.
459 			 */
460 			return 1;
461 		default:
462 			break;
463 		}
464 	}
465 
466 	/*
467 	 * The proxying router can't forward traffic sent to a link-local
468 	 * address, so signal the sender and discard the packet. This
469 	 * behavior is clarified by the MIPv6 specification.
470 	 */
471 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
472 		dst_link_failure(skb);
473 		return -1;
474 	}
475 
476 	return 0;
477 }
478 
479 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
480 				     struct sk_buff *skb)
481 {
482 #ifdef CONFIG_NET_SWITCHDEV
483 	if (skb->offload_l3_fwd_mark) {
484 		consume_skb(skb);
485 		return 0;
486 	}
487 #endif
488 
489 	skb_clear_tstamp(skb);
490 	return dst_output(net, sk, skb);
491 }
492 
493 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
494 {
495 	if (skb->len <= mtu)
496 		return false;
497 
498 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
499 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
500 		return true;
501 
502 	if (skb->ignore_df)
503 		return false;
504 
505 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
506 		return false;
507 
508 	return true;
509 }
510 
511 int ip6_forward(struct sk_buff *skb)
512 {
513 	struct dst_entry *dst = skb_dst(skb);
514 	struct ipv6hdr *hdr = ipv6_hdr(skb);
515 	struct inet6_skb_parm *opt = IP6CB(skb);
516 	struct net *net = dev_net(dst_dev(dst));
517 	struct net_device *dev;
518 	struct inet6_dev *idev;
519 	SKB_DR(reason);
520 	u32 mtu;
521 
522 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
523 	if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
524 	    (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
525 		goto error;
526 
527 	if (skb->pkt_type != PACKET_HOST)
528 		goto drop;
529 
530 	if (unlikely(skb->sk))
531 		goto drop;
532 
533 	if (skb_warn_if_lro(skb))
534 		goto drop;
535 
536 	if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
537 	    (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
538 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
539 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
540 		goto drop;
541 	}
542 
543 	skb_forward_csum(skb);
544 
545 	/*
546 	 *	We DO NOT make any processing on
547 	 *	RA packets, pushing them to user level AS IS
548 	 *	without ane WARRANTY that application will be able
549 	 *	to interpret them. The reason is that we
550 	 *	cannot make anything clever here.
551 	 *
552 	 *	We are not end-node, so that if packet contains
553 	 *	AH/ESP, we cannot make anything.
554 	 *	Defragmentation also would be mistake, RA packets
555 	 *	cannot be fragmented, because there is no warranty
556 	 *	that different fragments will go along one path. --ANK
557 	 */
558 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
559 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
560 			return 0;
561 	}
562 
563 	/*
564 	 *	check and decrement ttl
565 	 */
566 	if (hdr->hop_limit <= 1) {
567 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
568 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
569 
570 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
571 		return -ETIMEDOUT;
572 	}
573 
574 	/* XXX: idev->cnf.proxy_ndp? */
575 	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
576 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
577 		int proxied = ip6_forward_proxy_check(skb);
578 		if (proxied > 0) {
579 			/* It's tempting to decrease the hop limit
580 			 * here by 1, as we do at the end of the
581 			 * function too.
582 			 *
583 			 * But that would be incorrect, as proxying is
584 			 * not forwarding.  The ip6_input function
585 			 * will handle this packet locally, and it
586 			 * depends on the hop limit being unchanged.
587 			 *
588 			 * One example is the NDP hop limit, that
589 			 * always has to stay 255, but other would be
590 			 * similar checks around RA packets, where the
591 			 * user can even change the desired limit.
592 			 */
593 			return ip6_input(skb);
594 		} else if (proxied < 0) {
595 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
596 			goto drop;
597 		}
598 	}
599 
600 	if (!xfrm6_route_forward(skb)) {
601 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
602 		SKB_DR_SET(reason, XFRM_POLICY);
603 		goto drop;
604 	}
605 	dst = skb_dst(skb);
606 	dev = dst_dev(dst);
607 	/* IPv6 specs say nothing about it, but it is clear that we cannot
608 	   send redirects to source routed frames.
609 	   We don't send redirects to frames decapsulated from IPsec.
610 	 */
611 	if (IP6CB(skb)->iif == dev->ifindex &&
612 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
613 		struct in6_addr *target = NULL;
614 		struct inet_peer *peer;
615 		struct rt6_info *rt;
616 
617 		/*
618 		 *	incoming and outgoing devices are the same
619 		 *	send a redirect.
620 		 */
621 
622 		rt = dst_rt6_info(dst);
623 		if (rt->rt6i_flags & RTF_GATEWAY)
624 			target = &rt->rt6i_gateway;
625 		else
626 			target = &hdr->daddr;
627 
628 		rcu_read_lock();
629 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
630 
631 		/* Limit redirects both by destination (here)
632 		   and by source (inside ndisc_send_redirect)
633 		 */
634 		if (inet_peer_xrlim_allow(peer, 1*HZ))
635 			ndisc_send_redirect(skb, target);
636 		rcu_read_unlock();
637 	} else {
638 		int addrtype = ipv6_addr_type(&hdr->saddr);
639 
640 		/* This check is security critical. */
641 		if (addrtype == IPV6_ADDR_ANY ||
642 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
643 			goto error;
644 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
645 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
646 				    ICMPV6_NOT_NEIGHBOUR, 0);
647 			goto error;
648 		}
649 	}
650 
651 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
652 
653 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
654 	if (mtu < IPV6_MIN_MTU)
655 		mtu = IPV6_MIN_MTU;
656 
657 	if (unlikely(ip6_pkt_too_big(skb, mtu))) {
658 		/* Again, force OUTPUT device used as source address */
659 		skb->dev = dev;
660 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
661 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
662 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
663 				IPSTATS_MIB_FRAGFAILS);
664 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
665 		return -EMSGSIZE;
666 	}
667 
668 	if (skb_cow(skb, dev->hard_header_len)) {
669 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
670 				IPSTATS_MIB_OUTDISCARDS);
671 		goto drop;
672 	}
673 
674 	hdr = ipv6_hdr(skb);
675 
676 	/* Mangling hops number delayed to point after skb COW */
677 
678 	hdr->hop_limit--;
679 
680 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
681 		       net, NULL, skb, skb->dev, dev,
682 		       ip6_forward_finish);
683 
684 error:
685 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
686 	SKB_DR_SET(reason, IP_INADDRERRORS);
687 drop:
688 	kfree_skb_reason(skb, reason);
689 	return -EINVAL;
690 }
691 
692 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
693 {
694 	to->pkt_type = from->pkt_type;
695 	to->priority = from->priority;
696 	to->protocol = from->protocol;
697 	skb_dst_drop(to);
698 	skb_dst_set(to, dst_clone(skb_dst(from)));
699 	to->dev = from->dev;
700 	to->mark = from->mark;
701 
702 	skb_copy_hash(to, from);
703 
704 #ifdef CONFIG_NET_SCHED
705 	to->tc_index = from->tc_index;
706 #endif
707 	nf_copy(to, from);
708 	skb_ext_copy(to, from);
709 	skb_copy_secmark(to, from);
710 }
711 
712 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
713 		      u8 nexthdr, __be32 frag_id,
714 		      struct ip6_fraglist_iter *iter)
715 {
716 	unsigned int first_len;
717 	struct frag_hdr *fh;
718 
719 	/* BUILD HEADER */
720 	*prevhdr = NEXTHDR_FRAGMENT;
721 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
722 	if (!iter->tmp_hdr)
723 		return -ENOMEM;
724 
725 	iter->frag = skb_shinfo(skb)->frag_list;
726 	skb_frag_list_init(skb);
727 
728 	iter->offset = 0;
729 	iter->hlen = hlen;
730 	iter->frag_id = frag_id;
731 	iter->nexthdr = nexthdr;
732 
733 	__skb_pull(skb, hlen);
734 	fh = __skb_push(skb, sizeof(struct frag_hdr));
735 	__skb_push(skb, hlen);
736 	skb_reset_network_header(skb);
737 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
738 
739 	fh->nexthdr = nexthdr;
740 	fh->reserved = 0;
741 	fh->frag_off = htons(IP6_MF);
742 	fh->identification = frag_id;
743 
744 	first_len = skb_pagelen(skb);
745 	skb->data_len = first_len - skb_headlen(skb);
746 	skb->len = first_len;
747 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
748 
749 	return 0;
750 }
751 EXPORT_SYMBOL(ip6_fraglist_init);
752 
753 void ip6_fraglist_prepare(struct sk_buff *skb,
754 			  struct ip6_fraglist_iter *iter)
755 {
756 	struct sk_buff *frag = iter->frag;
757 	unsigned int hlen = iter->hlen;
758 	struct frag_hdr *fh;
759 
760 	frag->ip_summed = CHECKSUM_NONE;
761 	skb_reset_transport_header(frag);
762 	fh = __skb_push(frag, sizeof(struct frag_hdr));
763 	__skb_push(frag, hlen);
764 	skb_reset_network_header(frag);
765 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
766 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
767 	fh->nexthdr = iter->nexthdr;
768 	fh->reserved = 0;
769 	fh->frag_off = htons(iter->offset);
770 	if (frag->next)
771 		fh->frag_off |= htons(IP6_MF);
772 	fh->identification = iter->frag_id;
773 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
774 	ip6_copy_metadata(frag, skb);
775 }
776 EXPORT_SYMBOL(ip6_fraglist_prepare);
777 
778 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
779 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
780 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
781 {
782 	state->prevhdr = prevhdr;
783 	state->nexthdr = nexthdr;
784 	state->frag_id = frag_id;
785 
786 	state->hlen = hlen;
787 	state->mtu = mtu;
788 
789 	state->left = skb->len - hlen;	/* Space per frame */
790 	state->ptr = hlen;		/* Where to start from */
791 
792 	state->hroom = hdr_room;
793 	state->troom = needed_tailroom;
794 
795 	state->offset = 0;
796 }
797 EXPORT_SYMBOL(ip6_frag_init);
798 
799 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
800 {
801 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
802 	struct sk_buff *frag;
803 	struct frag_hdr *fh;
804 	unsigned int len;
805 
806 	len = state->left;
807 	/* IF: it doesn't fit, use 'mtu' - the data space left */
808 	if (len > state->mtu)
809 		len = state->mtu;
810 	/* IF: we are not sending up to and including the packet end
811 	   then align the next start on an eight byte boundary */
812 	if (len < state->left)
813 		len &= ~7;
814 
815 	/* Allocate buffer */
816 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
817 			 state->hroom + state->troom, GFP_ATOMIC);
818 	if (!frag)
819 		return ERR_PTR(-ENOMEM);
820 
821 	/*
822 	 *	Set up data on packet
823 	 */
824 
825 	ip6_copy_metadata(frag, skb);
826 	skb_reserve(frag, state->hroom);
827 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
828 	skb_reset_network_header(frag);
829 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
830 	frag->transport_header = (frag->network_header + state->hlen +
831 				  sizeof(struct frag_hdr));
832 
833 	/*
834 	 *	Charge the memory for the fragment to any owner
835 	 *	it might possess
836 	 */
837 	if (skb->sk)
838 		skb_set_owner_w(frag, skb->sk);
839 
840 	/*
841 	 *	Copy the packet header into the new buffer.
842 	 */
843 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
844 
845 	fragnexthdr_offset = skb_network_header(frag);
846 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
847 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
848 
849 	/*
850 	 *	Build fragment header.
851 	 */
852 	fh->nexthdr = state->nexthdr;
853 	fh->reserved = 0;
854 	fh->identification = state->frag_id;
855 
856 	/*
857 	 *	Copy a block of the IP datagram.
858 	 */
859 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
860 			     len));
861 	state->left -= len;
862 
863 	fh->frag_off = htons(state->offset);
864 	if (state->left > 0)
865 		fh->frag_off |= htons(IP6_MF);
866 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
867 
868 	state->ptr += len;
869 	state->offset += len;
870 
871 	return frag;
872 }
873 EXPORT_SYMBOL(ip6_frag_next);
874 
875 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
876 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
877 {
878 	struct sk_buff *frag;
879 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
880 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
881 				inet6_sk(skb->sk) : NULL;
882 	u8 tstamp_type = skb->tstamp_type;
883 	struct ip6_frag_state state;
884 	unsigned int mtu, hlen, nexthdr_offset;
885 	ktime_t tstamp = skb->tstamp;
886 	int hroom, err = 0;
887 	__be32 frag_id;
888 	u8 *prevhdr, nexthdr = 0;
889 
890 	err = ip6_find_1stfragopt(skb, &prevhdr);
891 	if (err < 0)
892 		goto fail;
893 	hlen = err;
894 	nexthdr = *prevhdr;
895 	nexthdr_offset = prevhdr - skb_network_header(skb);
896 
897 	mtu = ip6_skb_dst_mtu(skb);
898 
899 	/* We must not fragment if the socket is set to force MTU discovery
900 	 * or if the skb it not generated by a local socket.
901 	 */
902 	if (unlikely(!skb->ignore_df && skb->len > mtu))
903 		goto fail_toobig;
904 
905 	if (IP6CB(skb)->frag_max_size) {
906 		if (IP6CB(skb)->frag_max_size > mtu)
907 			goto fail_toobig;
908 
909 		/* don't send fragments larger than what we received */
910 		mtu = IP6CB(skb)->frag_max_size;
911 		if (mtu < IPV6_MIN_MTU)
912 			mtu = IPV6_MIN_MTU;
913 	}
914 
915 	if (np) {
916 		u32 frag_size = READ_ONCE(np->frag_size);
917 
918 		if (frag_size && frag_size < mtu)
919 			mtu = frag_size;
920 	}
921 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
922 		goto fail_toobig;
923 	mtu -= hlen + sizeof(struct frag_hdr);
924 
925 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
926 				    &ipv6_hdr(skb)->saddr);
927 
928 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
929 	    (err = skb_checksum_help(skb)))
930 		goto fail;
931 
932 	prevhdr = skb_network_header(skb) + nexthdr_offset;
933 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
934 	if (skb_has_frag_list(skb)) {
935 		unsigned int first_len = skb_pagelen(skb);
936 		struct ip6_fraglist_iter iter;
937 		struct sk_buff *frag2;
938 
939 		if (first_len - hlen > mtu ||
940 		    ((first_len - hlen) & 7) ||
941 		    skb_cloned(skb) ||
942 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
943 			goto slow_path;
944 
945 		skb_walk_frags(skb, frag) {
946 			/* Correct geometry. */
947 			if (frag->len > mtu ||
948 			    ((frag->len & 7) && frag->next) ||
949 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
950 				goto slow_path_clean;
951 
952 			/* Partially cloned skb? */
953 			if (skb_shared(frag))
954 				goto slow_path_clean;
955 
956 			BUG_ON(frag->sk);
957 			if (skb->sk) {
958 				frag->sk = skb->sk;
959 				frag->destructor = sock_wfree;
960 			}
961 			skb->truesize -= frag->truesize;
962 		}
963 
964 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
965 					&iter);
966 		if (err < 0)
967 			goto fail;
968 
969 		/* We prevent @rt from being freed. */
970 		rcu_read_lock();
971 
972 		for (;;) {
973 			/* Prepare header of the next frame,
974 			 * before previous one went down. */
975 			if (iter.frag)
976 				ip6_fraglist_prepare(skb, &iter);
977 
978 			skb_set_delivery_time(skb, tstamp, tstamp_type);
979 			err = output(net, sk, skb);
980 			if (!err)
981 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
982 					      IPSTATS_MIB_FRAGCREATES);
983 
984 			if (err || !iter.frag)
985 				break;
986 
987 			skb = ip6_fraglist_next(&iter);
988 		}
989 
990 		kfree(iter.tmp_hdr);
991 
992 		if (err == 0) {
993 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
994 				      IPSTATS_MIB_FRAGOKS);
995 			rcu_read_unlock();
996 			return 0;
997 		}
998 
999 		kfree_skb_list(iter.frag);
1000 
1001 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
1002 			      IPSTATS_MIB_FRAGFAILS);
1003 		rcu_read_unlock();
1004 		return err;
1005 
1006 slow_path_clean:
1007 		skb_walk_frags(skb, frag2) {
1008 			if (frag2 == frag)
1009 				break;
1010 			frag2->sk = NULL;
1011 			frag2->destructor = NULL;
1012 			skb->truesize += frag2->truesize;
1013 		}
1014 	}
1015 
1016 slow_path:
1017 	/*
1018 	 *	Fragment the datagram.
1019 	 */
1020 
1021 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1022 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1023 		      &state);
1024 
1025 	/*
1026 	 *	Keep copying data until we run out.
1027 	 */
1028 
1029 	while (state.left > 0) {
1030 		frag = ip6_frag_next(skb, &state);
1031 		if (IS_ERR(frag)) {
1032 			err = PTR_ERR(frag);
1033 			goto fail;
1034 		}
1035 
1036 		/*
1037 		 *	Put this fragment into the sending queue.
1038 		 */
1039 		skb_set_delivery_time(frag, tstamp, tstamp_type);
1040 		err = output(net, sk, frag);
1041 		if (err)
1042 			goto fail;
1043 
1044 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1045 			      IPSTATS_MIB_FRAGCREATES);
1046 	}
1047 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1048 		      IPSTATS_MIB_FRAGOKS);
1049 	consume_skb(skb);
1050 	return err;
1051 
1052 fail_toobig:
1053 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1054 	err = -EMSGSIZE;
1055 
1056 fail:
1057 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1058 		      IPSTATS_MIB_FRAGFAILS);
1059 	kfree_skb(skb);
1060 	return err;
1061 }
1062 
1063 static inline int ip6_rt_check(const struct rt6key *rt_key,
1064 			       const struct in6_addr *fl_addr,
1065 			       const struct in6_addr *addr_cache)
1066 {
1067 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1068 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1069 }
1070 
1071 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1072 					  struct dst_entry *dst,
1073 					  const struct flowi6 *fl6)
1074 {
1075 	struct ipv6_pinfo *np = inet6_sk(sk);
1076 	struct rt6_info *rt;
1077 
1078 	if (!dst)
1079 		goto out;
1080 
1081 	if (dst->ops->family != AF_INET6) {
1082 		dst_release(dst);
1083 		return NULL;
1084 	}
1085 
1086 	rt = dst_rt6_info(dst);
1087 	/* Yes, checking route validity in not connected
1088 	 * case is not very simple. Take into account,
1089 	 * that we do not support routing by source, TOS,
1090 	 * and MSG_DONTROUTE		--ANK (980726)
1091 	 *
1092 	 * 1. ip6_rt_check(): If route was host route,
1093 	 *    check that cached destination is current.
1094 	 *    If it is network route, we still may
1095 	 *    check its validity using saved pointer
1096 	 *    to the last used address: daddr_cache.
1097 	 *    We do not want to save whole address now,
1098 	 *    (because main consumer of this service
1099 	 *    is tcp, which has not this problem),
1100 	 *    so that the last trick works only on connected
1101 	 *    sockets.
1102 	 * 2. oif also should be the same.
1103 	 */
1104 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
1105 			 np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
1106 #ifdef CONFIG_IPV6_SUBTREES
1107 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
1108 			 np->saddr_cache ? &np->saddr : NULL) ||
1109 #endif
1110 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1111 		dst_release(dst);
1112 		dst = NULL;
1113 	}
1114 
1115 out:
1116 	return dst;
1117 }
1118 
1119 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1120 			       struct dst_entry **dst, struct flowi6 *fl6)
1121 {
1122 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1123 	struct neighbour *n;
1124 	struct rt6_info *rt;
1125 #endif
1126 	int err;
1127 	int flags = 0;
1128 
1129 	/* The correct way to handle this would be to do
1130 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1131 	 * the route-specific preferred source forces the
1132 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1133 	 *
1134 	 * In source specific routing (no src=any default route),
1135 	 * ip6_route_output will fail given src=any saddr, though, so
1136 	 * that's why we try it again later.
1137 	 */
1138 	if (ipv6_addr_any(&fl6->saddr)) {
1139 		struct fib6_info *from;
1140 		struct rt6_info *rt;
1141 
1142 		*dst = ip6_route_output(net, sk, fl6);
1143 		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1144 
1145 		rcu_read_lock();
1146 		from = rt ? rcu_dereference(rt->from) : NULL;
1147 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1148 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1149 					  fl6->flowi6_l3mdev,
1150 					  &fl6->saddr);
1151 		rcu_read_unlock();
1152 
1153 		if (err)
1154 			goto out_err_release;
1155 
1156 		/* If we had an erroneous initial result, pretend it
1157 		 * never existed and let the SA-enabled version take
1158 		 * over.
1159 		 */
1160 		if ((*dst)->error) {
1161 			dst_release(*dst);
1162 			*dst = NULL;
1163 		}
1164 
1165 		if (fl6->flowi6_oif)
1166 			flags |= RT6_LOOKUP_F_IFACE;
1167 	}
1168 
1169 	if (!*dst)
1170 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1171 
1172 	err = (*dst)->error;
1173 	if (err)
1174 		goto out_err_release;
1175 
1176 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1177 	/*
1178 	 * Here if the dst entry we've looked up
1179 	 * has a neighbour entry that is in the INCOMPLETE
1180 	 * state and the src address from the flow is
1181 	 * marked as OPTIMISTIC, we release the found
1182 	 * dst entry and replace it instead with the
1183 	 * dst entry of the nexthop router
1184 	 */
1185 	rt = dst_rt6_info(*dst);
1186 	rcu_read_lock();
1187 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1188 				      rt6_nexthop(rt, &fl6->daddr));
1189 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1190 	rcu_read_unlock();
1191 
1192 	if (err) {
1193 		struct inet6_ifaddr *ifp;
1194 		struct flowi6 fl_gw6;
1195 		int redirect;
1196 
1197 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1198 				      (*dst)->dev, 1);
1199 
1200 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1201 		if (ifp)
1202 			in6_ifa_put(ifp);
1203 
1204 		if (redirect) {
1205 			/*
1206 			 * We need to get the dst entry for the
1207 			 * default router instead
1208 			 */
1209 			dst_release(*dst);
1210 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1211 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1212 			*dst = ip6_route_output(net, sk, &fl_gw6);
1213 			err = (*dst)->error;
1214 			if (err)
1215 				goto out_err_release;
1216 		}
1217 	}
1218 #endif
1219 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1220 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1221 		err = -EAFNOSUPPORT;
1222 		goto out_err_release;
1223 	}
1224 
1225 	return 0;
1226 
1227 out_err_release:
1228 	dst_release(*dst);
1229 	*dst = NULL;
1230 
1231 	if (err == -ENETUNREACH)
1232 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1233 	return err;
1234 }
1235 
1236 /**
1237  *	ip6_dst_lookup - perform route lookup on flow
1238  *	@net: Network namespace to perform lookup in
1239  *	@sk: socket which provides route info
1240  *	@dst: pointer to dst_entry * for result
1241  *	@fl6: flow to lookup
1242  *
1243  *	This function performs a route lookup on the given flow.
1244  *
1245  *	It returns zero on success, or a standard errno code on error.
1246  */
1247 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1248 		   struct flowi6 *fl6)
1249 {
1250 	*dst = NULL;
1251 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1252 }
1253 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1254 
1255 /**
1256  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1257  *	@net: Network namespace to perform lookup in
1258  *	@sk: socket which provides route info
1259  *	@fl6: flow to lookup
1260  *	@final_dst: final destination address for ipsec lookup
1261  *
1262  *	This function performs a route lookup on the given flow.
1263  *
1264  *	It returns a valid dst pointer on success, or a pointer encoded
1265  *	error code.
1266  */
1267 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1268 				      const struct in6_addr *final_dst)
1269 {
1270 	struct dst_entry *dst = NULL;
1271 	int err;
1272 
1273 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1274 	if (err)
1275 		return ERR_PTR(err);
1276 	if (final_dst)
1277 		fl6->daddr = *final_dst;
1278 
1279 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1280 }
1281 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1282 
1283 /**
1284  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1285  *	@sk: socket which provides the dst cache and route info
1286  *	@fl6: flow to lookup
1287  *	@final_dst: final destination address for ipsec lookup
1288  *	@connected: whether @sk is connected or not
1289  *
1290  *	This function performs a route lookup on the given flow with the
1291  *	possibility of using the cached route in the socket if it is valid.
1292  *	It will take the socket dst lock when operating on the dst cache.
1293  *	As a result, this function can only be used in process context.
1294  *
1295  *	In addition, for a connected socket, cache the dst in the socket
1296  *	if the current cache is not valid.
1297  *
1298  *	It returns a valid dst pointer on success, or a pointer encoded
1299  *	error code.
1300  */
1301 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1302 					 const struct in6_addr *final_dst,
1303 					 bool connected)
1304 {
1305 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1306 
1307 	dst = ip6_sk_dst_check(sk, dst, fl6);
1308 	if (dst)
1309 		return dst;
1310 
1311 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1312 	if (connected && !IS_ERR(dst))
1313 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1314 
1315 	return dst;
1316 }
1317 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1318 
1319 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1320 					       gfp_t gfp)
1321 {
1322 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1323 }
1324 
1325 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1326 						gfp_t gfp)
1327 {
1328 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1329 }
1330 
1331 static void ip6_append_data_mtu(unsigned int *mtu,
1332 				int *maxfraglen,
1333 				unsigned int fragheaderlen,
1334 				struct sk_buff *skb,
1335 				struct rt6_info *rt,
1336 				unsigned int orig_mtu)
1337 {
1338 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1339 		if (!skb) {
1340 			/* first fragment, reserve header_len */
1341 			*mtu = orig_mtu - rt->dst.header_len;
1342 
1343 		} else {
1344 			/*
1345 			 * this fragment is not first, the headers
1346 			 * space is regarded as data space.
1347 			 */
1348 			*mtu = orig_mtu;
1349 		}
1350 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1351 			      + fragheaderlen - sizeof(struct frag_hdr);
1352 	}
1353 }
1354 
1355 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1356 			  struct ipcm6_cookie *ipc6,
1357 			  struct rt6_info *rt)
1358 {
1359 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1360 	struct inet6_cork *v6_cork = &cork->base6;
1361 	struct ipv6_pinfo *np = inet6_sk(sk);
1362 	unsigned int mtu, frag_size;
1363 
1364 	/* callers pass dst together with a reference, set it first so
1365 	 * ip6_cork_release() can put it down even in case of an error.
1366 	 */
1367 	cork->base.dst = &rt->dst;
1368 
1369 	/*
1370 	 * setup for corking
1371 	 */
1372 	if (unlikely(opt)) {
1373 		if (WARN_ON(v6_cork->opt))
1374 			return -EINVAL;
1375 
1376 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1377 		if (unlikely(!nopt))
1378 			return -ENOBUFS;
1379 
1380 		nopt->tot_len = sizeof(*opt);
1381 		nopt->opt_flen = opt->opt_flen;
1382 		nopt->opt_nflen = opt->opt_nflen;
1383 
1384 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1385 		if (opt->dst0opt && !nopt->dst0opt)
1386 			return -ENOBUFS;
1387 
1388 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1389 		if (opt->dst1opt && !nopt->dst1opt)
1390 			return -ENOBUFS;
1391 
1392 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1393 		if (opt->hopopt && !nopt->hopopt)
1394 			return -ENOBUFS;
1395 
1396 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1397 		if (opt->srcrt && !nopt->srcrt)
1398 			return -ENOBUFS;
1399 
1400 		/* need source address above miyazawa*/
1401 	}
1402 	v6_cork->hop_limit = ipc6->hlimit;
1403 	v6_cork->tclass = ipc6->tclass;
1404 	v6_cork->dontfrag = ipc6->dontfrag;
1405 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1406 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1407 		      READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst);
1408 	else
1409 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1410 			READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst));
1411 
1412 	frag_size = READ_ONCE(np->frag_size);
1413 	if (frag_size && frag_size < mtu)
1414 		mtu = frag_size;
1415 
1416 	cork->base.fragsize = mtu;
1417 	cork->base.gso_size = ipc6->gso_size;
1418 	cork->base.tx_flags = 0;
1419 	cork->base.mark = ipc6->sockc.mark;
1420 	cork->base.priority = ipc6->sockc.priority;
1421 	sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1422 	if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1423 		cork->base.flags |= IPCORK_TS_OPT_ID;
1424 		cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1425 	}
1426 	cork->base.length = 0;
1427 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1428 
1429 	return 0;
1430 }
1431 
1432 static int __ip6_append_data(struct sock *sk,
1433 			     struct sk_buff_head *queue,
1434 			     struct inet_cork_full *cork_full,
1435 			     struct page_frag *pfrag,
1436 			     int getfrag(void *from, char *to, int offset,
1437 					 int len, int odd, struct sk_buff *skb),
1438 			     void *from, size_t length, int transhdrlen,
1439 			     unsigned int flags)
1440 {
1441 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1442 	struct inet6_cork *v6_cork = &cork_full->base6;
1443 	struct inet_cork *cork = &cork_full->base;
1444 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1445 	struct sk_buff *skb, *skb_prev = NULL;
1446 	struct ubuf_info *uarg = NULL;
1447 	int exthdrlen = 0;
1448 	int dst_exthdrlen = 0;
1449 	int hh_len;
1450 	int copy;
1451 	int err;
1452 	int offset = 0;
1453 	bool zc = false;
1454 	u32 tskey = 0;
1455 	struct rt6_info *rt = dst_rt6_info(cork->dst);
1456 	bool paged, hold_tskey = false, extra_uref = false;
1457 	struct ipv6_txoptions *opt = v6_cork->opt;
1458 	int csummode = CHECKSUM_NONE;
1459 	unsigned int maxnonfragsize, headersize;
1460 	unsigned int wmem_alloc_delta = 0;
1461 
1462 	skb = skb_peek_tail(queue);
1463 	if (!skb) {
1464 		exthdrlen = opt ? opt->opt_flen : 0;
1465 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1466 	}
1467 
1468 	paged = !!cork->gso_size;
1469 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1470 	orig_mtu = mtu;
1471 
1472 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1473 
1474 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1475 			(opt ? opt->opt_nflen : 0);
1476 
1477 	headersize = sizeof(struct ipv6hdr) +
1478 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1479 		     rt->rt6i_nfheader_len;
1480 
1481 	if (mtu <= fragheaderlen ||
1482 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1483 		goto emsgsize;
1484 
1485 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1486 		     sizeof(struct frag_hdr);
1487 
1488 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1489 	 * the first fragment
1490 	 */
1491 	if (headersize + transhdrlen > mtu)
1492 		goto emsgsize;
1493 
1494 	if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1495 	    (sk->sk_protocol == IPPROTO_UDP ||
1496 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1497 	     sk->sk_protocol == IPPROTO_RAW)) {
1498 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1499 				sizeof(struct ipv6hdr));
1500 		goto emsgsize;
1501 	}
1502 
1503 	if (ip6_sk_ignore_df(sk))
1504 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1505 	else
1506 		maxnonfragsize = mtu;
1507 
1508 	if (cork->length + length > maxnonfragsize - headersize) {
1509 emsgsize:
1510 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1511 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1512 		return -EMSGSIZE;
1513 	}
1514 
1515 	/* CHECKSUM_PARTIAL only with no extension headers and when
1516 	 * we are not going to fragment
1517 	 */
1518 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1519 	    headersize == sizeof(struct ipv6hdr) &&
1520 	    length <= mtu - headersize &&
1521 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1522 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1523 		csummode = CHECKSUM_PARTIAL;
1524 
1525 	if ((flags & MSG_ZEROCOPY) && length) {
1526 		struct msghdr *msg = from;
1527 
1528 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1529 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1530 				return -EINVAL;
1531 
1532 			/* Leave uarg NULL if can't zerocopy, callers should
1533 			 * be able to handle it.
1534 			 */
1535 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1536 			    csummode == CHECKSUM_PARTIAL) {
1537 				paged = true;
1538 				zc = true;
1539 				uarg = msg->msg_ubuf;
1540 			}
1541 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1542 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1543 						    false);
1544 			if (!uarg)
1545 				return -ENOBUFS;
1546 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1547 			if (rt->dst.dev->features & NETIF_F_SG &&
1548 			    csummode == CHECKSUM_PARTIAL) {
1549 				paged = true;
1550 				zc = true;
1551 			} else {
1552 				uarg_to_msgzc(uarg)->zerocopy = 0;
1553 				skb_zcopy_set(skb, uarg, &extra_uref);
1554 			}
1555 		}
1556 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1557 		if (inet_test_bit(HDRINCL, sk))
1558 			return -EPERM;
1559 		if (rt->dst.dev->features & NETIF_F_SG &&
1560 		    getfrag == ip_generic_getfrag)
1561 			/* We need an empty buffer to attach stuff to */
1562 			paged = true;
1563 		else
1564 			flags &= ~MSG_SPLICE_PAGES;
1565 	}
1566 
1567 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1568 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1569 		if (cork->flags & IPCORK_TS_OPT_ID) {
1570 			tskey = cork->ts_opt_id;
1571 		} else {
1572 			tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1573 			hold_tskey = true;
1574 		}
1575 	}
1576 
1577 	/*
1578 	 * Let's try using as much space as possible.
1579 	 * Use MTU if total length of the message fits into the MTU.
1580 	 * Otherwise, we need to reserve fragment header and
1581 	 * fragment alignment (= 8-15 octects, in total).
1582 	 *
1583 	 * Note that we may need to "move" the data from the tail
1584 	 * of the buffer to the new fragment when we split
1585 	 * the message.
1586 	 *
1587 	 * FIXME: It may be fragmented into multiple chunks
1588 	 *        at once if non-fragmentable extension headers
1589 	 *        are too large.
1590 	 * --yoshfuji
1591 	 */
1592 
1593 	cork->length += length;
1594 	if (!skb)
1595 		goto alloc_new_skb;
1596 
1597 	while (length > 0) {
1598 		/* Check if the remaining data fits into current packet. */
1599 		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1600 		if (copy < length)
1601 			copy = maxfraglen - skb->len;
1602 
1603 		if (copy <= 0) {
1604 			char *data;
1605 			unsigned int datalen;
1606 			unsigned int fraglen;
1607 			unsigned int fraggap;
1608 			unsigned int alloclen, alloc_extra;
1609 			unsigned int pagedlen;
1610 alloc_new_skb:
1611 			/* There's no room in the current skb */
1612 			if (skb)
1613 				fraggap = skb->len - maxfraglen;
1614 			else
1615 				fraggap = 0;
1616 			/* update mtu and maxfraglen if necessary */
1617 			if (!skb || !skb_prev)
1618 				ip6_append_data_mtu(&mtu, &maxfraglen,
1619 						    fragheaderlen, skb, rt,
1620 						    orig_mtu);
1621 
1622 			skb_prev = skb;
1623 
1624 			/*
1625 			 * If remaining data exceeds the mtu,
1626 			 * we know we need more fragment(s).
1627 			 */
1628 			datalen = length + fraggap;
1629 
1630 			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1631 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1632 			fraglen = datalen + fragheaderlen;
1633 			pagedlen = 0;
1634 
1635 			alloc_extra = hh_len;
1636 			alloc_extra += dst_exthdrlen;
1637 			alloc_extra += rt->dst.trailer_len;
1638 
1639 			/* We just reserve space for fragment header.
1640 			 * Note: this may be overallocation if the message
1641 			 * (without MSG_MORE) fits into the MTU.
1642 			 */
1643 			alloc_extra += sizeof(struct frag_hdr);
1644 
1645 			if ((flags & MSG_MORE) &&
1646 			    !(rt->dst.dev->features&NETIF_F_SG))
1647 				alloclen = mtu;
1648 			else if (!paged &&
1649 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1650 				  !(rt->dst.dev->features & NETIF_F_SG)))
1651 				alloclen = fraglen;
1652 			else {
1653 				alloclen = fragheaderlen + transhdrlen;
1654 				pagedlen = datalen - transhdrlen;
1655 			}
1656 			alloclen += alloc_extra;
1657 
1658 			if (datalen != length + fraggap) {
1659 				/*
1660 				 * this is not the last fragment, the trailer
1661 				 * space is regarded as data space.
1662 				 */
1663 				datalen += rt->dst.trailer_len;
1664 			}
1665 
1666 			fraglen = datalen + fragheaderlen;
1667 
1668 			copy = datalen - transhdrlen - fraggap - pagedlen;
1669 			/* [!] NOTE: copy may be negative if pagedlen>0
1670 			 * because then the equation may reduces to -fraggap.
1671 			 */
1672 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1673 				err = -EINVAL;
1674 				goto error;
1675 			}
1676 			if (transhdrlen) {
1677 				skb = sock_alloc_send_skb(sk, alloclen,
1678 						(flags & MSG_DONTWAIT), &err);
1679 			} else {
1680 				skb = NULL;
1681 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1682 				    2 * sk->sk_sndbuf)
1683 					skb = alloc_skb(alloclen,
1684 							sk->sk_allocation);
1685 				if (unlikely(!skb))
1686 					err = -ENOBUFS;
1687 			}
1688 			if (!skb)
1689 				goto error;
1690 			/*
1691 			 *	Fill in the control structures
1692 			 */
1693 			skb->protocol = htons(ETH_P_IPV6);
1694 			skb->ip_summed = csummode;
1695 			skb->csum = 0;
1696 			/* reserve for fragmentation and ipsec header */
1697 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1698 				    dst_exthdrlen);
1699 
1700 			/*
1701 			 *	Find where to start putting bytes
1702 			 */
1703 			data = skb_put(skb, fraglen - pagedlen);
1704 			skb_set_network_header(skb, exthdrlen);
1705 			data += fragheaderlen;
1706 			skb->transport_header = (skb->network_header +
1707 						 fragheaderlen);
1708 			if (fraggap) {
1709 				skb->csum = skb_copy_and_csum_bits(
1710 					skb_prev, maxfraglen,
1711 					data + transhdrlen, fraggap);
1712 				skb_prev->csum = csum_sub(skb_prev->csum,
1713 							  skb->csum);
1714 				data += fraggap;
1715 				pskb_trim_unique(skb_prev, maxfraglen);
1716 			}
1717 			if (copy > 0 &&
1718 			    INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1719 					   from, data + transhdrlen, offset,
1720 					   copy, fraggap, skb) < 0) {
1721 				err = -EFAULT;
1722 				kfree_skb(skb);
1723 				goto error;
1724 			} else if (flags & MSG_SPLICE_PAGES) {
1725 				copy = 0;
1726 			}
1727 
1728 			offset += copy;
1729 			length -= copy + transhdrlen;
1730 			transhdrlen = 0;
1731 			exthdrlen = 0;
1732 			dst_exthdrlen = 0;
1733 
1734 			/* Only the initial fragment is time stamped */
1735 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1736 			cork->tx_flags = 0;
1737 			skb_shinfo(skb)->tskey = tskey;
1738 			tskey = 0;
1739 			skb_zcopy_set(skb, uarg, &extra_uref);
1740 
1741 			if ((flags & MSG_CONFIRM) && !skb_prev)
1742 				skb_set_dst_pending_confirm(skb, 1);
1743 
1744 			/*
1745 			 * Put the packet on the pending queue
1746 			 */
1747 			if (!skb->destructor) {
1748 				skb->destructor = sock_wfree;
1749 				skb->sk = sk;
1750 				wmem_alloc_delta += skb->truesize;
1751 			}
1752 			__skb_queue_tail(queue, skb);
1753 			continue;
1754 		}
1755 
1756 		if (copy > length)
1757 			copy = length;
1758 
1759 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1760 		    skb_tailroom(skb) >= copy) {
1761 			unsigned int off;
1762 
1763 			off = skb->len;
1764 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1765 					    from, skb_put(skb, copy),
1766 					    offset, copy, off, skb) < 0) {
1767 				__skb_trim(skb, off);
1768 				err = -EFAULT;
1769 				goto error;
1770 			}
1771 		} else if (flags & MSG_SPLICE_PAGES) {
1772 			struct msghdr *msg = from;
1773 
1774 			err = -EIO;
1775 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1776 				goto error;
1777 
1778 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
1779 			if (err < 0)
1780 				goto error;
1781 			copy = err;
1782 			wmem_alloc_delta += copy;
1783 		} else if (!zc) {
1784 			int i = skb_shinfo(skb)->nr_frags;
1785 
1786 			err = -ENOMEM;
1787 			if (!sk_page_frag_refill(sk, pfrag))
1788 				goto error;
1789 
1790 			skb_zcopy_downgrade_managed(skb);
1791 			if (!skb_can_coalesce(skb, i, pfrag->page,
1792 					      pfrag->offset)) {
1793 				err = -EMSGSIZE;
1794 				if (i == MAX_SKB_FRAGS)
1795 					goto error;
1796 
1797 				__skb_fill_page_desc(skb, i, pfrag->page,
1798 						     pfrag->offset, 0);
1799 				skb_shinfo(skb)->nr_frags = ++i;
1800 				get_page(pfrag->page);
1801 			}
1802 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1803 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1804 				    from,
1805 				    page_address(pfrag->page) + pfrag->offset,
1806 				    offset, copy, skb->len, skb) < 0)
1807 				goto error_efault;
1808 
1809 			pfrag->offset += copy;
1810 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1811 			skb->len += copy;
1812 			skb->data_len += copy;
1813 			skb->truesize += copy;
1814 			wmem_alloc_delta += copy;
1815 		} else {
1816 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1817 			if (err < 0)
1818 				goto error;
1819 		}
1820 		offset += copy;
1821 		length -= copy;
1822 	}
1823 
1824 	if (wmem_alloc_delta)
1825 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1826 	return 0;
1827 
1828 error_efault:
1829 	err = -EFAULT;
1830 error:
1831 	net_zcopy_put_abort(uarg, extra_uref);
1832 	cork->length -= length;
1833 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1834 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1835 	if (hold_tskey)
1836 		atomic_dec(&sk->sk_tskey);
1837 	return err;
1838 }
1839 
1840 int ip6_append_data(struct sock *sk,
1841 		    int getfrag(void *from, char *to, int offset, int len,
1842 				int odd, struct sk_buff *skb),
1843 		    void *from, size_t length, int transhdrlen,
1844 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1845 		    struct rt6_info *rt, unsigned int flags)
1846 {
1847 	struct inet_sock *inet = inet_sk(sk);
1848 	int exthdrlen;
1849 	int err;
1850 
1851 	if (flags&MSG_PROBE)
1852 		return 0;
1853 	if (skb_queue_empty(&sk->sk_write_queue)) {
1854 		/*
1855 		 * setup for corking
1856 		 */
1857 		dst_hold(&rt->dst);
1858 		err = ip6_setup_cork(sk, &inet->cork,
1859 				     ipc6, rt);
1860 		if (err)
1861 			return err;
1862 
1863 		inet->cork.fl.u.ip6 = *fl6;
1864 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1865 		length += exthdrlen;
1866 		transhdrlen += exthdrlen;
1867 	} else {
1868 		transhdrlen = 0;
1869 	}
1870 
1871 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1872 				 sk_page_frag(sk), getfrag,
1873 				 from, length, transhdrlen, flags);
1874 }
1875 EXPORT_SYMBOL_GPL(ip6_append_data);
1876 
1877 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1878 {
1879 	struct dst_entry *dst = cork->base.dst;
1880 
1881 	cork->base.dst = NULL;
1882 	skb_dst_set(skb, dst);
1883 }
1884 
1885 static void ip6_cork_release(struct inet_cork_full *cork)
1886 {
1887 	struct inet6_cork *v6_cork = &cork->base6;
1888 
1889 	if (unlikely(v6_cork->opt)) {
1890 		struct ipv6_txoptions *opt = v6_cork->opt;
1891 
1892 		kfree(opt->dst0opt);
1893 		kfree(opt->dst1opt);
1894 		kfree(opt->hopopt);
1895 		kfree(opt->srcrt);
1896 		kfree(opt);
1897 		v6_cork->opt = NULL;
1898 	}
1899 
1900 	if (cork->base.dst) {
1901 		dst_release(cork->base.dst);
1902 		cork->base.dst = NULL;
1903 	}
1904 }
1905 
1906 struct sk_buff *__ip6_make_skb(struct sock *sk,
1907 			       struct sk_buff_head *queue,
1908 			       struct inet_cork_full *cork)
1909 {
1910 	struct sk_buff *skb, *tmp_skb;
1911 	struct sk_buff **tail_skb;
1912 	struct in6_addr *final_dst;
1913 	struct net *net = sock_net(sk);
1914 	struct ipv6hdr *hdr;
1915 	struct ipv6_txoptions *opt;
1916 	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1917 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1918 	unsigned char proto = fl6->flowi6_proto;
1919 
1920 	skb = __skb_dequeue(queue);
1921 	if (!skb)
1922 		goto out;
1923 	tail_skb = &(skb_shinfo(skb)->frag_list);
1924 
1925 	/* move skb->data to ip header from ext header */
1926 	if (skb->data < skb_network_header(skb))
1927 		__skb_pull(skb, skb_network_offset(skb));
1928 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1929 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1930 		*tail_skb = tmp_skb;
1931 		tail_skb = &(tmp_skb->next);
1932 		skb->len += tmp_skb->len;
1933 		skb->data_len += tmp_skb->len;
1934 		skb->truesize += tmp_skb->truesize;
1935 		tmp_skb->destructor = NULL;
1936 		tmp_skb->sk = NULL;
1937 	}
1938 
1939 	/* Allow local fragmentation. */
1940 	skb->ignore_df = ip6_sk_ignore_df(sk);
1941 	__skb_pull(skb, skb_network_header_len(skb));
1942 
1943 	final_dst = &fl6->daddr;
1944 	opt = cork->base6.opt;
1945 	if (unlikely(opt)) {
1946 		if (opt->opt_flen)
1947 			proto = ipv6_push_frag_opts(skb, opt, proto);
1948 		if (opt->opt_nflen)
1949 			proto = ipv6_push_nfrag_opts(skb, opt, proto,
1950 						     &final_dst, &fl6->saddr);
1951 	}
1952 	skb_push(skb, sizeof(struct ipv6hdr));
1953 	skb_reset_network_header(skb);
1954 	hdr = ipv6_hdr(skb);
1955 
1956 	ip6_flow_hdr(hdr, cork->base6.tclass,
1957 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1958 					ip6_autoflowlabel(net, sk), fl6));
1959 	hdr->hop_limit = cork->base6.hop_limit;
1960 	hdr->nexthdr = proto;
1961 	hdr->saddr = fl6->saddr;
1962 	hdr->daddr = *final_dst;
1963 
1964 	skb->priority = cork->base.priority;
1965 	skb->mark = cork->base.mark;
1966 	if (sk_is_tcp(sk))
1967 		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1968 	else
1969 		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1970 
1971 	ip6_cork_steal_dst(skb, cork);
1972 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1973 	if (unlikely(proto == IPPROTO_ICMPV6)) {
1974 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1975 		u8 icmp6_type;
1976 
1977 		if (sk->sk_socket->type == SOCK_RAW &&
1978 		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1979 			icmp6_type = fl6->fl6_icmp_type;
1980 		else
1981 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1982 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1983 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1984 	}
1985 
1986 	ip6_cork_release(cork);
1987 out:
1988 	return skb;
1989 }
1990 
1991 int ip6_send_skb(struct sk_buff *skb)
1992 {
1993 	struct net *net = sock_net(skb->sk);
1994 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1995 	int err;
1996 
1997 	rcu_read_lock();
1998 	err = ip6_local_out(net, skb->sk, skb);
1999 	if (err) {
2000 		if (err > 0)
2001 			err = net_xmit_errno(err);
2002 		if (err)
2003 			IP6_INC_STATS(net, rt->rt6i_idev,
2004 				      IPSTATS_MIB_OUTDISCARDS);
2005 	}
2006 
2007 	rcu_read_unlock();
2008 	return err;
2009 }
2010 
2011 int ip6_push_pending_frames(struct sock *sk)
2012 {
2013 	struct sk_buff *skb;
2014 
2015 	skb = ip6_finish_skb(sk);
2016 	if (!skb)
2017 		return 0;
2018 
2019 	return ip6_send_skb(skb);
2020 }
2021 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2022 
2023 static void __ip6_flush_pending_frames(struct sock *sk,
2024 				       struct sk_buff_head *queue,
2025 				       struct inet_cork_full *cork)
2026 {
2027 	struct sk_buff *skb;
2028 
2029 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2030 		if (skb_dst(skb))
2031 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2032 				      IPSTATS_MIB_OUTDISCARDS);
2033 		kfree_skb(skb);
2034 	}
2035 
2036 	ip6_cork_release(cork);
2037 }
2038 
2039 void ip6_flush_pending_frames(struct sock *sk)
2040 {
2041 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2042 				   &inet_sk(sk)->cork);
2043 }
2044 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2045 
2046 struct sk_buff *ip6_make_skb(struct sock *sk,
2047 			     int getfrag(void *from, char *to, int offset,
2048 					 int len, int odd, struct sk_buff *skb),
2049 			     void *from, size_t length, int transhdrlen,
2050 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2051 			     unsigned int flags, struct inet_cork_full *cork)
2052 {
2053 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2054 	struct sk_buff_head queue;
2055 	int err;
2056 
2057 	if (flags & MSG_PROBE) {
2058 		dst_release(&rt->dst);
2059 		return NULL;
2060 	}
2061 
2062 	__skb_queue_head_init(&queue);
2063 
2064 	cork->base.flags = 0;
2065 	cork->base.addr = 0;
2066 	cork->base.opt = NULL;
2067 	cork->base6.opt = NULL;
2068 	err = ip6_setup_cork(sk, cork, ipc6, rt);
2069 	if (err) {
2070 		ip6_cork_release(cork);
2071 		return ERR_PTR(err);
2072 	}
2073 
2074 	err = __ip6_append_data(sk, &queue, cork,
2075 				&current->task_frag, getfrag, from,
2076 				length + exthdrlen, transhdrlen + exthdrlen,
2077 				flags);
2078 	if (err) {
2079 		__ip6_flush_pending_frames(sk, &queue, cork);
2080 		return ERR_PTR(err);
2081 	}
2082 
2083 	return __ip6_make_skb(sk, &queue, cork);
2084 }
2085