xref: /linux/net/ipv6/ip6_output.c (revision e3cdf6cf5fc6db0643723083e2c70fffe098e249)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst_dev_rcu(dst);
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		/* idev stays alive because we hold rcu_read_lock(). */
74 		skb = skb_expand_head(skb, hh_len);
75 		if (!skb) {
76 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
77 			return -ENOMEM;
78 		}
79 	}
80 
81 	hdr = ipv6_hdr(skb);
82 	daddr = &hdr->daddr;
83 	if (unlikely(ipv6_addr_is_multicast(daddr))) {
84 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
85 		    ((mroute6_is_socket(net, skb) &&
86 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
87 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
88 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
89 
90 			/* Do not check for IFF_ALLMULTI; multicast routing
91 			   is not supported in any case.
92 			 */
93 			if (newskb)
94 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
95 					net, sk, newskb, NULL, newskb->dev,
96 					dev_loopback_xmit);
97 
98 			if (hdr->hop_limit == 0) {
99 				IP6_INC_STATS(net, idev,
100 					      IPSTATS_MIB_OUTDISCARDS);
101 				kfree_skb(skb);
102 				return 0;
103 			}
104 		}
105 
106 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
107 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
108 		    !(dev->flags & IFF_LOOPBACK)) {
109 			kfree_skb(skb);
110 			return 0;
111 		}
112 	}
113 
114 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
115 		int res = lwtunnel_xmit(skb);
116 
117 		if (res != LWTUNNEL_XMIT_CONTINUE)
118 			return res;
119 	}
120 
121 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
122 
123 	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (IS_ERR_OR_NULL(neigh)) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
131 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
132 			return -EINVAL;
133 		}
134 	}
135 	sock_confirm_neigh(skb, neigh);
136 	ret = neigh_output(neigh, skb, false);
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		/* Last GSO segment can be smaller than gso_size (and MTU).
166 		 * Adding a fragment header would produce an "atomic fragment",
167 		 * which is considered harmful (RFC-8021). Avoid that.
168 		 */
169 		err = segs->len > mtu ?
170 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
171 			ip6_finish_output2(net, sk, segs);
172 		if (err && ret == 0)
173 			ret = err;
174 	}
175 
176 	return ret;
177 }
178 
179 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
180 				 struct sk_buff *skb, unsigned int mtu)
181 {
182 	if (unlikely(!skb_gso_validate_network_len(skb, mtu)))
183 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
184 
185 	return ip6_finish_output2(net, sk, skb);
186 }
187 
188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
189 {
190 	unsigned int mtu;
191 
192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
193 	/* Policy lookup after SNAT yielded a new policy */
194 	if (skb_dst(skb)->xfrm) {
195 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
196 		return dst_output(net, sk, skb);
197 	}
198 #endif
199 
200 	mtu = ip6_skb_dst_mtu(skb);
201 	if (skb_is_gso(skb))
202 		return ip6_finish_output_gso(net, sk, skb, mtu);
203 
204 	if (unlikely(skb->len > mtu ||
205 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)))
206 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
207 
208 	return ip6_finish_output2(net, sk, skb);
209 }
210 
211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 {
213 	int ret;
214 
215 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
216 	switch (ret) {
217 	case NET_XMIT_SUCCESS:
218 	case NET_XMIT_CN:
219 		return __ip6_finish_output(net, sk, skb) ? : ret;
220 	default:
221 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
222 		return ret;
223 	}
224 }
225 
226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
227 {
228 	struct dst_entry *dst = skb_dst(skb);
229 	struct net_device *dev, *indev = skb->dev;
230 	struct inet6_dev *idev;
231 	int ret;
232 
233 	skb->protocol = htons(ETH_P_IPV6);
234 	rcu_read_lock();
235 	dev = dst_dev_rcu(dst);
236 	idev = ip6_dst_idev(dst);
237 	skb->dev = dev;
238 
239 	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
240 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
241 		rcu_read_unlock();
242 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
243 		return 0;
244 	}
245 
246 	ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
247 			   net, sk, skb, indev, dev,
248 			   ip6_finish_output,
249 			   !(IP6CB(skb)->flags & IP6SKB_REROUTED));
250 	rcu_read_unlock();
251 	return ret;
252 }
253 EXPORT_SYMBOL(ip6_output);
254 
255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
256 {
257 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
258 		return ip6_default_np_autolabel(net);
259 	return inet6_test_bit(AUTOFLOWLABEL, sk);
260 }
261 
262 /*
263  * xmit an sk_buff (used by TCP and SCTP)
264  * Note : socket lock is not held for SYNACK packets, but might be modified
265  * by calls to skb_set_owner_w() and ipv6_local_error(),
266  * which are using proper atomic operations or spinlocks.
267  */
268 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
269 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
270 {
271 	const struct ipv6_pinfo *np = inet6_sk(sk);
272 	struct in6_addr *first_hop = &fl6->daddr;
273 	struct dst_entry *dst = skb_dst(skb);
274 	struct inet6_dev *idev = ip6_dst_idev(dst);
275 	struct net *net = sock_net(sk);
276 	unsigned int head_room;
277 	struct net_device *dev;
278 	struct ipv6hdr *hdr;
279 	u8  proto = fl6->flowi6_proto;
280 	int seg_len = skb->len;
281 	int ret, hlimit = -1;
282 	u32 mtu;
283 
284 	rcu_read_lock();
285 
286 	dev = dst_dev_rcu(dst);
287 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
288 	if (opt)
289 		head_room += opt->opt_nflen + opt->opt_flen;
290 
291 	if (unlikely(head_room > skb_headroom(skb))) {
292 		/* idev stays alive while we hold rcu_read_lock(). */
293 		skb = skb_expand_head(skb, head_room);
294 		if (!skb) {
295 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
296 			ret = -ENOBUFS;
297 			goto unlock;
298 		}
299 	}
300 
301 	if (unlikely(opt)) {
302 		seg_len += opt->opt_nflen + opt->opt_flen;
303 
304 		if (opt->opt_flen)
305 			proto = ipv6_push_frag_opts(skb, opt, proto);
306 
307 		if (opt->opt_nflen)
308 			proto = ipv6_push_nfrag_opts(skb, opt, proto,
309 						     &first_hop,
310 						     &fl6->saddr);
311 	}
312 
313 	if (unlikely(seg_len > IPV6_MAXPLEN))
314 		seg_len = 0;
315 
316 	__skb_push(skb, sizeof(struct ipv6hdr));
317 	skb_reset_network_header(skb);
318 	hdr = ipv6_hdr(skb);
319 
320 	/*
321 	 *	Fill in the IPv6 header
322 	 */
323 	if (np)
324 		hlimit = READ_ONCE(np->hop_limit);
325 	if (hlimit < 0)
326 		hlimit = ip6_dst_hoplimit(dst);
327 
328 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
329 				ip6_autoflowlabel(net, sk), fl6));
330 
331 	hdr->payload_len = htons(seg_len);
332 	hdr->nexthdr = proto;
333 	hdr->hop_limit = hlimit;
334 
335 	hdr->saddr = fl6->saddr;
336 	hdr->daddr = *first_hop;
337 
338 	skb->protocol = htons(ETH_P_IPV6);
339 	skb->priority = priority;
340 	skb->mark = mark;
341 
342 	mtu = dst6_mtu(dst);
343 	if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) {
344 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
345 
346 		/* if egress device is enslaved to an L3 master device pass the
347 		 * skb to its handler for processing
348 		 */
349 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
350 		if (unlikely(!skb)) {
351 			ret = 0;
352 			goto unlock;
353 		}
354 
355 		/* hooks should never assume socket lock is held.
356 		 * we promote our socket to non const
357 		 */
358 		ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
359 			      net, (struct sock *)sk, skb, NULL, dev,
360 			      dst_output);
361 		goto unlock;
362 	}
363 
364 	ret = -EMSGSIZE;
365 	skb->dev = dev;
366 	/* ipv6_local_error() does not require socket lock,
367 	 * we promote our socket to non const
368 	 */
369 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
370 
371 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
372 	kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
373 unlock:
374 	rcu_read_unlock();
375 	return ret;
376 }
377 EXPORT_SYMBOL(ip6_xmit);
378 
379 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
380 {
381 	struct ip6_ra_chain *ra;
382 	struct sock *last = NULL;
383 
384 	read_lock(&ip6_ra_lock);
385 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
386 		struct sock *sk = ra->sk;
387 		if (sk && ra->sel == sel &&
388 		    (!sk->sk_bound_dev_if ||
389 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
390 
391 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
392 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
393 				continue;
394 			}
395 			if (last) {
396 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
397 				if (skb2)
398 					rawv6_rcv(last, skb2);
399 			}
400 			last = sk;
401 		}
402 	}
403 
404 	if (last) {
405 		rawv6_rcv(last, skb);
406 		read_unlock(&ip6_ra_lock);
407 		return 1;
408 	}
409 	read_unlock(&ip6_ra_lock);
410 	return 0;
411 }
412 
413 static int ip6_forward_proxy_check(struct sk_buff *skb)
414 {
415 	struct ipv6hdr *hdr = ipv6_hdr(skb);
416 	u8 nexthdr = hdr->nexthdr;
417 	__be16 frag_off;
418 	int offset;
419 
420 	if (ipv6_ext_hdr(nexthdr)) {
421 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
422 		if (offset < 0)
423 			return 0;
424 	} else
425 		offset = sizeof(struct ipv6hdr);
426 
427 	if (nexthdr == IPPROTO_ICMPV6) {
428 		struct icmp6hdr *icmp6;
429 
430 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
431 					 offset + 1 - skb->data)))
432 			return 0;
433 
434 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
435 
436 		switch (icmp6->icmp6_type) {
437 		case NDISC_ROUTER_SOLICITATION:
438 		case NDISC_ROUTER_ADVERTISEMENT:
439 		case NDISC_NEIGHBOUR_SOLICITATION:
440 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
441 		case NDISC_REDIRECT:
442 			/* For reaction involving unicast neighbor discovery
443 			 * message destined to the proxied address, pass it to
444 			 * input function.
445 			 */
446 			return 1;
447 		default:
448 			break;
449 		}
450 	}
451 
452 	/*
453 	 * The proxying router can't forward traffic sent to a link-local
454 	 * address, so signal the sender and discard the packet. This
455 	 * behavior is clarified by the MIPv6 specification.
456 	 */
457 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
458 		dst_link_failure(skb);
459 		return -1;
460 	}
461 
462 	return 0;
463 }
464 
465 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
466 				     struct sk_buff *skb)
467 {
468 #ifdef CONFIG_NET_SWITCHDEV
469 	if (skb->offload_l3_fwd_mark) {
470 		consume_skb(skb);
471 		return 0;
472 	}
473 #endif
474 
475 	skb_clear_tstamp(skb);
476 	return dst_output(net, sk, skb);
477 }
478 
479 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
480 {
481 	if (skb->len <= mtu)
482 		return false;
483 
484 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
485 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
486 		return true;
487 
488 	if (skb->ignore_df)
489 		return false;
490 
491 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
492 		return false;
493 
494 	return true;
495 }
496 
497 int ip6_forward(struct sk_buff *skb)
498 {
499 	struct dst_entry *dst = skb_dst(skb);
500 	struct ipv6hdr *hdr = ipv6_hdr(skb);
501 	struct inet6_skb_parm *opt = IP6CB(skb);
502 	struct net *net = dev_net(dst_dev(dst));
503 	struct net_device *dev;
504 	struct inet6_dev *idev;
505 	SKB_DR(reason);
506 	u32 mtu;
507 
508 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
509 	if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
510 	    (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
511 		goto error;
512 
513 	if (skb->pkt_type != PACKET_HOST)
514 		goto drop;
515 
516 	if (unlikely(skb->sk))
517 		goto drop;
518 
519 	if (skb_warn_if_lro(skb))
520 		goto drop;
521 
522 	if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
523 	    (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
524 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
525 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
526 		goto drop;
527 	}
528 
529 	skb_forward_csum(skb);
530 
531 	/*
532 	 *	We DO NOT make any processing on
533 	 *	RA packets, pushing them to user level AS IS
534 	 *	without ane WARRANTY that application will be able
535 	 *	to interpret them. The reason is that we
536 	 *	cannot make anything clever here.
537 	 *
538 	 *	We are not end-node, so that if packet contains
539 	 *	AH/ESP, we cannot make anything.
540 	 *	Defragmentation also would be mistake, RA packets
541 	 *	cannot be fragmented, because there is no warranty
542 	 *	that different fragments will go along one path. --ANK
543 	 */
544 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
545 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
546 			return 0;
547 	}
548 
549 	/*
550 	 *	check and decrement ttl
551 	 */
552 	if (hdr->hop_limit <= 1) {
553 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
554 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
555 
556 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
557 		return -ETIMEDOUT;
558 	}
559 
560 	/* XXX: idev->cnf.proxy_ndp? */
561 	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
562 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
563 		int proxied = ip6_forward_proxy_check(skb);
564 		if (proxied > 0) {
565 			/* It's tempting to decrease the hop limit
566 			 * here by 1, as we do at the end of the
567 			 * function too.
568 			 *
569 			 * But that would be incorrect, as proxying is
570 			 * not forwarding.  The ip6_input function
571 			 * will handle this packet locally, and it
572 			 * depends on the hop limit being unchanged.
573 			 *
574 			 * One example is the NDP hop limit, that
575 			 * always has to stay 255, but other would be
576 			 * similar checks around RA packets, where the
577 			 * user can even change the desired limit.
578 			 */
579 			return ip6_input(skb);
580 		} else if (proxied < 0) {
581 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
582 			goto drop;
583 		}
584 	}
585 
586 	if (!xfrm6_route_forward(skb)) {
587 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
588 		SKB_DR_SET(reason, XFRM_POLICY);
589 		goto drop;
590 	}
591 	dst = skb_dst(skb);
592 	dev = dst_dev(dst);
593 	/* IPv6 specs say nothing about it, but it is clear that we cannot
594 	   send redirects to source routed frames.
595 	   We don't send redirects to frames decapsulated from IPsec.
596 	 */
597 	if (IP6CB(skb)->iif == dev->ifindex &&
598 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
599 		struct in6_addr *target = NULL;
600 		struct inet_peer *peer;
601 		struct rt6_info *rt;
602 
603 		/*
604 		 *	incoming and outgoing devices are the same
605 		 *	send a redirect.
606 		 */
607 
608 		rt = dst_rt6_info(dst);
609 		if (rt->rt6i_flags & RTF_GATEWAY)
610 			target = &rt->rt6i_gateway;
611 		else
612 			target = &hdr->daddr;
613 
614 		rcu_read_lock();
615 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
616 
617 		/* Limit redirects both by destination (here)
618 		   and by source (inside ndisc_send_redirect)
619 		 */
620 		if (inet_peer_xrlim_allow(peer, 1*HZ))
621 			ndisc_send_redirect(skb, target);
622 		rcu_read_unlock();
623 	} else {
624 		int addrtype = ipv6_addr_type(&hdr->saddr);
625 
626 		/* This check is security critical. */
627 		if (addrtype == IPV6_ADDR_ANY ||
628 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
629 			goto error;
630 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
631 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
632 				    ICMPV6_NOT_NEIGHBOUR, 0);
633 			goto error;
634 		}
635 	}
636 
637 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
638 
639 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
640 	if (mtu < IPV6_MIN_MTU)
641 		mtu = IPV6_MIN_MTU;
642 
643 	if (unlikely(ip6_pkt_too_big(skb, mtu))) {
644 		/* Again, force OUTPUT device used as source address */
645 		skb->dev = dev;
646 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
647 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
648 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
649 				IPSTATS_MIB_FRAGFAILS);
650 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
651 		return -EMSGSIZE;
652 	}
653 
654 	if (skb_cow(skb, dev->hard_header_len)) {
655 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
656 				IPSTATS_MIB_OUTDISCARDS);
657 		goto drop;
658 	}
659 
660 	hdr = ipv6_hdr(skb);
661 
662 	/* Mangling hops number delayed to point after skb COW */
663 
664 	hdr->hop_limit--;
665 
666 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
667 		       net, NULL, skb, skb->dev, dev,
668 		       ip6_forward_finish);
669 
670 error:
671 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
672 	SKB_DR_SET(reason, IP_INADDRERRORS);
673 drop:
674 	kfree_skb_reason(skb, reason);
675 	return -EINVAL;
676 }
677 
678 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
679 {
680 	to->pkt_type = from->pkt_type;
681 	to->priority = from->priority;
682 	to->protocol = from->protocol;
683 	skb_dst_drop(to);
684 	skb_dst_set(to, dst_clone(skb_dst(from)));
685 	to->dev = from->dev;
686 	to->mark = from->mark;
687 
688 	skb_copy_hash(to, from);
689 
690 #ifdef CONFIG_NET_SCHED
691 	to->tc_index = from->tc_index;
692 #endif
693 	nf_copy(to, from);
694 	skb_ext_copy(to, from);
695 	skb_copy_secmark(to, from);
696 }
697 
698 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
699 		      u8 nexthdr, __be32 frag_id,
700 		      struct ip6_fraglist_iter *iter)
701 {
702 	unsigned int first_len;
703 	struct frag_hdr *fh;
704 
705 	/* BUILD HEADER */
706 	*prevhdr = NEXTHDR_FRAGMENT;
707 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
708 	if (!iter->tmp_hdr)
709 		return -ENOMEM;
710 
711 	iter->frag = skb_shinfo(skb)->frag_list;
712 	skb_frag_list_init(skb);
713 
714 	iter->offset = 0;
715 	iter->hlen = hlen;
716 	iter->frag_id = frag_id;
717 	iter->nexthdr = nexthdr;
718 
719 	__skb_pull(skb, hlen);
720 	fh = __skb_push(skb, sizeof(struct frag_hdr));
721 	__skb_push(skb, hlen);
722 	skb_reset_network_header(skb);
723 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
724 
725 	fh->nexthdr = nexthdr;
726 	fh->reserved = 0;
727 	fh->frag_off = htons(IP6_MF);
728 	fh->identification = frag_id;
729 
730 	first_len = skb_pagelen(skb);
731 	skb->data_len = first_len - skb_headlen(skb);
732 	skb->len = first_len;
733 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
734 
735 	return 0;
736 }
737 EXPORT_SYMBOL(ip6_fraglist_init);
738 
739 void ip6_fraglist_prepare(struct sk_buff *skb,
740 			  struct ip6_fraglist_iter *iter)
741 {
742 	struct sk_buff *frag = iter->frag;
743 	unsigned int hlen = iter->hlen;
744 	struct frag_hdr *fh;
745 
746 	frag->ip_summed = CHECKSUM_NONE;
747 	skb_reset_transport_header(frag);
748 	fh = __skb_push(frag, sizeof(struct frag_hdr));
749 	__skb_push(frag, hlen);
750 	skb_reset_network_header(frag);
751 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
752 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
753 	fh->nexthdr = iter->nexthdr;
754 	fh->reserved = 0;
755 	fh->frag_off = htons(iter->offset);
756 	if (frag->next)
757 		fh->frag_off |= htons(IP6_MF);
758 	fh->identification = iter->frag_id;
759 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
760 	ip6_copy_metadata(frag, skb);
761 }
762 EXPORT_SYMBOL(ip6_fraglist_prepare);
763 
764 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
765 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
766 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
767 {
768 	state->prevhdr = prevhdr;
769 	state->nexthdr = nexthdr;
770 	state->frag_id = frag_id;
771 
772 	state->hlen = hlen;
773 	state->mtu = mtu;
774 
775 	state->left = skb->len - hlen;	/* Space per frame */
776 	state->ptr = hlen;		/* Where to start from */
777 
778 	state->hroom = hdr_room;
779 	state->troom = needed_tailroom;
780 
781 	state->offset = 0;
782 }
783 EXPORT_SYMBOL(ip6_frag_init);
784 
785 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
786 {
787 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
788 	struct sk_buff *frag;
789 	struct frag_hdr *fh;
790 	unsigned int len;
791 
792 	len = state->left;
793 	/* IF: it doesn't fit, use 'mtu' - the data space left */
794 	if (len > state->mtu)
795 		len = state->mtu;
796 	/* IF: we are not sending up to and including the packet end
797 	   then align the next start on an eight byte boundary */
798 	if (len < state->left)
799 		len &= ~7;
800 
801 	/* Allocate buffer */
802 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
803 			 state->hroom + state->troom, GFP_ATOMIC);
804 	if (!frag)
805 		return ERR_PTR(-ENOMEM);
806 
807 	/*
808 	 *	Set up data on packet
809 	 */
810 
811 	ip6_copy_metadata(frag, skb);
812 	skb_reserve(frag, state->hroom);
813 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
814 	skb_reset_network_header(frag);
815 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
816 	frag->transport_header = (frag->network_header + state->hlen +
817 				  sizeof(struct frag_hdr));
818 
819 	/*
820 	 *	Charge the memory for the fragment to any owner
821 	 *	it might possess
822 	 */
823 	if (skb->sk)
824 		skb_set_owner_w(frag, skb->sk);
825 
826 	/*
827 	 *	Copy the packet header into the new buffer.
828 	 */
829 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
830 
831 	fragnexthdr_offset = skb_network_header(frag);
832 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
833 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
834 
835 	/*
836 	 *	Build fragment header.
837 	 */
838 	fh->nexthdr = state->nexthdr;
839 	fh->reserved = 0;
840 	fh->identification = state->frag_id;
841 
842 	/*
843 	 *	Copy a block of the IP datagram.
844 	 */
845 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
846 			     len));
847 	state->left -= len;
848 
849 	fh->frag_off = htons(state->offset);
850 	if (state->left > 0)
851 		fh->frag_off |= htons(IP6_MF);
852 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
853 
854 	state->ptr += len;
855 	state->offset += len;
856 
857 	return frag;
858 }
859 EXPORT_SYMBOL(ip6_frag_next);
860 
861 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
862 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
863 {
864 	struct sk_buff *frag;
865 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
866 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
867 				inet6_sk(skb->sk) : NULL;
868 	u8 tstamp_type = skb->tstamp_type;
869 	struct ip6_frag_state state;
870 	unsigned int mtu, hlen, nexthdr_offset;
871 	ktime_t tstamp = skb->tstamp;
872 	int hroom, err = 0;
873 	__be32 frag_id;
874 	u8 *prevhdr, nexthdr = 0;
875 
876 	if (!ipv6_mod_enabled()) {
877 		kfree_skb(skb);
878 		return -EAFNOSUPPORT;
879 	}
880 
881 	err = ip6_find_1stfragopt(skb, &prevhdr);
882 	if (err < 0)
883 		goto fail;
884 	hlen = err;
885 	nexthdr = *prevhdr;
886 	nexthdr_offset = prevhdr - skb_network_header(skb);
887 
888 	mtu = ip6_skb_dst_mtu(skb);
889 
890 	/* We must not fragment if the socket is set to force MTU discovery
891 	 * or if the skb it not generated by a local socket.
892 	 */
893 	if (unlikely(!skb->ignore_df && skb->len > mtu))
894 		goto fail_toobig;
895 
896 	if (IP6CB(skb)->frag_max_size) {
897 		if (IP6CB(skb)->frag_max_size > mtu)
898 			goto fail_toobig;
899 
900 		/* don't send fragments larger than what we received */
901 		mtu = IP6CB(skb)->frag_max_size;
902 		if (mtu < IPV6_MIN_MTU)
903 			mtu = IPV6_MIN_MTU;
904 	}
905 
906 	if (np) {
907 		u32 frag_size = READ_ONCE(np->frag_size);
908 
909 		if (frag_size && frag_size < mtu)
910 			mtu = frag_size;
911 	}
912 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
913 		goto fail_toobig;
914 	mtu -= hlen + sizeof(struct frag_hdr);
915 
916 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
917 				    &ipv6_hdr(skb)->saddr);
918 
919 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
920 	    (err = skb_checksum_help(skb)))
921 		goto fail;
922 
923 	prevhdr = skb_network_header(skb) + nexthdr_offset;
924 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
925 	if (skb_has_frag_list(skb)) {
926 		unsigned int first_len = skb_pagelen(skb);
927 		struct ip6_fraglist_iter iter;
928 		struct sk_buff *frag2;
929 
930 		if (first_len - hlen > mtu ||
931 		    ((first_len - hlen) & 7) ||
932 		    skb_cloned(skb) ||
933 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
934 			goto slow_path;
935 
936 		skb_walk_frags(skb, frag) {
937 			/* Correct geometry. */
938 			if (frag->len > mtu ||
939 			    ((frag->len & 7) && frag->next) ||
940 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
941 				goto slow_path_clean;
942 
943 			/* Partially cloned skb? */
944 			if (skb_shared(frag))
945 				goto slow_path_clean;
946 
947 			BUG_ON(frag->sk);
948 			if (skb->sk) {
949 				frag->sk = skb->sk;
950 				frag->destructor = sock_wfree;
951 			}
952 			skb->truesize -= frag->truesize;
953 		}
954 
955 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
956 					&iter);
957 		if (err < 0)
958 			goto fail;
959 
960 		/* We prevent @rt from being freed. */
961 		rcu_read_lock();
962 
963 		for (;;) {
964 			/* Prepare header of the next frame,
965 			 * before previous one went down. */
966 			if (iter.frag)
967 				ip6_fraglist_prepare(skb, &iter);
968 
969 			skb_set_delivery_time(skb, tstamp, tstamp_type);
970 			err = output(net, sk, skb);
971 			if (!err)
972 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
973 					      IPSTATS_MIB_FRAGCREATES);
974 
975 			if (err || !iter.frag)
976 				break;
977 
978 			skb = ip6_fraglist_next(&iter);
979 		}
980 
981 		kfree(iter.tmp_hdr);
982 
983 		if (err == 0) {
984 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
985 				      IPSTATS_MIB_FRAGOKS);
986 			rcu_read_unlock();
987 			return 0;
988 		}
989 
990 		kfree_skb_list(iter.frag);
991 
992 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
993 			      IPSTATS_MIB_FRAGFAILS);
994 		rcu_read_unlock();
995 		return err;
996 
997 slow_path_clean:
998 		skb_walk_frags(skb, frag2) {
999 			if (frag2 == frag)
1000 				break;
1001 			frag2->sk = NULL;
1002 			frag2->destructor = NULL;
1003 			skb->truesize += frag2->truesize;
1004 		}
1005 	}
1006 
1007 slow_path:
1008 	/*
1009 	 *	Fragment the datagram.
1010 	 */
1011 
1012 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1013 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1014 		      &state);
1015 
1016 	/*
1017 	 *	Keep copying data until we run out.
1018 	 */
1019 
1020 	while (state.left > 0) {
1021 		frag = ip6_frag_next(skb, &state);
1022 		if (IS_ERR(frag)) {
1023 			err = PTR_ERR(frag);
1024 			goto fail;
1025 		}
1026 
1027 		/*
1028 		 *	Put this fragment into the sending queue.
1029 		 */
1030 		skb_set_delivery_time(frag, tstamp, tstamp_type);
1031 		err = output(net, sk, frag);
1032 		if (err)
1033 			goto fail;
1034 
1035 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1036 			      IPSTATS_MIB_FRAGCREATES);
1037 	}
1038 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1039 		      IPSTATS_MIB_FRAGOKS);
1040 	consume_skb(skb);
1041 	return err;
1042 
1043 fail_toobig:
1044 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1045 	err = -EMSGSIZE;
1046 
1047 fail:
1048 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1049 		      IPSTATS_MIB_FRAGFAILS);
1050 	kfree_skb(skb);
1051 	return err;
1052 }
1053 EXPORT_SYMBOL_GPL(ip6_fragment);
1054 
1055 static inline int ip6_rt_check(const struct rt6key *rt_key,
1056 			       const struct in6_addr *fl_addr,
1057 			       const struct in6_addr *addr_cache)
1058 {
1059 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1060 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1061 }
1062 
1063 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1064 					  struct dst_entry *dst,
1065 					  const struct flowi6 *fl6)
1066 {
1067 	struct ipv6_pinfo *np = inet6_sk(sk);
1068 	struct rt6_info *rt;
1069 
1070 	if (!dst)
1071 		goto out;
1072 
1073 	if (dst->ops->family != AF_INET6) {
1074 		dst_release(dst);
1075 		return NULL;
1076 	}
1077 
1078 	rt = dst_rt6_info(dst);
1079 	/* Yes, checking route validity in not connected
1080 	 * case is not very simple. Take into account,
1081 	 * that we do not support routing by source, TOS,
1082 	 * and MSG_DONTROUTE		--ANK (980726)
1083 	 *
1084 	 * 1. ip6_rt_check(): If route was host route,
1085 	 *    check that cached destination is current.
1086 	 *    If it is network route, we still may
1087 	 *    check its validity using saved pointer
1088 	 *    to the last used address: daddr_cache.
1089 	 *    We do not want to save whole address now,
1090 	 *    (because main consumer of this service
1091 	 *    is tcp, which has not this problem),
1092 	 *    so that the last trick works only on connected
1093 	 *    sockets.
1094 	 * 2. oif also should be the same.
1095 	 */
1096 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
1097 			 np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
1098 #ifdef CONFIG_IPV6_SUBTREES
1099 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
1100 			 np->saddr_cache ? &np->saddr : NULL) ||
1101 #endif
1102 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1103 		dst_release(dst);
1104 		dst = NULL;
1105 	}
1106 
1107 out:
1108 	return dst;
1109 }
1110 
1111 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1112 			       struct dst_entry **dst, struct flowi6 *fl6)
1113 {
1114 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1115 	struct neighbour *n;
1116 	struct rt6_info *rt;
1117 #endif
1118 	int err;
1119 	int flags = 0;
1120 
1121 	/* The correct way to handle this would be to do
1122 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1123 	 * the route-specific preferred source forces the
1124 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1125 	 *
1126 	 * In source specific routing (no src=any default route),
1127 	 * ip6_route_output will fail given src=any saddr, though, so
1128 	 * that's why we try it again later.
1129 	 */
1130 	if (ipv6_addr_any(&fl6->saddr)) {
1131 		struct fib6_info *from;
1132 		struct rt6_info *rt;
1133 
1134 		*dst = ip6_route_output(net, sk, fl6);
1135 		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1136 
1137 		rcu_read_lock();
1138 		from = rt ? rcu_dereference(rt->from) : NULL;
1139 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1140 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1141 					  fl6->flowi6_l3mdev,
1142 					  &fl6->saddr);
1143 		rcu_read_unlock();
1144 
1145 		if (err)
1146 			goto out_err_release;
1147 
1148 		/* If we had an erroneous initial result, pretend it
1149 		 * never existed and let the SA-enabled version take
1150 		 * over.
1151 		 */
1152 		if ((*dst)->error) {
1153 			dst_release(*dst);
1154 			*dst = NULL;
1155 		}
1156 
1157 		if (fl6->flowi6_oif)
1158 			flags |= RT6_LOOKUP_F_IFACE;
1159 	}
1160 
1161 	if (!*dst)
1162 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1163 
1164 	err = (*dst)->error;
1165 	if (err)
1166 		goto out_err_release;
1167 
1168 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1169 	/*
1170 	 * Here if the dst entry we've looked up
1171 	 * has a neighbour entry that is in the INCOMPLETE
1172 	 * state and the src address from the flow is
1173 	 * marked as OPTIMISTIC, we release the found
1174 	 * dst entry and replace it instead with the
1175 	 * dst entry of the nexthop router
1176 	 */
1177 	rt = dst_rt6_info(*dst);
1178 	rcu_read_lock();
1179 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1180 				      rt6_nexthop(rt, &fl6->daddr));
1181 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1182 	rcu_read_unlock();
1183 
1184 	if (err) {
1185 		struct inet6_ifaddr *ifp;
1186 		struct flowi6 fl_gw6;
1187 		int redirect;
1188 
1189 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1190 				      (*dst)->dev, 1);
1191 
1192 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1193 		if (ifp)
1194 			in6_ifa_put(ifp);
1195 
1196 		if (redirect) {
1197 			/*
1198 			 * We need to get the dst entry for the
1199 			 * default router instead
1200 			 */
1201 			dst_release(*dst);
1202 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1203 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1204 			*dst = ip6_route_output(net, sk, &fl_gw6);
1205 			err = (*dst)->error;
1206 			if (err)
1207 				goto out_err_release;
1208 		}
1209 	}
1210 #endif
1211 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1212 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1213 		err = -EAFNOSUPPORT;
1214 		goto out_err_release;
1215 	}
1216 
1217 	return 0;
1218 
1219 out_err_release:
1220 	dst_release(*dst);
1221 	*dst = NULL;
1222 
1223 	if (err == -ENETUNREACH)
1224 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1225 	return err;
1226 }
1227 
1228 /**
1229  *	ip6_dst_lookup - perform route lookup on flow
1230  *	@net: Network namespace to perform lookup in
1231  *	@sk: socket which provides route info
1232  *	@dst: pointer to dst_entry * for result
1233  *	@fl6: flow to lookup
1234  *
1235  *	This function performs a route lookup on the given flow.
1236  *
1237  *	It returns zero on success, or a standard errno code on error.
1238  */
1239 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1240 		   struct flowi6 *fl6)
1241 {
1242 	*dst = NULL;
1243 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1244 }
1245 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1246 
1247 /**
1248  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1249  *	@net: Network namespace to perform lookup in
1250  *	@sk: socket which provides route info
1251  *	@fl6: flow to lookup
1252  *	@final_dst: final destination address for ipsec lookup
1253  *
1254  *	This function performs a route lookup on the given flow.
1255  *
1256  *	It returns a valid dst pointer on success, or a pointer encoded
1257  *	error code.
1258  */
1259 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1260 				      const struct in6_addr *final_dst)
1261 {
1262 	struct dst_entry *dst = NULL;
1263 	int err;
1264 
1265 	if (!ipv6_mod_enabled())
1266 		return ERR_PTR(-EAFNOSUPPORT);
1267 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1268 	if (err)
1269 		return ERR_PTR(err);
1270 	if (final_dst)
1271 		fl6->daddr = *final_dst;
1272 
1273 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1274 }
1275 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1276 
1277 /**
1278  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1279  *	@sk: socket which provides the dst cache and route info
1280  *	@fl6: flow to lookup
1281  *	@final_dst: final destination address for ipsec lookup
1282  *	@connected: whether @sk is connected or not
1283  *
1284  *	This function performs a route lookup on the given flow with the
1285  *	possibility of using the cached route in the socket if it is valid.
1286  *	It will take the socket dst lock when operating on the dst cache.
1287  *	As a result, this function can only be used in process context.
1288  *
1289  *	In addition, for a connected socket, cache the dst in the socket
1290  *	if the current cache is not valid.
1291  *
1292  *	It returns a valid dst pointer on success, or a pointer encoded
1293  *	error code.
1294  */
1295 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1296 					 const struct in6_addr *final_dst,
1297 					 bool connected)
1298 {
1299 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1300 
1301 	dst = ip6_sk_dst_check(sk, dst, fl6);
1302 	if (dst)
1303 		return dst;
1304 
1305 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1306 	if (connected && !IS_ERR(dst))
1307 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1308 
1309 	return dst;
1310 }
1311 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1312 
1313 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1314 					       gfp_t gfp)
1315 {
1316 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1317 }
1318 
1319 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1320 						gfp_t gfp)
1321 {
1322 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1323 }
1324 
1325 static void ip6_append_data_mtu(unsigned int *mtu,
1326 				int *maxfraglen,
1327 				unsigned int fragheaderlen,
1328 				struct sk_buff *skb,
1329 				struct rt6_info *rt,
1330 				unsigned int orig_mtu)
1331 {
1332 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1333 		if (!skb) {
1334 			/* first fragment, reserve header_len */
1335 			*mtu = orig_mtu - rt->dst.header_len;
1336 
1337 		} else {
1338 			/*
1339 			 * this fragment is not first, the headers
1340 			 * space is regarded as data space.
1341 			 */
1342 			*mtu = orig_mtu;
1343 		}
1344 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1345 			      + fragheaderlen - sizeof(struct frag_hdr);
1346 	}
1347 }
1348 
1349 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1350 			  struct ipcm6_cookie *ipc6,
1351 			  struct rt6_info *rt)
1352 {
1353 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1354 	struct inet6_cork *v6_cork = &cork->base6;
1355 	struct ipv6_pinfo *np = inet6_sk(sk);
1356 	unsigned int mtu, frag_size;
1357 
1358 	/* callers pass dst together with a reference, set it first so
1359 	 * ip6_cork_release() can put it down even in case of an error.
1360 	 */
1361 	cork->base.dst = &rt->dst;
1362 
1363 	/*
1364 	 * setup for corking
1365 	 */
1366 	if (unlikely(opt)) {
1367 		if (WARN_ON(v6_cork->opt))
1368 			return -EINVAL;
1369 
1370 		nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation);
1371 		if (unlikely(!nopt))
1372 			return -ENOBUFS;
1373 
1374 		nopt->tot_len = sizeof(*opt);
1375 		nopt->opt_flen = opt->opt_flen;
1376 		nopt->opt_nflen = opt->opt_nflen;
1377 
1378 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1379 		if (opt->dst0opt && !nopt->dst0opt)
1380 			return -ENOBUFS;
1381 
1382 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1383 		if (opt->dst1opt && !nopt->dst1opt)
1384 			return -ENOBUFS;
1385 
1386 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1387 		if (opt->hopopt && !nopt->hopopt)
1388 			return -ENOBUFS;
1389 
1390 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1391 		if (opt->srcrt && !nopt->srcrt)
1392 			return -ENOBUFS;
1393 
1394 		/* need source address above miyazawa*/
1395 	}
1396 	v6_cork->hop_limit = ipc6->hlimit;
1397 	v6_cork->tclass = ipc6->tclass;
1398 	v6_cork->dontfrag = ipc6->dontfrag;
1399 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1400 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1401 		      READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst);
1402 	else
1403 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1404 			READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst));
1405 
1406 	frag_size = READ_ONCE(np->frag_size);
1407 	if (frag_size && frag_size < mtu)
1408 		mtu = frag_size;
1409 
1410 	cork->base.fragsize = mtu;
1411 	cork->base.gso_size = ipc6->gso_size;
1412 	cork->base.tx_flags = 0;
1413 	cork->base.mark = ipc6->sockc.mark;
1414 	cork->base.priority = ipc6->sockc.priority;
1415 	sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1416 	if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1417 		cork->base.flags |= IPCORK_TS_OPT_ID;
1418 		cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1419 	}
1420 	cork->base.length = 0;
1421 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1422 
1423 	return 0;
1424 }
1425 
1426 static int __ip6_append_data(struct sock *sk,
1427 			     struct sk_buff_head *queue,
1428 			     struct inet_cork_full *cork_full,
1429 			     struct page_frag *pfrag,
1430 			     int getfrag(void *from, char *to, int offset,
1431 					 int len, int odd, struct sk_buff *skb),
1432 			     void *from, size_t length, int transhdrlen,
1433 			     unsigned int flags)
1434 {
1435 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1436 	struct inet6_cork *v6_cork = &cork_full->base6;
1437 	struct inet_cork *cork = &cork_full->base;
1438 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1439 	struct sk_buff *skb, *skb_prev = NULL;
1440 	struct ubuf_info *uarg = NULL;
1441 	int exthdrlen = 0;
1442 	int dst_exthdrlen = 0;
1443 	int hh_len;
1444 	int copy;
1445 	int err;
1446 	int offset = 0;
1447 	bool zc = false;
1448 	u32 tskey = 0;
1449 	struct rt6_info *rt = dst_rt6_info(cork->dst);
1450 	bool paged, hold_tskey = false, extra_uref = false;
1451 	struct ipv6_txoptions *opt = v6_cork->opt;
1452 	int csummode = CHECKSUM_NONE;
1453 	unsigned int maxnonfragsize, headersize;
1454 	unsigned int wmem_alloc_delta = 0;
1455 
1456 	skb = skb_peek_tail(queue);
1457 	if (!skb) {
1458 		exthdrlen = opt ? opt->opt_flen : 0;
1459 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1460 	}
1461 
1462 	paged = !!cork->gso_size;
1463 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1464 	orig_mtu = mtu;
1465 
1466 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1467 
1468 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1469 			(opt ? opt->opt_nflen : 0);
1470 
1471 	headersize = sizeof(struct ipv6hdr) +
1472 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1473 		     rt->rt6i_nfheader_len;
1474 
1475 	if (mtu <= fragheaderlen ||
1476 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1477 		goto emsgsize;
1478 
1479 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1480 		     sizeof(struct frag_hdr);
1481 
1482 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1483 	 * the first fragment
1484 	 */
1485 	if (headersize + transhdrlen > mtu)
1486 		goto emsgsize;
1487 
1488 	if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1489 	    (sk->sk_protocol == IPPROTO_UDP ||
1490 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1491 	     sk->sk_protocol == IPPROTO_RAW)) {
1492 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1493 				sizeof(struct ipv6hdr));
1494 		goto emsgsize;
1495 	}
1496 
1497 	if (ip6_sk_ignore_df(sk))
1498 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1499 	else
1500 		maxnonfragsize = mtu;
1501 
1502 	if (cork->length + length > maxnonfragsize - headersize) {
1503 emsgsize:
1504 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1505 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1506 		return -EMSGSIZE;
1507 	}
1508 
1509 	/* CHECKSUM_PARTIAL only with no extension headers and when
1510 	 * we are not going to fragment
1511 	 */
1512 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1513 	    headersize == sizeof(struct ipv6hdr) &&
1514 	    length <= mtu - headersize &&
1515 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1516 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1517 		csummode = CHECKSUM_PARTIAL;
1518 
1519 	if ((flags & MSG_ZEROCOPY) && length) {
1520 		struct msghdr *msg = from;
1521 
1522 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1523 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1524 				return -EINVAL;
1525 
1526 			/* Leave uarg NULL if can't zerocopy, callers should
1527 			 * be able to handle it.
1528 			 */
1529 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1530 			    csummode == CHECKSUM_PARTIAL) {
1531 				paged = true;
1532 				zc = true;
1533 				uarg = msg->msg_ubuf;
1534 			}
1535 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1536 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1537 						    false);
1538 			if (!uarg)
1539 				return -ENOBUFS;
1540 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1541 			if (rt->dst.dev->features & NETIF_F_SG &&
1542 			    csummode == CHECKSUM_PARTIAL) {
1543 				paged = true;
1544 				zc = true;
1545 			} else {
1546 				uarg_to_msgzc(uarg)->zerocopy = 0;
1547 				skb_zcopy_set(skb, uarg, &extra_uref);
1548 			}
1549 		}
1550 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1551 		if (inet_test_bit(HDRINCL, sk))
1552 			return -EPERM;
1553 		if (rt->dst.dev->features & NETIF_F_SG &&
1554 		    getfrag == ip_generic_getfrag)
1555 			/* We need an empty buffer to attach stuff to */
1556 			paged = true;
1557 		else
1558 			flags &= ~MSG_SPLICE_PAGES;
1559 	}
1560 
1561 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1562 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1563 		if (cork->flags & IPCORK_TS_OPT_ID) {
1564 			tskey = cork->ts_opt_id;
1565 		} else {
1566 			tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1567 			hold_tskey = true;
1568 		}
1569 	}
1570 
1571 	/*
1572 	 * Let's try using as much space as possible.
1573 	 * Use MTU if total length of the message fits into the MTU.
1574 	 * Otherwise, we need to reserve fragment header and
1575 	 * fragment alignment (= 8-15 octects, in total).
1576 	 *
1577 	 * Note that we may need to "move" the data from the tail
1578 	 * of the buffer to the new fragment when we split
1579 	 * the message.
1580 	 *
1581 	 * FIXME: It may be fragmented into multiple chunks
1582 	 *        at once if non-fragmentable extension headers
1583 	 *        are too large.
1584 	 * --yoshfuji
1585 	 */
1586 
1587 	cork->length += length;
1588 	if (!skb)
1589 		goto alloc_new_skb;
1590 
1591 	while (length > 0) {
1592 		/* Check if the remaining data fits into current packet. */
1593 		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1594 		if (copy < length)
1595 			copy = maxfraglen - skb->len;
1596 
1597 		if (copy <= 0) {
1598 			char *data;
1599 			unsigned int datalen;
1600 			unsigned int fraglen;
1601 			unsigned int fraggap;
1602 			unsigned int alloclen, alloc_extra;
1603 			unsigned int pagedlen;
1604 alloc_new_skb:
1605 			/* There's no room in the current skb */
1606 			if (skb)
1607 				fraggap = skb->len - maxfraglen;
1608 			else
1609 				fraggap = 0;
1610 			/* update mtu and maxfraglen if necessary */
1611 			if (!skb || !skb_prev)
1612 				ip6_append_data_mtu(&mtu, &maxfraglen,
1613 						    fragheaderlen, skb, rt,
1614 						    orig_mtu);
1615 
1616 			skb_prev = skb;
1617 
1618 			/*
1619 			 * If remaining data exceeds the mtu,
1620 			 * we know we need more fragment(s).
1621 			 */
1622 			datalen = length + fraggap;
1623 
1624 			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1625 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1626 			fraglen = datalen + fragheaderlen;
1627 			pagedlen = 0;
1628 
1629 			alloc_extra = hh_len;
1630 			alloc_extra += dst_exthdrlen;
1631 			alloc_extra += rt->dst.trailer_len;
1632 
1633 			/* We just reserve space for fragment header.
1634 			 * Note: this may be overallocation if the message
1635 			 * (without MSG_MORE) fits into the MTU.
1636 			 */
1637 			alloc_extra += sizeof(struct frag_hdr);
1638 
1639 			if ((flags & MSG_MORE) &&
1640 			    !(rt->dst.dev->features&NETIF_F_SG))
1641 				alloclen = mtu;
1642 			else if (!paged &&
1643 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1644 				  !(rt->dst.dev->features & NETIF_F_SG)))
1645 				alloclen = fraglen;
1646 			else {
1647 				alloclen = fragheaderlen + transhdrlen;
1648 				pagedlen = datalen - transhdrlen;
1649 			}
1650 			alloclen += alloc_extra;
1651 
1652 			if (datalen != length + fraggap) {
1653 				/*
1654 				 * this is not the last fragment, the trailer
1655 				 * space is regarded as data space.
1656 				 */
1657 				datalen += rt->dst.trailer_len;
1658 			}
1659 
1660 			fraglen = datalen + fragheaderlen;
1661 
1662 			copy = datalen - transhdrlen - fraggap - pagedlen;
1663 			/* [!] NOTE: copy may be negative if pagedlen>0
1664 			 * because then the equation may reduces to -fraggap.
1665 			 */
1666 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1667 				err = -EINVAL;
1668 				goto error;
1669 			}
1670 			if (transhdrlen) {
1671 				skb = sock_alloc_send_skb(sk, alloclen,
1672 						(flags & MSG_DONTWAIT), &err);
1673 			} else {
1674 				skb = NULL;
1675 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1676 				    2 * sk->sk_sndbuf)
1677 					skb = alloc_skb(alloclen,
1678 							sk->sk_allocation);
1679 				if (unlikely(!skb))
1680 					err = -ENOBUFS;
1681 			}
1682 			if (!skb)
1683 				goto error;
1684 			/*
1685 			 *	Fill in the control structures
1686 			 */
1687 			skb->protocol = htons(ETH_P_IPV6);
1688 			skb->ip_summed = csummode;
1689 			skb->csum = 0;
1690 			/* reserve for fragmentation and ipsec header */
1691 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1692 				    dst_exthdrlen);
1693 
1694 			/*
1695 			 *	Find where to start putting bytes
1696 			 */
1697 			data = skb_put(skb, fraglen - pagedlen);
1698 			skb_set_network_header(skb, exthdrlen);
1699 			data += fragheaderlen;
1700 			skb->transport_header = (skb->network_header +
1701 						 fragheaderlen);
1702 			if (fraggap) {
1703 				skb->csum = skb_copy_and_csum_bits(
1704 					skb_prev, maxfraglen,
1705 					data + transhdrlen, fraggap);
1706 				skb_prev->csum = csum_sub(skb_prev->csum,
1707 							  skb->csum);
1708 				data += fraggap;
1709 				pskb_trim_unique(skb_prev, maxfraglen);
1710 			}
1711 			if (copy > 0 &&
1712 			    INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1713 					   from, data + transhdrlen, offset,
1714 					   copy, fraggap, skb) < 0) {
1715 				err = -EFAULT;
1716 				kfree_skb(skb);
1717 				goto error;
1718 			} else if (flags & MSG_SPLICE_PAGES) {
1719 				copy = 0;
1720 			}
1721 
1722 			offset += copy;
1723 			length -= copy + transhdrlen;
1724 			transhdrlen = 0;
1725 			exthdrlen = 0;
1726 			dst_exthdrlen = 0;
1727 
1728 			/* Only the initial fragment is time stamped */
1729 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1730 			cork->tx_flags = 0;
1731 			skb_shinfo(skb)->tskey = tskey;
1732 			tskey = 0;
1733 			skb_zcopy_set(skb, uarg, &extra_uref);
1734 
1735 			if ((flags & MSG_CONFIRM) && !skb_prev)
1736 				skb_set_dst_pending_confirm(skb, 1);
1737 
1738 			/*
1739 			 * Put the packet on the pending queue
1740 			 */
1741 			if (!skb->destructor) {
1742 				skb->destructor = sock_wfree;
1743 				skb->sk = sk;
1744 				wmem_alloc_delta += skb->truesize;
1745 			}
1746 			__skb_queue_tail(queue, skb);
1747 			continue;
1748 		}
1749 
1750 		if (copy > length)
1751 			copy = length;
1752 
1753 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1754 		    skb_tailroom(skb) >= copy) {
1755 			unsigned int off;
1756 
1757 			off = skb->len;
1758 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1759 					    from, skb_put(skb, copy),
1760 					    offset, copy, off, skb) < 0) {
1761 				__skb_trim(skb, off);
1762 				err = -EFAULT;
1763 				goto error;
1764 			}
1765 		} else if (flags & MSG_SPLICE_PAGES) {
1766 			struct msghdr *msg = from;
1767 
1768 			err = -EIO;
1769 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1770 				goto error;
1771 
1772 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
1773 			if (err < 0)
1774 				goto error;
1775 			copy = err;
1776 			wmem_alloc_delta += copy;
1777 		} else if (!zc) {
1778 			int i = skb_shinfo(skb)->nr_frags;
1779 
1780 			err = -ENOMEM;
1781 			if (!sk_page_frag_refill(sk, pfrag))
1782 				goto error;
1783 
1784 			skb_zcopy_downgrade_managed(skb);
1785 			if (!skb_can_coalesce(skb, i, pfrag->page,
1786 					      pfrag->offset)) {
1787 				err = -EMSGSIZE;
1788 				if (i == MAX_SKB_FRAGS)
1789 					goto error;
1790 
1791 				__skb_fill_page_desc(skb, i, pfrag->page,
1792 						     pfrag->offset, 0);
1793 				skb_shinfo(skb)->nr_frags = ++i;
1794 				get_page(pfrag->page);
1795 			}
1796 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1797 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1798 				    from,
1799 				    page_address(pfrag->page) + pfrag->offset,
1800 				    offset, copy, skb->len, skb) < 0)
1801 				goto error_efault;
1802 
1803 			pfrag->offset += copy;
1804 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1805 			skb->len += copy;
1806 			skb->data_len += copy;
1807 			skb->truesize += copy;
1808 			wmem_alloc_delta += copy;
1809 		} else {
1810 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1811 			if (err < 0)
1812 				goto error;
1813 		}
1814 		offset += copy;
1815 		length -= copy;
1816 	}
1817 
1818 	if (wmem_alloc_delta)
1819 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1820 	return 0;
1821 
1822 error_efault:
1823 	err = -EFAULT;
1824 error:
1825 	net_zcopy_put_abort(uarg, extra_uref);
1826 	cork->length -= length;
1827 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1828 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1829 	if (hold_tskey)
1830 		atomic_dec(&sk->sk_tskey);
1831 	return err;
1832 }
1833 
1834 int ip6_append_data(struct sock *sk,
1835 		    int getfrag(void *from, char *to, int offset, int len,
1836 				int odd, struct sk_buff *skb),
1837 		    void *from, size_t length, int transhdrlen,
1838 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1839 		    struct rt6_info *rt, unsigned int flags)
1840 {
1841 	struct inet_sock *inet = inet_sk(sk);
1842 	int exthdrlen;
1843 	int err;
1844 
1845 	if (flags&MSG_PROBE)
1846 		return 0;
1847 	if (skb_queue_empty(&sk->sk_write_queue)) {
1848 		/*
1849 		 * setup for corking
1850 		 */
1851 		dst_hold(&rt->dst);
1852 		err = ip6_setup_cork(sk, &inet->cork,
1853 				     ipc6, rt);
1854 		if (err)
1855 			return err;
1856 
1857 		inet->cork.fl.u.ip6 = *fl6;
1858 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1859 		length += exthdrlen;
1860 		transhdrlen += exthdrlen;
1861 	} else {
1862 		transhdrlen = 0;
1863 	}
1864 
1865 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1866 				 sk_page_frag(sk), getfrag,
1867 				 from, length, transhdrlen, flags);
1868 }
1869 EXPORT_SYMBOL_GPL(ip6_append_data);
1870 
1871 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1872 {
1873 	struct dst_entry *dst = cork->base.dst;
1874 
1875 	cork->base.dst = NULL;
1876 	skb_dst_set(skb, dst);
1877 }
1878 
1879 static void ip6_cork_release(struct inet_cork_full *cork)
1880 {
1881 	struct inet6_cork *v6_cork = &cork->base6;
1882 
1883 	if (unlikely(v6_cork->opt)) {
1884 		struct ipv6_txoptions *opt = v6_cork->opt;
1885 
1886 		kfree(opt->dst0opt);
1887 		kfree(opt->dst1opt);
1888 		kfree(opt->hopopt);
1889 		kfree(opt->srcrt);
1890 		kfree(opt);
1891 		v6_cork->opt = NULL;
1892 	}
1893 
1894 	if (cork->base.dst) {
1895 		dst_release(cork->base.dst);
1896 		cork->base.dst = NULL;
1897 	}
1898 }
1899 
1900 struct sk_buff *__ip6_make_skb(struct sock *sk,
1901 			       struct sk_buff_head *queue,
1902 			       struct inet_cork_full *cork)
1903 {
1904 	struct sk_buff *skb, *tmp_skb;
1905 	struct sk_buff **tail_skb;
1906 	struct in6_addr *final_dst;
1907 	struct net *net = sock_net(sk);
1908 	struct ipv6hdr *hdr;
1909 	struct ipv6_txoptions *opt;
1910 	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1911 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1912 	unsigned char proto = fl6->flowi6_proto;
1913 
1914 	skb = __skb_dequeue(queue);
1915 	if (!skb)
1916 		goto out;
1917 	tail_skb = &(skb_shinfo(skb)->frag_list);
1918 
1919 	/* move skb->data to ip header from ext header */
1920 	if (skb->data < skb_network_header(skb))
1921 		__skb_pull(skb, skb_network_offset(skb));
1922 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1923 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1924 		*tail_skb = tmp_skb;
1925 		tail_skb = &(tmp_skb->next);
1926 		skb->len += tmp_skb->len;
1927 		skb->data_len += tmp_skb->len;
1928 		skb->truesize += tmp_skb->truesize;
1929 		tmp_skb->destructor = NULL;
1930 		tmp_skb->sk = NULL;
1931 	}
1932 
1933 	/* Allow local fragmentation. */
1934 	skb->ignore_df = ip6_sk_ignore_df(sk);
1935 	__skb_pull(skb, skb_network_header_len(skb));
1936 
1937 	final_dst = &fl6->daddr;
1938 	opt = cork->base6.opt;
1939 	if (unlikely(opt)) {
1940 		if (opt->opt_flen)
1941 			proto = ipv6_push_frag_opts(skb, opt, proto);
1942 		if (opt->opt_nflen)
1943 			proto = ipv6_push_nfrag_opts(skb, opt, proto,
1944 						     &final_dst, &fl6->saddr);
1945 	}
1946 	skb_push(skb, sizeof(struct ipv6hdr));
1947 	skb_reset_network_header(skb);
1948 	hdr = ipv6_hdr(skb);
1949 
1950 	ip6_flow_hdr(hdr, cork->base6.tclass,
1951 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1952 					ip6_autoflowlabel(net, sk), fl6));
1953 	hdr->hop_limit = cork->base6.hop_limit;
1954 	hdr->nexthdr = proto;
1955 	hdr->saddr = fl6->saddr;
1956 	hdr->daddr = *final_dst;
1957 
1958 	skb->priority = cork->base.priority;
1959 	skb->mark = cork->base.mark;
1960 	if (sk_is_tcp(sk))
1961 		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1962 	else
1963 		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1964 
1965 	ip6_cork_steal_dst(skb, cork);
1966 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1967 	if (unlikely(proto == IPPROTO_ICMPV6)) {
1968 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1969 		u8 icmp6_type;
1970 
1971 		if (sk->sk_socket->type == SOCK_RAW &&
1972 		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1973 			icmp6_type = fl6->fl6_icmp_type;
1974 		else
1975 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1976 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1977 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1978 	}
1979 
1980 	ip6_cork_release(cork);
1981 out:
1982 	return skb;
1983 }
1984 
1985 int ip6_send_skb(struct sk_buff *skb)
1986 {
1987 	struct net *net = sock_net(skb->sk);
1988 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1989 	int err;
1990 
1991 	rcu_read_lock();
1992 	err = ip6_local_out(net, skb->sk, skb);
1993 	if (err) {
1994 		if (err > 0)
1995 			err = net_xmit_errno(err);
1996 		if (err)
1997 			IP6_INC_STATS(net, rt->rt6i_idev,
1998 				      IPSTATS_MIB_OUTDISCARDS);
1999 	}
2000 
2001 	rcu_read_unlock();
2002 	return err;
2003 }
2004 
2005 int ip6_push_pending_frames(struct sock *sk)
2006 {
2007 	struct sk_buff *skb;
2008 
2009 	skb = ip6_finish_skb(sk);
2010 	if (!skb)
2011 		return 0;
2012 
2013 	return ip6_send_skb(skb);
2014 }
2015 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2016 
2017 static void __ip6_flush_pending_frames(struct sock *sk,
2018 				       struct sk_buff_head *queue,
2019 				       struct inet_cork_full *cork)
2020 {
2021 	struct sk_buff *skb;
2022 
2023 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2024 		if (skb_dst(skb))
2025 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2026 				      IPSTATS_MIB_OUTDISCARDS);
2027 		kfree_skb(skb);
2028 	}
2029 
2030 	ip6_cork_release(cork);
2031 }
2032 
2033 void ip6_flush_pending_frames(struct sock *sk)
2034 {
2035 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2036 				   &inet_sk(sk)->cork);
2037 }
2038 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2039 
2040 struct sk_buff *ip6_make_skb(struct sock *sk,
2041 			     int getfrag(void *from, char *to, int offset,
2042 					 int len, int odd, struct sk_buff *skb),
2043 			     void *from, size_t length, int transhdrlen,
2044 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2045 			     unsigned int flags, struct inet_cork_full *cork)
2046 {
2047 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2048 	struct sk_buff_head queue;
2049 	int err;
2050 
2051 	if (flags & MSG_PROBE) {
2052 		dst_release(&rt->dst);
2053 		return NULL;
2054 	}
2055 
2056 	__skb_queue_head_init(&queue);
2057 
2058 	cork->base.flags = 0;
2059 	cork->base.addr = 0;
2060 	cork->base.opt = NULL;
2061 	cork->base6.opt = NULL;
2062 	err = ip6_setup_cork(sk, cork, ipc6, rt);
2063 	if (err) {
2064 		ip6_cork_release(cork);
2065 		return ERR_PTR(err);
2066 	}
2067 
2068 	err = __ip6_append_data(sk, &queue, cork,
2069 				&current->task_frag, getfrag, from,
2070 				length + exthdrlen, transhdrlen + exthdrlen,
2071 				flags);
2072 	if (err) {
2073 		__ip6_flush_pending_frames(sk, &queue, cork);
2074 		return ERR_PTR(err);
2075 	}
2076 
2077 	return __ip6_make_skb(sk, &queue, cork);
2078 }
2079