xref: /linux/net/ipv6/ip6_output.c (revision f09fc24dd9a5ec989dfdde7090624924ede6ddc7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst_dev(dst);
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		/* Make sure idev stays alive */
74 		rcu_read_lock();
75 		skb = skb_expand_head(skb, hh_len);
76 		if (!skb) {
77 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
78 			rcu_read_unlock();
79 			return -ENOMEM;
80 		}
81 		rcu_read_unlock();
82 	}
83 
84 	hdr = ipv6_hdr(skb);
85 	daddr = &hdr->daddr;
86 	if (ipv6_addr_is_multicast(daddr)) {
87 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
88 		    ((mroute6_is_socket(net, skb) &&
89 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
90 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
91 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
92 
93 			/* Do not check for IFF_ALLMULTI; multicast routing
94 			   is not supported in any case.
95 			 */
96 			if (newskb)
97 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
98 					net, sk, newskb, NULL, newskb->dev,
99 					dev_loopback_xmit);
100 
101 			if (hdr->hop_limit == 0) {
102 				IP6_INC_STATS(net, idev,
103 					      IPSTATS_MIB_OUTDISCARDS);
104 				kfree_skb(skb);
105 				return 0;
106 			}
107 		}
108 
109 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
110 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
111 		    !(dev->flags & IFF_LOOPBACK)) {
112 			kfree_skb(skb);
113 			return 0;
114 		}
115 	}
116 
117 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
118 		int res = lwtunnel_xmit(skb);
119 
120 		if (res != LWTUNNEL_XMIT_CONTINUE)
121 			return res;
122 	}
123 
124 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
125 
126 	rcu_read_lock();
127 	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
128 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
129 
130 	if (IS_ERR_OR_NULL(neigh)) {
131 		if (unlikely(!neigh))
132 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
133 		if (IS_ERR(neigh)) {
134 			rcu_read_unlock();
135 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
136 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
137 			return -EINVAL;
138 		}
139 	}
140 	sock_confirm_neigh(skb, neigh);
141 	ret = neigh_output(neigh, skb, false);
142 	rcu_read_unlock();
143 	return ret;
144 }
145 
146 static int
147 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
148 				    struct sk_buff *skb, unsigned int mtu)
149 {
150 	struct sk_buff *segs, *nskb;
151 	netdev_features_t features;
152 	int ret = 0;
153 
154 	/* Please see corresponding comment in ip_finish_output_gso
155 	 * describing the cases where GSO segment length exceeds the
156 	 * egress MTU.
157 	 */
158 	features = netif_skb_features(skb);
159 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
160 	if (IS_ERR_OR_NULL(segs)) {
161 		kfree_skb(skb);
162 		return -ENOMEM;
163 	}
164 
165 	consume_skb(skb);
166 
167 	skb_list_walk_safe(segs, segs, nskb) {
168 		int err;
169 
170 		skb_mark_not_on_list(segs);
171 		/* Last GSO segment can be smaller than gso_size (and MTU).
172 		 * Adding a fragment header would produce an "atomic fragment",
173 		 * which is considered harmful (RFC-8021). Avoid that.
174 		 */
175 		err = segs->len > mtu ?
176 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
177 			ip6_finish_output2(net, sk, segs);
178 		if (err && ret == 0)
179 			ret = err;
180 	}
181 
182 	return ret;
183 }
184 
185 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
186 				 struct sk_buff *skb, unsigned int mtu)
187 {
188 	if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
189 	    !skb_gso_validate_network_len(skb, mtu))
190 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
191 
192 	return ip6_finish_output2(net, sk, skb);
193 }
194 
195 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
196 {
197 	unsigned int mtu;
198 
199 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
200 	/* Policy lookup after SNAT yielded a new policy */
201 	if (skb_dst(skb)->xfrm) {
202 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
203 		return dst_output(net, sk, skb);
204 	}
205 #endif
206 
207 	mtu = ip6_skb_dst_mtu(skb);
208 	if (skb_is_gso(skb))
209 		return ip6_finish_output_gso(net, sk, skb, mtu);
210 
211 	if (skb->len > mtu ||
212 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
213 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
214 
215 	return ip6_finish_output2(net, sk, skb);
216 }
217 
218 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
219 {
220 	int ret;
221 
222 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
223 	switch (ret) {
224 	case NET_XMIT_SUCCESS:
225 	case NET_XMIT_CN:
226 		return __ip6_finish_output(net, sk, skb) ? : ret;
227 	default:
228 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
229 		return ret;
230 	}
231 }
232 
233 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
234 {
235 	struct dst_entry *dst = skb_dst(skb);
236 	struct net_device *dev = dst_dev(dst), *indev = skb->dev;
237 	struct inet6_dev *idev = ip6_dst_idev(dst);
238 
239 	skb->protocol = htons(ETH_P_IPV6);
240 	skb->dev = dev;
241 
242 	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
243 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
244 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
245 		return 0;
246 	}
247 
248 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
249 			    net, sk, skb, indev, dev,
250 			    ip6_finish_output,
251 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
252 }
253 EXPORT_SYMBOL(ip6_output);
254 
255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
256 {
257 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
258 		return ip6_default_np_autolabel(net);
259 	return inet6_test_bit(AUTOFLOWLABEL, sk);
260 }
261 
262 /*
263  * xmit an sk_buff (used by TCP and SCTP)
264  * Note : socket lock is not held for SYNACK packets, but might be modified
265  * by calls to skb_set_owner_w() and ipv6_local_error(),
266  * which are using proper atomic operations or spinlocks.
267  */
268 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
269 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
270 {
271 	struct net *net = sock_net(sk);
272 	const struct ipv6_pinfo *np = inet6_sk(sk);
273 	struct in6_addr *first_hop = &fl6->daddr;
274 	struct dst_entry *dst = skb_dst(skb);
275 	struct net_device *dev = dst_dev(dst);
276 	struct inet6_dev *idev = ip6_dst_idev(dst);
277 	struct hop_jumbo_hdr *hop_jumbo;
278 	int hoplen = sizeof(*hop_jumbo);
279 	unsigned int head_room;
280 	struct ipv6hdr *hdr;
281 	u8  proto = fl6->flowi6_proto;
282 	int seg_len = skb->len;
283 	int hlimit = -1;
284 	u32 mtu;
285 
286 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
287 	if (opt)
288 		head_room += opt->opt_nflen + opt->opt_flen;
289 
290 	if (unlikely(head_room > skb_headroom(skb))) {
291 		/* Make sure idev stays alive */
292 		rcu_read_lock();
293 		skb = skb_expand_head(skb, head_room);
294 		if (!skb) {
295 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
296 			rcu_read_unlock();
297 			return -ENOBUFS;
298 		}
299 		rcu_read_unlock();
300 	}
301 
302 	if (opt) {
303 		seg_len += opt->opt_nflen + opt->opt_flen;
304 
305 		if (opt->opt_flen)
306 			ipv6_push_frag_opts(skb, opt, &proto);
307 
308 		if (opt->opt_nflen)
309 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
310 					     &fl6->saddr);
311 	}
312 
313 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
314 		hop_jumbo = skb_push(skb, hoplen);
315 
316 		hop_jumbo->nexthdr = proto;
317 		hop_jumbo->hdrlen = 0;
318 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
319 		hop_jumbo->tlv_len = 4;
320 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
321 
322 		proto = IPPROTO_HOPOPTS;
323 		seg_len = 0;
324 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
325 	}
326 
327 	skb_push(skb, sizeof(struct ipv6hdr));
328 	skb_reset_network_header(skb);
329 	hdr = ipv6_hdr(skb);
330 
331 	/*
332 	 *	Fill in the IPv6 header
333 	 */
334 	if (np)
335 		hlimit = READ_ONCE(np->hop_limit);
336 	if (hlimit < 0)
337 		hlimit = ip6_dst_hoplimit(dst);
338 
339 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
340 				ip6_autoflowlabel(net, sk), fl6));
341 
342 	hdr->payload_len = htons(seg_len);
343 	hdr->nexthdr = proto;
344 	hdr->hop_limit = hlimit;
345 
346 	hdr->saddr = fl6->saddr;
347 	hdr->daddr = *first_hop;
348 
349 	skb->protocol = htons(ETH_P_IPV6);
350 	skb->priority = priority;
351 	skb->mark = mark;
352 
353 	mtu = dst_mtu(dst);
354 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
355 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
356 
357 		/* if egress device is enslaved to an L3 master device pass the
358 		 * skb to its handler for processing
359 		 */
360 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
361 		if (unlikely(!skb))
362 			return 0;
363 
364 		/* hooks should never assume socket lock is held.
365 		 * we promote our socket to non const
366 		 */
367 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
368 			       net, (struct sock *)sk, skb, NULL, dev,
369 			       dst_output);
370 	}
371 
372 	skb->dev = dev;
373 	/* ipv6_local_error() does not require socket lock,
374 	 * we promote our socket to non const
375 	 */
376 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
377 
378 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
379 	kfree_skb(skb);
380 	return -EMSGSIZE;
381 }
382 EXPORT_SYMBOL(ip6_xmit);
383 
384 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
385 {
386 	struct ip6_ra_chain *ra;
387 	struct sock *last = NULL;
388 
389 	read_lock(&ip6_ra_lock);
390 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
391 		struct sock *sk = ra->sk;
392 		if (sk && ra->sel == sel &&
393 		    (!sk->sk_bound_dev_if ||
394 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
395 
396 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
397 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
398 				continue;
399 			}
400 			if (last) {
401 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
402 				if (skb2)
403 					rawv6_rcv(last, skb2);
404 			}
405 			last = sk;
406 		}
407 	}
408 
409 	if (last) {
410 		rawv6_rcv(last, skb);
411 		read_unlock(&ip6_ra_lock);
412 		return 1;
413 	}
414 	read_unlock(&ip6_ra_lock);
415 	return 0;
416 }
417 
418 static int ip6_forward_proxy_check(struct sk_buff *skb)
419 {
420 	struct ipv6hdr *hdr = ipv6_hdr(skb);
421 	u8 nexthdr = hdr->nexthdr;
422 	__be16 frag_off;
423 	int offset;
424 
425 	if (ipv6_ext_hdr(nexthdr)) {
426 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
427 		if (offset < 0)
428 			return 0;
429 	} else
430 		offset = sizeof(struct ipv6hdr);
431 
432 	if (nexthdr == IPPROTO_ICMPV6) {
433 		struct icmp6hdr *icmp6;
434 
435 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
436 					 offset + 1 - skb->data)))
437 			return 0;
438 
439 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
440 
441 		switch (icmp6->icmp6_type) {
442 		case NDISC_ROUTER_SOLICITATION:
443 		case NDISC_ROUTER_ADVERTISEMENT:
444 		case NDISC_NEIGHBOUR_SOLICITATION:
445 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
446 		case NDISC_REDIRECT:
447 			/* For reaction involving unicast neighbor discovery
448 			 * message destined to the proxied address, pass it to
449 			 * input function.
450 			 */
451 			return 1;
452 		default:
453 			break;
454 		}
455 	}
456 
457 	/*
458 	 * The proxying router can't forward traffic sent to a link-local
459 	 * address, so signal the sender and discard the packet. This
460 	 * behavior is clarified by the MIPv6 specification.
461 	 */
462 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
463 		dst_link_failure(skb);
464 		return -1;
465 	}
466 
467 	return 0;
468 }
469 
470 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
471 				     struct sk_buff *skb)
472 {
473 #ifdef CONFIG_NET_SWITCHDEV
474 	if (skb->offload_l3_fwd_mark) {
475 		consume_skb(skb);
476 		return 0;
477 	}
478 #endif
479 
480 	skb_clear_tstamp(skb);
481 	return dst_output(net, sk, skb);
482 }
483 
484 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
485 {
486 	if (skb->len <= mtu)
487 		return false;
488 
489 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
490 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
491 		return true;
492 
493 	if (skb->ignore_df)
494 		return false;
495 
496 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
497 		return false;
498 
499 	return true;
500 }
501 
502 int ip6_forward(struct sk_buff *skb)
503 {
504 	struct dst_entry *dst = skb_dst(skb);
505 	struct ipv6hdr *hdr = ipv6_hdr(skb);
506 	struct inet6_skb_parm *opt = IP6CB(skb);
507 	struct net *net = dev_net(dst_dev(dst));
508 	struct net_device *dev;
509 	struct inet6_dev *idev;
510 	SKB_DR(reason);
511 	u32 mtu;
512 
513 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
514 	if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
515 	    (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
516 		goto error;
517 
518 	if (skb->pkt_type != PACKET_HOST)
519 		goto drop;
520 
521 	if (unlikely(skb->sk))
522 		goto drop;
523 
524 	if (skb_warn_if_lro(skb))
525 		goto drop;
526 
527 	if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
528 	    (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
529 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
530 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
531 		goto drop;
532 	}
533 
534 	skb_forward_csum(skb);
535 
536 	/*
537 	 *	We DO NOT make any processing on
538 	 *	RA packets, pushing them to user level AS IS
539 	 *	without ane WARRANTY that application will be able
540 	 *	to interpret them. The reason is that we
541 	 *	cannot make anything clever here.
542 	 *
543 	 *	We are not end-node, so that if packet contains
544 	 *	AH/ESP, we cannot make anything.
545 	 *	Defragmentation also would be mistake, RA packets
546 	 *	cannot be fragmented, because there is no warranty
547 	 *	that different fragments will go along one path. --ANK
548 	 */
549 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
550 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
551 			return 0;
552 	}
553 
554 	/*
555 	 *	check and decrement ttl
556 	 */
557 	if (hdr->hop_limit <= 1) {
558 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
559 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
560 
561 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
562 		return -ETIMEDOUT;
563 	}
564 
565 	/* XXX: idev->cnf.proxy_ndp? */
566 	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
567 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
568 		int proxied = ip6_forward_proxy_check(skb);
569 		if (proxied > 0) {
570 			/* It's tempting to decrease the hop limit
571 			 * here by 1, as we do at the end of the
572 			 * function too.
573 			 *
574 			 * But that would be incorrect, as proxying is
575 			 * not forwarding.  The ip6_input function
576 			 * will handle this packet locally, and it
577 			 * depends on the hop limit being unchanged.
578 			 *
579 			 * One example is the NDP hop limit, that
580 			 * always has to stay 255, but other would be
581 			 * similar checks around RA packets, where the
582 			 * user can even change the desired limit.
583 			 */
584 			return ip6_input(skb);
585 		} else if (proxied < 0) {
586 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
587 			goto drop;
588 		}
589 	}
590 
591 	if (!xfrm6_route_forward(skb)) {
592 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
593 		SKB_DR_SET(reason, XFRM_POLICY);
594 		goto drop;
595 	}
596 	dst = skb_dst(skb);
597 	dev = dst_dev(dst);
598 	/* IPv6 specs say nothing about it, but it is clear that we cannot
599 	   send redirects to source routed frames.
600 	   We don't send redirects to frames decapsulated from IPsec.
601 	 */
602 	if (IP6CB(skb)->iif == dev->ifindex &&
603 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
604 		struct in6_addr *target = NULL;
605 		struct inet_peer *peer;
606 		struct rt6_info *rt;
607 
608 		/*
609 		 *	incoming and outgoing devices are the same
610 		 *	send a redirect.
611 		 */
612 
613 		rt = dst_rt6_info(dst);
614 		if (rt->rt6i_flags & RTF_GATEWAY)
615 			target = &rt->rt6i_gateway;
616 		else
617 			target = &hdr->daddr;
618 
619 		rcu_read_lock();
620 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
621 
622 		/* Limit redirects both by destination (here)
623 		   and by source (inside ndisc_send_redirect)
624 		 */
625 		if (inet_peer_xrlim_allow(peer, 1*HZ))
626 			ndisc_send_redirect(skb, target);
627 		rcu_read_unlock();
628 	} else {
629 		int addrtype = ipv6_addr_type(&hdr->saddr);
630 
631 		/* This check is security critical. */
632 		if (addrtype == IPV6_ADDR_ANY ||
633 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
634 			goto error;
635 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
636 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
637 				    ICMPV6_NOT_NEIGHBOUR, 0);
638 			goto error;
639 		}
640 	}
641 
642 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
643 
644 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
645 	if (mtu < IPV6_MIN_MTU)
646 		mtu = IPV6_MIN_MTU;
647 
648 	if (ip6_pkt_too_big(skb, mtu)) {
649 		/* Again, force OUTPUT device used as source address */
650 		skb->dev = dev;
651 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
652 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
653 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
654 				IPSTATS_MIB_FRAGFAILS);
655 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
656 		return -EMSGSIZE;
657 	}
658 
659 	if (skb_cow(skb, dev->hard_header_len)) {
660 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
661 				IPSTATS_MIB_OUTDISCARDS);
662 		goto drop;
663 	}
664 
665 	hdr = ipv6_hdr(skb);
666 
667 	/* Mangling hops number delayed to point after skb COW */
668 
669 	hdr->hop_limit--;
670 
671 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
672 		       net, NULL, skb, skb->dev, dev,
673 		       ip6_forward_finish);
674 
675 error:
676 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
677 	SKB_DR_SET(reason, IP_INADDRERRORS);
678 drop:
679 	kfree_skb_reason(skb, reason);
680 	return -EINVAL;
681 }
682 
683 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
684 {
685 	to->pkt_type = from->pkt_type;
686 	to->priority = from->priority;
687 	to->protocol = from->protocol;
688 	skb_dst_drop(to);
689 	skb_dst_set(to, dst_clone(skb_dst(from)));
690 	to->dev = from->dev;
691 	to->mark = from->mark;
692 
693 	skb_copy_hash(to, from);
694 
695 #ifdef CONFIG_NET_SCHED
696 	to->tc_index = from->tc_index;
697 #endif
698 	nf_copy(to, from);
699 	skb_ext_copy(to, from);
700 	skb_copy_secmark(to, from);
701 }
702 
703 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
704 		      u8 nexthdr, __be32 frag_id,
705 		      struct ip6_fraglist_iter *iter)
706 {
707 	unsigned int first_len;
708 	struct frag_hdr *fh;
709 
710 	/* BUILD HEADER */
711 	*prevhdr = NEXTHDR_FRAGMENT;
712 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
713 	if (!iter->tmp_hdr)
714 		return -ENOMEM;
715 
716 	iter->frag = skb_shinfo(skb)->frag_list;
717 	skb_frag_list_init(skb);
718 
719 	iter->offset = 0;
720 	iter->hlen = hlen;
721 	iter->frag_id = frag_id;
722 	iter->nexthdr = nexthdr;
723 
724 	__skb_pull(skb, hlen);
725 	fh = __skb_push(skb, sizeof(struct frag_hdr));
726 	__skb_push(skb, hlen);
727 	skb_reset_network_header(skb);
728 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
729 
730 	fh->nexthdr = nexthdr;
731 	fh->reserved = 0;
732 	fh->frag_off = htons(IP6_MF);
733 	fh->identification = frag_id;
734 
735 	first_len = skb_pagelen(skb);
736 	skb->data_len = first_len - skb_headlen(skb);
737 	skb->len = first_len;
738 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
739 
740 	return 0;
741 }
742 EXPORT_SYMBOL(ip6_fraglist_init);
743 
744 void ip6_fraglist_prepare(struct sk_buff *skb,
745 			  struct ip6_fraglist_iter *iter)
746 {
747 	struct sk_buff *frag = iter->frag;
748 	unsigned int hlen = iter->hlen;
749 	struct frag_hdr *fh;
750 
751 	frag->ip_summed = CHECKSUM_NONE;
752 	skb_reset_transport_header(frag);
753 	fh = __skb_push(frag, sizeof(struct frag_hdr));
754 	__skb_push(frag, hlen);
755 	skb_reset_network_header(frag);
756 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
757 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
758 	fh->nexthdr = iter->nexthdr;
759 	fh->reserved = 0;
760 	fh->frag_off = htons(iter->offset);
761 	if (frag->next)
762 		fh->frag_off |= htons(IP6_MF);
763 	fh->identification = iter->frag_id;
764 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
765 	ip6_copy_metadata(frag, skb);
766 }
767 EXPORT_SYMBOL(ip6_fraglist_prepare);
768 
769 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
770 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
771 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
772 {
773 	state->prevhdr = prevhdr;
774 	state->nexthdr = nexthdr;
775 	state->frag_id = frag_id;
776 
777 	state->hlen = hlen;
778 	state->mtu = mtu;
779 
780 	state->left = skb->len - hlen;	/* Space per frame */
781 	state->ptr = hlen;		/* Where to start from */
782 
783 	state->hroom = hdr_room;
784 	state->troom = needed_tailroom;
785 
786 	state->offset = 0;
787 }
788 EXPORT_SYMBOL(ip6_frag_init);
789 
790 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
791 {
792 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
793 	struct sk_buff *frag;
794 	struct frag_hdr *fh;
795 	unsigned int len;
796 
797 	len = state->left;
798 	/* IF: it doesn't fit, use 'mtu' - the data space left */
799 	if (len > state->mtu)
800 		len = state->mtu;
801 	/* IF: we are not sending up to and including the packet end
802 	   then align the next start on an eight byte boundary */
803 	if (len < state->left)
804 		len &= ~7;
805 
806 	/* Allocate buffer */
807 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
808 			 state->hroom + state->troom, GFP_ATOMIC);
809 	if (!frag)
810 		return ERR_PTR(-ENOMEM);
811 
812 	/*
813 	 *	Set up data on packet
814 	 */
815 
816 	ip6_copy_metadata(frag, skb);
817 	skb_reserve(frag, state->hroom);
818 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
819 	skb_reset_network_header(frag);
820 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
821 	frag->transport_header = (frag->network_header + state->hlen +
822 				  sizeof(struct frag_hdr));
823 
824 	/*
825 	 *	Charge the memory for the fragment to any owner
826 	 *	it might possess
827 	 */
828 	if (skb->sk)
829 		skb_set_owner_w(frag, skb->sk);
830 
831 	/*
832 	 *	Copy the packet header into the new buffer.
833 	 */
834 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
835 
836 	fragnexthdr_offset = skb_network_header(frag);
837 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
838 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
839 
840 	/*
841 	 *	Build fragment header.
842 	 */
843 	fh->nexthdr = state->nexthdr;
844 	fh->reserved = 0;
845 	fh->identification = state->frag_id;
846 
847 	/*
848 	 *	Copy a block of the IP datagram.
849 	 */
850 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
851 			     len));
852 	state->left -= len;
853 
854 	fh->frag_off = htons(state->offset);
855 	if (state->left > 0)
856 		fh->frag_off |= htons(IP6_MF);
857 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
858 
859 	state->ptr += len;
860 	state->offset += len;
861 
862 	return frag;
863 }
864 EXPORT_SYMBOL(ip6_frag_next);
865 
866 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
867 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
868 {
869 	struct sk_buff *frag;
870 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
871 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
872 				inet6_sk(skb->sk) : NULL;
873 	u8 tstamp_type = skb->tstamp_type;
874 	struct ip6_frag_state state;
875 	unsigned int mtu, hlen, nexthdr_offset;
876 	ktime_t tstamp = skb->tstamp;
877 	int hroom, err = 0;
878 	__be32 frag_id;
879 	u8 *prevhdr, nexthdr = 0;
880 
881 	err = ip6_find_1stfragopt(skb, &prevhdr);
882 	if (err < 0)
883 		goto fail;
884 	hlen = err;
885 	nexthdr = *prevhdr;
886 	nexthdr_offset = prevhdr - skb_network_header(skb);
887 
888 	mtu = ip6_skb_dst_mtu(skb);
889 
890 	/* We must not fragment if the socket is set to force MTU discovery
891 	 * or if the skb it not generated by a local socket.
892 	 */
893 	if (unlikely(!skb->ignore_df && skb->len > mtu))
894 		goto fail_toobig;
895 
896 	if (IP6CB(skb)->frag_max_size) {
897 		if (IP6CB(skb)->frag_max_size > mtu)
898 			goto fail_toobig;
899 
900 		/* don't send fragments larger than what we received */
901 		mtu = IP6CB(skb)->frag_max_size;
902 		if (mtu < IPV6_MIN_MTU)
903 			mtu = IPV6_MIN_MTU;
904 	}
905 
906 	if (np) {
907 		u32 frag_size = READ_ONCE(np->frag_size);
908 
909 		if (frag_size && frag_size < mtu)
910 			mtu = frag_size;
911 	}
912 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
913 		goto fail_toobig;
914 	mtu -= hlen + sizeof(struct frag_hdr);
915 
916 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
917 				    &ipv6_hdr(skb)->saddr);
918 
919 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
920 	    (err = skb_checksum_help(skb)))
921 		goto fail;
922 
923 	prevhdr = skb_network_header(skb) + nexthdr_offset;
924 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
925 	if (skb_has_frag_list(skb)) {
926 		unsigned int first_len = skb_pagelen(skb);
927 		struct ip6_fraglist_iter iter;
928 		struct sk_buff *frag2;
929 
930 		if (first_len - hlen > mtu ||
931 		    ((first_len - hlen) & 7) ||
932 		    skb_cloned(skb) ||
933 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
934 			goto slow_path;
935 
936 		skb_walk_frags(skb, frag) {
937 			/* Correct geometry. */
938 			if (frag->len > mtu ||
939 			    ((frag->len & 7) && frag->next) ||
940 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
941 				goto slow_path_clean;
942 
943 			/* Partially cloned skb? */
944 			if (skb_shared(frag))
945 				goto slow_path_clean;
946 
947 			BUG_ON(frag->sk);
948 			if (skb->sk) {
949 				frag->sk = skb->sk;
950 				frag->destructor = sock_wfree;
951 			}
952 			skb->truesize -= frag->truesize;
953 		}
954 
955 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
956 					&iter);
957 		if (err < 0)
958 			goto fail;
959 
960 		/* We prevent @rt from being freed. */
961 		rcu_read_lock();
962 
963 		for (;;) {
964 			/* Prepare header of the next frame,
965 			 * before previous one went down. */
966 			if (iter.frag)
967 				ip6_fraglist_prepare(skb, &iter);
968 
969 			skb_set_delivery_time(skb, tstamp, tstamp_type);
970 			err = output(net, sk, skb);
971 			if (!err)
972 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
973 					      IPSTATS_MIB_FRAGCREATES);
974 
975 			if (err || !iter.frag)
976 				break;
977 
978 			skb = ip6_fraglist_next(&iter);
979 		}
980 
981 		kfree(iter.tmp_hdr);
982 
983 		if (err == 0) {
984 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
985 				      IPSTATS_MIB_FRAGOKS);
986 			rcu_read_unlock();
987 			return 0;
988 		}
989 
990 		kfree_skb_list(iter.frag);
991 
992 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
993 			      IPSTATS_MIB_FRAGFAILS);
994 		rcu_read_unlock();
995 		return err;
996 
997 slow_path_clean:
998 		skb_walk_frags(skb, frag2) {
999 			if (frag2 == frag)
1000 				break;
1001 			frag2->sk = NULL;
1002 			frag2->destructor = NULL;
1003 			skb->truesize += frag2->truesize;
1004 		}
1005 	}
1006 
1007 slow_path:
1008 	/*
1009 	 *	Fragment the datagram.
1010 	 */
1011 
1012 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1013 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1014 		      &state);
1015 
1016 	/*
1017 	 *	Keep copying data until we run out.
1018 	 */
1019 
1020 	while (state.left > 0) {
1021 		frag = ip6_frag_next(skb, &state);
1022 		if (IS_ERR(frag)) {
1023 			err = PTR_ERR(frag);
1024 			goto fail;
1025 		}
1026 
1027 		/*
1028 		 *	Put this fragment into the sending queue.
1029 		 */
1030 		skb_set_delivery_time(frag, tstamp, tstamp_type);
1031 		err = output(net, sk, frag);
1032 		if (err)
1033 			goto fail;
1034 
1035 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1036 			      IPSTATS_MIB_FRAGCREATES);
1037 	}
1038 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1039 		      IPSTATS_MIB_FRAGOKS);
1040 	consume_skb(skb);
1041 	return err;
1042 
1043 fail_toobig:
1044 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1045 	err = -EMSGSIZE;
1046 
1047 fail:
1048 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1049 		      IPSTATS_MIB_FRAGFAILS);
1050 	kfree_skb(skb);
1051 	return err;
1052 }
1053 
1054 static inline int ip6_rt_check(const struct rt6key *rt_key,
1055 			       const struct in6_addr *fl_addr,
1056 			       const struct in6_addr *addr_cache)
1057 {
1058 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1059 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1060 }
1061 
1062 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1063 					  struct dst_entry *dst,
1064 					  const struct flowi6 *fl6)
1065 {
1066 	struct ipv6_pinfo *np = inet6_sk(sk);
1067 	struct rt6_info *rt;
1068 
1069 	if (!dst)
1070 		goto out;
1071 
1072 	if (dst->ops->family != AF_INET6) {
1073 		dst_release(dst);
1074 		return NULL;
1075 	}
1076 
1077 	rt = dst_rt6_info(dst);
1078 	/* Yes, checking route validity in not connected
1079 	 * case is not very simple. Take into account,
1080 	 * that we do not support routing by source, TOS,
1081 	 * and MSG_DONTROUTE		--ANK (980726)
1082 	 *
1083 	 * 1. ip6_rt_check(): If route was host route,
1084 	 *    check that cached destination is current.
1085 	 *    If it is network route, we still may
1086 	 *    check its validity using saved pointer
1087 	 *    to the last used address: daddr_cache.
1088 	 *    We do not want to save whole address now,
1089 	 *    (because main consumer of this service
1090 	 *    is tcp, which has not this problem),
1091 	 *    so that the last trick works only on connected
1092 	 *    sockets.
1093 	 * 2. oif also should be the same.
1094 	 */
1095 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1096 #ifdef CONFIG_IPV6_SUBTREES
1097 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1098 #endif
1099 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1100 		dst_release(dst);
1101 		dst = NULL;
1102 	}
1103 
1104 out:
1105 	return dst;
1106 }
1107 
1108 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1109 			       struct dst_entry **dst, struct flowi6 *fl6)
1110 {
1111 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1112 	struct neighbour *n;
1113 	struct rt6_info *rt;
1114 #endif
1115 	int err;
1116 	int flags = 0;
1117 
1118 	/* The correct way to handle this would be to do
1119 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1120 	 * the route-specific preferred source forces the
1121 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1122 	 *
1123 	 * In source specific routing (no src=any default route),
1124 	 * ip6_route_output will fail given src=any saddr, though, so
1125 	 * that's why we try it again later.
1126 	 */
1127 	if (ipv6_addr_any(&fl6->saddr)) {
1128 		struct fib6_info *from;
1129 		struct rt6_info *rt;
1130 
1131 		*dst = ip6_route_output(net, sk, fl6);
1132 		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1133 
1134 		rcu_read_lock();
1135 		from = rt ? rcu_dereference(rt->from) : NULL;
1136 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1137 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1138 					  fl6->flowi6_l3mdev,
1139 					  &fl6->saddr);
1140 		rcu_read_unlock();
1141 
1142 		if (err)
1143 			goto out_err_release;
1144 
1145 		/* If we had an erroneous initial result, pretend it
1146 		 * never existed and let the SA-enabled version take
1147 		 * over.
1148 		 */
1149 		if ((*dst)->error) {
1150 			dst_release(*dst);
1151 			*dst = NULL;
1152 		}
1153 
1154 		if (fl6->flowi6_oif)
1155 			flags |= RT6_LOOKUP_F_IFACE;
1156 	}
1157 
1158 	if (!*dst)
1159 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1160 
1161 	err = (*dst)->error;
1162 	if (err)
1163 		goto out_err_release;
1164 
1165 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1166 	/*
1167 	 * Here if the dst entry we've looked up
1168 	 * has a neighbour entry that is in the INCOMPLETE
1169 	 * state and the src address from the flow is
1170 	 * marked as OPTIMISTIC, we release the found
1171 	 * dst entry and replace it instead with the
1172 	 * dst entry of the nexthop router
1173 	 */
1174 	rt = dst_rt6_info(*dst);
1175 	rcu_read_lock();
1176 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1177 				      rt6_nexthop(rt, &fl6->daddr));
1178 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1179 	rcu_read_unlock();
1180 
1181 	if (err) {
1182 		struct inet6_ifaddr *ifp;
1183 		struct flowi6 fl_gw6;
1184 		int redirect;
1185 
1186 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1187 				      (*dst)->dev, 1);
1188 
1189 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1190 		if (ifp)
1191 			in6_ifa_put(ifp);
1192 
1193 		if (redirect) {
1194 			/*
1195 			 * We need to get the dst entry for the
1196 			 * default router instead
1197 			 */
1198 			dst_release(*dst);
1199 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1200 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1201 			*dst = ip6_route_output(net, sk, &fl_gw6);
1202 			err = (*dst)->error;
1203 			if (err)
1204 				goto out_err_release;
1205 		}
1206 	}
1207 #endif
1208 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1209 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1210 		err = -EAFNOSUPPORT;
1211 		goto out_err_release;
1212 	}
1213 
1214 	return 0;
1215 
1216 out_err_release:
1217 	dst_release(*dst);
1218 	*dst = NULL;
1219 
1220 	if (err == -ENETUNREACH)
1221 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1222 	return err;
1223 }
1224 
1225 /**
1226  *	ip6_dst_lookup - perform route lookup on flow
1227  *	@net: Network namespace to perform lookup in
1228  *	@sk: socket which provides route info
1229  *	@dst: pointer to dst_entry * for result
1230  *	@fl6: flow to lookup
1231  *
1232  *	This function performs a route lookup on the given flow.
1233  *
1234  *	It returns zero on success, or a standard errno code on error.
1235  */
1236 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1237 		   struct flowi6 *fl6)
1238 {
1239 	*dst = NULL;
1240 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1241 }
1242 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1243 
1244 /**
1245  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1246  *	@net: Network namespace to perform lookup in
1247  *	@sk: socket which provides route info
1248  *	@fl6: flow to lookup
1249  *	@final_dst: final destination address for ipsec lookup
1250  *
1251  *	This function performs a route lookup on the given flow.
1252  *
1253  *	It returns a valid dst pointer on success, or a pointer encoded
1254  *	error code.
1255  */
1256 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1257 				      const struct in6_addr *final_dst)
1258 {
1259 	struct dst_entry *dst = NULL;
1260 	int err;
1261 
1262 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1263 	if (err)
1264 		return ERR_PTR(err);
1265 	if (final_dst)
1266 		fl6->daddr = *final_dst;
1267 
1268 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1269 }
1270 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1271 
1272 /**
1273  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1274  *	@sk: socket which provides the dst cache and route info
1275  *	@fl6: flow to lookup
1276  *	@final_dst: final destination address for ipsec lookup
1277  *	@connected: whether @sk is connected or not
1278  *
1279  *	This function performs a route lookup on the given flow with the
1280  *	possibility of using the cached route in the socket if it is valid.
1281  *	It will take the socket dst lock when operating on the dst cache.
1282  *	As a result, this function can only be used in process context.
1283  *
1284  *	In addition, for a connected socket, cache the dst in the socket
1285  *	if the current cache is not valid.
1286  *
1287  *	It returns a valid dst pointer on success, or a pointer encoded
1288  *	error code.
1289  */
1290 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1291 					 const struct in6_addr *final_dst,
1292 					 bool connected)
1293 {
1294 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1295 
1296 	dst = ip6_sk_dst_check(sk, dst, fl6);
1297 	if (dst)
1298 		return dst;
1299 
1300 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1301 	if (connected && !IS_ERR(dst))
1302 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1303 
1304 	return dst;
1305 }
1306 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1307 
1308 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1309 					       gfp_t gfp)
1310 {
1311 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1312 }
1313 
1314 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1315 						gfp_t gfp)
1316 {
1317 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1318 }
1319 
1320 static void ip6_append_data_mtu(unsigned int *mtu,
1321 				int *maxfraglen,
1322 				unsigned int fragheaderlen,
1323 				struct sk_buff *skb,
1324 				struct rt6_info *rt,
1325 				unsigned int orig_mtu)
1326 {
1327 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1328 		if (!skb) {
1329 			/* first fragment, reserve header_len */
1330 			*mtu = orig_mtu - rt->dst.header_len;
1331 
1332 		} else {
1333 			/*
1334 			 * this fragment is not first, the headers
1335 			 * space is regarded as data space.
1336 			 */
1337 			*mtu = orig_mtu;
1338 		}
1339 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1340 			      + fragheaderlen - sizeof(struct frag_hdr);
1341 	}
1342 }
1343 
1344 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1345 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1346 			  struct rt6_info *rt)
1347 {
1348 	struct ipv6_pinfo *np = inet6_sk(sk);
1349 	unsigned int mtu, frag_size;
1350 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1351 
1352 	/* callers pass dst together with a reference, set it first so
1353 	 * ip6_cork_release() can put it down even in case of an error.
1354 	 */
1355 	cork->base.dst = &rt->dst;
1356 
1357 	/*
1358 	 * setup for corking
1359 	 */
1360 	if (opt) {
1361 		if (WARN_ON(v6_cork->opt))
1362 			return -EINVAL;
1363 
1364 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1365 		if (unlikely(!nopt))
1366 			return -ENOBUFS;
1367 
1368 		nopt->tot_len = sizeof(*opt);
1369 		nopt->opt_flen = opt->opt_flen;
1370 		nopt->opt_nflen = opt->opt_nflen;
1371 
1372 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1373 		if (opt->dst0opt && !nopt->dst0opt)
1374 			return -ENOBUFS;
1375 
1376 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1377 		if (opt->dst1opt && !nopt->dst1opt)
1378 			return -ENOBUFS;
1379 
1380 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1381 		if (opt->hopopt && !nopt->hopopt)
1382 			return -ENOBUFS;
1383 
1384 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1385 		if (opt->srcrt && !nopt->srcrt)
1386 			return -ENOBUFS;
1387 
1388 		/* need source address above miyazawa*/
1389 	}
1390 	v6_cork->hop_limit = ipc6->hlimit;
1391 	v6_cork->tclass = ipc6->tclass;
1392 	v6_cork->dontfrag = ipc6->dontfrag;
1393 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1394 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1395 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1396 	else
1397 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1398 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1399 
1400 	frag_size = READ_ONCE(np->frag_size);
1401 	if (frag_size && frag_size < mtu)
1402 		mtu = frag_size;
1403 
1404 	cork->base.fragsize = mtu;
1405 	cork->base.gso_size = ipc6->gso_size;
1406 	cork->base.tx_flags = 0;
1407 	cork->base.mark = ipc6->sockc.mark;
1408 	cork->base.priority = ipc6->sockc.priority;
1409 	sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1410 	if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1411 		cork->base.flags |= IPCORK_TS_OPT_ID;
1412 		cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1413 	}
1414 	cork->base.length = 0;
1415 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1416 
1417 	return 0;
1418 }
1419 
1420 static int __ip6_append_data(struct sock *sk,
1421 			     struct sk_buff_head *queue,
1422 			     struct inet_cork_full *cork_full,
1423 			     struct inet6_cork *v6_cork,
1424 			     struct page_frag *pfrag,
1425 			     int getfrag(void *from, char *to, int offset,
1426 					 int len, int odd, struct sk_buff *skb),
1427 			     void *from, size_t length, int transhdrlen,
1428 			     unsigned int flags)
1429 {
1430 	struct sk_buff *skb, *skb_prev = NULL;
1431 	struct inet_cork *cork = &cork_full->base;
1432 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1433 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1434 	struct ubuf_info *uarg = NULL;
1435 	int exthdrlen = 0;
1436 	int dst_exthdrlen = 0;
1437 	int hh_len;
1438 	int copy;
1439 	int err;
1440 	int offset = 0;
1441 	bool zc = false;
1442 	u32 tskey = 0;
1443 	struct rt6_info *rt = dst_rt6_info(cork->dst);
1444 	bool paged, hold_tskey = false, extra_uref = false;
1445 	struct ipv6_txoptions *opt = v6_cork->opt;
1446 	int csummode = CHECKSUM_NONE;
1447 	unsigned int maxnonfragsize, headersize;
1448 	unsigned int wmem_alloc_delta = 0;
1449 
1450 	skb = skb_peek_tail(queue);
1451 	if (!skb) {
1452 		exthdrlen = opt ? opt->opt_flen : 0;
1453 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1454 	}
1455 
1456 	paged = !!cork->gso_size;
1457 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1458 	orig_mtu = mtu;
1459 
1460 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1461 
1462 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1463 			(opt ? opt->opt_nflen : 0);
1464 
1465 	headersize = sizeof(struct ipv6hdr) +
1466 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1467 		     rt->rt6i_nfheader_len;
1468 
1469 	if (mtu <= fragheaderlen ||
1470 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1471 		goto emsgsize;
1472 
1473 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1474 		     sizeof(struct frag_hdr);
1475 
1476 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1477 	 * the first fragment
1478 	 */
1479 	if (headersize + transhdrlen > mtu)
1480 		goto emsgsize;
1481 
1482 	if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1483 	    (sk->sk_protocol == IPPROTO_UDP ||
1484 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1485 	     sk->sk_protocol == IPPROTO_RAW)) {
1486 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1487 				sizeof(struct ipv6hdr));
1488 		goto emsgsize;
1489 	}
1490 
1491 	if (ip6_sk_ignore_df(sk))
1492 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1493 	else
1494 		maxnonfragsize = mtu;
1495 
1496 	if (cork->length + length > maxnonfragsize - headersize) {
1497 emsgsize:
1498 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1499 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1500 		return -EMSGSIZE;
1501 	}
1502 
1503 	/* CHECKSUM_PARTIAL only with no extension headers and when
1504 	 * we are not going to fragment
1505 	 */
1506 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1507 	    headersize == sizeof(struct ipv6hdr) &&
1508 	    length <= mtu - headersize &&
1509 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1510 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1511 		csummode = CHECKSUM_PARTIAL;
1512 
1513 	if ((flags & MSG_ZEROCOPY) && length) {
1514 		struct msghdr *msg = from;
1515 
1516 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1517 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1518 				return -EINVAL;
1519 
1520 			/* Leave uarg NULL if can't zerocopy, callers should
1521 			 * be able to handle it.
1522 			 */
1523 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1524 			    csummode == CHECKSUM_PARTIAL) {
1525 				paged = true;
1526 				zc = true;
1527 				uarg = msg->msg_ubuf;
1528 			}
1529 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1530 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1531 						    false);
1532 			if (!uarg)
1533 				return -ENOBUFS;
1534 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1535 			if (rt->dst.dev->features & NETIF_F_SG &&
1536 			    csummode == CHECKSUM_PARTIAL) {
1537 				paged = true;
1538 				zc = true;
1539 			} else {
1540 				uarg_to_msgzc(uarg)->zerocopy = 0;
1541 				skb_zcopy_set(skb, uarg, &extra_uref);
1542 			}
1543 		}
1544 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1545 		if (inet_test_bit(HDRINCL, sk))
1546 			return -EPERM;
1547 		if (rt->dst.dev->features & NETIF_F_SG &&
1548 		    getfrag == ip_generic_getfrag)
1549 			/* We need an empty buffer to attach stuff to */
1550 			paged = true;
1551 		else
1552 			flags &= ~MSG_SPLICE_PAGES;
1553 	}
1554 
1555 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1556 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1557 		if (cork->flags & IPCORK_TS_OPT_ID) {
1558 			tskey = cork->ts_opt_id;
1559 		} else {
1560 			tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1561 			hold_tskey = true;
1562 		}
1563 	}
1564 
1565 	/*
1566 	 * Let's try using as much space as possible.
1567 	 * Use MTU if total length of the message fits into the MTU.
1568 	 * Otherwise, we need to reserve fragment header and
1569 	 * fragment alignment (= 8-15 octects, in total).
1570 	 *
1571 	 * Note that we may need to "move" the data from the tail
1572 	 * of the buffer to the new fragment when we split
1573 	 * the message.
1574 	 *
1575 	 * FIXME: It may be fragmented into multiple chunks
1576 	 *        at once if non-fragmentable extension headers
1577 	 *        are too large.
1578 	 * --yoshfuji
1579 	 */
1580 
1581 	cork->length += length;
1582 	if (!skb)
1583 		goto alloc_new_skb;
1584 
1585 	while (length > 0) {
1586 		/* Check if the remaining data fits into current packet. */
1587 		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1588 		if (copy < length)
1589 			copy = maxfraglen - skb->len;
1590 
1591 		if (copy <= 0) {
1592 			char *data;
1593 			unsigned int datalen;
1594 			unsigned int fraglen;
1595 			unsigned int fraggap;
1596 			unsigned int alloclen, alloc_extra;
1597 			unsigned int pagedlen;
1598 alloc_new_skb:
1599 			/* There's no room in the current skb */
1600 			if (skb)
1601 				fraggap = skb->len - maxfraglen;
1602 			else
1603 				fraggap = 0;
1604 			/* update mtu and maxfraglen if necessary */
1605 			if (!skb || !skb_prev)
1606 				ip6_append_data_mtu(&mtu, &maxfraglen,
1607 						    fragheaderlen, skb, rt,
1608 						    orig_mtu);
1609 
1610 			skb_prev = skb;
1611 
1612 			/*
1613 			 * If remaining data exceeds the mtu,
1614 			 * we know we need more fragment(s).
1615 			 */
1616 			datalen = length + fraggap;
1617 
1618 			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1619 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1620 			fraglen = datalen + fragheaderlen;
1621 			pagedlen = 0;
1622 
1623 			alloc_extra = hh_len;
1624 			alloc_extra += dst_exthdrlen;
1625 			alloc_extra += rt->dst.trailer_len;
1626 
1627 			/* We just reserve space for fragment header.
1628 			 * Note: this may be overallocation if the message
1629 			 * (without MSG_MORE) fits into the MTU.
1630 			 */
1631 			alloc_extra += sizeof(struct frag_hdr);
1632 
1633 			if ((flags & MSG_MORE) &&
1634 			    !(rt->dst.dev->features&NETIF_F_SG))
1635 				alloclen = mtu;
1636 			else if (!paged &&
1637 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1638 				  !(rt->dst.dev->features & NETIF_F_SG)))
1639 				alloclen = fraglen;
1640 			else {
1641 				alloclen = fragheaderlen + transhdrlen;
1642 				pagedlen = datalen - transhdrlen;
1643 			}
1644 			alloclen += alloc_extra;
1645 
1646 			if (datalen != length + fraggap) {
1647 				/*
1648 				 * this is not the last fragment, the trailer
1649 				 * space is regarded as data space.
1650 				 */
1651 				datalen += rt->dst.trailer_len;
1652 			}
1653 
1654 			fraglen = datalen + fragheaderlen;
1655 
1656 			copy = datalen - transhdrlen - fraggap - pagedlen;
1657 			/* [!] NOTE: copy may be negative if pagedlen>0
1658 			 * because then the equation may reduces to -fraggap.
1659 			 */
1660 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1661 				err = -EINVAL;
1662 				goto error;
1663 			}
1664 			if (transhdrlen) {
1665 				skb = sock_alloc_send_skb(sk, alloclen,
1666 						(flags & MSG_DONTWAIT), &err);
1667 			} else {
1668 				skb = NULL;
1669 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1670 				    2 * sk->sk_sndbuf)
1671 					skb = alloc_skb(alloclen,
1672 							sk->sk_allocation);
1673 				if (unlikely(!skb))
1674 					err = -ENOBUFS;
1675 			}
1676 			if (!skb)
1677 				goto error;
1678 			/*
1679 			 *	Fill in the control structures
1680 			 */
1681 			skb->protocol = htons(ETH_P_IPV6);
1682 			skb->ip_summed = csummode;
1683 			skb->csum = 0;
1684 			/* reserve for fragmentation and ipsec header */
1685 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1686 				    dst_exthdrlen);
1687 
1688 			/*
1689 			 *	Find where to start putting bytes
1690 			 */
1691 			data = skb_put(skb, fraglen - pagedlen);
1692 			skb_set_network_header(skb, exthdrlen);
1693 			data += fragheaderlen;
1694 			skb->transport_header = (skb->network_header +
1695 						 fragheaderlen);
1696 			if (fraggap) {
1697 				skb->csum = skb_copy_and_csum_bits(
1698 					skb_prev, maxfraglen,
1699 					data + transhdrlen, fraggap);
1700 				skb_prev->csum = csum_sub(skb_prev->csum,
1701 							  skb->csum);
1702 				data += fraggap;
1703 				pskb_trim_unique(skb_prev, maxfraglen);
1704 			}
1705 			if (copy > 0 &&
1706 			    INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1707 					   from, data + transhdrlen, offset,
1708 					   copy, fraggap, skb) < 0) {
1709 				err = -EFAULT;
1710 				kfree_skb(skb);
1711 				goto error;
1712 			} else if (flags & MSG_SPLICE_PAGES) {
1713 				copy = 0;
1714 			}
1715 
1716 			offset += copy;
1717 			length -= copy + transhdrlen;
1718 			transhdrlen = 0;
1719 			exthdrlen = 0;
1720 			dst_exthdrlen = 0;
1721 
1722 			/* Only the initial fragment is time stamped */
1723 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1724 			cork->tx_flags = 0;
1725 			skb_shinfo(skb)->tskey = tskey;
1726 			tskey = 0;
1727 			skb_zcopy_set(skb, uarg, &extra_uref);
1728 
1729 			if ((flags & MSG_CONFIRM) && !skb_prev)
1730 				skb_set_dst_pending_confirm(skb, 1);
1731 
1732 			/*
1733 			 * Put the packet on the pending queue
1734 			 */
1735 			if (!skb->destructor) {
1736 				skb->destructor = sock_wfree;
1737 				skb->sk = sk;
1738 				wmem_alloc_delta += skb->truesize;
1739 			}
1740 			__skb_queue_tail(queue, skb);
1741 			continue;
1742 		}
1743 
1744 		if (copy > length)
1745 			copy = length;
1746 
1747 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1748 		    skb_tailroom(skb) >= copy) {
1749 			unsigned int off;
1750 
1751 			off = skb->len;
1752 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1753 					    from, skb_put(skb, copy),
1754 					    offset, copy, off, skb) < 0) {
1755 				__skb_trim(skb, off);
1756 				err = -EFAULT;
1757 				goto error;
1758 			}
1759 		} else if (flags & MSG_SPLICE_PAGES) {
1760 			struct msghdr *msg = from;
1761 
1762 			err = -EIO;
1763 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1764 				goto error;
1765 
1766 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
1767 			if (err < 0)
1768 				goto error;
1769 			copy = err;
1770 			wmem_alloc_delta += copy;
1771 		} else if (!zc) {
1772 			int i = skb_shinfo(skb)->nr_frags;
1773 
1774 			err = -ENOMEM;
1775 			if (!sk_page_frag_refill(sk, pfrag))
1776 				goto error;
1777 
1778 			skb_zcopy_downgrade_managed(skb);
1779 			if (!skb_can_coalesce(skb, i, pfrag->page,
1780 					      pfrag->offset)) {
1781 				err = -EMSGSIZE;
1782 				if (i == MAX_SKB_FRAGS)
1783 					goto error;
1784 
1785 				__skb_fill_page_desc(skb, i, pfrag->page,
1786 						     pfrag->offset, 0);
1787 				skb_shinfo(skb)->nr_frags = ++i;
1788 				get_page(pfrag->page);
1789 			}
1790 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1791 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1792 				    from,
1793 				    page_address(pfrag->page) + pfrag->offset,
1794 				    offset, copy, skb->len, skb) < 0)
1795 				goto error_efault;
1796 
1797 			pfrag->offset += copy;
1798 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1799 			skb->len += copy;
1800 			skb->data_len += copy;
1801 			skb->truesize += copy;
1802 			wmem_alloc_delta += copy;
1803 		} else {
1804 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1805 			if (err < 0)
1806 				goto error;
1807 		}
1808 		offset += copy;
1809 		length -= copy;
1810 	}
1811 
1812 	if (wmem_alloc_delta)
1813 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1814 	return 0;
1815 
1816 error_efault:
1817 	err = -EFAULT;
1818 error:
1819 	net_zcopy_put_abort(uarg, extra_uref);
1820 	cork->length -= length;
1821 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1822 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1823 	if (hold_tskey)
1824 		atomic_dec(&sk->sk_tskey);
1825 	return err;
1826 }
1827 
1828 int ip6_append_data(struct sock *sk,
1829 		    int getfrag(void *from, char *to, int offset, int len,
1830 				int odd, struct sk_buff *skb),
1831 		    void *from, size_t length, int transhdrlen,
1832 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1833 		    struct rt6_info *rt, unsigned int flags)
1834 {
1835 	struct inet_sock *inet = inet_sk(sk);
1836 	struct ipv6_pinfo *np = inet6_sk(sk);
1837 	int exthdrlen;
1838 	int err;
1839 
1840 	if (flags&MSG_PROBE)
1841 		return 0;
1842 	if (skb_queue_empty(&sk->sk_write_queue)) {
1843 		/*
1844 		 * setup for corking
1845 		 */
1846 		dst_hold(&rt->dst);
1847 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1848 				     ipc6, rt);
1849 		if (err)
1850 			return err;
1851 
1852 		inet->cork.fl.u.ip6 = *fl6;
1853 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1854 		length += exthdrlen;
1855 		transhdrlen += exthdrlen;
1856 	} else {
1857 		transhdrlen = 0;
1858 	}
1859 
1860 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1861 				 &np->cork, sk_page_frag(sk), getfrag,
1862 				 from, length, transhdrlen, flags);
1863 }
1864 EXPORT_SYMBOL_GPL(ip6_append_data);
1865 
1866 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1867 {
1868 	struct dst_entry *dst = cork->base.dst;
1869 
1870 	cork->base.dst = NULL;
1871 	skb_dst_set(skb, dst);
1872 }
1873 
1874 static void ip6_cork_release(struct inet_cork_full *cork,
1875 			     struct inet6_cork *v6_cork)
1876 {
1877 	if (v6_cork->opt) {
1878 		struct ipv6_txoptions *opt = v6_cork->opt;
1879 
1880 		kfree(opt->dst0opt);
1881 		kfree(opt->dst1opt);
1882 		kfree(opt->hopopt);
1883 		kfree(opt->srcrt);
1884 		kfree(opt);
1885 		v6_cork->opt = NULL;
1886 	}
1887 
1888 	if (cork->base.dst) {
1889 		dst_release(cork->base.dst);
1890 		cork->base.dst = NULL;
1891 	}
1892 }
1893 
1894 struct sk_buff *__ip6_make_skb(struct sock *sk,
1895 			       struct sk_buff_head *queue,
1896 			       struct inet_cork_full *cork,
1897 			       struct inet6_cork *v6_cork)
1898 {
1899 	struct sk_buff *skb, *tmp_skb;
1900 	struct sk_buff **tail_skb;
1901 	struct in6_addr *final_dst;
1902 	struct net *net = sock_net(sk);
1903 	struct ipv6hdr *hdr;
1904 	struct ipv6_txoptions *opt = v6_cork->opt;
1905 	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1906 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1907 	unsigned char proto = fl6->flowi6_proto;
1908 
1909 	skb = __skb_dequeue(queue);
1910 	if (!skb)
1911 		goto out;
1912 	tail_skb = &(skb_shinfo(skb)->frag_list);
1913 
1914 	/* move skb->data to ip header from ext header */
1915 	if (skb->data < skb_network_header(skb))
1916 		__skb_pull(skb, skb_network_offset(skb));
1917 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1918 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1919 		*tail_skb = tmp_skb;
1920 		tail_skb = &(tmp_skb->next);
1921 		skb->len += tmp_skb->len;
1922 		skb->data_len += tmp_skb->len;
1923 		skb->truesize += tmp_skb->truesize;
1924 		tmp_skb->destructor = NULL;
1925 		tmp_skb->sk = NULL;
1926 	}
1927 
1928 	/* Allow local fragmentation. */
1929 	skb->ignore_df = ip6_sk_ignore_df(sk);
1930 	__skb_pull(skb, skb_network_header_len(skb));
1931 
1932 	final_dst = &fl6->daddr;
1933 	if (opt && opt->opt_flen)
1934 		ipv6_push_frag_opts(skb, opt, &proto);
1935 	if (opt && opt->opt_nflen)
1936 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1937 
1938 	skb_push(skb, sizeof(struct ipv6hdr));
1939 	skb_reset_network_header(skb);
1940 	hdr = ipv6_hdr(skb);
1941 
1942 	ip6_flow_hdr(hdr, v6_cork->tclass,
1943 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1944 					ip6_autoflowlabel(net, sk), fl6));
1945 	hdr->hop_limit = v6_cork->hop_limit;
1946 	hdr->nexthdr = proto;
1947 	hdr->saddr = fl6->saddr;
1948 	hdr->daddr = *final_dst;
1949 
1950 	skb->priority = cork->base.priority;
1951 	skb->mark = cork->base.mark;
1952 	if (sk_is_tcp(sk))
1953 		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1954 	else
1955 		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1956 
1957 	ip6_cork_steal_dst(skb, cork);
1958 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1959 	if (proto == IPPROTO_ICMPV6) {
1960 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1961 		u8 icmp6_type;
1962 
1963 		if (sk->sk_socket->type == SOCK_RAW &&
1964 		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1965 			icmp6_type = fl6->fl6_icmp_type;
1966 		else
1967 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1968 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1969 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1970 	}
1971 
1972 	ip6_cork_release(cork, v6_cork);
1973 out:
1974 	return skb;
1975 }
1976 
1977 int ip6_send_skb(struct sk_buff *skb)
1978 {
1979 	struct net *net = sock_net(skb->sk);
1980 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1981 	int err;
1982 
1983 	rcu_read_lock();
1984 	err = ip6_local_out(net, skb->sk, skb);
1985 	if (err) {
1986 		if (err > 0)
1987 			err = net_xmit_errno(err);
1988 		if (err)
1989 			IP6_INC_STATS(net, rt->rt6i_idev,
1990 				      IPSTATS_MIB_OUTDISCARDS);
1991 	}
1992 
1993 	rcu_read_unlock();
1994 	return err;
1995 }
1996 
1997 int ip6_push_pending_frames(struct sock *sk)
1998 {
1999 	struct sk_buff *skb;
2000 
2001 	skb = ip6_finish_skb(sk);
2002 	if (!skb)
2003 		return 0;
2004 
2005 	return ip6_send_skb(skb);
2006 }
2007 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2008 
2009 static void __ip6_flush_pending_frames(struct sock *sk,
2010 				       struct sk_buff_head *queue,
2011 				       struct inet_cork_full *cork,
2012 				       struct inet6_cork *v6_cork)
2013 {
2014 	struct sk_buff *skb;
2015 
2016 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2017 		if (skb_dst(skb))
2018 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2019 				      IPSTATS_MIB_OUTDISCARDS);
2020 		kfree_skb(skb);
2021 	}
2022 
2023 	ip6_cork_release(cork, v6_cork);
2024 }
2025 
2026 void ip6_flush_pending_frames(struct sock *sk)
2027 {
2028 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2029 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2030 }
2031 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2032 
2033 struct sk_buff *ip6_make_skb(struct sock *sk,
2034 			     int getfrag(void *from, char *to, int offset,
2035 					 int len, int odd, struct sk_buff *skb),
2036 			     void *from, size_t length, int transhdrlen,
2037 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2038 			     unsigned int flags, struct inet_cork_full *cork)
2039 {
2040 	struct inet6_cork v6_cork;
2041 	struct sk_buff_head queue;
2042 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2043 	int err;
2044 
2045 	if (flags & MSG_PROBE) {
2046 		dst_release(&rt->dst);
2047 		return NULL;
2048 	}
2049 
2050 	__skb_queue_head_init(&queue);
2051 
2052 	cork->base.flags = 0;
2053 	cork->base.addr = 0;
2054 	cork->base.opt = NULL;
2055 	v6_cork.opt = NULL;
2056 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2057 	if (err) {
2058 		ip6_cork_release(cork, &v6_cork);
2059 		return ERR_PTR(err);
2060 	}
2061 
2062 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2063 				&current->task_frag, getfrag, from,
2064 				length + exthdrlen, transhdrlen + exthdrlen,
2065 				flags);
2066 	if (err) {
2067 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2068 		return ERR_PTR(err);
2069 	}
2070 
2071 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2072 }
2073