xref: /linux/net/ipv6/ip6_output.c (revision eb65f96cb332d577b490ab9c9f5f8de8c0316076)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res != LWTUNNEL_XMIT_CONTINUE)
117 			return res;
118 	}
119 
120 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
121 
122 	rcu_read_lock();
123 	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			rcu_read_unlock();
131 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
132 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
133 			return -EINVAL;
134 		}
135 	}
136 	sock_confirm_neigh(skb, neigh);
137 	ret = neigh_output(neigh, skb, false);
138 	rcu_read_unlock();
139 	return ret;
140 }
141 
142 static int
143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
144 				    struct sk_buff *skb, unsigned int mtu)
145 {
146 	struct sk_buff *segs, *nskb;
147 	netdev_features_t features;
148 	int ret = 0;
149 
150 	/* Please see corresponding comment in ip_finish_output_gso
151 	 * describing the cases where GSO segment length exceeds the
152 	 * egress MTU.
153 	 */
154 	features = netif_skb_features(skb);
155 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
156 	if (IS_ERR_OR_NULL(segs)) {
157 		kfree_skb(skb);
158 		return -ENOMEM;
159 	}
160 
161 	consume_skb(skb);
162 
163 	skb_list_walk_safe(segs, segs, nskb) {
164 		int err;
165 
166 		skb_mark_not_on_list(segs);
167 		/* Last GSO segment can be smaller than gso_size (and MTU).
168 		 * Adding a fragment header would produce an "atomic fragment",
169 		 * which is considered harmful (RFC-8021). Avoid that.
170 		 */
171 		err = segs->len > mtu ?
172 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
173 			ip6_finish_output2(net, sk, segs);
174 		if (err && ret == 0)
175 			ret = err;
176 	}
177 
178 	return ret;
179 }
180 
181 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
182 				 struct sk_buff *skb, unsigned int mtu)
183 {
184 	if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
185 	    !skb_gso_validate_network_len(skb, mtu))
186 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
187 
188 	return ip6_finish_output2(net, sk, skb);
189 }
190 
191 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
192 {
193 	unsigned int mtu;
194 
195 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
196 	/* Policy lookup after SNAT yielded a new policy */
197 	if (skb_dst(skb)->xfrm) {
198 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
199 		return dst_output(net, sk, skb);
200 	}
201 #endif
202 
203 	mtu = ip6_skb_dst_mtu(skb);
204 	if (skb_is_gso(skb))
205 		return ip6_finish_output_gso(net, sk, skb, mtu);
206 
207 	if (skb->len > mtu ||
208 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
209 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
210 
211 	return ip6_finish_output2(net, sk, skb);
212 }
213 
214 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216 	int ret;
217 
218 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
219 	switch (ret) {
220 	case NET_XMIT_SUCCESS:
221 	case NET_XMIT_CN:
222 		return __ip6_finish_output(net, sk, skb) ? : ret;
223 	default:
224 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
225 		return ret;
226 	}
227 }
228 
229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
230 {
231 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
232 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
233 
234 	skb->protocol = htons(ETH_P_IPV6);
235 	skb->dev = dev;
236 
237 	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
238 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
239 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
240 		return 0;
241 	}
242 
243 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
244 			    net, sk, skb, indev, dev,
245 			    ip6_finish_output,
246 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
247 }
248 EXPORT_SYMBOL(ip6_output);
249 
250 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
251 {
252 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
253 		return ip6_default_np_autolabel(net);
254 	return inet6_test_bit(AUTOFLOWLABEL, sk);
255 }
256 
257 /*
258  * xmit an sk_buff (used by TCP, SCTP and DCCP)
259  * Note : socket lock is not held for SYNACK packets, but might be modified
260  * by calls to skb_set_owner_w() and ipv6_local_error(),
261  * which are using proper atomic operations or spinlocks.
262  */
263 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
264 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
265 {
266 	struct net *net = sock_net(sk);
267 	const struct ipv6_pinfo *np = inet6_sk(sk);
268 	struct in6_addr *first_hop = &fl6->daddr;
269 	struct dst_entry *dst = skb_dst(skb);
270 	struct net_device *dev = dst->dev;
271 	struct inet6_dev *idev = ip6_dst_idev(dst);
272 	struct hop_jumbo_hdr *hop_jumbo;
273 	int hoplen = sizeof(*hop_jumbo);
274 	unsigned int head_room;
275 	struct ipv6hdr *hdr;
276 	u8  proto = fl6->flowi6_proto;
277 	int seg_len = skb->len;
278 	int hlimit = -1;
279 	u32 mtu;
280 
281 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
282 	if (opt)
283 		head_room += opt->opt_nflen + opt->opt_flen;
284 
285 	if (unlikely(head_room > skb_headroom(skb))) {
286 		skb = skb_expand_head(skb, head_room);
287 		if (!skb) {
288 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
289 			return -ENOBUFS;
290 		}
291 	}
292 
293 	if (opt) {
294 		seg_len += opt->opt_nflen + opt->opt_flen;
295 
296 		if (opt->opt_flen)
297 			ipv6_push_frag_opts(skb, opt, &proto);
298 
299 		if (opt->opt_nflen)
300 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
301 					     &fl6->saddr);
302 	}
303 
304 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
305 		hop_jumbo = skb_push(skb, hoplen);
306 
307 		hop_jumbo->nexthdr = proto;
308 		hop_jumbo->hdrlen = 0;
309 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
310 		hop_jumbo->tlv_len = 4;
311 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
312 
313 		proto = IPPROTO_HOPOPTS;
314 		seg_len = 0;
315 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
316 	}
317 
318 	skb_push(skb, sizeof(struct ipv6hdr));
319 	skb_reset_network_header(skb);
320 	hdr = ipv6_hdr(skb);
321 
322 	/*
323 	 *	Fill in the IPv6 header
324 	 */
325 	if (np)
326 		hlimit = READ_ONCE(np->hop_limit);
327 	if (hlimit < 0)
328 		hlimit = ip6_dst_hoplimit(dst);
329 
330 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
331 				ip6_autoflowlabel(net, sk), fl6));
332 
333 	hdr->payload_len = htons(seg_len);
334 	hdr->nexthdr = proto;
335 	hdr->hop_limit = hlimit;
336 
337 	hdr->saddr = fl6->saddr;
338 	hdr->daddr = *first_hop;
339 
340 	skb->protocol = htons(ETH_P_IPV6);
341 	skb->priority = priority;
342 	skb->mark = mark;
343 
344 	mtu = dst_mtu(dst);
345 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
346 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
347 
348 		/* if egress device is enslaved to an L3 master device pass the
349 		 * skb to its handler for processing
350 		 */
351 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
352 		if (unlikely(!skb))
353 			return 0;
354 
355 		/* hooks should never assume socket lock is held.
356 		 * we promote our socket to non const
357 		 */
358 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
359 			       net, (struct sock *)sk, skb, NULL, dev,
360 			       dst_output);
361 	}
362 
363 	skb->dev = dev;
364 	/* ipv6_local_error() does not require socket lock,
365 	 * we promote our socket to non const
366 	 */
367 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
368 
369 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
370 	kfree_skb(skb);
371 	return -EMSGSIZE;
372 }
373 EXPORT_SYMBOL(ip6_xmit);
374 
375 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
376 {
377 	struct ip6_ra_chain *ra;
378 	struct sock *last = NULL;
379 
380 	read_lock(&ip6_ra_lock);
381 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
382 		struct sock *sk = ra->sk;
383 		if (sk && ra->sel == sel &&
384 		    (!sk->sk_bound_dev_if ||
385 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
386 
387 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
388 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
389 				continue;
390 			}
391 			if (last) {
392 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
393 				if (skb2)
394 					rawv6_rcv(last, skb2);
395 			}
396 			last = sk;
397 		}
398 	}
399 
400 	if (last) {
401 		rawv6_rcv(last, skb);
402 		read_unlock(&ip6_ra_lock);
403 		return 1;
404 	}
405 	read_unlock(&ip6_ra_lock);
406 	return 0;
407 }
408 
409 static int ip6_forward_proxy_check(struct sk_buff *skb)
410 {
411 	struct ipv6hdr *hdr = ipv6_hdr(skb);
412 	u8 nexthdr = hdr->nexthdr;
413 	__be16 frag_off;
414 	int offset;
415 
416 	if (ipv6_ext_hdr(nexthdr)) {
417 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
418 		if (offset < 0)
419 			return 0;
420 	} else
421 		offset = sizeof(struct ipv6hdr);
422 
423 	if (nexthdr == IPPROTO_ICMPV6) {
424 		struct icmp6hdr *icmp6;
425 
426 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
427 					 offset + 1 - skb->data)))
428 			return 0;
429 
430 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
431 
432 		switch (icmp6->icmp6_type) {
433 		case NDISC_ROUTER_SOLICITATION:
434 		case NDISC_ROUTER_ADVERTISEMENT:
435 		case NDISC_NEIGHBOUR_SOLICITATION:
436 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
437 		case NDISC_REDIRECT:
438 			/* For reaction involving unicast neighbor discovery
439 			 * message destined to the proxied address, pass it to
440 			 * input function.
441 			 */
442 			return 1;
443 		default:
444 			break;
445 		}
446 	}
447 
448 	/*
449 	 * The proxying router can't forward traffic sent to a link-local
450 	 * address, so signal the sender and discard the packet. This
451 	 * behavior is clarified by the MIPv6 specification.
452 	 */
453 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
454 		dst_link_failure(skb);
455 		return -1;
456 	}
457 
458 	return 0;
459 }
460 
461 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
462 				     struct sk_buff *skb)
463 {
464 #ifdef CONFIG_NET_SWITCHDEV
465 	if (skb->offload_l3_fwd_mark) {
466 		consume_skb(skb);
467 		return 0;
468 	}
469 #endif
470 
471 	skb_clear_tstamp(skb);
472 	return dst_output(net, sk, skb);
473 }
474 
475 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
476 {
477 	if (skb->len <= mtu)
478 		return false;
479 
480 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
481 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
482 		return true;
483 
484 	if (skb->ignore_df)
485 		return false;
486 
487 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
488 		return false;
489 
490 	return true;
491 }
492 
493 int ip6_forward(struct sk_buff *skb)
494 {
495 	struct dst_entry *dst = skb_dst(skb);
496 	struct ipv6hdr *hdr = ipv6_hdr(skb);
497 	struct inet6_skb_parm *opt = IP6CB(skb);
498 	struct net *net = dev_net(dst->dev);
499 	struct inet6_dev *idev;
500 	SKB_DR(reason);
501 	u32 mtu;
502 
503 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
504 	if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
505 		goto error;
506 
507 	if (skb->pkt_type != PACKET_HOST)
508 		goto drop;
509 
510 	if (unlikely(skb->sk))
511 		goto drop;
512 
513 	if (skb_warn_if_lro(skb))
514 		goto drop;
515 
516 	if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
517 	    (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
518 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
519 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
520 		goto drop;
521 	}
522 
523 	skb_forward_csum(skb);
524 
525 	/*
526 	 *	We DO NOT make any processing on
527 	 *	RA packets, pushing them to user level AS IS
528 	 *	without ane WARRANTY that application will be able
529 	 *	to interpret them. The reason is that we
530 	 *	cannot make anything clever here.
531 	 *
532 	 *	We are not end-node, so that if packet contains
533 	 *	AH/ESP, we cannot make anything.
534 	 *	Defragmentation also would be mistake, RA packets
535 	 *	cannot be fragmented, because there is no warranty
536 	 *	that different fragments will go along one path. --ANK
537 	 */
538 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
539 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
540 			return 0;
541 	}
542 
543 	/*
544 	 *	check and decrement ttl
545 	 */
546 	if (hdr->hop_limit <= 1) {
547 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
548 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
549 
550 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
551 		return -ETIMEDOUT;
552 	}
553 
554 	/* XXX: idev->cnf.proxy_ndp? */
555 	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
556 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
557 		int proxied = ip6_forward_proxy_check(skb);
558 		if (proxied > 0) {
559 			/* It's tempting to decrease the hop limit
560 			 * here by 1, as we do at the end of the
561 			 * function too.
562 			 *
563 			 * But that would be incorrect, as proxying is
564 			 * not forwarding.  The ip6_input function
565 			 * will handle this packet locally, and it
566 			 * depends on the hop limit being unchanged.
567 			 *
568 			 * One example is the NDP hop limit, that
569 			 * always has to stay 255, but other would be
570 			 * similar checks around RA packets, where the
571 			 * user can even change the desired limit.
572 			 */
573 			return ip6_input(skb);
574 		} else if (proxied < 0) {
575 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
576 			goto drop;
577 		}
578 	}
579 
580 	if (!xfrm6_route_forward(skb)) {
581 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
582 		SKB_DR_SET(reason, XFRM_POLICY);
583 		goto drop;
584 	}
585 	dst = skb_dst(skb);
586 
587 	/* IPv6 specs say nothing about it, but it is clear that we cannot
588 	   send redirects to source routed frames.
589 	   We don't send redirects to frames decapsulated from IPsec.
590 	 */
591 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
592 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
593 		struct in6_addr *target = NULL;
594 		struct inet_peer *peer;
595 		struct rt6_info *rt;
596 
597 		/*
598 		 *	incoming and outgoing devices are the same
599 		 *	send a redirect.
600 		 */
601 
602 		rt = dst_rt6_info(dst);
603 		if (rt->rt6i_flags & RTF_GATEWAY)
604 			target = &rt->rt6i_gateway;
605 		else
606 			target = &hdr->daddr;
607 
608 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
609 
610 		/* Limit redirects both by destination (here)
611 		   and by source (inside ndisc_send_redirect)
612 		 */
613 		if (inet_peer_xrlim_allow(peer, 1*HZ))
614 			ndisc_send_redirect(skb, target);
615 		if (peer)
616 			inet_putpeer(peer);
617 	} else {
618 		int addrtype = ipv6_addr_type(&hdr->saddr);
619 
620 		/* This check is security critical. */
621 		if (addrtype == IPV6_ADDR_ANY ||
622 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
623 			goto error;
624 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
625 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
626 				    ICMPV6_NOT_NEIGHBOUR, 0);
627 			goto error;
628 		}
629 	}
630 
631 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
632 
633 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
634 	if (mtu < IPV6_MIN_MTU)
635 		mtu = IPV6_MIN_MTU;
636 
637 	if (ip6_pkt_too_big(skb, mtu)) {
638 		/* Again, force OUTPUT device used as source address */
639 		skb->dev = dst->dev;
640 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
641 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
642 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
643 				IPSTATS_MIB_FRAGFAILS);
644 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
645 		return -EMSGSIZE;
646 	}
647 
648 	if (skb_cow(skb, dst->dev->hard_header_len)) {
649 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
650 				IPSTATS_MIB_OUTDISCARDS);
651 		goto drop;
652 	}
653 
654 	hdr = ipv6_hdr(skb);
655 
656 	/* Mangling hops number delayed to point after skb COW */
657 
658 	hdr->hop_limit--;
659 
660 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
661 		       net, NULL, skb, skb->dev, dst->dev,
662 		       ip6_forward_finish);
663 
664 error:
665 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
666 	SKB_DR_SET(reason, IP_INADDRERRORS);
667 drop:
668 	kfree_skb_reason(skb, reason);
669 	return -EINVAL;
670 }
671 
672 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
673 {
674 	to->pkt_type = from->pkt_type;
675 	to->priority = from->priority;
676 	to->protocol = from->protocol;
677 	skb_dst_drop(to);
678 	skb_dst_set(to, dst_clone(skb_dst(from)));
679 	to->dev = from->dev;
680 	to->mark = from->mark;
681 
682 	skb_copy_hash(to, from);
683 
684 #ifdef CONFIG_NET_SCHED
685 	to->tc_index = from->tc_index;
686 #endif
687 	nf_copy(to, from);
688 	skb_ext_copy(to, from);
689 	skb_copy_secmark(to, from);
690 }
691 
692 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
693 		      u8 nexthdr, __be32 frag_id,
694 		      struct ip6_fraglist_iter *iter)
695 {
696 	unsigned int first_len;
697 	struct frag_hdr *fh;
698 
699 	/* BUILD HEADER */
700 	*prevhdr = NEXTHDR_FRAGMENT;
701 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
702 	if (!iter->tmp_hdr)
703 		return -ENOMEM;
704 
705 	iter->frag = skb_shinfo(skb)->frag_list;
706 	skb_frag_list_init(skb);
707 
708 	iter->offset = 0;
709 	iter->hlen = hlen;
710 	iter->frag_id = frag_id;
711 	iter->nexthdr = nexthdr;
712 
713 	__skb_pull(skb, hlen);
714 	fh = __skb_push(skb, sizeof(struct frag_hdr));
715 	__skb_push(skb, hlen);
716 	skb_reset_network_header(skb);
717 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
718 
719 	fh->nexthdr = nexthdr;
720 	fh->reserved = 0;
721 	fh->frag_off = htons(IP6_MF);
722 	fh->identification = frag_id;
723 
724 	first_len = skb_pagelen(skb);
725 	skb->data_len = first_len - skb_headlen(skb);
726 	skb->len = first_len;
727 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
728 
729 	return 0;
730 }
731 EXPORT_SYMBOL(ip6_fraglist_init);
732 
733 void ip6_fraglist_prepare(struct sk_buff *skb,
734 			  struct ip6_fraglist_iter *iter)
735 {
736 	struct sk_buff *frag = iter->frag;
737 	unsigned int hlen = iter->hlen;
738 	struct frag_hdr *fh;
739 
740 	frag->ip_summed = CHECKSUM_NONE;
741 	skb_reset_transport_header(frag);
742 	fh = __skb_push(frag, sizeof(struct frag_hdr));
743 	__skb_push(frag, hlen);
744 	skb_reset_network_header(frag);
745 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
746 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
747 	fh->nexthdr = iter->nexthdr;
748 	fh->reserved = 0;
749 	fh->frag_off = htons(iter->offset);
750 	if (frag->next)
751 		fh->frag_off |= htons(IP6_MF);
752 	fh->identification = iter->frag_id;
753 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
754 	ip6_copy_metadata(frag, skb);
755 }
756 EXPORT_SYMBOL(ip6_fraglist_prepare);
757 
758 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
759 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
760 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
761 {
762 	state->prevhdr = prevhdr;
763 	state->nexthdr = nexthdr;
764 	state->frag_id = frag_id;
765 
766 	state->hlen = hlen;
767 	state->mtu = mtu;
768 
769 	state->left = skb->len - hlen;	/* Space per frame */
770 	state->ptr = hlen;		/* Where to start from */
771 
772 	state->hroom = hdr_room;
773 	state->troom = needed_tailroom;
774 
775 	state->offset = 0;
776 }
777 EXPORT_SYMBOL(ip6_frag_init);
778 
779 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
780 {
781 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
782 	struct sk_buff *frag;
783 	struct frag_hdr *fh;
784 	unsigned int len;
785 
786 	len = state->left;
787 	/* IF: it doesn't fit, use 'mtu' - the data space left */
788 	if (len > state->mtu)
789 		len = state->mtu;
790 	/* IF: we are not sending up to and including the packet end
791 	   then align the next start on an eight byte boundary */
792 	if (len < state->left)
793 		len &= ~7;
794 
795 	/* Allocate buffer */
796 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
797 			 state->hroom + state->troom, GFP_ATOMIC);
798 	if (!frag)
799 		return ERR_PTR(-ENOMEM);
800 
801 	/*
802 	 *	Set up data on packet
803 	 */
804 
805 	ip6_copy_metadata(frag, skb);
806 	skb_reserve(frag, state->hroom);
807 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
808 	skb_reset_network_header(frag);
809 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
810 	frag->transport_header = (frag->network_header + state->hlen +
811 				  sizeof(struct frag_hdr));
812 
813 	/*
814 	 *	Charge the memory for the fragment to any owner
815 	 *	it might possess
816 	 */
817 	if (skb->sk)
818 		skb_set_owner_w(frag, skb->sk);
819 
820 	/*
821 	 *	Copy the packet header into the new buffer.
822 	 */
823 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
824 
825 	fragnexthdr_offset = skb_network_header(frag);
826 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
827 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
828 
829 	/*
830 	 *	Build fragment header.
831 	 */
832 	fh->nexthdr = state->nexthdr;
833 	fh->reserved = 0;
834 	fh->identification = state->frag_id;
835 
836 	/*
837 	 *	Copy a block of the IP datagram.
838 	 */
839 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
840 			     len));
841 	state->left -= len;
842 
843 	fh->frag_off = htons(state->offset);
844 	if (state->left > 0)
845 		fh->frag_off |= htons(IP6_MF);
846 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
847 
848 	state->ptr += len;
849 	state->offset += len;
850 
851 	return frag;
852 }
853 EXPORT_SYMBOL(ip6_frag_next);
854 
855 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
856 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
857 {
858 	struct sk_buff *frag;
859 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
860 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
861 				inet6_sk(skb->sk) : NULL;
862 	bool mono_delivery_time = skb->mono_delivery_time;
863 	struct ip6_frag_state state;
864 	unsigned int mtu, hlen, nexthdr_offset;
865 	ktime_t tstamp = skb->tstamp;
866 	int hroom, err = 0;
867 	__be32 frag_id;
868 	u8 *prevhdr, nexthdr = 0;
869 
870 	err = ip6_find_1stfragopt(skb, &prevhdr);
871 	if (err < 0)
872 		goto fail;
873 	hlen = err;
874 	nexthdr = *prevhdr;
875 	nexthdr_offset = prevhdr - skb_network_header(skb);
876 
877 	mtu = ip6_skb_dst_mtu(skb);
878 
879 	/* We must not fragment if the socket is set to force MTU discovery
880 	 * or if the skb it not generated by a local socket.
881 	 */
882 	if (unlikely(!skb->ignore_df && skb->len > mtu))
883 		goto fail_toobig;
884 
885 	if (IP6CB(skb)->frag_max_size) {
886 		if (IP6CB(skb)->frag_max_size > mtu)
887 			goto fail_toobig;
888 
889 		/* don't send fragments larger than what we received */
890 		mtu = IP6CB(skb)->frag_max_size;
891 		if (mtu < IPV6_MIN_MTU)
892 			mtu = IPV6_MIN_MTU;
893 	}
894 
895 	if (np) {
896 		u32 frag_size = READ_ONCE(np->frag_size);
897 
898 		if (frag_size && frag_size < mtu)
899 			mtu = frag_size;
900 	}
901 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
902 		goto fail_toobig;
903 	mtu -= hlen + sizeof(struct frag_hdr);
904 
905 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
906 				    &ipv6_hdr(skb)->saddr);
907 
908 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
909 	    (err = skb_checksum_help(skb)))
910 		goto fail;
911 
912 	prevhdr = skb_network_header(skb) + nexthdr_offset;
913 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
914 	if (skb_has_frag_list(skb)) {
915 		unsigned int first_len = skb_pagelen(skb);
916 		struct ip6_fraglist_iter iter;
917 		struct sk_buff *frag2;
918 
919 		if (first_len - hlen > mtu ||
920 		    ((first_len - hlen) & 7) ||
921 		    skb_cloned(skb) ||
922 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
923 			goto slow_path;
924 
925 		skb_walk_frags(skb, frag) {
926 			/* Correct geometry. */
927 			if (frag->len > mtu ||
928 			    ((frag->len & 7) && frag->next) ||
929 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
930 				goto slow_path_clean;
931 
932 			/* Partially cloned skb? */
933 			if (skb_shared(frag))
934 				goto slow_path_clean;
935 
936 			BUG_ON(frag->sk);
937 			if (skb->sk) {
938 				frag->sk = skb->sk;
939 				frag->destructor = sock_wfree;
940 			}
941 			skb->truesize -= frag->truesize;
942 		}
943 
944 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
945 					&iter);
946 		if (err < 0)
947 			goto fail;
948 
949 		/* We prevent @rt from being freed. */
950 		rcu_read_lock();
951 
952 		for (;;) {
953 			/* Prepare header of the next frame,
954 			 * before previous one went down. */
955 			if (iter.frag)
956 				ip6_fraglist_prepare(skb, &iter);
957 
958 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
959 			err = output(net, sk, skb);
960 			if (!err)
961 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
962 					      IPSTATS_MIB_FRAGCREATES);
963 
964 			if (err || !iter.frag)
965 				break;
966 
967 			skb = ip6_fraglist_next(&iter);
968 		}
969 
970 		kfree(iter.tmp_hdr);
971 
972 		if (err == 0) {
973 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
974 				      IPSTATS_MIB_FRAGOKS);
975 			rcu_read_unlock();
976 			return 0;
977 		}
978 
979 		kfree_skb_list(iter.frag);
980 
981 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
982 			      IPSTATS_MIB_FRAGFAILS);
983 		rcu_read_unlock();
984 		return err;
985 
986 slow_path_clean:
987 		skb_walk_frags(skb, frag2) {
988 			if (frag2 == frag)
989 				break;
990 			frag2->sk = NULL;
991 			frag2->destructor = NULL;
992 			skb->truesize += frag2->truesize;
993 		}
994 	}
995 
996 slow_path:
997 	/*
998 	 *	Fragment the datagram.
999 	 */
1000 
1001 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1002 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1003 		      &state);
1004 
1005 	/*
1006 	 *	Keep copying data until we run out.
1007 	 */
1008 
1009 	while (state.left > 0) {
1010 		frag = ip6_frag_next(skb, &state);
1011 		if (IS_ERR(frag)) {
1012 			err = PTR_ERR(frag);
1013 			goto fail;
1014 		}
1015 
1016 		/*
1017 		 *	Put this fragment into the sending queue.
1018 		 */
1019 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1020 		err = output(net, sk, frag);
1021 		if (err)
1022 			goto fail;
1023 
1024 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1025 			      IPSTATS_MIB_FRAGCREATES);
1026 	}
1027 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 		      IPSTATS_MIB_FRAGOKS);
1029 	consume_skb(skb);
1030 	return err;
1031 
1032 fail_toobig:
1033 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1034 	err = -EMSGSIZE;
1035 
1036 fail:
1037 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1038 		      IPSTATS_MIB_FRAGFAILS);
1039 	kfree_skb(skb);
1040 	return err;
1041 }
1042 
1043 static inline int ip6_rt_check(const struct rt6key *rt_key,
1044 			       const struct in6_addr *fl_addr,
1045 			       const struct in6_addr *addr_cache)
1046 {
1047 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1048 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1049 }
1050 
1051 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1052 					  struct dst_entry *dst,
1053 					  const struct flowi6 *fl6)
1054 {
1055 	struct ipv6_pinfo *np = inet6_sk(sk);
1056 	struct rt6_info *rt;
1057 
1058 	if (!dst)
1059 		goto out;
1060 
1061 	if (dst->ops->family != AF_INET6) {
1062 		dst_release(dst);
1063 		return NULL;
1064 	}
1065 
1066 	rt = dst_rt6_info(dst);
1067 	/* Yes, checking route validity in not connected
1068 	 * case is not very simple. Take into account,
1069 	 * that we do not support routing by source, TOS,
1070 	 * and MSG_DONTROUTE		--ANK (980726)
1071 	 *
1072 	 * 1. ip6_rt_check(): If route was host route,
1073 	 *    check that cached destination is current.
1074 	 *    If it is network route, we still may
1075 	 *    check its validity using saved pointer
1076 	 *    to the last used address: daddr_cache.
1077 	 *    We do not want to save whole address now,
1078 	 *    (because main consumer of this service
1079 	 *    is tcp, which has not this problem),
1080 	 *    so that the last trick works only on connected
1081 	 *    sockets.
1082 	 * 2. oif also should be the same.
1083 	 */
1084 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1085 #ifdef CONFIG_IPV6_SUBTREES
1086 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1087 #endif
1088 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1089 		dst_release(dst);
1090 		dst = NULL;
1091 	}
1092 
1093 out:
1094 	return dst;
1095 }
1096 
1097 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1098 			       struct dst_entry **dst, struct flowi6 *fl6)
1099 {
1100 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1101 	struct neighbour *n;
1102 	struct rt6_info *rt;
1103 #endif
1104 	int err;
1105 	int flags = 0;
1106 
1107 	/* The correct way to handle this would be to do
1108 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1109 	 * the route-specific preferred source forces the
1110 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1111 	 *
1112 	 * In source specific routing (no src=any default route),
1113 	 * ip6_route_output will fail given src=any saddr, though, so
1114 	 * that's why we try it again later.
1115 	 */
1116 	if (ipv6_addr_any(&fl6->saddr)) {
1117 		struct fib6_info *from;
1118 		struct rt6_info *rt;
1119 
1120 		*dst = ip6_route_output(net, sk, fl6);
1121 		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1122 
1123 		rcu_read_lock();
1124 		from = rt ? rcu_dereference(rt->from) : NULL;
1125 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1126 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1127 					  &fl6->saddr);
1128 		rcu_read_unlock();
1129 
1130 		if (err)
1131 			goto out_err_release;
1132 
1133 		/* If we had an erroneous initial result, pretend it
1134 		 * never existed and let the SA-enabled version take
1135 		 * over.
1136 		 */
1137 		if ((*dst)->error) {
1138 			dst_release(*dst);
1139 			*dst = NULL;
1140 		}
1141 
1142 		if (fl6->flowi6_oif)
1143 			flags |= RT6_LOOKUP_F_IFACE;
1144 	}
1145 
1146 	if (!*dst)
1147 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1148 
1149 	err = (*dst)->error;
1150 	if (err)
1151 		goto out_err_release;
1152 
1153 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1154 	/*
1155 	 * Here if the dst entry we've looked up
1156 	 * has a neighbour entry that is in the INCOMPLETE
1157 	 * state and the src address from the flow is
1158 	 * marked as OPTIMISTIC, we release the found
1159 	 * dst entry and replace it instead with the
1160 	 * dst entry of the nexthop router
1161 	 */
1162 	rt = dst_rt6_info(*dst);
1163 	rcu_read_lock();
1164 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1165 				      rt6_nexthop(rt, &fl6->daddr));
1166 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1167 	rcu_read_unlock();
1168 
1169 	if (err) {
1170 		struct inet6_ifaddr *ifp;
1171 		struct flowi6 fl_gw6;
1172 		int redirect;
1173 
1174 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1175 				      (*dst)->dev, 1);
1176 
1177 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1178 		if (ifp)
1179 			in6_ifa_put(ifp);
1180 
1181 		if (redirect) {
1182 			/*
1183 			 * We need to get the dst entry for the
1184 			 * default router instead
1185 			 */
1186 			dst_release(*dst);
1187 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1188 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1189 			*dst = ip6_route_output(net, sk, &fl_gw6);
1190 			err = (*dst)->error;
1191 			if (err)
1192 				goto out_err_release;
1193 		}
1194 	}
1195 #endif
1196 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1197 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1198 		err = -EAFNOSUPPORT;
1199 		goto out_err_release;
1200 	}
1201 
1202 	return 0;
1203 
1204 out_err_release:
1205 	dst_release(*dst);
1206 	*dst = NULL;
1207 
1208 	if (err == -ENETUNREACH)
1209 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1210 	return err;
1211 }
1212 
1213 /**
1214  *	ip6_dst_lookup - perform route lookup on flow
1215  *	@net: Network namespace to perform lookup in
1216  *	@sk: socket which provides route info
1217  *	@dst: pointer to dst_entry * for result
1218  *	@fl6: flow to lookup
1219  *
1220  *	This function performs a route lookup on the given flow.
1221  *
1222  *	It returns zero on success, or a standard errno code on error.
1223  */
1224 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1225 		   struct flowi6 *fl6)
1226 {
1227 	*dst = NULL;
1228 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1229 }
1230 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1231 
1232 /**
1233  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1234  *	@net: Network namespace to perform lookup in
1235  *	@sk: socket which provides route info
1236  *	@fl6: flow to lookup
1237  *	@final_dst: final destination address for ipsec lookup
1238  *
1239  *	This function performs a route lookup on the given flow.
1240  *
1241  *	It returns a valid dst pointer on success, or a pointer encoded
1242  *	error code.
1243  */
1244 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1245 				      const struct in6_addr *final_dst)
1246 {
1247 	struct dst_entry *dst = NULL;
1248 	int err;
1249 
1250 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1251 	if (err)
1252 		return ERR_PTR(err);
1253 	if (final_dst)
1254 		fl6->daddr = *final_dst;
1255 
1256 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1257 }
1258 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1259 
1260 /**
1261  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1262  *	@sk: socket which provides the dst cache and route info
1263  *	@fl6: flow to lookup
1264  *	@final_dst: final destination address for ipsec lookup
1265  *	@connected: whether @sk is connected or not
1266  *
1267  *	This function performs a route lookup on the given flow with the
1268  *	possibility of using the cached route in the socket if it is valid.
1269  *	It will take the socket dst lock when operating on the dst cache.
1270  *	As a result, this function can only be used in process context.
1271  *
1272  *	In addition, for a connected socket, cache the dst in the socket
1273  *	if the current cache is not valid.
1274  *
1275  *	It returns a valid dst pointer on success, or a pointer encoded
1276  *	error code.
1277  */
1278 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1279 					 const struct in6_addr *final_dst,
1280 					 bool connected)
1281 {
1282 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1283 
1284 	dst = ip6_sk_dst_check(sk, dst, fl6);
1285 	if (dst)
1286 		return dst;
1287 
1288 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1289 	if (connected && !IS_ERR(dst))
1290 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1291 
1292 	return dst;
1293 }
1294 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1295 
1296 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1297 					       gfp_t gfp)
1298 {
1299 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1300 }
1301 
1302 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1303 						gfp_t gfp)
1304 {
1305 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1306 }
1307 
1308 static void ip6_append_data_mtu(unsigned int *mtu,
1309 				int *maxfraglen,
1310 				unsigned int fragheaderlen,
1311 				struct sk_buff *skb,
1312 				struct rt6_info *rt,
1313 				unsigned int orig_mtu)
1314 {
1315 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1316 		if (!skb) {
1317 			/* first fragment, reserve header_len */
1318 			*mtu = orig_mtu - rt->dst.header_len;
1319 
1320 		} else {
1321 			/*
1322 			 * this fragment is not first, the headers
1323 			 * space is regarded as data space.
1324 			 */
1325 			*mtu = orig_mtu;
1326 		}
1327 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1328 			      + fragheaderlen - sizeof(struct frag_hdr);
1329 	}
1330 }
1331 
1332 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1333 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1334 			  struct rt6_info *rt)
1335 {
1336 	struct ipv6_pinfo *np = inet6_sk(sk);
1337 	unsigned int mtu, frag_size;
1338 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1339 
1340 	/* callers pass dst together with a reference, set it first so
1341 	 * ip6_cork_release() can put it down even in case of an error.
1342 	 */
1343 	cork->base.dst = &rt->dst;
1344 
1345 	/*
1346 	 * setup for corking
1347 	 */
1348 	if (opt) {
1349 		if (WARN_ON(v6_cork->opt))
1350 			return -EINVAL;
1351 
1352 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1353 		if (unlikely(!nopt))
1354 			return -ENOBUFS;
1355 
1356 		nopt->tot_len = sizeof(*opt);
1357 		nopt->opt_flen = opt->opt_flen;
1358 		nopt->opt_nflen = opt->opt_nflen;
1359 
1360 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1361 		if (opt->dst0opt && !nopt->dst0opt)
1362 			return -ENOBUFS;
1363 
1364 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1365 		if (opt->dst1opt && !nopt->dst1opt)
1366 			return -ENOBUFS;
1367 
1368 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1369 		if (opt->hopopt && !nopt->hopopt)
1370 			return -ENOBUFS;
1371 
1372 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1373 		if (opt->srcrt && !nopt->srcrt)
1374 			return -ENOBUFS;
1375 
1376 		/* need source address above miyazawa*/
1377 	}
1378 	v6_cork->hop_limit = ipc6->hlimit;
1379 	v6_cork->tclass = ipc6->tclass;
1380 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1381 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1382 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1383 	else
1384 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1385 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1386 
1387 	frag_size = READ_ONCE(np->frag_size);
1388 	if (frag_size && frag_size < mtu)
1389 		mtu = frag_size;
1390 
1391 	cork->base.fragsize = mtu;
1392 	cork->base.gso_size = ipc6->gso_size;
1393 	cork->base.tx_flags = 0;
1394 	cork->base.mark = ipc6->sockc.mark;
1395 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1396 
1397 	cork->base.length = 0;
1398 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1399 
1400 	return 0;
1401 }
1402 
1403 static int __ip6_append_data(struct sock *sk,
1404 			     struct sk_buff_head *queue,
1405 			     struct inet_cork_full *cork_full,
1406 			     struct inet6_cork *v6_cork,
1407 			     struct page_frag *pfrag,
1408 			     int getfrag(void *from, char *to, int offset,
1409 					 int len, int odd, struct sk_buff *skb),
1410 			     void *from, size_t length, int transhdrlen,
1411 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1412 {
1413 	struct sk_buff *skb, *skb_prev = NULL;
1414 	struct inet_cork *cork = &cork_full->base;
1415 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1416 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1417 	struct ubuf_info *uarg = NULL;
1418 	int exthdrlen = 0;
1419 	int dst_exthdrlen = 0;
1420 	int hh_len;
1421 	int copy;
1422 	int err;
1423 	int offset = 0;
1424 	bool zc = false;
1425 	u32 tskey = 0;
1426 	struct rt6_info *rt = dst_rt6_info(cork->dst);
1427 	bool paged, hold_tskey, extra_uref = false;
1428 	struct ipv6_txoptions *opt = v6_cork->opt;
1429 	int csummode = CHECKSUM_NONE;
1430 	unsigned int maxnonfragsize, headersize;
1431 	unsigned int wmem_alloc_delta = 0;
1432 
1433 	skb = skb_peek_tail(queue);
1434 	if (!skb) {
1435 		exthdrlen = opt ? opt->opt_flen : 0;
1436 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1437 	}
1438 
1439 	paged = !!cork->gso_size;
1440 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1441 	orig_mtu = mtu;
1442 
1443 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1444 
1445 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1446 			(opt ? opt->opt_nflen : 0);
1447 
1448 	headersize = sizeof(struct ipv6hdr) +
1449 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1450 		     rt->rt6i_nfheader_len;
1451 
1452 	if (mtu <= fragheaderlen ||
1453 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1454 		goto emsgsize;
1455 
1456 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1457 		     sizeof(struct frag_hdr);
1458 
1459 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1460 	 * the first fragment
1461 	 */
1462 	if (headersize + transhdrlen > mtu)
1463 		goto emsgsize;
1464 
1465 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1466 	    (sk->sk_protocol == IPPROTO_UDP ||
1467 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1468 	     sk->sk_protocol == IPPROTO_RAW)) {
1469 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1470 				sizeof(struct ipv6hdr));
1471 		goto emsgsize;
1472 	}
1473 
1474 	if (ip6_sk_ignore_df(sk))
1475 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1476 	else
1477 		maxnonfragsize = mtu;
1478 
1479 	if (cork->length + length > maxnonfragsize - headersize) {
1480 emsgsize:
1481 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1482 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1483 		return -EMSGSIZE;
1484 	}
1485 
1486 	/* CHECKSUM_PARTIAL only with no extension headers and when
1487 	 * we are not going to fragment
1488 	 */
1489 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1490 	    headersize == sizeof(struct ipv6hdr) &&
1491 	    length <= mtu - headersize &&
1492 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1493 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1494 		csummode = CHECKSUM_PARTIAL;
1495 
1496 	if ((flags & MSG_ZEROCOPY) && length) {
1497 		struct msghdr *msg = from;
1498 
1499 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1500 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1501 				return -EINVAL;
1502 
1503 			/* Leave uarg NULL if can't zerocopy, callers should
1504 			 * be able to handle it.
1505 			 */
1506 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1507 			    csummode == CHECKSUM_PARTIAL) {
1508 				paged = true;
1509 				zc = true;
1510 				uarg = msg->msg_ubuf;
1511 			}
1512 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1513 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1514 			if (!uarg)
1515 				return -ENOBUFS;
1516 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1517 			if (rt->dst.dev->features & NETIF_F_SG &&
1518 			    csummode == CHECKSUM_PARTIAL) {
1519 				paged = true;
1520 				zc = true;
1521 			} else {
1522 				uarg_to_msgzc(uarg)->zerocopy = 0;
1523 				skb_zcopy_set(skb, uarg, &extra_uref);
1524 			}
1525 		}
1526 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1527 		if (inet_test_bit(HDRINCL, sk))
1528 			return -EPERM;
1529 		if (rt->dst.dev->features & NETIF_F_SG &&
1530 		    getfrag == ip_generic_getfrag)
1531 			/* We need an empty buffer to attach stuff to */
1532 			paged = true;
1533 		else
1534 			flags &= ~MSG_SPLICE_PAGES;
1535 	}
1536 
1537 	hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP &&
1538 		     READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID;
1539 	if (hold_tskey)
1540 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1541 
1542 	/*
1543 	 * Let's try using as much space as possible.
1544 	 * Use MTU if total length of the message fits into the MTU.
1545 	 * Otherwise, we need to reserve fragment header and
1546 	 * fragment alignment (= 8-15 octects, in total).
1547 	 *
1548 	 * Note that we may need to "move" the data from the tail
1549 	 * of the buffer to the new fragment when we split
1550 	 * the message.
1551 	 *
1552 	 * FIXME: It may be fragmented into multiple chunks
1553 	 *        at once if non-fragmentable extension headers
1554 	 *        are too large.
1555 	 * --yoshfuji
1556 	 */
1557 
1558 	cork->length += length;
1559 	if (!skb)
1560 		goto alloc_new_skb;
1561 
1562 	while (length > 0) {
1563 		/* Check if the remaining data fits into current packet. */
1564 		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1565 		if (copy < length)
1566 			copy = maxfraglen - skb->len;
1567 
1568 		if (copy <= 0) {
1569 			char *data;
1570 			unsigned int datalen;
1571 			unsigned int fraglen;
1572 			unsigned int fraggap;
1573 			unsigned int alloclen, alloc_extra;
1574 			unsigned int pagedlen;
1575 alloc_new_skb:
1576 			/* There's no room in the current skb */
1577 			if (skb)
1578 				fraggap = skb->len - maxfraglen;
1579 			else
1580 				fraggap = 0;
1581 			/* update mtu and maxfraglen if necessary */
1582 			if (!skb || !skb_prev)
1583 				ip6_append_data_mtu(&mtu, &maxfraglen,
1584 						    fragheaderlen, skb, rt,
1585 						    orig_mtu);
1586 
1587 			skb_prev = skb;
1588 
1589 			/*
1590 			 * If remaining data exceeds the mtu,
1591 			 * we know we need more fragment(s).
1592 			 */
1593 			datalen = length + fraggap;
1594 
1595 			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1596 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1597 			fraglen = datalen + fragheaderlen;
1598 			pagedlen = 0;
1599 
1600 			alloc_extra = hh_len;
1601 			alloc_extra += dst_exthdrlen;
1602 			alloc_extra += rt->dst.trailer_len;
1603 
1604 			/* We just reserve space for fragment header.
1605 			 * Note: this may be overallocation if the message
1606 			 * (without MSG_MORE) fits into the MTU.
1607 			 */
1608 			alloc_extra += sizeof(struct frag_hdr);
1609 
1610 			if ((flags & MSG_MORE) &&
1611 			    !(rt->dst.dev->features&NETIF_F_SG))
1612 				alloclen = mtu;
1613 			else if (!paged &&
1614 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1615 				  !(rt->dst.dev->features & NETIF_F_SG)))
1616 				alloclen = fraglen;
1617 			else {
1618 				alloclen = fragheaderlen + transhdrlen;
1619 				pagedlen = datalen - transhdrlen;
1620 			}
1621 			alloclen += alloc_extra;
1622 
1623 			if (datalen != length + fraggap) {
1624 				/*
1625 				 * this is not the last fragment, the trailer
1626 				 * space is regarded as data space.
1627 				 */
1628 				datalen += rt->dst.trailer_len;
1629 			}
1630 
1631 			fraglen = datalen + fragheaderlen;
1632 
1633 			copy = datalen - transhdrlen - fraggap - pagedlen;
1634 			/* [!] NOTE: copy may be negative if pagedlen>0
1635 			 * because then the equation may reduces to -fraggap.
1636 			 */
1637 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1638 				err = -EINVAL;
1639 				goto error;
1640 			}
1641 			if (transhdrlen) {
1642 				skb = sock_alloc_send_skb(sk, alloclen,
1643 						(flags & MSG_DONTWAIT), &err);
1644 			} else {
1645 				skb = NULL;
1646 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1647 				    2 * sk->sk_sndbuf)
1648 					skb = alloc_skb(alloclen,
1649 							sk->sk_allocation);
1650 				if (unlikely(!skb))
1651 					err = -ENOBUFS;
1652 			}
1653 			if (!skb)
1654 				goto error;
1655 			/*
1656 			 *	Fill in the control structures
1657 			 */
1658 			skb->protocol = htons(ETH_P_IPV6);
1659 			skb->ip_summed = csummode;
1660 			skb->csum = 0;
1661 			/* reserve for fragmentation and ipsec header */
1662 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1663 				    dst_exthdrlen);
1664 
1665 			/*
1666 			 *	Find where to start putting bytes
1667 			 */
1668 			data = skb_put(skb, fraglen - pagedlen);
1669 			skb_set_network_header(skb, exthdrlen);
1670 			data += fragheaderlen;
1671 			skb->transport_header = (skb->network_header +
1672 						 fragheaderlen);
1673 			if (fraggap) {
1674 				skb->csum = skb_copy_and_csum_bits(
1675 					skb_prev, maxfraglen,
1676 					data + transhdrlen, fraggap);
1677 				skb_prev->csum = csum_sub(skb_prev->csum,
1678 							  skb->csum);
1679 				data += fraggap;
1680 				pskb_trim_unique(skb_prev, maxfraglen);
1681 			}
1682 			if (copy > 0 &&
1683 			    getfrag(from, data + transhdrlen, offset,
1684 				    copy, fraggap, skb) < 0) {
1685 				err = -EFAULT;
1686 				kfree_skb(skb);
1687 				goto error;
1688 			} else if (flags & MSG_SPLICE_PAGES) {
1689 				copy = 0;
1690 			}
1691 
1692 			offset += copy;
1693 			length -= copy + transhdrlen;
1694 			transhdrlen = 0;
1695 			exthdrlen = 0;
1696 			dst_exthdrlen = 0;
1697 
1698 			/* Only the initial fragment is time stamped */
1699 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1700 			cork->tx_flags = 0;
1701 			skb_shinfo(skb)->tskey = tskey;
1702 			tskey = 0;
1703 			skb_zcopy_set(skb, uarg, &extra_uref);
1704 
1705 			if ((flags & MSG_CONFIRM) && !skb_prev)
1706 				skb_set_dst_pending_confirm(skb, 1);
1707 
1708 			/*
1709 			 * Put the packet on the pending queue
1710 			 */
1711 			if (!skb->destructor) {
1712 				skb->destructor = sock_wfree;
1713 				skb->sk = sk;
1714 				wmem_alloc_delta += skb->truesize;
1715 			}
1716 			__skb_queue_tail(queue, skb);
1717 			continue;
1718 		}
1719 
1720 		if (copy > length)
1721 			copy = length;
1722 
1723 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1724 		    skb_tailroom(skb) >= copy) {
1725 			unsigned int off;
1726 
1727 			off = skb->len;
1728 			if (getfrag(from, skb_put(skb, copy),
1729 						offset, copy, off, skb) < 0) {
1730 				__skb_trim(skb, off);
1731 				err = -EFAULT;
1732 				goto error;
1733 			}
1734 		} else if (flags & MSG_SPLICE_PAGES) {
1735 			struct msghdr *msg = from;
1736 
1737 			err = -EIO;
1738 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1739 				goto error;
1740 
1741 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1742 						   sk->sk_allocation);
1743 			if (err < 0)
1744 				goto error;
1745 			copy = err;
1746 			wmem_alloc_delta += copy;
1747 		} else if (!zc) {
1748 			int i = skb_shinfo(skb)->nr_frags;
1749 
1750 			err = -ENOMEM;
1751 			if (!sk_page_frag_refill(sk, pfrag))
1752 				goto error;
1753 
1754 			skb_zcopy_downgrade_managed(skb);
1755 			if (!skb_can_coalesce(skb, i, pfrag->page,
1756 					      pfrag->offset)) {
1757 				err = -EMSGSIZE;
1758 				if (i == MAX_SKB_FRAGS)
1759 					goto error;
1760 
1761 				__skb_fill_page_desc(skb, i, pfrag->page,
1762 						     pfrag->offset, 0);
1763 				skb_shinfo(skb)->nr_frags = ++i;
1764 				get_page(pfrag->page);
1765 			}
1766 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1767 			if (getfrag(from,
1768 				    page_address(pfrag->page) + pfrag->offset,
1769 				    offset, copy, skb->len, skb) < 0)
1770 				goto error_efault;
1771 
1772 			pfrag->offset += copy;
1773 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1774 			skb->len += copy;
1775 			skb->data_len += copy;
1776 			skb->truesize += copy;
1777 			wmem_alloc_delta += copy;
1778 		} else {
1779 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1780 			if (err < 0)
1781 				goto error;
1782 		}
1783 		offset += copy;
1784 		length -= copy;
1785 	}
1786 
1787 	if (wmem_alloc_delta)
1788 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1789 	return 0;
1790 
1791 error_efault:
1792 	err = -EFAULT;
1793 error:
1794 	net_zcopy_put_abort(uarg, extra_uref);
1795 	cork->length -= length;
1796 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1797 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1798 	if (hold_tskey)
1799 		atomic_dec(&sk->sk_tskey);
1800 	return err;
1801 }
1802 
1803 int ip6_append_data(struct sock *sk,
1804 		    int getfrag(void *from, char *to, int offset, int len,
1805 				int odd, struct sk_buff *skb),
1806 		    void *from, size_t length, int transhdrlen,
1807 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1808 		    struct rt6_info *rt, unsigned int flags)
1809 {
1810 	struct inet_sock *inet = inet_sk(sk);
1811 	struct ipv6_pinfo *np = inet6_sk(sk);
1812 	int exthdrlen;
1813 	int err;
1814 
1815 	if (flags&MSG_PROBE)
1816 		return 0;
1817 	if (skb_queue_empty(&sk->sk_write_queue)) {
1818 		/*
1819 		 * setup for corking
1820 		 */
1821 		dst_hold(&rt->dst);
1822 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1823 				     ipc6, rt);
1824 		if (err)
1825 			return err;
1826 
1827 		inet->cork.fl.u.ip6 = *fl6;
1828 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1829 		length += exthdrlen;
1830 		transhdrlen += exthdrlen;
1831 	} else {
1832 		transhdrlen = 0;
1833 	}
1834 
1835 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1836 				 &np->cork, sk_page_frag(sk), getfrag,
1837 				 from, length, transhdrlen, flags, ipc6);
1838 }
1839 EXPORT_SYMBOL_GPL(ip6_append_data);
1840 
1841 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1842 {
1843 	struct dst_entry *dst = cork->base.dst;
1844 
1845 	cork->base.dst = NULL;
1846 	skb_dst_set(skb, dst);
1847 }
1848 
1849 static void ip6_cork_release(struct inet_cork_full *cork,
1850 			     struct inet6_cork *v6_cork)
1851 {
1852 	if (v6_cork->opt) {
1853 		struct ipv6_txoptions *opt = v6_cork->opt;
1854 
1855 		kfree(opt->dst0opt);
1856 		kfree(opt->dst1opt);
1857 		kfree(opt->hopopt);
1858 		kfree(opt->srcrt);
1859 		kfree(opt);
1860 		v6_cork->opt = NULL;
1861 	}
1862 
1863 	if (cork->base.dst) {
1864 		dst_release(cork->base.dst);
1865 		cork->base.dst = NULL;
1866 	}
1867 }
1868 
1869 struct sk_buff *__ip6_make_skb(struct sock *sk,
1870 			       struct sk_buff_head *queue,
1871 			       struct inet_cork_full *cork,
1872 			       struct inet6_cork *v6_cork)
1873 {
1874 	struct sk_buff *skb, *tmp_skb;
1875 	struct sk_buff **tail_skb;
1876 	struct in6_addr *final_dst;
1877 	struct net *net = sock_net(sk);
1878 	struct ipv6hdr *hdr;
1879 	struct ipv6_txoptions *opt = v6_cork->opt;
1880 	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1881 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1882 	unsigned char proto = fl6->flowi6_proto;
1883 
1884 	skb = __skb_dequeue(queue);
1885 	if (!skb)
1886 		goto out;
1887 	tail_skb = &(skb_shinfo(skb)->frag_list);
1888 
1889 	/* move skb->data to ip header from ext header */
1890 	if (skb->data < skb_network_header(skb))
1891 		__skb_pull(skb, skb_network_offset(skb));
1892 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1893 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1894 		*tail_skb = tmp_skb;
1895 		tail_skb = &(tmp_skb->next);
1896 		skb->len += tmp_skb->len;
1897 		skb->data_len += tmp_skb->len;
1898 		skb->truesize += tmp_skb->truesize;
1899 		tmp_skb->destructor = NULL;
1900 		tmp_skb->sk = NULL;
1901 	}
1902 
1903 	/* Allow local fragmentation. */
1904 	skb->ignore_df = ip6_sk_ignore_df(sk);
1905 	__skb_pull(skb, skb_network_header_len(skb));
1906 
1907 	final_dst = &fl6->daddr;
1908 	if (opt && opt->opt_flen)
1909 		ipv6_push_frag_opts(skb, opt, &proto);
1910 	if (opt && opt->opt_nflen)
1911 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1912 
1913 	skb_push(skb, sizeof(struct ipv6hdr));
1914 	skb_reset_network_header(skb);
1915 	hdr = ipv6_hdr(skb);
1916 
1917 	ip6_flow_hdr(hdr, v6_cork->tclass,
1918 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1919 					ip6_autoflowlabel(net, sk), fl6));
1920 	hdr->hop_limit = v6_cork->hop_limit;
1921 	hdr->nexthdr = proto;
1922 	hdr->saddr = fl6->saddr;
1923 	hdr->daddr = *final_dst;
1924 
1925 	skb->priority = READ_ONCE(sk->sk_priority);
1926 	skb->mark = cork->base.mark;
1927 	skb->tstamp = cork->base.transmit_time;
1928 
1929 	ip6_cork_steal_dst(skb, cork);
1930 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1931 	if (proto == IPPROTO_ICMPV6) {
1932 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1933 		u8 icmp6_type;
1934 
1935 		if (sk->sk_socket->type == SOCK_RAW &&
1936 		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1937 			icmp6_type = fl6->fl6_icmp_type;
1938 		else
1939 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1940 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1941 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1942 	}
1943 
1944 	ip6_cork_release(cork, v6_cork);
1945 out:
1946 	return skb;
1947 }
1948 
1949 int ip6_send_skb(struct sk_buff *skb)
1950 {
1951 	struct net *net = sock_net(skb->sk);
1952 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1953 	int err;
1954 
1955 	err = ip6_local_out(net, skb->sk, skb);
1956 	if (err) {
1957 		if (err > 0)
1958 			err = net_xmit_errno(err);
1959 		if (err)
1960 			IP6_INC_STATS(net, rt->rt6i_idev,
1961 				      IPSTATS_MIB_OUTDISCARDS);
1962 	}
1963 
1964 	return err;
1965 }
1966 
1967 int ip6_push_pending_frames(struct sock *sk)
1968 {
1969 	struct sk_buff *skb;
1970 
1971 	skb = ip6_finish_skb(sk);
1972 	if (!skb)
1973 		return 0;
1974 
1975 	return ip6_send_skb(skb);
1976 }
1977 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1978 
1979 static void __ip6_flush_pending_frames(struct sock *sk,
1980 				       struct sk_buff_head *queue,
1981 				       struct inet_cork_full *cork,
1982 				       struct inet6_cork *v6_cork)
1983 {
1984 	struct sk_buff *skb;
1985 
1986 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1987 		if (skb_dst(skb))
1988 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1989 				      IPSTATS_MIB_OUTDISCARDS);
1990 		kfree_skb(skb);
1991 	}
1992 
1993 	ip6_cork_release(cork, v6_cork);
1994 }
1995 
1996 void ip6_flush_pending_frames(struct sock *sk)
1997 {
1998 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1999 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2000 }
2001 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2002 
2003 struct sk_buff *ip6_make_skb(struct sock *sk,
2004 			     int getfrag(void *from, char *to, int offset,
2005 					 int len, int odd, struct sk_buff *skb),
2006 			     void *from, size_t length, int transhdrlen,
2007 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2008 			     unsigned int flags, struct inet_cork_full *cork)
2009 {
2010 	struct inet6_cork v6_cork;
2011 	struct sk_buff_head queue;
2012 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2013 	int err;
2014 
2015 	if (flags & MSG_PROBE) {
2016 		dst_release(&rt->dst);
2017 		return NULL;
2018 	}
2019 
2020 	__skb_queue_head_init(&queue);
2021 
2022 	cork->base.flags = 0;
2023 	cork->base.addr = 0;
2024 	cork->base.opt = NULL;
2025 	v6_cork.opt = NULL;
2026 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2027 	if (err) {
2028 		ip6_cork_release(cork, &v6_cork);
2029 		return ERR_PTR(err);
2030 	}
2031 	if (ipc6->dontfrag < 0)
2032 		ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
2033 
2034 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2035 				&current->task_frag, getfrag, from,
2036 				length + exthdrlen, transhdrlen + exthdrlen,
2037 				flags, ipc6);
2038 	if (err) {
2039 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2040 		return ERR_PTR(err);
2041 	}
2042 
2043 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2044 }
2045