xref: /linux/net/ipv6/ip6_output.c (revision e7d759f31ca295d589f7420719c311870bb3166f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res != LWTUNNEL_XMIT_CONTINUE)
117 			return res;
118 	}
119 
120 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
121 
122 	rcu_read_lock();
123 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			rcu_read_unlock();
131 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
132 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
133 			return -EINVAL;
134 		}
135 	}
136 	sock_confirm_neigh(skb, neigh);
137 	ret = neigh_output(neigh, skb, false);
138 	rcu_read_unlock();
139 	return ret;
140 }
141 
142 static int
143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
144 				    struct sk_buff *skb, unsigned int mtu)
145 {
146 	struct sk_buff *segs, *nskb;
147 	netdev_features_t features;
148 	int ret = 0;
149 
150 	/* Please see corresponding comment in ip_finish_output_gso
151 	 * describing the cases where GSO segment length exceeds the
152 	 * egress MTU.
153 	 */
154 	features = netif_skb_features(skb);
155 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
156 	if (IS_ERR_OR_NULL(segs)) {
157 		kfree_skb(skb);
158 		return -ENOMEM;
159 	}
160 
161 	consume_skb(skb);
162 
163 	skb_list_walk_safe(segs, segs, nskb) {
164 		int err;
165 
166 		skb_mark_not_on_list(segs);
167 		/* Last GSO segment can be smaller than gso_size (and MTU).
168 		 * Adding a fragment header would produce an "atomic fragment",
169 		 * which is considered harmful (RFC-8021). Avoid that.
170 		 */
171 		err = segs->len > mtu ?
172 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
173 			ip6_finish_output2(net, sk, segs);
174 		if (err && ret == 0)
175 			ret = err;
176 	}
177 
178 	return ret;
179 }
180 
181 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
182 				 struct sk_buff *skb, unsigned int mtu)
183 {
184 	if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
185 	    !skb_gso_validate_network_len(skb, mtu))
186 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
187 
188 	return ip6_finish_output2(net, sk, skb);
189 }
190 
191 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
192 {
193 	unsigned int mtu;
194 
195 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
196 	/* Policy lookup after SNAT yielded a new policy */
197 	if (skb_dst(skb)->xfrm) {
198 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
199 		return dst_output(net, sk, skb);
200 	}
201 #endif
202 
203 	mtu = ip6_skb_dst_mtu(skb);
204 	if (skb_is_gso(skb))
205 		return ip6_finish_output_gso(net, sk, skb, mtu);
206 
207 	if (skb->len > mtu ||
208 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
209 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
210 
211 	return ip6_finish_output2(net, sk, skb);
212 }
213 
214 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216 	int ret;
217 
218 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
219 	switch (ret) {
220 	case NET_XMIT_SUCCESS:
221 	case NET_XMIT_CN:
222 		return __ip6_finish_output(net, sk, skb) ? : ret;
223 	default:
224 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
225 		return ret;
226 	}
227 }
228 
229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
230 {
231 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
232 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
233 
234 	skb->protocol = htons(ETH_P_IPV6);
235 	skb->dev = dev;
236 
237 	if (unlikely(idev->cnf.disable_ipv6)) {
238 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
239 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
240 		return 0;
241 	}
242 
243 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
244 			    net, sk, skb, indev, dev,
245 			    ip6_finish_output,
246 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
247 }
248 EXPORT_SYMBOL(ip6_output);
249 
250 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
251 {
252 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
253 		return ip6_default_np_autolabel(net);
254 	return inet6_test_bit(AUTOFLOWLABEL, sk);
255 }
256 
257 /*
258  * xmit an sk_buff (used by TCP, SCTP and DCCP)
259  * Note : socket lock is not held for SYNACK packets, but might be modified
260  * by calls to skb_set_owner_w() and ipv6_local_error(),
261  * which are using proper atomic operations or spinlocks.
262  */
263 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
264 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
265 {
266 	struct net *net = sock_net(sk);
267 	const struct ipv6_pinfo *np = inet6_sk(sk);
268 	struct in6_addr *first_hop = &fl6->daddr;
269 	struct dst_entry *dst = skb_dst(skb);
270 	struct net_device *dev = dst->dev;
271 	struct inet6_dev *idev = ip6_dst_idev(dst);
272 	struct hop_jumbo_hdr *hop_jumbo;
273 	int hoplen = sizeof(*hop_jumbo);
274 	unsigned int head_room;
275 	struct ipv6hdr *hdr;
276 	u8  proto = fl6->flowi6_proto;
277 	int seg_len = skb->len;
278 	int hlimit = -1;
279 	u32 mtu;
280 
281 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
282 	if (opt)
283 		head_room += opt->opt_nflen + opt->opt_flen;
284 
285 	if (unlikely(head_room > skb_headroom(skb))) {
286 		skb = skb_expand_head(skb, head_room);
287 		if (!skb) {
288 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
289 			return -ENOBUFS;
290 		}
291 	}
292 
293 	if (opt) {
294 		seg_len += opt->opt_nflen + opt->opt_flen;
295 
296 		if (opt->opt_flen)
297 			ipv6_push_frag_opts(skb, opt, &proto);
298 
299 		if (opt->opt_nflen)
300 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
301 					     &fl6->saddr);
302 	}
303 
304 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
305 		hop_jumbo = skb_push(skb, hoplen);
306 
307 		hop_jumbo->nexthdr = proto;
308 		hop_jumbo->hdrlen = 0;
309 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
310 		hop_jumbo->tlv_len = 4;
311 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
312 
313 		proto = IPPROTO_HOPOPTS;
314 		seg_len = 0;
315 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
316 	}
317 
318 	skb_push(skb, sizeof(struct ipv6hdr));
319 	skb_reset_network_header(skb);
320 	hdr = ipv6_hdr(skb);
321 
322 	/*
323 	 *	Fill in the IPv6 header
324 	 */
325 	if (np)
326 		hlimit = READ_ONCE(np->hop_limit);
327 	if (hlimit < 0)
328 		hlimit = ip6_dst_hoplimit(dst);
329 
330 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
331 				ip6_autoflowlabel(net, sk), fl6));
332 
333 	hdr->payload_len = htons(seg_len);
334 	hdr->nexthdr = proto;
335 	hdr->hop_limit = hlimit;
336 
337 	hdr->saddr = fl6->saddr;
338 	hdr->daddr = *first_hop;
339 
340 	skb->protocol = htons(ETH_P_IPV6);
341 	skb->priority = priority;
342 	skb->mark = mark;
343 
344 	mtu = dst_mtu(dst);
345 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
346 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
347 
348 		/* if egress device is enslaved to an L3 master device pass the
349 		 * skb to its handler for processing
350 		 */
351 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
352 		if (unlikely(!skb))
353 			return 0;
354 
355 		/* hooks should never assume socket lock is held.
356 		 * we promote our socket to non const
357 		 */
358 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
359 			       net, (struct sock *)sk, skb, NULL, dev,
360 			       dst_output);
361 	}
362 
363 	skb->dev = dev;
364 	/* ipv6_local_error() does not require socket lock,
365 	 * we promote our socket to non const
366 	 */
367 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
368 
369 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
370 	kfree_skb(skb);
371 	return -EMSGSIZE;
372 }
373 EXPORT_SYMBOL(ip6_xmit);
374 
375 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
376 {
377 	struct ip6_ra_chain *ra;
378 	struct sock *last = NULL;
379 
380 	read_lock(&ip6_ra_lock);
381 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
382 		struct sock *sk = ra->sk;
383 		if (sk && ra->sel == sel &&
384 		    (!sk->sk_bound_dev_if ||
385 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
386 
387 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
388 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
389 				continue;
390 			}
391 			if (last) {
392 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
393 				if (skb2)
394 					rawv6_rcv(last, skb2);
395 			}
396 			last = sk;
397 		}
398 	}
399 
400 	if (last) {
401 		rawv6_rcv(last, skb);
402 		read_unlock(&ip6_ra_lock);
403 		return 1;
404 	}
405 	read_unlock(&ip6_ra_lock);
406 	return 0;
407 }
408 
409 static int ip6_forward_proxy_check(struct sk_buff *skb)
410 {
411 	struct ipv6hdr *hdr = ipv6_hdr(skb);
412 	u8 nexthdr = hdr->nexthdr;
413 	__be16 frag_off;
414 	int offset;
415 
416 	if (ipv6_ext_hdr(nexthdr)) {
417 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
418 		if (offset < 0)
419 			return 0;
420 	} else
421 		offset = sizeof(struct ipv6hdr);
422 
423 	if (nexthdr == IPPROTO_ICMPV6) {
424 		struct icmp6hdr *icmp6;
425 
426 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
427 					 offset + 1 - skb->data)))
428 			return 0;
429 
430 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
431 
432 		switch (icmp6->icmp6_type) {
433 		case NDISC_ROUTER_SOLICITATION:
434 		case NDISC_ROUTER_ADVERTISEMENT:
435 		case NDISC_NEIGHBOUR_SOLICITATION:
436 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
437 		case NDISC_REDIRECT:
438 			/* For reaction involving unicast neighbor discovery
439 			 * message destined to the proxied address, pass it to
440 			 * input function.
441 			 */
442 			return 1;
443 		default:
444 			break;
445 		}
446 	}
447 
448 	/*
449 	 * The proxying router can't forward traffic sent to a link-local
450 	 * address, so signal the sender and discard the packet. This
451 	 * behavior is clarified by the MIPv6 specification.
452 	 */
453 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
454 		dst_link_failure(skb);
455 		return -1;
456 	}
457 
458 	return 0;
459 }
460 
461 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
462 				     struct sk_buff *skb)
463 {
464 #ifdef CONFIG_NET_SWITCHDEV
465 	if (skb->offload_l3_fwd_mark) {
466 		consume_skb(skb);
467 		return 0;
468 	}
469 #endif
470 
471 	skb_clear_tstamp(skb);
472 	return dst_output(net, sk, skb);
473 }
474 
475 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
476 {
477 	if (skb->len <= mtu)
478 		return false;
479 
480 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
481 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
482 		return true;
483 
484 	if (skb->ignore_df)
485 		return false;
486 
487 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
488 		return false;
489 
490 	return true;
491 }
492 
493 int ip6_forward(struct sk_buff *skb)
494 {
495 	struct dst_entry *dst = skb_dst(skb);
496 	struct ipv6hdr *hdr = ipv6_hdr(skb);
497 	struct inet6_skb_parm *opt = IP6CB(skb);
498 	struct net *net = dev_net(dst->dev);
499 	struct inet6_dev *idev;
500 	SKB_DR(reason);
501 	u32 mtu;
502 
503 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
504 	if (net->ipv6.devconf_all->forwarding == 0)
505 		goto error;
506 
507 	if (skb->pkt_type != PACKET_HOST)
508 		goto drop;
509 
510 	if (unlikely(skb->sk))
511 		goto drop;
512 
513 	if (skb_warn_if_lro(skb))
514 		goto drop;
515 
516 	if (!net->ipv6.devconf_all->disable_policy &&
517 	    (!idev || !idev->cnf.disable_policy) &&
518 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
519 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
520 		goto drop;
521 	}
522 
523 	skb_forward_csum(skb);
524 
525 	/*
526 	 *	We DO NOT make any processing on
527 	 *	RA packets, pushing them to user level AS IS
528 	 *	without ane WARRANTY that application will be able
529 	 *	to interpret them. The reason is that we
530 	 *	cannot make anything clever here.
531 	 *
532 	 *	We are not end-node, so that if packet contains
533 	 *	AH/ESP, we cannot make anything.
534 	 *	Defragmentation also would be mistake, RA packets
535 	 *	cannot be fragmented, because there is no warranty
536 	 *	that different fragments will go along one path. --ANK
537 	 */
538 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
539 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
540 			return 0;
541 	}
542 
543 	/*
544 	 *	check and decrement ttl
545 	 */
546 	if (hdr->hop_limit <= 1) {
547 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
548 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
549 
550 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
551 		return -ETIMEDOUT;
552 	}
553 
554 	/* XXX: idev->cnf.proxy_ndp? */
555 	if (net->ipv6.devconf_all->proxy_ndp &&
556 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
557 		int proxied = ip6_forward_proxy_check(skb);
558 		if (proxied > 0) {
559 			/* It's tempting to decrease the hop limit
560 			 * here by 1, as we do at the end of the
561 			 * function too.
562 			 *
563 			 * But that would be incorrect, as proxying is
564 			 * not forwarding.  The ip6_input function
565 			 * will handle this packet locally, and it
566 			 * depends on the hop limit being unchanged.
567 			 *
568 			 * One example is the NDP hop limit, that
569 			 * always has to stay 255, but other would be
570 			 * similar checks around RA packets, where the
571 			 * user can even change the desired limit.
572 			 */
573 			return ip6_input(skb);
574 		} else if (proxied < 0) {
575 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
576 			goto drop;
577 		}
578 	}
579 
580 	if (!xfrm6_route_forward(skb)) {
581 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
582 		SKB_DR_SET(reason, XFRM_POLICY);
583 		goto drop;
584 	}
585 	dst = skb_dst(skb);
586 
587 	/* IPv6 specs say nothing about it, but it is clear that we cannot
588 	   send redirects to source routed frames.
589 	   We don't send redirects to frames decapsulated from IPsec.
590 	 */
591 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
592 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
593 		struct in6_addr *target = NULL;
594 		struct inet_peer *peer;
595 		struct rt6_info *rt;
596 
597 		/*
598 		 *	incoming and outgoing devices are the same
599 		 *	send a redirect.
600 		 */
601 
602 		rt = (struct rt6_info *) dst;
603 		if (rt->rt6i_flags & RTF_GATEWAY)
604 			target = &rt->rt6i_gateway;
605 		else
606 			target = &hdr->daddr;
607 
608 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
609 
610 		/* Limit redirects both by destination (here)
611 		   and by source (inside ndisc_send_redirect)
612 		 */
613 		if (inet_peer_xrlim_allow(peer, 1*HZ))
614 			ndisc_send_redirect(skb, target);
615 		if (peer)
616 			inet_putpeer(peer);
617 	} else {
618 		int addrtype = ipv6_addr_type(&hdr->saddr);
619 
620 		/* This check is security critical. */
621 		if (addrtype == IPV6_ADDR_ANY ||
622 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
623 			goto error;
624 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
625 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
626 				    ICMPV6_NOT_NEIGHBOUR, 0);
627 			goto error;
628 		}
629 	}
630 
631 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
632 
633 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
634 	if (mtu < IPV6_MIN_MTU)
635 		mtu = IPV6_MIN_MTU;
636 
637 	if (ip6_pkt_too_big(skb, mtu)) {
638 		/* Again, force OUTPUT device used as source address */
639 		skb->dev = dst->dev;
640 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
641 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
642 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
643 				IPSTATS_MIB_FRAGFAILS);
644 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
645 		return -EMSGSIZE;
646 	}
647 
648 	if (skb_cow(skb, dst->dev->hard_header_len)) {
649 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
650 				IPSTATS_MIB_OUTDISCARDS);
651 		goto drop;
652 	}
653 
654 	hdr = ipv6_hdr(skb);
655 
656 	/* Mangling hops number delayed to point after skb COW */
657 
658 	hdr->hop_limit--;
659 
660 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
661 		       net, NULL, skb, skb->dev, dst->dev,
662 		       ip6_forward_finish);
663 
664 error:
665 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
666 	SKB_DR_SET(reason, IP_INADDRERRORS);
667 drop:
668 	kfree_skb_reason(skb, reason);
669 	return -EINVAL;
670 }
671 
672 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
673 {
674 	to->pkt_type = from->pkt_type;
675 	to->priority = from->priority;
676 	to->protocol = from->protocol;
677 	skb_dst_drop(to);
678 	skb_dst_set(to, dst_clone(skb_dst(from)));
679 	to->dev = from->dev;
680 	to->mark = from->mark;
681 
682 	skb_copy_hash(to, from);
683 
684 #ifdef CONFIG_NET_SCHED
685 	to->tc_index = from->tc_index;
686 #endif
687 	nf_copy(to, from);
688 	skb_ext_copy(to, from);
689 	skb_copy_secmark(to, from);
690 }
691 
692 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
693 		      u8 nexthdr, __be32 frag_id,
694 		      struct ip6_fraglist_iter *iter)
695 {
696 	unsigned int first_len;
697 	struct frag_hdr *fh;
698 
699 	/* BUILD HEADER */
700 	*prevhdr = NEXTHDR_FRAGMENT;
701 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
702 	if (!iter->tmp_hdr)
703 		return -ENOMEM;
704 
705 	iter->frag = skb_shinfo(skb)->frag_list;
706 	skb_frag_list_init(skb);
707 
708 	iter->offset = 0;
709 	iter->hlen = hlen;
710 	iter->frag_id = frag_id;
711 	iter->nexthdr = nexthdr;
712 
713 	__skb_pull(skb, hlen);
714 	fh = __skb_push(skb, sizeof(struct frag_hdr));
715 	__skb_push(skb, hlen);
716 	skb_reset_network_header(skb);
717 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
718 
719 	fh->nexthdr = nexthdr;
720 	fh->reserved = 0;
721 	fh->frag_off = htons(IP6_MF);
722 	fh->identification = frag_id;
723 
724 	first_len = skb_pagelen(skb);
725 	skb->data_len = first_len - skb_headlen(skb);
726 	skb->len = first_len;
727 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
728 
729 	return 0;
730 }
731 EXPORT_SYMBOL(ip6_fraglist_init);
732 
733 void ip6_fraglist_prepare(struct sk_buff *skb,
734 			  struct ip6_fraglist_iter *iter)
735 {
736 	struct sk_buff *frag = iter->frag;
737 	unsigned int hlen = iter->hlen;
738 	struct frag_hdr *fh;
739 
740 	frag->ip_summed = CHECKSUM_NONE;
741 	skb_reset_transport_header(frag);
742 	fh = __skb_push(frag, sizeof(struct frag_hdr));
743 	__skb_push(frag, hlen);
744 	skb_reset_network_header(frag);
745 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
746 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
747 	fh->nexthdr = iter->nexthdr;
748 	fh->reserved = 0;
749 	fh->frag_off = htons(iter->offset);
750 	if (frag->next)
751 		fh->frag_off |= htons(IP6_MF);
752 	fh->identification = iter->frag_id;
753 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
754 	ip6_copy_metadata(frag, skb);
755 }
756 EXPORT_SYMBOL(ip6_fraglist_prepare);
757 
758 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
759 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
760 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
761 {
762 	state->prevhdr = prevhdr;
763 	state->nexthdr = nexthdr;
764 	state->frag_id = frag_id;
765 
766 	state->hlen = hlen;
767 	state->mtu = mtu;
768 
769 	state->left = skb->len - hlen;	/* Space per frame */
770 	state->ptr = hlen;		/* Where to start from */
771 
772 	state->hroom = hdr_room;
773 	state->troom = needed_tailroom;
774 
775 	state->offset = 0;
776 }
777 EXPORT_SYMBOL(ip6_frag_init);
778 
779 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
780 {
781 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
782 	struct sk_buff *frag;
783 	struct frag_hdr *fh;
784 	unsigned int len;
785 
786 	len = state->left;
787 	/* IF: it doesn't fit, use 'mtu' - the data space left */
788 	if (len > state->mtu)
789 		len = state->mtu;
790 	/* IF: we are not sending up to and including the packet end
791 	   then align the next start on an eight byte boundary */
792 	if (len < state->left)
793 		len &= ~7;
794 
795 	/* Allocate buffer */
796 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
797 			 state->hroom + state->troom, GFP_ATOMIC);
798 	if (!frag)
799 		return ERR_PTR(-ENOMEM);
800 
801 	/*
802 	 *	Set up data on packet
803 	 */
804 
805 	ip6_copy_metadata(frag, skb);
806 	skb_reserve(frag, state->hroom);
807 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
808 	skb_reset_network_header(frag);
809 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
810 	frag->transport_header = (frag->network_header + state->hlen +
811 				  sizeof(struct frag_hdr));
812 
813 	/*
814 	 *	Charge the memory for the fragment to any owner
815 	 *	it might possess
816 	 */
817 	if (skb->sk)
818 		skb_set_owner_w(frag, skb->sk);
819 
820 	/*
821 	 *	Copy the packet header into the new buffer.
822 	 */
823 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
824 
825 	fragnexthdr_offset = skb_network_header(frag);
826 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
827 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
828 
829 	/*
830 	 *	Build fragment header.
831 	 */
832 	fh->nexthdr = state->nexthdr;
833 	fh->reserved = 0;
834 	fh->identification = state->frag_id;
835 
836 	/*
837 	 *	Copy a block of the IP datagram.
838 	 */
839 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
840 			     len));
841 	state->left -= len;
842 
843 	fh->frag_off = htons(state->offset);
844 	if (state->left > 0)
845 		fh->frag_off |= htons(IP6_MF);
846 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
847 
848 	state->ptr += len;
849 	state->offset += len;
850 
851 	return frag;
852 }
853 EXPORT_SYMBOL(ip6_frag_next);
854 
855 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
856 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
857 {
858 	struct sk_buff *frag;
859 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
860 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
861 				inet6_sk(skb->sk) : NULL;
862 	bool mono_delivery_time = skb->mono_delivery_time;
863 	struct ip6_frag_state state;
864 	unsigned int mtu, hlen, nexthdr_offset;
865 	ktime_t tstamp = skb->tstamp;
866 	int hroom, err = 0;
867 	__be32 frag_id;
868 	u8 *prevhdr, nexthdr = 0;
869 
870 	err = ip6_find_1stfragopt(skb, &prevhdr);
871 	if (err < 0)
872 		goto fail;
873 	hlen = err;
874 	nexthdr = *prevhdr;
875 	nexthdr_offset = prevhdr - skb_network_header(skb);
876 
877 	mtu = ip6_skb_dst_mtu(skb);
878 
879 	/* We must not fragment if the socket is set to force MTU discovery
880 	 * or if the skb it not generated by a local socket.
881 	 */
882 	if (unlikely(!skb->ignore_df && skb->len > mtu))
883 		goto fail_toobig;
884 
885 	if (IP6CB(skb)->frag_max_size) {
886 		if (IP6CB(skb)->frag_max_size > mtu)
887 			goto fail_toobig;
888 
889 		/* don't send fragments larger than what we received */
890 		mtu = IP6CB(skb)->frag_max_size;
891 		if (mtu < IPV6_MIN_MTU)
892 			mtu = IPV6_MIN_MTU;
893 	}
894 
895 	if (np) {
896 		u32 frag_size = READ_ONCE(np->frag_size);
897 
898 		if (frag_size && frag_size < mtu)
899 			mtu = frag_size;
900 	}
901 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
902 		goto fail_toobig;
903 	mtu -= hlen + sizeof(struct frag_hdr);
904 
905 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
906 				    &ipv6_hdr(skb)->saddr);
907 
908 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
909 	    (err = skb_checksum_help(skb)))
910 		goto fail;
911 
912 	prevhdr = skb_network_header(skb) + nexthdr_offset;
913 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
914 	if (skb_has_frag_list(skb)) {
915 		unsigned int first_len = skb_pagelen(skb);
916 		struct ip6_fraglist_iter iter;
917 		struct sk_buff *frag2;
918 
919 		if (first_len - hlen > mtu ||
920 		    ((first_len - hlen) & 7) ||
921 		    skb_cloned(skb) ||
922 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
923 			goto slow_path;
924 
925 		skb_walk_frags(skb, frag) {
926 			/* Correct geometry. */
927 			if (frag->len > mtu ||
928 			    ((frag->len & 7) && frag->next) ||
929 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
930 				goto slow_path_clean;
931 
932 			/* Partially cloned skb? */
933 			if (skb_shared(frag))
934 				goto slow_path_clean;
935 
936 			BUG_ON(frag->sk);
937 			if (skb->sk) {
938 				frag->sk = skb->sk;
939 				frag->destructor = sock_wfree;
940 			}
941 			skb->truesize -= frag->truesize;
942 		}
943 
944 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
945 					&iter);
946 		if (err < 0)
947 			goto fail;
948 
949 		/* We prevent @rt from being freed. */
950 		rcu_read_lock();
951 
952 		for (;;) {
953 			/* Prepare header of the next frame,
954 			 * before previous one went down. */
955 			if (iter.frag)
956 				ip6_fraglist_prepare(skb, &iter);
957 
958 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
959 			err = output(net, sk, skb);
960 			if (!err)
961 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
962 					      IPSTATS_MIB_FRAGCREATES);
963 
964 			if (err || !iter.frag)
965 				break;
966 
967 			skb = ip6_fraglist_next(&iter);
968 		}
969 
970 		kfree(iter.tmp_hdr);
971 
972 		if (err == 0) {
973 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
974 				      IPSTATS_MIB_FRAGOKS);
975 			rcu_read_unlock();
976 			return 0;
977 		}
978 
979 		kfree_skb_list(iter.frag);
980 
981 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
982 			      IPSTATS_MIB_FRAGFAILS);
983 		rcu_read_unlock();
984 		return err;
985 
986 slow_path_clean:
987 		skb_walk_frags(skb, frag2) {
988 			if (frag2 == frag)
989 				break;
990 			frag2->sk = NULL;
991 			frag2->destructor = NULL;
992 			skb->truesize += frag2->truesize;
993 		}
994 	}
995 
996 slow_path:
997 	/*
998 	 *	Fragment the datagram.
999 	 */
1000 
1001 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1002 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1003 		      &state);
1004 
1005 	/*
1006 	 *	Keep copying data until we run out.
1007 	 */
1008 
1009 	while (state.left > 0) {
1010 		frag = ip6_frag_next(skb, &state);
1011 		if (IS_ERR(frag)) {
1012 			err = PTR_ERR(frag);
1013 			goto fail;
1014 		}
1015 
1016 		/*
1017 		 *	Put this fragment into the sending queue.
1018 		 */
1019 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1020 		err = output(net, sk, frag);
1021 		if (err)
1022 			goto fail;
1023 
1024 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1025 			      IPSTATS_MIB_FRAGCREATES);
1026 	}
1027 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 		      IPSTATS_MIB_FRAGOKS);
1029 	consume_skb(skb);
1030 	return err;
1031 
1032 fail_toobig:
1033 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1034 	err = -EMSGSIZE;
1035 
1036 fail:
1037 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1038 		      IPSTATS_MIB_FRAGFAILS);
1039 	kfree_skb(skb);
1040 	return err;
1041 }
1042 
1043 static inline int ip6_rt_check(const struct rt6key *rt_key,
1044 			       const struct in6_addr *fl_addr,
1045 			       const struct in6_addr *addr_cache)
1046 {
1047 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1048 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1049 }
1050 
1051 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1052 					  struct dst_entry *dst,
1053 					  const struct flowi6 *fl6)
1054 {
1055 	struct ipv6_pinfo *np = inet6_sk(sk);
1056 	struct rt6_info *rt;
1057 
1058 	if (!dst)
1059 		goto out;
1060 
1061 	if (dst->ops->family != AF_INET6) {
1062 		dst_release(dst);
1063 		return NULL;
1064 	}
1065 
1066 	rt = (struct rt6_info *)dst;
1067 	/* Yes, checking route validity in not connected
1068 	 * case is not very simple. Take into account,
1069 	 * that we do not support routing by source, TOS,
1070 	 * and MSG_DONTROUTE		--ANK (980726)
1071 	 *
1072 	 * 1. ip6_rt_check(): If route was host route,
1073 	 *    check that cached destination is current.
1074 	 *    If it is network route, we still may
1075 	 *    check its validity using saved pointer
1076 	 *    to the last used address: daddr_cache.
1077 	 *    We do not want to save whole address now,
1078 	 *    (because main consumer of this service
1079 	 *    is tcp, which has not this problem),
1080 	 *    so that the last trick works only on connected
1081 	 *    sockets.
1082 	 * 2. oif also should be the same.
1083 	 */
1084 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1085 #ifdef CONFIG_IPV6_SUBTREES
1086 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1087 #endif
1088 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1089 		dst_release(dst);
1090 		dst = NULL;
1091 	}
1092 
1093 out:
1094 	return dst;
1095 }
1096 
1097 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1098 			       struct dst_entry **dst, struct flowi6 *fl6)
1099 {
1100 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1101 	struct neighbour *n;
1102 	struct rt6_info *rt;
1103 #endif
1104 	int err;
1105 	int flags = 0;
1106 
1107 	/* The correct way to handle this would be to do
1108 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1109 	 * the route-specific preferred source forces the
1110 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1111 	 *
1112 	 * In source specific routing (no src=any default route),
1113 	 * ip6_route_output will fail given src=any saddr, though, so
1114 	 * that's why we try it again later.
1115 	 */
1116 	if (ipv6_addr_any(&fl6->saddr)) {
1117 		struct fib6_info *from;
1118 		struct rt6_info *rt;
1119 
1120 		*dst = ip6_route_output(net, sk, fl6);
1121 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1122 
1123 		rcu_read_lock();
1124 		from = rt ? rcu_dereference(rt->from) : NULL;
1125 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1126 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1127 					  &fl6->saddr);
1128 		rcu_read_unlock();
1129 
1130 		if (err)
1131 			goto out_err_release;
1132 
1133 		/* If we had an erroneous initial result, pretend it
1134 		 * never existed and let the SA-enabled version take
1135 		 * over.
1136 		 */
1137 		if ((*dst)->error) {
1138 			dst_release(*dst);
1139 			*dst = NULL;
1140 		}
1141 
1142 		if (fl6->flowi6_oif)
1143 			flags |= RT6_LOOKUP_F_IFACE;
1144 	}
1145 
1146 	if (!*dst)
1147 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1148 
1149 	err = (*dst)->error;
1150 	if (err)
1151 		goto out_err_release;
1152 
1153 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1154 	/*
1155 	 * Here if the dst entry we've looked up
1156 	 * has a neighbour entry that is in the INCOMPLETE
1157 	 * state and the src address from the flow is
1158 	 * marked as OPTIMISTIC, we release the found
1159 	 * dst entry and replace it instead with the
1160 	 * dst entry of the nexthop router
1161 	 */
1162 	rt = (struct rt6_info *) *dst;
1163 	rcu_read_lock();
1164 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1165 				      rt6_nexthop(rt, &fl6->daddr));
1166 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1167 	rcu_read_unlock();
1168 
1169 	if (err) {
1170 		struct inet6_ifaddr *ifp;
1171 		struct flowi6 fl_gw6;
1172 		int redirect;
1173 
1174 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1175 				      (*dst)->dev, 1);
1176 
1177 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1178 		if (ifp)
1179 			in6_ifa_put(ifp);
1180 
1181 		if (redirect) {
1182 			/*
1183 			 * We need to get the dst entry for the
1184 			 * default router instead
1185 			 */
1186 			dst_release(*dst);
1187 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1188 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1189 			*dst = ip6_route_output(net, sk, &fl_gw6);
1190 			err = (*dst)->error;
1191 			if (err)
1192 				goto out_err_release;
1193 		}
1194 	}
1195 #endif
1196 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1197 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1198 		err = -EAFNOSUPPORT;
1199 		goto out_err_release;
1200 	}
1201 
1202 	return 0;
1203 
1204 out_err_release:
1205 	dst_release(*dst);
1206 	*dst = NULL;
1207 
1208 	if (err == -ENETUNREACH)
1209 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1210 	return err;
1211 }
1212 
1213 /**
1214  *	ip6_dst_lookup - perform route lookup on flow
1215  *	@net: Network namespace to perform lookup in
1216  *	@sk: socket which provides route info
1217  *	@dst: pointer to dst_entry * for result
1218  *	@fl6: flow to lookup
1219  *
1220  *	This function performs a route lookup on the given flow.
1221  *
1222  *	It returns zero on success, or a standard errno code on error.
1223  */
1224 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1225 		   struct flowi6 *fl6)
1226 {
1227 	*dst = NULL;
1228 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1229 }
1230 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1231 
1232 /**
1233  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1234  *	@net: Network namespace to perform lookup in
1235  *	@sk: socket which provides route info
1236  *	@fl6: flow to lookup
1237  *	@final_dst: final destination address for ipsec lookup
1238  *
1239  *	This function performs a route lookup on the given flow.
1240  *
1241  *	It returns a valid dst pointer on success, or a pointer encoded
1242  *	error code.
1243  */
1244 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1245 				      const struct in6_addr *final_dst)
1246 {
1247 	struct dst_entry *dst = NULL;
1248 	int err;
1249 
1250 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1251 	if (err)
1252 		return ERR_PTR(err);
1253 	if (final_dst)
1254 		fl6->daddr = *final_dst;
1255 
1256 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1257 }
1258 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1259 
1260 /**
1261  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1262  *	@sk: socket which provides the dst cache and route info
1263  *	@fl6: flow to lookup
1264  *	@final_dst: final destination address for ipsec lookup
1265  *	@connected: whether @sk is connected or not
1266  *
1267  *	This function performs a route lookup on the given flow with the
1268  *	possibility of using the cached route in the socket if it is valid.
1269  *	It will take the socket dst lock when operating on the dst cache.
1270  *	As a result, this function can only be used in process context.
1271  *
1272  *	In addition, for a connected socket, cache the dst in the socket
1273  *	if the current cache is not valid.
1274  *
1275  *	It returns a valid dst pointer on success, or a pointer encoded
1276  *	error code.
1277  */
1278 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1279 					 const struct in6_addr *final_dst,
1280 					 bool connected)
1281 {
1282 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1283 
1284 	dst = ip6_sk_dst_check(sk, dst, fl6);
1285 	if (dst)
1286 		return dst;
1287 
1288 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1289 	if (connected && !IS_ERR(dst))
1290 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1291 
1292 	return dst;
1293 }
1294 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1295 
1296 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1297 					       gfp_t gfp)
1298 {
1299 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1300 }
1301 
1302 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1303 						gfp_t gfp)
1304 {
1305 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1306 }
1307 
1308 static void ip6_append_data_mtu(unsigned int *mtu,
1309 				int *maxfraglen,
1310 				unsigned int fragheaderlen,
1311 				struct sk_buff *skb,
1312 				struct rt6_info *rt,
1313 				unsigned int orig_mtu)
1314 {
1315 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1316 		if (!skb) {
1317 			/* first fragment, reserve header_len */
1318 			*mtu = orig_mtu - rt->dst.header_len;
1319 
1320 		} else {
1321 			/*
1322 			 * this fragment is not first, the headers
1323 			 * space is regarded as data space.
1324 			 */
1325 			*mtu = orig_mtu;
1326 		}
1327 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1328 			      + fragheaderlen - sizeof(struct frag_hdr);
1329 	}
1330 }
1331 
1332 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1333 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1334 			  struct rt6_info *rt)
1335 {
1336 	struct ipv6_pinfo *np = inet6_sk(sk);
1337 	unsigned int mtu, frag_size;
1338 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1339 
1340 	/* callers pass dst together with a reference, set it first so
1341 	 * ip6_cork_release() can put it down even in case of an error.
1342 	 */
1343 	cork->base.dst = &rt->dst;
1344 
1345 	/*
1346 	 * setup for corking
1347 	 */
1348 	if (opt) {
1349 		if (WARN_ON(v6_cork->opt))
1350 			return -EINVAL;
1351 
1352 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1353 		if (unlikely(!nopt))
1354 			return -ENOBUFS;
1355 
1356 		nopt->tot_len = sizeof(*opt);
1357 		nopt->opt_flen = opt->opt_flen;
1358 		nopt->opt_nflen = opt->opt_nflen;
1359 
1360 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1361 		if (opt->dst0opt && !nopt->dst0opt)
1362 			return -ENOBUFS;
1363 
1364 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1365 		if (opt->dst1opt && !nopt->dst1opt)
1366 			return -ENOBUFS;
1367 
1368 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1369 		if (opt->hopopt && !nopt->hopopt)
1370 			return -ENOBUFS;
1371 
1372 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1373 		if (opt->srcrt && !nopt->srcrt)
1374 			return -ENOBUFS;
1375 
1376 		/* need source address above miyazawa*/
1377 	}
1378 	v6_cork->hop_limit = ipc6->hlimit;
1379 	v6_cork->tclass = ipc6->tclass;
1380 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1381 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1382 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1383 	else
1384 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1385 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1386 
1387 	frag_size = READ_ONCE(np->frag_size);
1388 	if (frag_size && frag_size < mtu)
1389 		mtu = frag_size;
1390 
1391 	cork->base.fragsize = mtu;
1392 	cork->base.gso_size = ipc6->gso_size;
1393 	cork->base.tx_flags = 0;
1394 	cork->base.mark = ipc6->sockc.mark;
1395 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1396 
1397 	cork->base.length = 0;
1398 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1399 
1400 	return 0;
1401 }
1402 
1403 static int __ip6_append_data(struct sock *sk,
1404 			     struct sk_buff_head *queue,
1405 			     struct inet_cork_full *cork_full,
1406 			     struct inet6_cork *v6_cork,
1407 			     struct page_frag *pfrag,
1408 			     int getfrag(void *from, char *to, int offset,
1409 					 int len, int odd, struct sk_buff *skb),
1410 			     void *from, size_t length, int transhdrlen,
1411 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1412 {
1413 	struct sk_buff *skb, *skb_prev = NULL;
1414 	struct inet_cork *cork = &cork_full->base;
1415 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1416 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1417 	struct ubuf_info *uarg = NULL;
1418 	int exthdrlen = 0;
1419 	int dst_exthdrlen = 0;
1420 	int hh_len;
1421 	int copy;
1422 	int err;
1423 	int offset = 0;
1424 	bool zc = false;
1425 	u32 tskey = 0;
1426 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1427 	struct ipv6_txoptions *opt = v6_cork->opt;
1428 	int csummode = CHECKSUM_NONE;
1429 	unsigned int maxnonfragsize, headersize;
1430 	unsigned int wmem_alloc_delta = 0;
1431 	bool paged, extra_uref = false;
1432 
1433 	skb = skb_peek_tail(queue);
1434 	if (!skb) {
1435 		exthdrlen = opt ? opt->opt_flen : 0;
1436 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1437 	}
1438 
1439 	paged = !!cork->gso_size;
1440 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1441 	orig_mtu = mtu;
1442 
1443 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1444 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1445 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1446 
1447 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1448 
1449 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1450 			(opt ? opt->opt_nflen : 0);
1451 
1452 	headersize = sizeof(struct ipv6hdr) +
1453 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1454 		     rt->rt6i_nfheader_len;
1455 
1456 	if (mtu <= fragheaderlen ||
1457 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1458 		goto emsgsize;
1459 
1460 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1461 		     sizeof(struct frag_hdr);
1462 
1463 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1464 	 * the first fragment
1465 	 */
1466 	if (headersize + transhdrlen > mtu)
1467 		goto emsgsize;
1468 
1469 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1470 	    (sk->sk_protocol == IPPROTO_UDP ||
1471 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1472 	     sk->sk_protocol == IPPROTO_RAW)) {
1473 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1474 				sizeof(struct ipv6hdr));
1475 		goto emsgsize;
1476 	}
1477 
1478 	if (ip6_sk_ignore_df(sk))
1479 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1480 	else
1481 		maxnonfragsize = mtu;
1482 
1483 	if (cork->length + length > maxnonfragsize - headersize) {
1484 emsgsize:
1485 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1486 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1487 		return -EMSGSIZE;
1488 	}
1489 
1490 	/* CHECKSUM_PARTIAL only with no extension headers and when
1491 	 * we are not going to fragment
1492 	 */
1493 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1494 	    headersize == sizeof(struct ipv6hdr) &&
1495 	    length <= mtu - headersize &&
1496 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1497 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1498 		csummode = CHECKSUM_PARTIAL;
1499 
1500 	if ((flags & MSG_ZEROCOPY) && length) {
1501 		struct msghdr *msg = from;
1502 
1503 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1504 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1505 				return -EINVAL;
1506 
1507 			/* Leave uarg NULL if can't zerocopy, callers should
1508 			 * be able to handle it.
1509 			 */
1510 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1511 			    csummode == CHECKSUM_PARTIAL) {
1512 				paged = true;
1513 				zc = true;
1514 				uarg = msg->msg_ubuf;
1515 			}
1516 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1517 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1518 			if (!uarg)
1519 				return -ENOBUFS;
1520 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1521 			if (rt->dst.dev->features & NETIF_F_SG &&
1522 			    csummode == CHECKSUM_PARTIAL) {
1523 				paged = true;
1524 				zc = true;
1525 			} else {
1526 				uarg_to_msgzc(uarg)->zerocopy = 0;
1527 				skb_zcopy_set(skb, uarg, &extra_uref);
1528 			}
1529 		}
1530 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1531 		if (inet_test_bit(HDRINCL, sk))
1532 			return -EPERM;
1533 		if (rt->dst.dev->features & NETIF_F_SG &&
1534 		    getfrag == ip_generic_getfrag)
1535 			/* We need an empty buffer to attach stuff to */
1536 			paged = true;
1537 		else
1538 			flags &= ~MSG_SPLICE_PAGES;
1539 	}
1540 
1541 	/*
1542 	 * Let's try using as much space as possible.
1543 	 * Use MTU if total length of the message fits into the MTU.
1544 	 * Otherwise, we need to reserve fragment header and
1545 	 * fragment alignment (= 8-15 octects, in total).
1546 	 *
1547 	 * Note that we may need to "move" the data from the tail
1548 	 * of the buffer to the new fragment when we split
1549 	 * the message.
1550 	 *
1551 	 * FIXME: It may be fragmented into multiple chunks
1552 	 *        at once if non-fragmentable extension headers
1553 	 *        are too large.
1554 	 * --yoshfuji
1555 	 */
1556 
1557 	cork->length += length;
1558 	if (!skb)
1559 		goto alloc_new_skb;
1560 
1561 	while (length > 0) {
1562 		/* Check if the remaining data fits into current packet. */
1563 		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1564 		if (copy < length)
1565 			copy = maxfraglen - skb->len;
1566 
1567 		if (copy <= 0) {
1568 			char *data;
1569 			unsigned int datalen;
1570 			unsigned int fraglen;
1571 			unsigned int fraggap;
1572 			unsigned int alloclen, alloc_extra;
1573 			unsigned int pagedlen;
1574 alloc_new_skb:
1575 			/* There's no room in the current skb */
1576 			if (skb)
1577 				fraggap = skb->len - maxfraglen;
1578 			else
1579 				fraggap = 0;
1580 			/* update mtu and maxfraglen if necessary */
1581 			if (!skb || !skb_prev)
1582 				ip6_append_data_mtu(&mtu, &maxfraglen,
1583 						    fragheaderlen, skb, rt,
1584 						    orig_mtu);
1585 
1586 			skb_prev = skb;
1587 
1588 			/*
1589 			 * If remaining data exceeds the mtu,
1590 			 * we know we need more fragment(s).
1591 			 */
1592 			datalen = length + fraggap;
1593 
1594 			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1595 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1596 			fraglen = datalen + fragheaderlen;
1597 			pagedlen = 0;
1598 
1599 			alloc_extra = hh_len;
1600 			alloc_extra += dst_exthdrlen;
1601 			alloc_extra += rt->dst.trailer_len;
1602 
1603 			/* We just reserve space for fragment header.
1604 			 * Note: this may be overallocation if the message
1605 			 * (without MSG_MORE) fits into the MTU.
1606 			 */
1607 			alloc_extra += sizeof(struct frag_hdr);
1608 
1609 			if ((flags & MSG_MORE) &&
1610 			    !(rt->dst.dev->features&NETIF_F_SG))
1611 				alloclen = mtu;
1612 			else if (!paged &&
1613 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1614 				  !(rt->dst.dev->features & NETIF_F_SG)))
1615 				alloclen = fraglen;
1616 			else {
1617 				alloclen = fragheaderlen + transhdrlen;
1618 				pagedlen = datalen - transhdrlen;
1619 			}
1620 			alloclen += alloc_extra;
1621 
1622 			if (datalen != length + fraggap) {
1623 				/*
1624 				 * this is not the last fragment, the trailer
1625 				 * space is regarded as data space.
1626 				 */
1627 				datalen += rt->dst.trailer_len;
1628 			}
1629 
1630 			fraglen = datalen + fragheaderlen;
1631 
1632 			copy = datalen - transhdrlen - fraggap - pagedlen;
1633 			/* [!] NOTE: copy may be negative if pagedlen>0
1634 			 * because then the equation may reduces to -fraggap.
1635 			 */
1636 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1637 				err = -EINVAL;
1638 				goto error;
1639 			}
1640 			if (transhdrlen) {
1641 				skb = sock_alloc_send_skb(sk, alloclen,
1642 						(flags & MSG_DONTWAIT), &err);
1643 			} else {
1644 				skb = NULL;
1645 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1646 				    2 * sk->sk_sndbuf)
1647 					skb = alloc_skb(alloclen,
1648 							sk->sk_allocation);
1649 				if (unlikely(!skb))
1650 					err = -ENOBUFS;
1651 			}
1652 			if (!skb)
1653 				goto error;
1654 			/*
1655 			 *	Fill in the control structures
1656 			 */
1657 			skb->protocol = htons(ETH_P_IPV6);
1658 			skb->ip_summed = csummode;
1659 			skb->csum = 0;
1660 			/* reserve for fragmentation and ipsec header */
1661 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1662 				    dst_exthdrlen);
1663 
1664 			/*
1665 			 *	Find where to start putting bytes
1666 			 */
1667 			data = skb_put(skb, fraglen - pagedlen);
1668 			skb_set_network_header(skb, exthdrlen);
1669 			data += fragheaderlen;
1670 			skb->transport_header = (skb->network_header +
1671 						 fragheaderlen);
1672 			if (fraggap) {
1673 				skb->csum = skb_copy_and_csum_bits(
1674 					skb_prev, maxfraglen,
1675 					data + transhdrlen, fraggap);
1676 				skb_prev->csum = csum_sub(skb_prev->csum,
1677 							  skb->csum);
1678 				data += fraggap;
1679 				pskb_trim_unique(skb_prev, maxfraglen);
1680 			}
1681 			if (copy > 0 &&
1682 			    getfrag(from, data + transhdrlen, offset,
1683 				    copy, fraggap, skb) < 0) {
1684 				err = -EFAULT;
1685 				kfree_skb(skb);
1686 				goto error;
1687 			} else if (flags & MSG_SPLICE_PAGES) {
1688 				copy = 0;
1689 			}
1690 
1691 			offset += copy;
1692 			length -= copy + transhdrlen;
1693 			transhdrlen = 0;
1694 			exthdrlen = 0;
1695 			dst_exthdrlen = 0;
1696 
1697 			/* Only the initial fragment is time stamped */
1698 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1699 			cork->tx_flags = 0;
1700 			skb_shinfo(skb)->tskey = tskey;
1701 			tskey = 0;
1702 			skb_zcopy_set(skb, uarg, &extra_uref);
1703 
1704 			if ((flags & MSG_CONFIRM) && !skb_prev)
1705 				skb_set_dst_pending_confirm(skb, 1);
1706 
1707 			/*
1708 			 * Put the packet on the pending queue
1709 			 */
1710 			if (!skb->destructor) {
1711 				skb->destructor = sock_wfree;
1712 				skb->sk = sk;
1713 				wmem_alloc_delta += skb->truesize;
1714 			}
1715 			__skb_queue_tail(queue, skb);
1716 			continue;
1717 		}
1718 
1719 		if (copy > length)
1720 			copy = length;
1721 
1722 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1723 		    skb_tailroom(skb) >= copy) {
1724 			unsigned int off;
1725 
1726 			off = skb->len;
1727 			if (getfrag(from, skb_put(skb, copy),
1728 						offset, copy, off, skb) < 0) {
1729 				__skb_trim(skb, off);
1730 				err = -EFAULT;
1731 				goto error;
1732 			}
1733 		} else if (flags & MSG_SPLICE_PAGES) {
1734 			struct msghdr *msg = from;
1735 
1736 			err = -EIO;
1737 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1738 				goto error;
1739 
1740 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1741 						   sk->sk_allocation);
1742 			if (err < 0)
1743 				goto error;
1744 			copy = err;
1745 			wmem_alloc_delta += copy;
1746 		} else if (!zc) {
1747 			int i = skb_shinfo(skb)->nr_frags;
1748 
1749 			err = -ENOMEM;
1750 			if (!sk_page_frag_refill(sk, pfrag))
1751 				goto error;
1752 
1753 			skb_zcopy_downgrade_managed(skb);
1754 			if (!skb_can_coalesce(skb, i, pfrag->page,
1755 					      pfrag->offset)) {
1756 				err = -EMSGSIZE;
1757 				if (i == MAX_SKB_FRAGS)
1758 					goto error;
1759 
1760 				__skb_fill_page_desc(skb, i, pfrag->page,
1761 						     pfrag->offset, 0);
1762 				skb_shinfo(skb)->nr_frags = ++i;
1763 				get_page(pfrag->page);
1764 			}
1765 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1766 			if (getfrag(from,
1767 				    page_address(pfrag->page) + pfrag->offset,
1768 				    offset, copy, skb->len, skb) < 0)
1769 				goto error_efault;
1770 
1771 			pfrag->offset += copy;
1772 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1773 			skb->len += copy;
1774 			skb->data_len += copy;
1775 			skb->truesize += copy;
1776 			wmem_alloc_delta += copy;
1777 		} else {
1778 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1779 			if (err < 0)
1780 				goto error;
1781 		}
1782 		offset += copy;
1783 		length -= copy;
1784 	}
1785 
1786 	if (wmem_alloc_delta)
1787 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1788 	return 0;
1789 
1790 error_efault:
1791 	err = -EFAULT;
1792 error:
1793 	net_zcopy_put_abort(uarg, extra_uref);
1794 	cork->length -= length;
1795 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1796 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1797 	return err;
1798 }
1799 
1800 int ip6_append_data(struct sock *sk,
1801 		    int getfrag(void *from, char *to, int offset, int len,
1802 				int odd, struct sk_buff *skb),
1803 		    void *from, size_t length, int transhdrlen,
1804 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1805 		    struct rt6_info *rt, unsigned int flags)
1806 {
1807 	struct inet_sock *inet = inet_sk(sk);
1808 	struct ipv6_pinfo *np = inet6_sk(sk);
1809 	int exthdrlen;
1810 	int err;
1811 
1812 	if (flags&MSG_PROBE)
1813 		return 0;
1814 	if (skb_queue_empty(&sk->sk_write_queue)) {
1815 		/*
1816 		 * setup for corking
1817 		 */
1818 		dst_hold(&rt->dst);
1819 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1820 				     ipc6, rt);
1821 		if (err)
1822 			return err;
1823 
1824 		inet->cork.fl.u.ip6 = *fl6;
1825 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1826 		length += exthdrlen;
1827 		transhdrlen += exthdrlen;
1828 	} else {
1829 		transhdrlen = 0;
1830 	}
1831 
1832 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1833 				 &np->cork, sk_page_frag(sk), getfrag,
1834 				 from, length, transhdrlen, flags, ipc6);
1835 }
1836 EXPORT_SYMBOL_GPL(ip6_append_data);
1837 
1838 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1839 {
1840 	struct dst_entry *dst = cork->base.dst;
1841 
1842 	cork->base.dst = NULL;
1843 	skb_dst_set(skb, dst);
1844 }
1845 
1846 static void ip6_cork_release(struct inet_cork_full *cork,
1847 			     struct inet6_cork *v6_cork)
1848 {
1849 	if (v6_cork->opt) {
1850 		struct ipv6_txoptions *opt = v6_cork->opt;
1851 
1852 		kfree(opt->dst0opt);
1853 		kfree(opt->dst1opt);
1854 		kfree(opt->hopopt);
1855 		kfree(opt->srcrt);
1856 		kfree(opt);
1857 		v6_cork->opt = NULL;
1858 	}
1859 
1860 	if (cork->base.dst) {
1861 		dst_release(cork->base.dst);
1862 		cork->base.dst = NULL;
1863 	}
1864 }
1865 
1866 struct sk_buff *__ip6_make_skb(struct sock *sk,
1867 			       struct sk_buff_head *queue,
1868 			       struct inet_cork_full *cork,
1869 			       struct inet6_cork *v6_cork)
1870 {
1871 	struct sk_buff *skb, *tmp_skb;
1872 	struct sk_buff **tail_skb;
1873 	struct in6_addr *final_dst;
1874 	struct net *net = sock_net(sk);
1875 	struct ipv6hdr *hdr;
1876 	struct ipv6_txoptions *opt = v6_cork->opt;
1877 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1878 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1879 	unsigned char proto = fl6->flowi6_proto;
1880 
1881 	skb = __skb_dequeue(queue);
1882 	if (!skb)
1883 		goto out;
1884 	tail_skb = &(skb_shinfo(skb)->frag_list);
1885 
1886 	/* move skb->data to ip header from ext header */
1887 	if (skb->data < skb_network_header(skb))
1888 		__skb_pull(skb, skb_network_offset(skb));
1889 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1890 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1891 		*tail_skb = tmp_skb;
1892 		tail_skb = &(tmp_skb->next);
1893 		skb->len += tmp_skb->len;
1894 		skb->data_len += tmp_skb->len;
1895 		skb->truesize += tmp_skb->truesize;
1896 		tmp_skb->destructor = NULL;
1897 		tmp_skb->sk = NULL;
1898 	}
1899 
1900 	/* Allow local fragmentation. */
1901 	skb->ignore_df = ip6_sk_ignore_df(sk);
1902 	__skb_pull(skb, skb_network_header_len(skb));
1903 
1904 	final_dst = &fl6->daddr;
1905 	if (opt && opt->opt_flen)
1906 		ipv6_push_frag_opts(skb, opt, &proto);
1907 	if (opt && opt->opt_nflen)
1908 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1909 
1910 	skb_push(skb, sizeof(struct ipv6hdr));
1911 	skb_reset_network_header(skb);
1912 	hdr = ipv6_hdr(skb);
1913 
1914 	ip6_flow_hdr(hdr, v6_cork->tclass,
1915 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1916 					ip6_autoflowlabel(net, sk), fl6));
1917 	hdr->hop_limit = v6_cork->hop_limit;
1918 	hdr->nexthdr = proto;
1919 	hdr->saddr = fl6->saddr;
1920 	hdr->daddr = *final_dst;
1921 
1922 	skb->priority = READ_ONCE(sk->sk_priority);
1923 	skb->mark = cork->base.mark;
1924 	skb->tstamp = cork->base.transmit_time;
1925 
1926 	ip6_cork_steal_dst(skb, cork);
1927 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1928 	if (proto == IPPROTO_ICMPV6) {
1929 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1930 		u8 icmp6_type;
1931 
1932 		if (sk->sk_socket->type == SOCK_RAW &&
1933 		   !inet_test_bit(HDRINCL, sk))
1934 			icmp6_type = fl6->fl6_icmp_type;
1935 		else
1936 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1937 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1938 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1939 	}
1940 
1941 	ip6_cork_release(cork, v6_cork);
1942 out:
1943 	return skb;
1944 }
1945 
1946 int ip6_send_skb(struct sk_buff *skb)
1947 {
1948 	struct net *net = sock_net(skb->sk);
1949 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1950 	int err;
1951 
1952 	err = ip6_local_out(net, skb->sk, skb);
1953 	if (err) {
1954 		if (err > 0)
1955 			err = net_xmit_errno(err);
1956 		if (err)
1957 			IP6_INC_STATS(net, rt->rt6i_idev,
1958 				      IPSTATS_MIB_OUTDISCARDS);
1959 	}
1960 
1961 	return err;
1962 }
1963 
1964 int ip6_push_pending_frames(struct sock *sk)
1965 {
1966 	struct sk_buff *skb;
1967 
1968 	skb = ip6_finish_skb(sk);
1969 	if (!skb)
1970 		return 0;
1971 
1972 	return ip6_send_skb(skb);
1973 }
1974 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1975 
1976 static void __ip6_flush_pending_frames(struct sock *sk,
1977 				       struct sk_buff_head *queue,
1978 				       struct inet_cork_full *cork,
1979 				       struct inet6_cork *v6_cork)
1980 {
1981 	struct sk_buff *skb;
1982 
1983 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1984 		if (skb_dst(skb))
1985 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1986 				      IPSTATS_MIB_OUTDISCARDS);
1987 		kfree_skb(skb);
1988 	}
1989 
1990 	ip6_cork_release(cork, v6_cork);
1991 }
1992 
1993 void ip6_flush_pending_frames(struct sock *sk)
1994 {
1995 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1996 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1997 }
1998 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1999 
2000 struct sk_buff *ip6_make_skb(struct sock *sk,
2001 			     int getfrag(void *from, char *to, int offset,
2002 					 int len, int odd, struct sk_buff *skb),
2003 			     void *from, size_t length, int transhdrlen,
2004 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2005 			     unsigned int flags, struct inet_cork_full *cork)
2006 {
2007 	struct inet6_cork v6_cork;
2008 	struct sk_buff_head queue;
2009 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2010 	int err;
2011 
2012 	if (flags & MSG_PROBE) {
2013 		dst_release(&rt->dst);
2014 		return NULL;
2015 	}
2016 
2017 	__skb_queue_head_init(&queue);
2018 
2019 	cork->base.flags = 0;
2020 	cork->base.addr = 0;
2021 	cork->base.opt = NULL;
2022 	v6_cork.opt = NULL;
2023 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2024 	if (err) {
2025 		ip6_cork_release(cork, &v6_cork);
2026 		return ERR_PTR(err);
2027 	}
2028 	if (ipc6->dontfrag < 0)
2029 		ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
2030 
2031 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2032 				&current->task_frag, getfrag, from,
2033 				length + exthdrlen, transhdrlen + exthdrlen,
2034 				flags, ipc6);
2035 	if (err) {
2036 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2037 		return ERR_PTR(err);
2038 	}
2039 
2040 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2041 }
2042