xref: /linux/net/ipv6/ip6_output.c (revision 84de8154c516b821bd60493b90d4782c5a4905ab)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	const struct in6_addr *nexthop;
64 	struct neighbour *neigh;
65 	int ret;
66 
67 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
68 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
69 
70 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
71 		    ((mroute6_is_socket(net, skb) &&
72 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
73 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
74 					 &ipv6_hdr(skb)->saddr))) {
75 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
76 
77 			/* Do not check for IFF_ALLMULTI; multicast routing
78 			   is not supported in any case.
79 			 */
80 			if (newskb)
81 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
82 					net, sk, newskb, NULL, newskb->dev,
83 					dev_loopback_xmit);
84 
85 			if (ipv6_hdr(skb)->hop_limit == 0) {
86 				IP6_INC_STATS(net, idev,
87 					      IPSTATS_MIB_OUTDISCARDS);
88 				kfree_skb(skb);
89 				return 0;
90 			}
91 		}
92 
93 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
94 
95 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
96 		    IPV6_ADDR_SCOPE_NODELOCAL &&
97 		    !(dev->flags & IFF_LOOPBACK)) {
98 			kfree_skb(skb);
99 			return 0;
100 		}
101 	}
102 
103 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
104 		int res = lwtunnel_xmit(skb);
105 
106 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
107 			return res;
108 	}
109 
110 	rcu_read_lock_bh();
111 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
112 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
113 	if (unlikely(!neigh))
114 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
115 	if (!IS_ERR(neigh)) {
116 		sock_confirm_neigh(skb, neigh);
117 		ret = neigh_output(neigh, skb, false);
118 		rcu_read_unlock_bh();
119 		return ret;
120 	}
121 	rcu_read_unlock_bh();
122 
123 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
124 	kfree_skb(skb);
125 	return -EINVAL;
126 }
127 
128 static int
129 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
130 				    struct sk_buff *skb, unsigned int mtu)
131 {
132 	struct sk_buff *segs, *nskb;
133 	netdev_features_t features;
134 	int ret = 0;
135 
136 	/* Please see corresponding comment in ip_finish_output_gso
137 	 * describing the cases where GSO segment length exceeds the
138 	 * egress MTU.
139 	 */
140 	features = netif_skb_features(skb);
141 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
142 	if (IS_ERR_OR_NULL(segs)) {
143 		kfree_skb(skb);
144 		return -ENOMEM;
145 	}
146 
147 	consume_skb(skb);
148 
149 	skb_list_walk_safe(segs, segs, nskb) {
150 		int err;
151 
152 		skb_mark_not_on_list(segs);
153 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
154 		if (err && ret == 0)
155 			ret = err;
156 	}
157 
158 	return ret;
159 }
160 
161 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
162 {
163 	unsigned int mtu;
164 
165 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
166 	/* Policy lookup after SNAT yielded a new policy */
167 	if (skb_dst(skb)->xfrm) {
168 		IPCB(skb)->flags |= IPSKB_REROUTED;
169 		return dst_output(net, sk, skb);
170 	}
171 #endif
172 
173 	mtu = ip6_skb_dst_mtu(skb);
174 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
175 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
176 
177 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
178 	    dst_allfrag(skb_dst(skb)) ||
179 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
180 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
181 	else
182 		return ip6_finish_output2(net, sk, skb);
183 }
184 
185 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
186 {
187 	int ret;
188 
189 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
190 	switch (ret) {
191 	case NET_XMIT_SUCCESS:
192 		return __ip6_finish_output(net, sk, skb);
193 	case NET_XMIT_CN:
194 		return __ip6_finish_output(net, sk, skb) ? : ret;
195 	default:
196 		kfree_skb(skb);
197 		return ret;
198 	}
199 }
200 
201 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
202 {
203 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
204 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
205 
206 	skb->protocol = htons(ETH_P_IPV6);
207 	skb->dev = dev;
208 
209 	if (unlikely(idev->cnf.disable_ipv6)) {
210 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
211 		kfree_skb(skb);
212 		return 0;
213 	}
214 
215 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
216 			    net, sk, skb, indev, dev,
217 			    ip6_finish_output,
218 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
219 }
220 EXPORT_SYMBOL(ip6_output);
221 
222 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
223 {
224 	if (!np->autoflowlabel_set)
225 		return ip6_default_np_autolabel(net);
226 	else
227 		return np->autoflowlabel;
228 }
229 
230 /*
231  * xmit an sk_buff (used by TCP, SCTP and DCCP)
232  * Note : socket lock is not held for SYNACK packets, but might be modified
233  * by calls to skb_set_owner_w() and ipv6_local_error(),
234  * which are using proper atomic operations or spinlocks.
235  */
236 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
237 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
238 {
239 	struct net *net = sock_net(sk);
240 	const struct ipv6_pinfo *np = inet6_sk(sk);
241 	struct in6_addr *first_hop = &fl6->daddr;
242 	struct dst_entry *dst = skb_dst(skb);
243 	unsigned int head_room;
244 	struct ipv6hdr *hdr;
245 	u8  proto = fl6->flowi6_proto;
246 	int seg_len = skb->len;
247 	int hlimit = -1;
248 	u32 mtu;
249 
250 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
251 	if (opt)
252 		head_room += opt->opt_nflen + opt->opt_flen;
253 
254 	if (unlikely(skb_headroom(skb) < head_room)) {
255 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
256 		if (!skb2) {
257 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
258 				      IPSTATS_MIB_OUTDISCARDS);
259 			kfree_skb(skb);
260 			return -ENOBUFS;
261 		}
262 		if (skb->sk)
263 			skb_set_owner_w(skb2, skb->sk);
264 		consume_skb(skb);
265 		skb = skb2;
266 	}
267 
268 	if (opt) {
269 		seg_len += opt->opt_nflen + opt->opt_flen;
270 
271 		if (opt->opt_flen)
272 			ipv6_push_frag_opts(skb, opt, &proto);
273 
274 		if (opt->opt_nflen)
275 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
276 					     &fl6->saddr);
277 	}
278 
279 	skb_push(skb, sizeof(struct ipv6hdr));
280 	skb_reset_network_header(skb);
281 	hdr = ipv6_hdr(skb);
282 
283 	/*
284 	 *	Fill in the IPv6 header
285 	 */
286 	if (np)
287 		hlimit = np->hop_limit;
288 	if (hlimit < 0)
289 		hlimit = ip6_dst_hoplimit(dst);
290 
291 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
292 				ip6_autoflowlabel(net, np), fl6));
293 
294 	hdr->payload_len = htons(seg_len);
295 	hdr->nexthdr = proto;
296 	hdr->hop_limit = hlimit;
297 
298 	hdr->saddr = fl6->saddr;
299 	hdr->daddr = *first_hop;
300 
301 	skb->protocol = htons(ETH_P_IPV6);
302 	skb->priority = priority;
303 	skb->mark = mark;
304 
305 	mtu = dst_mtu(dst);
306 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
307 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
308 			      IPSTATS_MIB_OUT, skb->len);
309 
310 		/* if egress device is enslaved to an L3 master device pass the
311 		 * skb to its handler for processing
312 		 */
313 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
314 		if (unlikely(!skb))
315 			return 0;
316 
317 		/* hooks should never assume socket lock is held.
318 		 * we promote our socket to non const
319 		 */
320 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
321 			       net, (struct sock *)sk, skb, NULL, dst->dev,
322 			       dst_output);
323 	}
324 
325 	skb->dev = dst->dev;
326 	/* ipv6_local_error() does not require socket lock,
327 	 * we promote our socket to non const
328 	 */
329 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
330 
331 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
332 	kfree_skb(skb);
333 	return -EMSGSIZE;
334 }
335 EXPORT_SYMBOL(ip6_xmit);
336 
337 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
338 {
339 	struct ip6_ra_chain *ra;
340 	struct sock *last = NULL;
341 
342 	read_lock(&ip6_ra_lock);
343 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
344 		struct sock *sk = ra->sk;
345 		if (sk && ra->sel == sel &&
346 		    (!sk->sk_bound_dev_if ||
347 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
348 			struct ipv6_pinfo *np = inet6_sk(sk);
349 
350 			if (np && np->rtalert_isolate &&
351 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
352 				continue;
353 			}
354 			if (last) {
355 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
356 				if (skb2)
357 					rawv6_rcv(last, skb2);
358 			}
359 			last = sk;
360 		}
361 	}
362 
363 	if (last) {
364 		rawv6_rcv(last, skb);
365 		read_unlock(&ip6_ra_lock);
366 		return 1;
367 	}
368 	read_unlock(&ip6_ra_lock);
369 	return 0;
370 }
371 
372 static int ip6_forward_proxy_check(struct sk_buff *skb)
373 {
374 	struct ipv6hdr *hdr = ipv6_hdr(skb);
375 	u8 nexthdr = hdr->nexthdr;
376 	__be16 frag_off;
377 	int offset;
378 
379 	if (ipv6_ext_hdr(nexthdr)) {
380 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
381 		if (offset < 0)
382 			return 0;
383 	} else
384 		offset = sizeof(struct ipv6hdr);
385 
386 	if (nexthdr == IPPROTO_ICMPV6) {
387 		struct icmp6hdr *icmp6;
388 
389 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
390 					 offset + 1 - skb->data)))
391 			return 0;
392 
393 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
394 
395 		switch (icmp6->icmp6_type) {
396 		case NDISC_ROUTER_SOLICITATION:
397 		case NDISC_ROUTER_ADVERTISEMENT:
398 		case NDISC_NEIGHBOUR_SOLICITATION:
399 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
400 		case NDISC_REDIRECT:
401 			/* For reaction involving unicast neighbor discovery
402 			 * message destined to the proxied address, pass it to
403 			 * input function.
404 			 */
405 			return 1;
406 		default:
407 			break;
408 		}
409 	}
410 
411 	/*
412 	 * The proxying router can't forward traffic sent to a link-local
413 	 * address, so signal the sender and discard the packet. This
414 	 * behavior is clarified by the MIPv6 specification.
415 	 */
416 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
417 		dst_link_failure(skb);
418 		return -1;
419 	}
420 
421 	return 0;
422 }
423 
424 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
425 				     struct sk_buff *skb)
426 {
427 	struct dst_entry *dst = skb_dst(skb);
428 
429 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
430 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
431 
432 #ifdef CONFIG_NET_SWITCHDEV
433 	if (skb->offload_l3_fwd_mark) {
434 		consume_skb(skb);
435 		return 0;
436 	}
437 #endif
438 
439 	skb->tstamp = 0;
440 	return dst_output(net, sk, skb);
441 }
442 
443 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
444 {
445 	if (skb->len <= mtu)
446 		return false;
447 
448 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
449 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
450 		return true;
451 
452 	if (skb->ignore_df)
453 		return false;
454 
455 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
456 		return false;
457 
458 	return true;
459 }
460 
461 int ip6_forward(struct sk_buff *skb)
462 {
463 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
464 	struct dst_entry *dst = skb_dst(skb);
465 	struct ipv6hdr *hdr = ipv6_hdr(skb);
466 	struct inet6_skb_parm *opt = IP6CB(skb);
467 	struct net *net = dev_net(dst->dev);
468 	u32 mtu;
469 
470 	if (net->ipv6.devconf_all->forwarding == 0)
471 		goto error;
472 
473 	if (skb->pkt_type != PACKET_HOST)
474 		goto drop;
475 
476 	if (unlikely(skb->sk))
477 		goto drop;
478 
479 	if (skb_warn_if_lro(skb))
480 		goto drop;
481 
482 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
483 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
484 		goto drop;
485 	}
486 
487 	skb_forward_csum(skb);
488 
489 	/*
490 	 *	We DO NOT make any processing on
491 	 *	RA packets, pushing them to user level AS IS
492 	 *	without ane WARRANTY that application will be able
493 	 *	to interpret them. The reason is that we
494 	 *	cannot make anything clever here.
495 	 *
496 	 *	We are not end-node, so that if packet contains
497 	 *	AH/ESP, we cannot make anything.
498 	 *	Defragmentation also would be mistake, RA packets
499 	 *	cannot be fragmented, because there is no warranty
500 	 *	that different fragments will go along one path. --ANK
501 	 */
502 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
503 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
504 			return 0;
505 	}
506 
507 	/*
508 	 *	check and decrement ttl
509 	 */
510 	if (hdr->hop_limit <= 1) {
511 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
512 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
513 
514 		kfree_skb(skb);
515 		return -ETIMEDOUT;
516 	}
517 
518 	/* XXX: idev->cnf.proxy_ndp? */
519 	if (net->ipv6.devconf_all->proxy_ndp &&
520 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
521 		int proxied = ip6_forward_proxy_check(skb);
522 		if (proxied > 0)
523 			return ip6_input(skb);
524 		else if (proxied < 0) {
525 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
526 			goto drop;
527 		}
528 	}
529 
530 	if (!xfrm6_route_forward(skb)) {
531 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
532 		goto drop;
533 	}
534 	dst = skb_dst(skb);
535 
536 	/* IPv6 specs say nothing about it, but it is clear that we cannot
537 	   send redirects to source routed frames.
538 	   We don't send redirects to frames decapsulated from IPsec.
539 	 */
540 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
541 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
542 		struct in6_addr *target = NULL;
543 		struct inet_peer *peer;
544 		struct rt6_info *rt;
545 
546 		/*
547 		 *	incoming and outgoing devices are the same
548 		 *	send a redirect.
549 		 */
550 
551 		rt = (struct rt6_info *) dst;
552 		if (rt->rt6i_flags & RTF_GATEWAY)
553 			target = &rt->rt6i_gateway;
554 		else
555 			target = &hdr->daddr;
556 
557 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
558 
559 		/* Limit redirects both by destination (here)
560 		   and by source (inside ndisc_send_redirect)
561 		 */
562 		if (inet_peer_xrlim_allow(peer, 1*HZ))
563 			ndisc_send_redirect(skb, target);
564 		if (peer)
565 			inet_putpeer(peer);
566 	} else {
567 		int addrtype = ipv6_addr_type(&hdr->saddr);
568 
569 		/* This check is security critical. */
570 		if (addrtype == IPV6_ADDR_ANY ||
571 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
572 			goto error;
573 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
574 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
575 				    ICMPV6_NOT_NEIGHBOUR, 0);
576 			goto error;
577 		}
578 	}
579 
580 	mtu = ip6_dst_mtu_forward(dst);
581 	if (mtu < IPV6_MIN_MTU)
582 		mtu = IPV6_MIN_MTU;
583 
584 	if (ip6_pkt_too_big(skb, mtu)) {
585 		/* Again, force OUTPUT device used as source address */
586 		skb->dev = dst->dev;
587 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
588 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
589 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
590 				IPSTATS_MIB_FRAGFAILS);
591 		kfree_skb(skb);
592 		return -EMSGSIZE;
593 	}
594 
595 	if (skb_cow(skb, dst->dev->hard_header_len)) {
596 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
597 				IPSTATS_MIB_OUTDISCARDS);
598 		goto drop;
599 	}
600 
601 	hdr = ipv6_hdr(skb);
602 
603 	/* Mangling hops number delayed to point after skb COW */
604 
605 	hdr->hop_limit--;
606 
607 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
608 		       net, NULL, skb, skb->dev, dst->dev,
609 		       ip6_forward_finish);
610 
611 error:
612 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
613 drop:
614 	kfree_skb(skb);
615 	return -EINVAL;
616 }
617 
618 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
619 {
620 	to->pkt_type = from->pkt_type;
621 	to->priority = from->priority;
622 	to->protocol = from->protocol;
623 	skb_dst_drop(to);
624 	skb_dst_set(to, dst_clone(skb_dst(from)));
625 	to->dev = from->dev;
626 	to->mark = from->mark;
627 
628 	skb_copy_hash(to, from);
629 
630 #ifdef CONFIG_NET_SCHED
631 	to->tc_index = from->tc_index;
632 #endif
633 	nf_copy(to, from);
634 	skb_ext_copy(to, from);
635 	skb_copy_secmark(to, from);
636 }
637 
638 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
639 		      u8 nexthdr, __be32 frag_id,
640 		      struct ip6_fraglist_iter *iter)
641 {
642 	unsigned int first_len;
643 	struct frag_hdr *fh;
644 
645 	/* BUILD HEADER */
646 	*prevhdr = NEXTHDR_FRAGMENT;
647 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
648 	if (!iter->tmp_hdr)
649 		return -ENOMEM;
650 
651 	iter->frag = skb_shinfo(skb)->frag_list;
652 	skb_frag_list_init(skb);
653 
654 	iter->offset = 0;
655 	iter->hlen = hlen;
656 	iter->frag_id = frag_id;
657 	iter->nexthdr = nexthdr;
658 
659 	__skb_pull(skb, hlen);
660 	fh = __skb_push(skb, sizeof(struct frag_hdr));
661 	__skb_push(skb, hlen);
662 	skb_reset_network_header(skb);
663 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
664 
665 	fh->nexthdr = nexthdr;
666 	fh->reserved = 0;
667 	fh->frag_off = htons(IP6_MF);
668 	fh->identification = frag_id;
669 
670 	first_len = skb_pagelen(skb);
671 	skb->data_len = first_len - skb_headlen(skb);
672 	skb->len = first_len;
673 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
674 
675 	return 0;
676 }
677 EXPORT_SYMBOL(ip6_fraglist_init);
678 
679 void ip6_fraglist_prepare(struct sk_buff *skb,
680 			  struct ip6_fraglist_iter *iter)
681 {
682 	struct sk_buff *frag = iter->frag;
683 	unsigned int hlen = iter->hlen;
684 	struct frag_hdr *fh;
685 
686 	frag->ip_summed = CHECKSUM_NONE;
687 	skb_reset_transport_header(frag);
688 	fh = __skb_push(frag, sizeof(struct frag_hdr));
689 	__skb_push(frag, hlen);
690 	skb_reset_network_header(frag);
691 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
692 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
693 	fh->nexthdr = iter->nexthdr;
694 	fh->reserved = 0;
695 	fh->frag_off = htons(iter->offset);
696 	if (frag->next)
697 		fh->frag_off |= htons(IP6_MF);
698 	fh->identification = iter->frag_id;
699 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
700 	ip6_copy_metadata(frag, skb);
701 }
702 EXPORT_SYMBOL(ip6_fraglist_prepare);
703 
704 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
705 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
706 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
707 {
708 	state->prevhdr = prevhdr;
709 	state->nexthdr = nexthdr;
710 	state->frag_id = frag_id;
711 
712 	state->hlen = hlen;
713 	state->mtu = mtu;
714 
715 	state->left = skb->len - hlen;	/* Space per frame */
716 	state->ptr = hlen;		/* Where to start from */
717 
718 	state->hroom = hdr_room;
719 	state->troom = needed_tailroom;
720 
721 	state->offset = 0;
722 }
723 EXPORT_SYMBOL(ip6_frag_init);
724 
725 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
726 {
727 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
728 	struct sk_buff *frag;
729 	struct frag_hdr *fh;
730 	unsigned int len;
731 
732 	len = state->left;
733 	/* IF: it doesn't fit, use 'mtu' - the data space left */
734 	if (len > state->mtu)
735 		len = state->mtu;
736 	/* IF: we are not sending up to and including the packet end
737 	   then align the next start on an eight byte boundary */
738 	if (len < state->left)
739 		len &= ~7;
740 
741 	/* Allocate buffer */
742 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
743 			 state->hroom + state->troom, GFP_ATOMIC);
744 	if (!frag)
745 		return ERR_PTR(-ENOMEM);
746 
747 	/*
748 	 *	Set up data on packet
749 	 */
750 
751 	ip6_copy_metadata(frag, skb);
752 	skb_reserve(frag, state->hroom);
753 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
754 	skb_reset_network_header(frag);
755 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
756 	frag->transport_header = (frag->network_header + state->hlen +
757 				  sizeof(struct frag_hdr));
758 
759 	/*
760 	 *	Charge the memory for the fragment to any owner
761 	 *	it might possess
762 	 */
763 	if (skb->sk)
764 		skb_set_owner_w(frag, skb->sk);
765 
766 	/*
767 	 *	Copy the packet header into the new buffer.
768 	 */
769 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
770 
771 	fragnexthdr_offset = skb_network_header(frag);
772 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
773 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
774 
775 	/*
776 	 *	Build fragment header.
777 	 */
778 	fh->nexthdr = state->nexthdr;
779 	fh->reserved = 0;
780 	fh->identification = state->frag_id;
781 
782 	/*
783 	 *	Copy a block of the IP datagram.
784 	 */
785 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
786 			     len));
787 	state->left -= len;
788 
789 	fh->frag_off = htons(state->offset);
790 	if (state->left > 0)
791 		fh->frag_off |= htons(IP6_MF);
792 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
793 
794 	state->ptr += len;
795 	state->offset += len;
796 
797 	return frag;
798 }
799 EXPORT_SYMBOL(ip6_frag_next);
800 
801 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
802 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
803 {
804 	struct sk_buff *frag;
805 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
806 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
807 				inet6_sk(skb->sk) : NULL;
808 	struct ip6_frag_state state;
809 	unsigned int mtu, hlen, nexthdr_offset;
810 	ktime_t tstamp = skb->tstamp;
811 	int hroom, err = 0;
812 	__be32 frag_id;
813 	u8 *prevhdr, nexthdr = 0;
814 
815 	err = ip6_find_1stfragopt(skb, &prevhdr);
816 	if (err < 0)
817 		goto fail;
818 	hlen = err;
819 	nexthdr = *prevhdr;
820 	nexthdr_offset = prevhdr - skb_network_header(skb);
821 
822 	mtu = ip6_skb_dst_mtu(skb);
823 
824 	/* We must not fragment if the socket is set to force MTU discovery
825 	 * or if the skb it not generated by a local socket.
826 	 */
827 	if (unlikely(!skb->ignore_df && skb->len > mtu))
828 		goto fail_toobig;
829 
830 	if (IP6CB(skb)->frag_max_size) {
831 		if (IP6CB(skb)->frag_max_size > mtu)
832 			goto fail_toobig;
833 
834 		/* don't send fragments larger than what we received */
835 		mtu = IP6CB(skb)->frag_max_size;
836 		if (mtu < IPV6_MIN_MTU)
837 			mtu = IPV6_MIN_MTU;
838 	}
839 
840 	if (np && np->frag_size < mtu) {
841 		if (np->frag_size)
842 			mtu = np->frag_size;
843 	}
844 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
845 		goto fail_toobig;
846 	mtu -= hlen + sizeof(struct frag_hdr);
847 
848 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
849 				    &ipv6_hdr(skb)->saddr);
850 
851 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
852 	    (err = skb_checksum_help(skb)))
853 		goto fail;
854 
855 	prevhdr = skb_network_header(skb) + nexthdr_offset;
856 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
857 	if (skb_has_frag_list(skb)) {
858 		unsigned int first_len = skb_pagelen(skb);
859 		struct ip6_fraglist_iter iter;
860 		struct sk_buff *frag2;
861 
862 		if (first_len - hlen > mtu ||
863 		    ((first_len - hlen) & 7) ||
864 		    skb_cloned(skb) ||
865 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
866 			goto slow_path;
867 
868 		skb_walk_frags(skb, frag) {
869 			/* Correct geometry. */
870 			if (frag->len > mtu ||
871 			    ((frag->len & 7) && frag->next) ||
872 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
873 				goto slow_path_clean;
874 
875 			/* Partially cloned skb? */
876 			if (skb_shared(frag))
877 				goto slow_path_clean;
878 
879 			BUG_ON(frag->sk);
880 			if (skb->sk) {
881 				frag->sk = skb->sk;
882 				frag->destructor = sock_wfree;
883 			}
884 			skb->truesize -= frag->truesize;
885 		}
886 
887 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
888 					&iter);
889 		if (err < 0)
890 			goto fail;
891 
892 		for (;;) {
893 			/* Prepare header of the next frame,
894 			 * before previous one went down. */
895 			if (iter.frag)
896 				ip6_fraglist_prepare(skb, &iter);
897 
898 			skb->tstamp = tstamp;
899 			err = output(net, sk, skb);
900 			if (!err)
901 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
902 					      IPSTATS_MIB_FRAGCREATES);
903 
904 			if (err || !iter.frag)
905 				break;
906 
907 			skb = ip6_fraglist_next(&iter);
908 		}
909 
910 		kfree(iter.tmp_hdr);
911 
912 		if (err == 0) {
913 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
914 				      IPSTATS_MIB_FRAGOKS);
915 			return 0;
916 		}
917 
918 		kfree_skb_list(iter.frag);
919 
920 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
921 			      IPSTATS_MIB_FRAGFAILS);
922 		return err;
923 
924 slow_path_clean:
925 		skb_walk_frags(skb, frag2) {
926 			if (frag2 == frag)
927 				break;
928 			frag2->sk = NULL;
929 			frag2->destructor = NULL;
930 			skb->truesize += frag2->truesize;
931 		}
932 	}
933 
934 slow_path:
935 	/*
936 	 *	Fragment the datagram.
937 	 */
938 
939 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
940 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
941 		      &state);
942 
943 	/*
944 	 *	Keep copying data until we run out.
945 	 */
946 
947 	while (state.left > 0) {
948 		frag = ip6_frag_next(skb, &state);
949 		if (IS_ERR(frag)) {
950 			err = PTR_ERR(frag);
951 			goto fail;
952 		}
953 
954 		/*
955 		 *	Put this fragment into the sending queue.
956 		 */
957 		frag->tstamp = tstamp;
958 		err = output(net, sk, frag);
959 		if (err)
960 			goto fail;
961 
962 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
963 			      IPSTATS_MIB_FRAGCREATES);
964 	}
965 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
966 		      IPSTATS_MIB_FRAGOKS);
967 	consume_skb(skb);
968 	return err;
969 
970 fail_toobig:
971 	if (skb->sk && dst_allfrag(skb_dst(skb)))
972 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
973 
974 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
975 	err = -EMSGSIZE;
976 
977 fail:
978 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
979 		      IPSTATS_MIB_FRAGFAILS);
980 	kfree_skb(skb);
981 	return err;
982 }
983 
984 static inline int ip6_rt_check(const struct rt6key *rt_key,
985 			       const struct in6_addr *fl_addr,
986 			       const struct in6_addr *addr_cache)
987 {
988 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
989 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
990 }
991 
992 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
993 					  struct dst_entry *dst,
994 					  const struct flowi6 *fl6)
995 {
996 	struct ipv6_pinfo *np = inet6_sk(sk);
997 	struct rt6_info *rt;
998 
999 	if (!dst)
1000 		goto out;
1001 
1002 	if (dst->ops->family != AF_INET6) {
1003 		dst_release(dst);
1004 		return NULL;
1005 	}
1006 
1007 	rt = (struct rt6_info *)dst;
1008 	/* Yes, checking route validity in not connected
1009 	 * case is not very simple. Take into account,
1010 	 * that we do not support routing by source, TOS,
1011 	 * and MSG_DONTROUTE		--ANK (980726)
1012 	 *
1013 	 * 1. ip6_rt_check(): If route was host route,
1014 	 *    check that cached destination is current.
1015 	 *    If it is network route, we still may
1016 	 *    check its validity using saved pointer
1017 	 *    to the last used address: daddr_cache.
1018 	 *    We do not want to save whole address now,
1019 	 *    (because main consumer of this service
1020 	 *    is tcp, which has not this problem),
1021 	 *    so that the last trick works only on connected
1022 	 *    sockets.
1023 	 * 2. oif also should be the same.
1024 	 */
1025 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1026 #ifdef CONFIG_IPV6_SUBTREES
1027 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1028 #endif
1029 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1030 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1031 		dst_release(dst);
1032 		dst = NULL;
1033 	}
1034 
1035 out:
1036 	return dst;
1037 }
1038 
1039 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1040 			       struct dst_entry **dst, struct flowi6 *fl6)
1041 {
1042 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1043 	struct neighbour *n;
1044 	struct rt6_info *rt;
1045 #endif
1046 	int err;
1047 	int flags = 0;
1048 
1049 	/* The correct way to handle this would be to do
1050 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1051 	 * the route-specific preferred source forces the
1052 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1053 	 *
1054 	 * In source specific routing (no src=any default route),
1055 	 * ip6_route_output will fail given src=any saddr, though, so
1056 	 * that's why we try it again later.
1057 	 */
1058 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1059 		struct fib6_info *from;
1060 		struct rt6_info *rt;
1061 		bool had_dst = *dst != NULL;
1062 
1063 		if (!had_dst)
1064 			*dst = ip6_route_output(net, sk, fl6);
1065 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1066 
1067 		rcu_read_lock();
1068 		from = rt ? rcu_dereference(rt->from) : NULL;
1069 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1070 					  sk ? inet6_sk(sk)->srcprefs : 0,
1071 					  &fl6->saddr);
1072 		rcu_read_unlock();
1073 
1074 		if (err)
1075 			goto out_err_release;
1076 
1077 		/* If we had an erroneous initial result, pretend it
1078 		 * never existed and let the SA-enabled version take
1079 		 * over.
1080 		 */
1081 		if (!had_dst && (*dst)->error) {
1082 			dst_release(*dst);
1083 			*dst = NULL;
1084 		}
1085 
1086 		if (fl6->flowi6_oif)
1087 			flags |= RT6_LOOKUP_F_IFACE;
1088 	}
1089 
1090 	if (!*dst)
1091 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1092 
1093 	err = (*dst)->error;
1094 	if (err)
1095 		goto out_err_release;
1096 
1097 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1098 	/*
1099 	 * Here if the dst entry we've looked up
1100 	 * has a neighbour entry that is in the INCOMPLETE
1101 	 * state and the src address from the flow is
1102 	 * marked as OPTIMISTIC, we release the found
1103 	 * dst entry and replace it instead with the
1104 	 * dst entry of the nexthop router
1105 	 */
1106 	rt = (struct rt6_info *) *dst;
1107 	rcu_read_lock_bh();
1108 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1109 				      rt6_nexthop(rt, &fl6->daddr));
1110 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1111 	rcu_read_unlock_bh();
1112 
1113 	if (err) {
1114 		struct inet6_ifaddr *ifp;
1115 		struct flowi6 fl_gw6;
1116 		int redirect;
1117 
1118 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1119 				      (*dst)->dev, 1);
1120 
1121 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1122 		if (ifp)
1123 			in6_ifa_put(ifp);
1124 
1125 		if (redirect) {
1126 			/*
1127 			 * We need to get the dst entry for the
1128 			 * default router instead
1129 			 */
1130 			dst_release(*dst);
1131 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1132 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1133 			*dst = ip6_route_output(net, sk, &fl_gw6);
1134 			err = (*dst)->error;
1135 			if (err)
1136 				goto out_err_release;
1137 		}
1138 	}
1139 #endif
1140 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1141 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1142 		err = -EAFNOSUPPORT;
1143 		goto out_err_release;
1144 	}
1145 
1146 	return 0;
1147 
1148 out_err_release:
1149 	dst_release(*dst);
1150 	*dst = NULL;
1151 
1152 	if (err == -ENETUNREACH)
1153 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1154 	return err;
1155 }
1156 
1157 /**
1158  *	ip6_dst_lookup - perform route lookup on flow
1159  *	@net: Network namespace to perform lookup in
1160  *	@sk: socket which provides route info
1161  *	@dst: pointer to dst_entry * for result
1162  *	@fl6: flow to lookup
1163  *
1164  *	This function performs a route lookup on the given flow.
1165  *
1166  *	It returns zero on success, or a standard errno code on error.
1167  */
1168 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1169 		   struct flowi6 *fl6)
1170 {
1171 	*dst = NULL;
1172 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1173 }
1174 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1175 
1176 /**
1177  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1178  *	@net: Network namespace to perform lookup in
1179  *	@sk: socket which provides route info
1180  *	@fl6: flow to lookup
1181  *	@final_dst: final destination address for ipsec lookup
1182  *
1183  *	This function performs a route lookup on the given flow.
1184  *
1185  *	It returns a valid dst pointer on success, or a pointer encoded
1186  *	error code.
1187  */
1188 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1189 				      const struct in6_addr *final_dst)
1190 {
1191 	struct dst_entry *dst = NULL;
1192 	int err;
1193 
1194 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1195 	if (err)
1196 		return ERR_PTR(err);
1197 	if (final_dst)
1198 		fl6->daddr = *final_dst;
1199 
1200 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1201 }
1202 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1203 
1204 /**
1205  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1206  *	@sk: socket which provides the dst cache and route info
1207  *	@fl6: flow to lookup
1208  *	@final_dst: final destination address for ipsec lookup
1209  *	@connected: whether @sk is connected or not
1210  *
1211  *	This function performs a route lookup on the given flow with the
1212  *	possibility of using the cached route in the socket if it is valid.
1213  *	It will take the socket dst lock when operating on the dst cache.
1214  *	As a result, this function can only be used in process context.
1215  *
1216  *	In addition, for a connected socket, cache the dst in the socket
1217  *	if the current cache is not valid.
1218  *
1219  *	It returns a valid dst pointer on success, or a pointer encoded
1220  *	error code.
1221  */
1222 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1223 					 const struct in6_addr *final_dst,
1224 					 bool connected)
1225 {
1226 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1227 
1228 	dst = ip6_sk_dst_check(sk, dst, fl6);
1229 	if (dst)
1230 		return dst;
1231 
1232 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1233 	if (connected && !IS_ERR(dst))
1234 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1235 
1236 	return dst;
1237 }
1238 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1239 
1240 /**
1241  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1242  *      @skb: Packet for which lookup is done
1243  *      @dev: Tunnel device
1244  *      @net: Network namespace of tunnel device
1245  *      @sock: Socket which provides route info
1246  *      @saddr: Memory to store the src ip address
1247  *      @info: Tunnel information
1248  *      @protocol: IP protocol
1249  *      @use_cache: Flag to enable cache usage
1250  *      This function performs a route lookup on a tunnel
1251  *
1252  *      It returns a valid dst pointer and stores src address to be used in
1253  *      tunnel in param saddr on success, else a pointer encoded error code.
1254  */
1255 
1256 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1257 					struct net_device *dev,
1258 					struct net *net,
1259 					struct socket *sock,
1260 					struct in6_addr *saddr,
1261 					const struct ip_tunnel_info *info,
1262 					u8 protocol,
1263 					bool use_cache)
1264 {
1265 	struct dst_entry *dst = NULL;
1266 #ifdef CONFIG_DST_CACHE
1267 	struct dst_cache *dst_cache;
1268 #endif
1269 	struct flowi6 fl6;
1270 	__u8 prio;
1271 
1272 #ifdef CONFIG_DST_CACHE
1273 	dst_cache = (struct dst_cache *)&info->dst_cache;
1274 	if (use_cache) {
1275 		dst = dst_cache_get_ip6(dst_cache, saddr);
1276 		if (dst)
1277 			return dst;
1278 	}
1279 #endif
1280 	memset(&fl6, 0, sizeof(fl6));
1281 	fl6.flowi6_mark = skb->mark;
1282 	fl6.flowi6_proto = protocol;
1283 	fl6.daddr = info->key.u.ipv6.dst;
1284 	fl6.saddr = info->key.u.ipv6.src;
1285 	prio = info->key.tos;
1286 	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1287 					  info->key.label);
1288 
1289 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1290 					      NULL);
1291 	if (IS_ERR(dst)) {
1292 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1293 		return ERR_PTR(-ENETUNREACH);
1294 	}
1295 	if (dst->dev == dev) { /* is this necessary? */
1296 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1297 		dst_release(dst);
1298 		return ERR_PTR(-ELOOP);
1299 	}
1300 #ifdef CONFIG_DST_CACHE
1301 	if (use_cache)
1302 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1303 #endif
1304 	*saddr = fl6.saddr;
1305 	return dst;
1306 }
1307 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1308 
1309 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1310 					       gfp_t gfp)
1311 {
1312 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1313 }
1314 
1315 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1316 						gfp_t gfp)
1317 {
1318 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1319 }
1320 
1321 static void ip6_append_data_mtu(unsigned int *mtu,
1322 				int *maxfraglen,
1323 				unsigned int fragheaderlen,
1324 				struct sk_buff *skb,
1325 				struct rt6_info *rt,
1326 				unsigned int orig_mtu)
1327 {
1328 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1329 		if (!skb) {
1330 			/* first fragment, reserve header_len */
1331 			*mtu = orig_mtu - rt->dst.header_len;
1332 
1333 		} else {
1334 			/*
1335 			 * this fragment is not first, the headers
1336 			 * space is regarded as data space.
1337 			 */
1338 			*mtu = orig_mtu;
1339 		}
1340 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1341 			      + fragheaderlen - sizeof(struct frag_hdr);
1342 	}
1343 }
1344 
1345 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1346 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1347 			  struct rt6_info *rt, struct flowi6 *fl6)
1348 {
1349 	struct ipv6_pinfo *np = inet6_sk(sk);
1350 	unsigned int mtu;
1351 	struct ipv6_txoptions *opt = ipc6->opt;
1352 
1353 	/*
1354 	 * setup for corking
1355 	 */
1356 	if (opt) {
1357 		if (WARN_ON(v6_cork->opt))
1358 			return -EINVAL;
1359 
1360 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1361 		if (unlikely(!v6_cork->opt))
1362 			return -ENOBUFS;
1363 
1364 		v6_cork->opt->tot_len = sizeof(*opt);
1365 		v6_cork->opt->opt_flen = opt->opt_flen;
1366 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1367 
1368 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1369 						    sk->sk_allocation);
1370 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1371 			return -ENOBUFS;
1372 
1373 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1374 						    sk->sk_allocation);
1375 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1376 			return -ENOBUFS;
1377 
1378 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1379 						   sk->sk_allocation);
1380 		if (opt->hopopt && !v6_cork->opt->hopopt)
1381 			return -ENOBUFS;
1382 
1383 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1384 						    sk->sk_allocation);
1385 		if (opt->srcrt && !v6_cork->opt->srcrt)
1386 			return -ENOBUFS;
1387 
1388 		/* need source address above miyazawa*/
1389 	}
1390 	dst_hold(&rt->dst);
1391 	cork->base.dst = &rt->dst;
1392 	cork->fl.u.ip6 = *fl6;
1393 	v6_cork->hop_limit = ipc6->hlimit;
1394 	v6_cork->tclass = ipc6->tclass;
1395 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1396 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1397 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1398 	else
1399 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1400 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1401 	if (np->frag_size < mtu) {
1402 		if (np->frag_size)
1403 			mtu = np->frag_size;
1404 	}
1405 	if (mtu < IPV6_MIN_MTU)
1406 		return -EINVAL;
1407 	cork->base.fragsize = mtu;
1408 	cork->base.gso_size = ipc6->gso_size;
1409 	cork->base.tx_flags = 0;
1410 	cork->base.mark = ipc6->sockc.mark;
1411 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1412 
1413 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1414 		cork->base.flags |= IPCORK_ALLFRAG;
1415 	cork->base.length = 0;
1416 
1417 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1418 
1419 	return 0;
1420 }
1421 
1422 static int __ip6_append_data(struct sock *sk,
1423 			     struct flowi6 *fl6,
1424 			     struct sk_buff_head *queue,
1425 			     struct inet_cork *cork,
1426 			     struct inet6_cork *v6_cork,
1427 			     struct page_frag *pfrag,
1428 			     int getfrag(void *from, char *to, int offset,
1429 					 int len, int odd, struct sk_buff *skb),
1430 			     void *from, int length, int transhdrlen,
1431 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1432 {
1433 	struct sk_buff *skb, *skb_prev = NULL;
1434 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1435 	struct ubuf_info *uarg = NULL;
1436 	int exthdrlen = 0;
1437 	int dst_exthdrlen = 0;
1438 	int hh_len;
1439 	int copy;
1440 	int err;
1441 	int offset = 0;
1442 	u32 tskey = 0;
1443 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1444 	struct ipv6_txoptions *opt = v6_cork->opt;
1445 	int csummode = CHECKSUM_NONE;
1446 	unsigned int maxnonfragsize, headersize;
1447 	unsigned int wmem_alloc_delta = 0;
1448 	bool paged, extra_uref = false;
1449 
1450 	skb = skb_peek_tail(queue);
1451 	if (!skb) {
1452 		exthdrlen = opt ? opt->opt_flen : 0;
1453 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1454 	}
1455 
1456 	paged = !!cork->gso_size;
1457 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1458 	orig_mtu = mtu;
1459 
1460 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1461 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1462 		tskey = sk->sk_tskey++;
1463 
1464 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1465 
1466 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1467 			(opt ? opt->opt_nflen : 0);
1468 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1469 		     sizeof(struct frag_hdr);
1470 
1471 	headersize = sizeof(struct ipv6hdr) +
1472 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1473 		     (dst_allfrag(&rt->dst) ?
1474 		      sizeof(struct frag_hdr) : 0) +
1475 		     rt->rt6i_nfheader_len;
1476 
1477 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1478 	 * the first fragment
1479 	 */
1480 	if (headersize + transhdrlen > mtu)
1481 		goto emsgsize;
1482 
1483 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1484 	    (sk->sk_protocol == IPPROTO_UDP ||
1485 	     sk->sk_protocol == IPPROTO_RAW)) {
1486 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1487 				sizeof(struct ipv6hdr));
1488 		goto emsgsize;
1489 	}
1490 
1491 	if (ip6_sk_ignore_df(sk))
1492 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1493 	else
1494 		maxnonfragsize = mtu;
1495 
1496 	if (cork->length + length > maxnonfragsize - headersize) {
1497 emsgsize:
1498 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1499 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1500 		return -EMSGSIZE;
1501 	}
1502 
1503 	/* CHECKSUM_PARTIAL only with no extension headers and when
1504 	 * we are not going to fragment
1505 	 */
1506 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1507 	    headersize == sizeof(struct ipv6hdr) &&
1508 	    length <= mtu - headersize &&
1509 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1510 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1511 		csummode = CHECKSUM_PARTIAL;
1512 
1513 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1514 		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1515 		if (!uarg)
1516 			return -ENOBUFS;
1517 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1518 		if (rt->dst.dev->features & NETIF_F_SG &&
1519 		    csummode == CHECKSUM_PARTIAL) {
1520 			paged = true;
1521 		} else {
1522 			uarg->zerocopy = 0;
1523 			skb_zcopy_set(skb, uarg, &extra_uref);
1524 		}
1525 	}
1526 
1527 	/*
1528 	 * Let's try using as much space as possible.
1529 	 * Use MTU if total length of the message fits into the MTU.
1530 	 * Otherwise, we need to reserve fragment header and
1531 	 * fragment alignment (= 8-15 octects, in total).
1532 	 *
1533 	 * Note that we may need to "move" the data from the tail
1534 	 * of the buffer to the new fragment when we split
1535 	 * the message.
1536 	 *
1537 	 * FIXME: It may be fragmented into multiple chunks
1538 	 *        at once if non-fragmentable extension headers
1539 	 *        are too large.
1540 	 * --yoshfuji
1541 	 */
1542 
1543 	cork->length += length;
1544 	if (!skb)
1545 		goto alloc_new_skb;
1546 
1547 	while (length > 0) {
1548 		/* Check if the remaining data fits into current packet. */
1549 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1550 		if (copy < length)
1551 			copy = maxfraglen - skb->len;
1552 
1553 		if (copy <= 0) {
1554 			char *data;
1555 			unsigned int datalen;
1556 			unsigned int fraglen;
1557 			unsigned int fraggap;
1558 			unsigned int alloclen;
1559 			unsigned int pagedlen;
1560 alloc_new_skb:
1561 			/* There's no room in the current skb */
1562 			if (skb)
1563 				fraggap = skb->len - maxfraglen;
1564 			else
1565 				fraggap = 0;
1566 			/* update mtu and maxfraglen if necessary */
1567 			if (!skb || !skb_prev)
1568 				ip6_append_data_mtu(&mtu, &maxfraglen,
1569 						    fragheaderlen, skb, rt,
1570 						    orig_mtu);
1571 
1572 			skb_prev = skb;
1573 
1574 			/*
1575 			 * If remaining data exceeds the mtu,
1576 			 * we know we need more fragment(s).
1577 			 */
1578 			datalen = length + fraggap;
1579 
1580 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1581 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1582 			fraglen = datalen + fragheaderlen;
1583 			pagedlen = 0;
1584 
1585 			if ((flags & MSG_MORE) &&
1586 			    !(rt->dst.dev->features&NETIF_F_SG))
1587 				alloclen = mtu;
1588 			else if (!paged)
1589 				alloclen = fraglen;
1590 			else {
1591 				alloclen = min_t(int, fraglen, MAX_HEADER);
1592 				pagedlen = fraglen - alloclen;
1593 			}
1594 
1595 			alloclen += dst_exthdrlen;
1596 
1597 			if (datalen != length + fraggap) {
1598 				/*
1599 				 * this is not the last fragment, the trailer
1600 				 * space is regarded as data space.
1601 				 */
1602 				datalen += rt->dst.trailer_len;
1603 			}
1604 
1605 			alloclen += rt->dst.trailer_len;
1606 			fraglen = datalen + fragheaderlen;
1607 
1608 			/*
1609 			 * We just reserve space for fragment header.
1610 			 * Note: this may be overallocation if the message
1611 			 * (without MSG_MORE) fits into the MTU.
1612 			 */
1613 			alloclen += sizeof(struct frag_hdr);
1614 
1615 			copy = datalen - transhdrlen - fraggap - pagedlen;
1616 			if (copy < 0) {
1617 				err = -EINVAL;
1618 				goto error;
1619 			}
1620 			if (transhdrlen) {
1621 				skb = sock_alloc_send_skb(sk,
1622 						alloclen + hh_len,
1623 						(flags & MSG_DONTWAIT), &err);
1624 			} else {
1625 				skb = NULL;
1626 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1627 				    2 * sk->sk_sndbuf)
1628 					skb = alloc_skb(alloclen + hh_len,
1629 							sk->sk_allocation);
1630 				if (unlikely(!skb))
1631 					err = -ENOBUFS;
1632 			}
1633 			if (!skb)
1634 				goto error;
1635 			/*
1636 			 *	Fill in the control structures
1637 			 */
1638 			skb->protocol = htons(ETH_P_IPV6);
1639 			skb->ip_summed = csummode;
1640 			skb->csum = 0;
1641 			/* reserve for fragmentation and ipsec header */
1642 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1643 				    dst_exthdrlen);
1644 
1645 			/*
1646 			 *	Find where to start putting bytes
1647 			 */
1648 			data = skb_put(skb, fraglen - pagedlen);
1649 			skb_set_network_header(skb, exthdrlen);
1650 			data += fragheaderlen;
1651 			skb->transport_header = (skb->network_header +
1652 						 fragheaderlen);
1653 			if (fraggap) {
1654 				skb->csum = skb_copy_and_csum_bits(
1655 					skb_prev, maxfraglen,
1656 					data + transhdrlen, fraggap);
1657 				skb_prev->csum = csum_sub(skb_prev->csum,
1658 							  skb->csum);
1659 				data += fraggap;
1660 				pskb_trim_unique(skb_prev, maxfraglen);
1661 			}
1662 			if (copy > 0 &&
1663 			    getfrag(from, data + transhdrlen, offset,
1664 				    copy, fraggap, skb) < 0) {
1665 				err = -EFAULT;
1666 				kfree_skb(skb);
1667 				goto error;
1668 			}
1669 
1670 			offset += copy;
1671 			length -= copy + transhdrlen;
1672 			transhdrlen = 0;
1673 			exthdrlen = 0;
1674 			dst_exthdrlen = 0;
1675 
1676 			/* Only the initial fragment is time stamped */
1677 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1678 			cork->tx_flags = 0;
1679 			skb_shinfo(skb)->tskey = tskey;
1680 			tskey = 0;
1681 			skb_zcopy_set(skb, uarg, &extra_uref);
1682 
1683 			if ((flags & MSG_CONFIRM) && !skb_prev)
1684 				skb_set_dst_pending_confirm(skb, 1);
1685 
1686 			/*
1687 			 * Put the packet on the pending queue
1688 			 */
1689 			if (!skb->destructor) {
1690 				skb->destructor = sock_wfree;
1691 				skb->sk = sk;
1692 				wmem_alloc_delta += skb->truesize;
1693 			}
1694 			__skb_queue_tail(queue, skb);
1695 			continue;
1696 		}
1697 
1698 		if (copy > length)
1699 			copy = length;
1700 
1701 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1702 		    skb_tailroom(skb) >= copy) {
1703 			unsigned int off;
1704 
1705 			off = skb->len;
1706 			if (getfrag(from, skb_put(skb, copy),
1707 						offset, copy, off, skb) < 0) {
1708 				__skb_trim(skb, off);
1709 				err = -EFAULT;
1710 				goto error;
1711 			}
1712 		} else if (!uarg || !uarg->zerocopy) {
1713 			int i = skb_shinfo(skb)->nr_frags;
1714 
1715 			err = -ENOMEM;
1716 			if (!sk_page_frag_refill(sk, pfrag))
1717 				goto error;
1718 
1719 			if (!skb_can_coalesce(skb, i, pfrag->page,
1720 					      pfrag->offset)) {
1721 				err = -EMSGSIZE;
1722 				if (i == MAX_SKB_FRAGS)
1723 					goto error;
1724 
1725 				__skb_fill_page_desc(skb, i, pfrag->page,
1726 						     pfrag->offset, 0);
1727 				skb_shinfo(skb)->nr_frags = ++i;
1728 				get_page(pfrag->page);
1729 			}
1730 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1731 			if (getfrag(from,
1732 				    page_address(pfrag->page) + pfrag->offset,
1733 				    offset, copy, skb->len, skb) < 0)
1734 				goto error_efault;
1735 
1736 			pfrag->offset += copy;
1737 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1738 			skb->len += copy;
1739 			skb->data_len += copy;
1740 			skb->truesize += copy;
1741 			wmem_alloc_delta += copy;
1742 		} else {
1743 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1744 			if (err < 0)
1745 				goto error;
1746 		}
1747 		offset += copy;
1748 		length -= copy;
1749 	}
1750 
1751 	if (wmem_alloc_delta)
1752 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1753 	return 0;
1754 
1755 error_efault:
1756 	err = -EFAULT;
1757 error:
1758 	net_zcopy_put_abort(uarg, extra_uref);
1759 	cork->length -= length;
1760 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1761 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1762 	return err;
1763 }
1764 
1765 int ip6_append_data(struct sock *sk,
1766 		    int getfrag(void *from, char *to, int offset, int len,
1767 				int odd, struct sk_buff *skb),
1768 		    void *from, int length, int transhdrlen,
1769 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1770 		    struct rt6_info *rt, unsigned int flags)
1771 {
1772 	struct inet_sock *inet = inet_sk(sk);
1773 	struct ipv6_pinfo *np = inet6_sk(sk);
1774 	int exthdrlen;
1775 	int err;
1776 
1777 	if (flags&MSG_PROBE)
1778 		return 0;
1779 	if (skb_queue_empty(&sk->sk_write_queue)) {
1780 		/*
1781 		 * setup for corking
1782 		 */
1783 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1784 				     ipc6, rt, fl6);
1785 		if (err)
1786 			return err;
1787 
1788 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1789 		length += exthdrlen;
1790 		transhdrlen += exthdrlen;
1791 	} else {
1792 		fl6 = &inet->cork.fl.u.ip6;
1793 		transhdrlen = 0;
1794 	}
1795 
1796 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1797 				 &np->cork, sk_page_frag(sk), getfrag,
1798 				 from, length, transhdrlen, flags, ipc6);
1799 }
1800 EXPORT_SYMBOL_GPL(ip6_append_data);
1801 
1802 static void ip6_cork_release(struct inet_cork_full *cork,
1803 			     struct inet6_cork *v6_cork)
1804 {
1805 	if (v6_cork->opt) {
1806 		kfree(v6_cork->opt->dst0opt);
1807 		kfree(v6_cork->opt->dst1opt);
1808 		kfree(v6_cork->opt->hopopt);
1809 		kfree(v6_cork->opt->srcrt);
1810 		kfree(v6_cork->opt);
1811 		v6_cork->opt = NULL;
1812 	}
1813 
1814 	if (cork->base.dst) {
1815 		dst_release(cork->base.dst);
1816 		cork->base.dst = NULL;
1817 		cork->base.flags &= ~IPCORK_ALLFRAG;
1818 	}
1819 	memset(&cork->fl, 0, sizeof(cork->fl));
1820 }
1821 
1822 struct sk_buff *__ip6_make_skb(struct sock *sk,
1823 			       struct sk_buff_head *queue,
1824 			       struct inet_cork_full *cork,
1825 			       struct inet6_cork *v6_cork)
1826 {
1827 	struct sk_buff *skb, *tmp_skb;
1828 	struct sk_buff **tail_skb;
1829 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1830 	struct ipv6_pinfo *np = inet6_sk(sk);
1831 	struct net *net = sock_net(sk);
1832 	struct ipv6hdr *hdr;
1833 	struct ipv6_txoptions *opt = v6_cork->opt;
1834 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1835 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1836 	unsigned char proto = fl6->flowi6_proto;
1837 
1838 	skb = __skb_dequeue(queue);
1839 	if (!skb)
1840 		goto out;
1841 	tail_skb = &(skb_shinfo(skb)->frag_list);
1842 
1843 	/* move skb->data to ip header from ext header */
1844 	if (skb->data < skb_network_header(skb))
1845 		__skb_pull(skb, skb_network_offset(skb));
1846 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1847 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1848 		*tail_skb = tmp_skb;
1849 		tail_skb = &(tmp_skb->next);
1850 		skb->len += tmp_skb->len;
1851 		skb->data_len += tmp_skb->len;
1852 		skb->truesize += tmp_skb->truesize;
1853 		tmp_skb->destructor = NULL;
1854 		tmp_skb->sk = NULL;
1855 	}
1856 
1857 	/* Allow local fragmentation. */
1858 	skb->ignore_df = ip6_sk_ignore_df(sk);
1859 
1860 	*final_dst = fl6->daddr;
1861 	__skb_pull(skb, skb_network_header_len(skb));
1862 	if (opt && opt->opt_flen)
1863 		ipv6_push_frag_opts(skb, opt, &proto);
1864 	if (opt && opt->opt_nflen)
1865 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1866 
1867 	skb_push(skb, sizeof(struct ipv6hdr));
1868 	skb_reset_network_header(skb);
1869 	hdr = ipv6_hdr(skb);
1870 
1871 	ip6_flow_hdr(hdr, v6_cork->tclass,
1872 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1873 					ip6_autoflowlabel(net, np), fl6));
1874 	hdr->hop_limit = v6_cork->hop_limit;
1875 	hdr->nexthdr = proto;
1876 	hdr->saddr = fl6->saddr;
1877 	hdr->daddr = *final_dst;
1878 
1879 	skb->priority = sk->sk_priority;
1880 	skb->mark = cork->base.mark;
1881 
1882 	skb->tstamp = cork->base.transmit_time;
1883 
1884 	skb_dst_set(skb, dst_clone(&rt->dst));
1885 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1886 	if (proto == IPPROTO_ICMPV6) {
1887 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1888 
1889 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1890 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1891 	}
1892 
1893 	ip6_cork_release(cork, v6_cork);
1894 out:
1895 	return skb;
1896 }
1897 
1898 int ip6_send_skb(struct sk_buff *skb)
1899 {
1900 	struct net *net = sock_net(skb->sk);
1901 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1902 	int err;
1903 
1904 	err = ip6_local_out(net, skb->sk, skb);
1905 	if (err) {
1906 		if (err > 0)
1907 			err = net_xmit_errno(err);
1908 		if (err)
1909 			IP6_INC_STATS(net, rt->rt6i_idev,
1910 				      IPSTATS_MIB_OUTDISCARDS);
1911 	}
1912 
1913 	return err;
1914 }
1915 
1916 int ip6_push_pending_frames(struct sock *sk)
1917 {
1918 	struct sk_buff *skb;
1919 
1920 	skb = ip6_finish_skb(sk);
1921 	if (!skb)
1922 		return 0;
1923 
1924 	return ip6_send_skb(skb);
1925 }
1926 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1927 
1928 static void __ip6_flush_pending_frames(struct sock *sk,
1929 				       struct sk_buff_head *queue,
1930 				       struct inet_cork_full *cork,
1931 				       struct inet6_cork *v6_cork)
1932 {
1933 	struct sk_buff *skb;
1934 
1935 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1936 		if (skb_dst(skb))
1937 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1938 				      IPSTATS_MIB_OUTDISCARDS);
1939 		kfree_skb(skb);
1940 	}
1941 
1942 	ip6_cork_release(cork, v6_cork);
1943 }
1944 
1945 void ip6_flush_pending_frames(struct sock *sk)
1946 {
1947 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1948 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1949 }
1950 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1951 
1952 struct sk_buff *ip6_make_skb(struct sock *sk,
1953 			     int getfrag(void *from, char *to, int offset,
1954 					 int len, int odd, struct sk_buff *skb),
1955 			     void *from, int length, int transhdrlen,
1956 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1957 			     struct rt6_info *rt, unsigned int flags,
1958 			     struct inet_cork_full *cork)
1959 {
1960 	struct inet6_cork v6_cork;
1961 	struct sk_buff_head queue;
1962 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1963 	int err;
1964 
1965 	if (flags & MSG_PROBE)
1966 		return NULL;
1967 
1968 	__skb_queue_head_init(&queue);
1969 
1970 	cork->base.flags = 0;
1971 	cork->base.addr = 0;
1972 	cork->base.opt = NULL;
1973 	cork->base.dst = NULL;
1974 	v6_cork.opt = NULL;
1975 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1976 	if (err) {
1977 		ip6_cork_release(cork, &v6_cork);
1978 		return ERR_PTR(err);
1979 	}
1980 	if (ipc6->dontfrag < 0)
1981 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1982 
1983 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1984 				&current->task_frag, getfrag, from,
1985 				length + exthdrlen, transhdrlen + exthdrlen,
1986 				flags, ipc6);
1987 	if (err) {
1988 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1989 		return ERR_PTR(err);
1990 	}
1991 
1992 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1993 }
1994