xref: /linux/net/ipv6/ip6_output.c (revision d0f482bb06f9447d44d2cae0386a0bd768c3cc16)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	const struct in6_addr *nexthop;
64 	struct neighbour *neigh;
65 	int ret;
66 
67 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
68 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
69 
70 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
71 		    ((mroute6_is_socket(net, skb) &&
72 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
73 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
74 					 &ipv6_hdr(skb)->saddr))) {
75 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
76 
77 			/* Do not check for IFF_ALLMULTI; multicast routing
78 			   is not supported in any case.
79 			 */
80 			if (newskb)
81 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
82 					net, sk, newskb, NULL, newskb->dev,
83 					dev_loopback_xmit);
84 
85 			if (ipv6_hdr(skb)->hop_limit == 0) {
86 				IP6_INC_STATS(net, idev,
87 					      IPSTATS_MIB_OUTDISCARDS);
88 				kfree_skb(skb);
89 				return 0;
90 			}
91 		}
92 
93 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
94 
95 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
96 		    IPV6_ADDR_SCOPE_NODELOCAL &&
97 		    !(dev->flags & IFF_LOOPBACK)) {
98 			kfree_skb(skb);
99 			return 0;
100 		}
101 	}
102 
103 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
104 		int res = lwtunnel_xmit(skb);
105 
106 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
107 			return res;
108 	}
109 
110 	rcu_read_lock_bh();
111 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
112 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
113 	if (unlikely(!neigh))
114 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
115 	if (!IS_ERR(neigh)) {
116 		sock_confirm_neigh(skb, neigh);
117 		ret = neigh_output(neigh, skb, false);
118 		rcu_read_unlock_bh();
119 		return ret;
120 	}
121 	rcu_read_unlock_bh();
122 
123 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
124 	kfree_skb(skb);
125 	return -EINVAL;
126 }
127 
128 static int
129 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
130 				    struct sk_buff *skb, unsigned int mtu)
131 {
132 	struct sk_buff *segs, *nskb;
133 	netdev_features_t features;
134 	int ret = 0;
135 
136 	/* Please see corresponding comment in ip_finish_output_gso
137 	 * describing the cases where GSO segment length exceeds the
138 	 * egress MTU.
139 	 */
140 	features = netif_skb_features(skb);
141 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
142 	if (IS_ERR_OR_NULL(segs)) {
143 		kfree_skb(skb);
144 		return -ENOMEM;
145 	}
146 
147 	consume_skb(skb);
148 
149 	skb_list_walk_safe(segs, segs, nskb) {
150 		int err;
151 
152 		skb_mark_not_on_list(segs);
153 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
154 		if (err && ret == 0)
155 			ret = err;
156 	}
157 
158 	return ret;
159 }
160 
161 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
162 {
163 	unsigned int mtu;
164 
165 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
166 	/* Policy lookup after SNAT yielded a new policy */
167 	if (skb_dst(skb)->xfrm) {
168 		IPCB(skb)->flags |= IPSKB_REROUTED;
169 		return dst_output(net, sk, skb);
170 	}
171 #endif
172 
173 	mtu = ip6_skb_dst_mtu(skb);
174 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
175 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
176 
177 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
178 	    dst_allfrag(skb_dst(skb)) ||
179 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
180 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
181 	else
182 		return ip6_finish_output2(net, sk, skb);
183 }
184 
185 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
186 {
187 	int ret;
188 
189 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
190 	switch (ret) {
191 	case NET_XMIT_SUCCESS:
192 		return __ip6_finish_output(net, sk, skb);
193 	case NET_XMIT_CN:
194 		return __ip6_finish_output(net, sk, skb) ? : ret;
195 	default:
196 		kfree_skb(skb);
197 		return ret;
198 	}
199 }
200 
201 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
202 {
203 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
204 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
205 
206 	skb->protocol = htons(ETH_P_IPV6);
207 	skb->dev = dev;
208 
209 	if (unlikely(idev->cnf.disable_ipv6)) {
210 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
211 		kfree_skb(skb);
212 		return 0;
213 	}
214 
215 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
216 			    net, sk, skb, indev, dev,
217 			    ip6_finish_output,
218 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
219 }
220 EXPORT_SYMBOL(ip6_output);
221 
222 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
223 {
224 	if (!np->autoflowlabel_set)
225 		return ip6_default_np_autolabel(net);
226 	else
227 		return np->autoflowlabel;
228 }
229 
230 /*
231  * xmit an sk_buff (used by TCP, SCTP and DCCP)
232  * Note : socket lock is not held for SYNACK packets, but might be modified
233  * by calls to skb_set_owner_w() and ipv6_local_error(),
234  * which are using proper atomic operations or spinlocks.
235  */
236 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
237 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
238 {
239 	struct net *net = sock_net(sk);
240 	const struct ipv6_pinfo *np = inet6_sk(sk);
241 	struct in6_addr *first_hop = &fl6->daddr;
242 	struct dst_entry *dst = skb_dst(skb);
243 	unsigned int head_room;
244 	struct ipv6hdr *hdr;
245 	u8  proto = fl6->flowi6_proto;
246 	int seg_len = skb->len;
247 	int hlimit = -1;
248 	u32 mtu;
249 
250 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
251 	if (opt)
252 		head_room += opt->opt_nflen + opt->opt_flen;
253 
254 	if (unlikely(skb_headroom(skb) < head_room)) {
255 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
256 		if (!skb2) {
257 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
258 				      IPSTATS_MIB_OUTDISCARDS);
259 			kfree_skb(skb);
260 			return -ENOBUFS;
261 		}
262 		if (skb->sk)
263 			skb_set_owner_w(skb2, skb->sk);
264 		consume_skb(skb);
265 		skb = skb2;
266 	}
267 
268 	if (opt) {
269 		seg_len += opt->opt_nflen + opt->opt_flen;
270 
271 		if (opt->opt_flen)
272 			ipv6_push_frag_opts(skb, opt, &proto);
273 
274 		if (opt->opt_nflen)
275 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
276 					     &fl6->saddr);
277 	}
278 
279 	skb_push(skb, sizeof(struct ipv6hdr));
280 	skb_reset_network_header(skb);
281 	hdr = ipv6_hdr(skb);
282 
283 	/*
284 	 *	Fill in the IPv6 header
285 	 */
286 	if (np)
287 		hlimit = np->hop_limit;
288 	if (hlimit < 0)
289 		hlimit = ip6_dst_hoplimit(dst);
290 
291 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
292 				ip6_autoflowlabel(net, np), fl6));
293 
294 	hdr->payload_len = htons(seg_len);
295 	hdr->nexthdr = proto;
296 	hdr->hop_limit = hlimit;
297 
298 	hdr->saddr = fl6->saddr;
299 	hdr->daddr = *first_hop;
300 
301 	skb->protocol = htons(ETH_P_IPV6);
302 	skb->priority = priority;
303 	skb->mark = mark;
304 
305 	mtu = dst_mtu(dst);
306 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
307 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
308 			      IPSTATS_MIB_OUT, skb->len);
309 
310 		/* if egress device is enslaved to an L3 master device pass the
311 		 * skb to its handler for processing
312 		 */
313 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
314 		if (unlikely(!skb))
315 			return 0;
316 
317 		/* hooks should never assume socket lock is held.
318 		 * we promote our socket to non const
319 		 */
320 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
321 			       net, (struct sock *)sk, skb, NULL, dst->dev,
322 			       dst_output);
323 	}
324 
325 	skb->dev = dst->dev;
326 	/* ipv6_local_error() does not require socket lock,
327 	 * we promote our socket to non const
328 	 */
329 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
330 
331 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
332 	kfree_skb(skb);
333 	return -EMSGSIZE;
334 }
335 EXPORT_SYMBOL(ip6_xmit);
336 
337 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
338 {
339 	struct ip6_ra_chain *ra;
340 	struct sock *last = NULL;
341 
342 	read_lock(&ip6_ra_lock);
343 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
344 		struct sock *sk = ra->sk;
345 		if (sk && ra->sel == sel &&
346 		    (!sk->sk_bound_dev_if ||
347 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
348 			struct ipv6_pinfo *np = inet6_sk(sk);
349 
350 			if (np && np->rtalert_isolate &&
351 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
352 				continue;
353 			}
354 			if (last) {
355 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
356 				if (skb2)
357 					rawv6_rcv(last, skb2);
358 			}
359 			last = sk;
360 		}
361 	}
362 
363 	if (last) {
364 		rawv6_rcv(last, skb);
365 		read_unlock(&ip6_ra_lock);
366 		return 1;
367 	}
368 	read_unlock(&ip6_ra_lock);
369 	return 0;
370 }
371 
372 static int ip6_forward_proxy_check(struct sk_buff *skb)
373 {
374 	struct ipv6hdr *hdr = ipv6_hdr(skb);
375 	u8 nexthdr = hdr->nexthdr;
376 	__be16 frag_off;
377 	int offset;
378 
379 	if (ipv6_ext_hdr(nexthdr)) {
380 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
381 		if (offset < 0)
382 			return 0;
383 	} else
384 		offset = sizeof(struct ipv6hdr);
385 
386 	if (nexthdr == IPPROTO_ICMPV6) {
387 		struct icmp6hdr *icmp6;
388 
389 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
390 					 offset + 1 - skb->data)))
391 			return 0;
392 
393 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
394 
395 		switch (icmp6->icmp6_type) {
396 		case NDISC_ROUTER_SOLICITATION:
397 		case NDISC_ROUTER_ADVERTISEMENT:
398 		case NDISC_NEIGHBOUR_SOLICITATION:
399 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
400 		case NDISC_REDIRECT:
401 			/* For reaction involving unicast neighbor discovery
402 			 * message destined to the proxied address, pass it to
403 			 * input function.
404 			 */
405 			return 1;
406 		default:
407 			break;
408 		}
409 	}
410 
411 	/*
412 	 * The proxying router can't forward traffic sent to a link-local
413 	 * address, so signal the sender and discard the packet. This
414 	 * behavior is clarified by the MIPv6 specification.
415 	 */
416 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
417 		dst_link_failure(skb);
418 		return -1;
419 	}
420 
421 	return 0;
422 }
423 
424 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
425 				     struct sk_buff *skb)
426 {
427 	struct dst_entry *dst = skb_dst(skb);
428 
429 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
430 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
431 
432 #ifdef CONFIG_NET_SWITCHDEV
433 	if (skb->offload_l3_fwd_mark) {
434 		consume_skb(skb);
435 		return 0;
436 	}
437 #endif
438 
439 	skb->tstamp = 0;
440 	return dst_output(net, sk, skb);
441 }
442 
443 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
444 {
445 	if (skb->len <= mtu)
446 		return false;
447 
448 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
449 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
450 		return true;
451 
452 	if (skb->ignore_df)
453 		return false;
454 
455 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
456 		return false;
457 
458 	return true;
459 }
460 
461 int ip6_forward(struct sk_buff *skb)
462 {
463 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
464 	struct dst_entry *dst = skb_dst(skb);
465 	struct ipv6hdr *hdr = ipv6_hdr(skb);
466 	struct inet6_skb_parm *opt = IP6CB(skb);
467 	struct net *net = dev_net(dst->dev);
468 	u32 mtu;
469 
470 	if (net->ipv6.devconf_all->forwarding == 0)
471 		goto error;
472 
473 	if (skb->pkt_type != PACKET_HOST)
474 		goto drop;
475 
476 	if (unlikely(skb->sk))
477 		goto drop;
478 
479 	if (skb_warn_if_lro(skb))
480 		goto drop;
481 
482 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
483 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
484 		goto drop;
485 	}
486 
487 	skb_forward_csum(skb);
488 
489 	/*
490 	 *	We DO NOT make any processing on
491 	 *	RA packets, pushing them to user level AS IS
492 	 *	without ane WARRANTY that application will be able
493 	 *	to interpret them. The reason is that we
494 	 *	cannot make anything clever here.
495 	 *
496 	 *	We are not end-node, so that if packet contains
497 	 *	AH/ESP, we cannot make anything.
498 	 *	Defragmentation also would be mistake, RA packets
499 	 *	cannot be fragmented, because there is no warranty
500 	 *	that different fragments will go along one path. --ANK
501 	 */
502 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
503 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
504 			return 0;
505 	}
506 
507 	/*
508 	 *	check and decrement ttl
509 	 */
510 	if (hdr->hop_limit <= 1) {
511 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
512 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
513 
514 		kfree_skb(skb);
515 		return -ETIMEDOUT;
516 	}
517 
518 	/* XXX: idev->cnf.proxy_ndp? */
519 	if (net->ipv6.devconf_all->proxy_ndp &&
520 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
521 		int proxied = ip6_forward_proxy_check(skb);
522 		if (proxied > 0)
523 			return ip6_input(skb);
524 		else if (proxied < 0) {
525 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
526 			goto drop;
527 		}
528 	}
529 
530 	if (!xfrm6_route_forward(skb)) {
531 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
532 		goto drop;
533 	}
534 	dst = skb_dst(skb);
535 
536 	/* IPv6 specs say nothing about it, but it is clear that we cannot
537 	   send redirects to source routed frames.
538 	   We don't send redirects to frames decapsulated from IPsec.
539 	 */
540 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
541 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
542 		struct in6_addr *target = NULL;
543 		struct inet_peer *peer;
544 		struct rt6_info *rt;
545 
546 		/*
547 		 *	incoming and outgoing devices are the same
548 		 *	send a redirect.
549 		 */
550 
551 		rt = (struct rt6_info *) dst;
552 		if (rt->rt6i_flags & RTF_GATEWAY)
553 			target = &rt->rt6i_gateway;
554 		else
555 			target = &hdr->daddr;
556 
557 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
558 
559 		/* Limit redirects both by destination (here)
560 		   and by source (inside ndisc_send_redirect)
561 		 */
562 		if (inet_peer_xrlim_allow(peer, 1*HZ))
563 			ndisc_send_redirect(skb, target);
564 		if (peer)
565 			inet_putpeer(peer);
566 	} else {
567 		int addrtype = ipv6_addr_type(&hdr->saddr);
568 
569 		/* This check is security critical. */
570 		if (addrtype == IPV6_ADDR_ANY ||
571 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
572 			goto error;
573 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
574 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
575 				    ICMPV6_NOT_NEIGHBOUR, 0);
576 			goto error;
577 		}
578 	}
579 
580 	mtu = ip6_dst_mtu_forward(dst);
581 	if (mtu < IPV6_MIN_MTU)
582 		mtu = IPV6_MIN_MTU;
583 
584 	if (ip6_pkt_too_big(skb, mtu)) {
585 		/* Again, force OUTPUT device used as source address */
586 		skb->dev = dst->dev;
587 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
588 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
589 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
590 				IPSTATS_MIB_FRAGFAILS);
591 		kfree_skb(skb);
592 		return -EMSGSIZE;
593 	}
594 
595 	if (skb_cow(skb, dst->dev->hard_header_len)) {
596 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
597 				IPSTATS_MIB_OUTDISCARDS);
598 		goto drop;
599 	}
600 
601 	hdr = ipv6_hdr(skb);
602 
603 	/* Mangling hops number delayed to point after skb COW */
604 
605 	hdr->hop_limit--;
606 
607 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
608 		       net, NULL, skb, skb->dev, dst->dev,
609 		       ip6_forward_finish);
610 
611 error:
612 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
613 drop:
614 	kfree_skb(skb);
615 	return -EINVAL;
616 }
617 
618 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
619 {
620 	to->pkt_type = from->pkt_type;
621 	to->priority = from->priority;
622 	to->protocol = from->protocol;
623 	skb_dst_drop(to);
624 	skb_dst_set(to, dst_clone(skb_dst(from)));
625 	to->dev = from->dev;
626 	to->mark = from->mark;
627 
628 	skb_copy_hash(to, from);
629 
630 #ifdef CONFIG_NET_SCHED
631 	to->tc_index = from->tc_index;
632 #endif
633 	nf_copy(to, from);
634 	skb_ext_copy(to, from);
635 	skb_copy_secmark(to, from);
636 }
637 
638 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
639 		      u8 nexthdr, __be32 frag_id,
640 		      struct ip6_fraglist_iter *iter)
641 {
642 	unsigned int first_len;
643 	struct frag_hdr *fh;
644 
645 	/* BUILD HEADER */
646 	*prevhdr = NEXTHDR_FRAGMENT;
647 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
648 	if (!iter->tmp_hdr)
649 		return -ENOMEM;
650 
651 	iter->frag = skb_shinfo(skb)->frag_list;
652 	skb_frag_list_init(skb);
653 
654 	iter->offset = 0;
655 	iter->hlen = hlen;
656 	iter->frag_id = frag_id;
657 	iter->nexthdr = nexthdr;
658 
659 	__skb_pull(skb, hlen);
660 	fh = __skb_push(skb, sizeof(struct frag_hdr));
661 	__skb_push(skb, hlen);
662 	skb_reset_network_header(skb);
663 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
664 
665 	fh->nexthdr = nexthdr;
666 	fh->reserved = 0;
667 	fh->frag_off = htons(IP6_MF);
668 	fh->identification = frag_id;
669 
670 	first_len = skb_pagelen(skb);
671 	skb->data_len = first_len - skb_headlen(skb);
672 	skb->len = first_len;
673 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
674 
675 	return 0;
676 }
677 EXPORT_SYMBOL(ip6_fraglist_init);
678 
679 void ip6_fraglist_prepare(struct sk_buff *skb,
680 			  struct ip6_fraglist_iter *iter)
681 {
682 	struct sk_buff *frag = iter->frag;
683 	unsigned int hlen = iter->hlen;
684 	struct frag_hdr *fh;
685 
686 	frag->ip_summed = CHECKSUM_NONE;
687 	skb_reset_transport_header(frag);
688 	fh = __skb_push(frag, sizeof(struct frag_hdr));
689 	__skb_push(frag, hlen);
690 	skb_reset_network_header(frag);
691 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
692 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
693 	fh->nexthdr = iter->nexthdr;
694 	fh->reserved = 0;
695 	fh->frag_off = htons(iter->offset);
696 	if (frag->next)
697 		fh->frag_off |= htons(IP6_MF);
698 	fh->identification = iter->frag_id;
699 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
700 	ip6_copy_metadata(frag, skb);
701 }
702 EXPORT_SYMBOL(ip6_fraglist_prepare);
703 
704 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
705 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
706 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
707 {
708 	state->prevhdr = prevhdr;
709 	state->nexthdr = nexthdr;
710 	state->frag_id = frag_id;
711 
712 	state->hlen = hlen;
713 	state->mtu = mtu;
714 
715 	state->left = skb->len - hlen;	/* Space per frame */
716 	state->ptr = hlen;		/* Where to start from */
717 
718 	state->hroom = hdr_room;
719 	state->troom = needed_tailroom;
720 
721 	state->offset = 0;
722 }
723 EXPORT_SYMBOL(ip6_frag_init);
724 
725 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
726 {
727 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
728 	struct sk_buff *frag;
729 	struct frag_hdr *fh;
730 	unsigned int len;
731 
732 	len = state->left;
733 	/* IF: it doesn't fit, use 'mtu' - the data space left */
734 	if (len > state->mtu)
735 		len = state->mtu;
736 	/* IF: we are not sending up to and including the packet end
737 	   then align the next start on an eight byte boundary */
738 	if (len < state->left)
739 		len &= ~7;
740 
741 	/* Allocate buffer */
742 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
743 			 state->hroom + state->troom, GFP_ATOMIC);
744 	if (!frag)
745 		return ERR_PTR(-ENOMEM);
746 
747 	/*
748 	 *	Set up data on packet
749 	 */
750 
751 	ip6_copy_metadata(frag, skb);
752 	skb_reserve(frag, state->hroom);
753 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
754 	skb_reset_network_header(frag);
755 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
756 	frag->transport_header = (frag->network_header + state->hlen +
757 				  sizeof(struct frag_hdr));
758 
759 	/*
760 	 *	Charge the memory for the fragment to any owner
761 	 *	it might possess
762 	 */
763 	if (skb->sk)
764 		skb_set_owner_w(frag, skb->sk);
765 
766 	/*
767 	 *	Copy the packet header into the new buffer.
768 	 */
769 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
770 
771 	fragnexthdr_offset = skb_network_header(frag);
772 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
773 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
774 
775 	/*
776 	 *	Build fragment header.
777 	 */
778 	fh->nexthdr = state->nexthdr;
779 	fh->reserved = 0;
780 	fh->identification = state->frag_id;
781 
782 	/*
783 	 *	Copy a block of the IP datagram.
784 	 */
785 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
786 			     len));
787 	state->left -= len;
788 
789 	fh->frag_off = htons(state->offset);
790 	if (state->left > 0)
791 		fh->frag_off |= htons(IP6_MF);
792 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
793 
794 	state->ptr += len;
795 	state->offset += len;
796 
797 	return frag;
798 }
799 EXPORT_SYMBOL(ip6_frag_next);
800 
801 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
802 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
803 {
804 	struct sk_buff *frag;
805 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
806 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
807 				inet6_sk(skb->sk) : NULL;
808 	struct ip6_frag_state state;
809 	unsigned int mtu, hlen, nexthdr_offset;
810 	ktime_t tstamp = skb->tstamp;
811 	int hroom, err = 0;
812 	__be32 frag_id;
813 	u8 *prevhdr, nexthdr = 0;
814 
815 	err = ip6_find_1stfragopt(skb, &prevhdr);
816 	if (err < 0)
817 		goto fail;
818 	hlen = err;
819 	nexthdr = *prevhdr;
820 	nexthdr_offset = prevhdr - skb_network_header(skb);
821 
822 	mtu = ip6_skb_dst_mtu(skb);
823 
824 	/* We must not fragment if the socket is set to force MTU discovery
825 	 * or if the skb it not generated by a local socket.
826 	 */
827 	if (unlikely(!skb->ignore_df && skb->len > mtu))
828 		goto fail_toobig;
829 
830 	if (IP6CB(skb)->frag_max_size) {
831 		if (IP6CB(skb)->frag_max_size > mtu)
832 			goto fail_toobig;
833 
834 		/* don't send fragments larger than what we received */
835 		mtu = IP6CB(skb)->frag_max_size;
836 		if (mtu < IPV6_MIN_MTU)
837 			mtu = IPV6_MIN_MTU;
838 	}
839 
840 	if (np && np->frag_size < mtu) {
841 		if (np->frag_size)
842 			mtu = np->frag_size;
843 	}
844 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
845 		goto fail_toobig;
846 	mtu -= hlen + sizeof(struct frag_hdr);
847 
848 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
849 				    &ipv6_hdr(skb)->saddr);
850 
851 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
852 	    (err = skb_checksum_help(skb)))
853 		goto fail;
854 
855 	prevhdr = skb_network_header(skb) + nexthdr_offset;
856 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
857 	if (skb_has_frag_list(skb)) {
858 		unsigned int first_len = skb_pagelen(skb);
859 		struct ip6_fraglist_iter iter;
860 		struct sk_buff *frag2;
861 
862 		if (first_len - hlen > mtu ||
863 		    ((first_len - hlen) & 7) ||
864 		    skb_cloned(skb) ||
865 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
866 			goto slow_path;
867 
868 		skb_walk_frags(skb, frag) {
869 			/* Correct geometry. */
870 			if (frag->len > mtu ||
871 			    ((frag->len & 7) && frag->next) ||
872 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
873 				goto slow_path_clean;
874 
875 			/* Partially cloned skb? */
876 			if (skb_shared(frag))
877 				goto slow_path_clean;
878 
879 			BUG_ON(frag->sk);
880 			if (skb->sk) {
881 				frag->sk = skb->sk;
882 				frag->destructor = sock_wfree;
883 			}
884 			skb->truesize -= frag->truesize;
885 		}
886 
887 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
888 					&iter);
889 		if (err < 0)
890 			goto fail;
891 
892 		for (;;) {
893 			/* Prepare header of the next frame,
894 			 * before previous one went down. */
895 			if (iter.frag)
896 				ip6_fraglist_prepare(skb, &iter);
897 
898 			skb->tstamp = tstamp;
899 			err = output(net, sk, skb);
900 			if (!err)
901 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
902 					      IPSTATS_MIB_FRAGCREATES);
903 
904 			if (err || !iter.frag)
905 				break;
906 
907 			skb = ip6_fraglist_next(&iter);
908 		}
909 
910 		kfree(iter.tmp_hdr);
911 
912 		if (err == 0) {
913 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
914 				      IPSTATS_MIB_FRAGOKS);
915 			return 0;
916 		}
917 
918 		kfree_skb_list(iter.frag);
919 
920 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
921 			      IPSTATS_MIB_FRAGFAILS);
922 		return err;
923 
924 slow_path_clean:
925 		skb_walk_frags(skb, frag2) {
926 			if (frag2 == frag)
927 				break;
928 			frag2->sk = NULL;
929 			frag2->destructor = NULL;
930 			skb->truesize += frag2->truesize;
931 		}
932 	}
933 
934 slow_path:
935 	/*
936 	 *	Fragment the datagram.
937 	 */
938 
939 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
940 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
941 		      &state);
942 
943 	/*
944 	 *	Keep copying data until we run out.
945 	 */
946 
947 	while (state.left > 0) {
948 		frag = ip6_frag_next(skb, &state);
949 		if (IS_ERR(frag)) {
950 			err = PTR_ERR(frag);
951 			goto fail;
952 		}
953 
954 		/*
955 		 *	Put this fragment into the sending queue.
956 		 */
957 		frag->tstamp = tstamp;
958 		err = output(net, sk, frag);
959 		if (err)
960 			goto fail;
961 
962 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
963 			      IPSTATS_MIB_FRAGCREATES);
964 	}
965 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
966 		      IPSTATS_MIB_FRAGOKS);
967 	consume_skb(skb);
968 	return err;
969 
970 fail_toobig:
971 	if (skb->sk && dst_allfrag(skb_dst(skb)))
972 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
973 
974 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
975 	err = -EMSGSIZE;
976 
977 fail:
978 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
979 		      IPSTATS_MIB_FRAGFAILS);
980 	kfree_skb(skb);
981 	return err;
982 }
983 
984 static inline int ip6_rt_check(const struct rt6key *rt_key,
985 			       const struct in6_addr *fl_addr,
986 			       const struct in6_addr *addr_cache)
987 {
988 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
989 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
990 }
991 
992 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
993 					  struct dst_entry *dst,
994 					  const struct flowi6 *fl6)
995 {
996 	struct ipv6_pinfo *np = inet6_sk(sk);
997 	struct rt6_info *rt;
998 
999 	if (!dst)
1000 		goto out;
1001 
1002 	if (dst->ops->family != AF_INET6) {
1003 		dst_release(dst);
1004 		return NULL;
1005 	}
1006 
1007 	rt = (struct rt6_info *)dst;
1008 	/* Yes, checking route validity in not connected
1009 	 * case is not very simple. Take into account,
1010 	 * that we do not support routing by source, TOS,
1011 	 * and MSG_DONTROUTE		--ANK (980726)
1012 	 *
1013 	 * 1. ip6_rt_check(): If route was host route,
1014 	 *    check that cached destination is current.
1015 	 *    If it is network route, we still may
1016 	 *    check its validity using saved pointer
1017 	 *    to the last used address: daddr_cache.
1018 	 *    We do not want to save whole address now,
1019 	 *    (because main consumer of this service
1020 	 *    is tcp, which has not this problem),
1021 	 *    so that the last trick works only on connected
1022 	 *    sockets.
1023 	 * 2. oif also should be the same.
1024 	 */
1025 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1026 #ifdef CONFIG_IPV6_SUBTREES
1027 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1028 #endif
1029 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1030 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1031 		dst_release(dst);
1032 		dst = NULL;
1033 	}
1034 
1035 out:
1036 	return dst;
1037 }
1038 
1039 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1040 			       struct dst_entry **dst, struct flowi6 *fl6)
1041 {
1042 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1043 	struct neighbour *n;
1044 	struct rt6_info *rt;
1045 #endif
1046 	int err;
1047 	int flags = 0;
1048 
1049 	/* The correct way to handle this would be to do
1050 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1051 	 * the route-specific preferred source forces the
1052 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1053 	 *
1054 	 * In source specific routing (no src=any default route),
1055 	 * ip6_route_output will fail given src=any saddr, though, so
1056 	 * that's why we try it again later.
1057 	 */
1058 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1059 		struct fib6_info *from;
1060 		struct rt6_info *rt;
1061 		bool had_dst = *dst != NULL;
1062 
1063 		if (!had_dst)
1064 			*dst = ip6_route_output(net, sk, fl6);
1065 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1066 
1067 		rcu_read_lock();
1068 		from = rt ? rcu_dereference(rt->from) : NULL;
1069 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1070 					  sk ? inet6_sk(sk)->srcprefs : 0,
1071 					  &fl6->saddr);
1072 		rcu_read_unlock();
1073 
1074 		if (err)
1075 			goto out_err_release;
1076 
1077 		/* If we had an erroneous initial result, pretend it
1078 		 * never existed and let the SA-enabled version take
1079 		 * over.
1080 		 */
1081 		if (!had_dst && (*dst)->error) {
1082 			dst_release(*dst);
1083 			*dst = NULL;
1084 		}
1085 
1086 		if (fl6->flowi6_oif)
1087 			flags |= RT6_LOOKUP_F_IFACE;
1088 	}
1089 
1090 	if (!*dst)
1091 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1092 
1093 	err = (*dst)->error;
1094 	if (err)
1095 		goto out_err_release;
1096 
1097 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1098 	/*
1099 	 * Here if the dst entry we've looked up
1100 	 * has a neighbour entry that is in the INCOMPLETE
1101 	 * state and the src address from the flow is
1102 	 * marked as OPTIMISTIC, we release the found
1103 	 * dst entry and replace it instead with the
1104 	 * dst entry of the nexthop router
1105 	 */
1106 	rt = (struct rt6_info *) *dst;
1107 	rcu_read_lock_bh();
1108 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1109 				      rt6_nexthop(rt, &fl6->daddr));
1110 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1111 	rcu_read_unlock_bh();
1112 
1113 	if (err) {
1114 		struct inet6_ifaddr *ifp;
1115 		struct flowi6 fl_gw6;
1116 		int redirect;
1117 
1118 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1119 				      (*dst)->dev, 1);
1120 
1121 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1122 		if (ifp)
1123 			in6_ifa_put(ifp);
1124 
1125 		if (redirect) {
1126 			/*
1127 			 * We need to get the dst entry for the
1128 			 * default router instead
1129 			 */
1130 			dst_release(*dst);
1131 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1132 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1133 			*dst = ip6_route_output(net, sk, &fl_gw6);
1134 			err = (*dst)->error;
1135 			if (err)
1136 				goto out_err_release;
1137 		}
1138 	}
1139 #endif
1140 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1141 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1142 		err = -EAFNOSUPPORT;
1143 		goto out_err_release;
1144 	}
1145 
1146 	return 0;
1147 
1148 out_err_release:
1149 	dst_release(*dst);
1150 	*dst = NULL;
1151 
1152 	if (err == -ENETUNREACH)
1153 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1154 	return err;
1155 }
1156 
1157 /**
1158  *	ip6_dst_lookup - perform route lookup on flow
1159  *	@net: Network namespace to perform lookup in
1160  *	@sk: socket which provides route info
1161  *	@dst: pointer to dst_entry * for result
1162  *	@fl6: flow to lookup
1163  *
1164  *	This function performs a route lookup on the given flow.
1165  *
1166  *	It returns zero on success, or a standard errno code on error.
1167  */
1168 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1169 		   struct flowi6 *fl6)
1170 {
1171 	*dst = NULL;
1172 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1173 }
1174 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1175 
1176 /**
1177  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1178  *	@net: Network namespace to perform lookup in
1179  *	@sk: socket which provides route info
1180  *	@fl6: flow to lookup
1181  *	@final_dst: final destination address for ipsec lookup
1182  *
1183  *	This function performs a route lookup on the given flow.
1184  *
1185  *	It returns a valid dst pointer on success, or a pointer encoded
1186  *	error code.
1187  */
1188 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1189 				      const struct in6_addr *final_dst)
1190 {
1191 	struct dst_entry *dst = NULL;
1192 	int err;
1193 
1194 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1195 	if (err)
1196 		return ERR_PTR(err);
1197 	if (final_dst)
1198 		fl6->daddr = *final_dst;
1199 
1200 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1201 }
1202 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1203 
1204 /**
1205  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1206  *	@sk: socket which provides the dst cache and route info
1207  *	@fl6: flow to lookup
1208  *	@final_dst: final destination address for ipsec lookup
1209  *	@connected: whether @sk is connected or not
1210  *
1211  *	This function performs a route lookup on the given flow with the
1212  *	possibility of using the cached route in the socket if it is valid.
1213  *	It will take the socket dst lock when operating on the dst cache.
1214  *	As a result, this function can only be used in process context.
1215  *
1216  *	In addition, for a connected socket, cache the dst in the socket
1217  *	if the current cache is not valid.
1218  *
1219  *	It returns a valid dst pointer on success, or a pointer encoded
1220  *	error code.
1221  */
1222 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1223 					 const struct in6_addr *final_dst,
1224 					 bool connected)
1225 {
1226 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1227 
1228 	dst = ip6_sk_dst_check(sk, dst, fl6);
1229 	if (dst)
1230 		return dst;
1231 
1232 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1233 	if (connected && !IS_ERR(dst))
1234 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1235 
1236 	return dst;
1237 }
1238 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1239 
1240 /**
1241  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1242  *      @skb: Packet for which lookup is done
1243  *      @dev: Tunnel device
1244  *      @net: Network namespace of tunnel device
1245  *      @sock: Socket which provides route info
1246  *      @saddr: Memory to store the src ip address
1247  *      @info: Tunnel information
1248  *      @protocol: IP protocol
1249  *      @use_cache: Flag to enable cache usage
1250  *      This function performs a route lookup on a tunnel
1251  *
1252  *      It returns a valid dst pointer and stores src address to be used in
1253  *      tunnel in param saddr on success, else a pointer encoded error code.
1254  */
1255 
1256 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1257 					struct net_device *dev,
1258 					struct net *net,
1259 					struct socket *sock,
1260 					struct in6_addr *saddr,
1261 					const struct ip_tunnel_info *info,
1262 					u8 protocol,
1263 					bool use_cache)
1264 {
1265 	struct dst_entry *dst = NULL;
1266 #ifdef CONFIG_DST_CACHE
1267 	struct dst_cache *dst_cache;
1268 #endif
1269 	struct flowi6 fl6;
1270 	__u8 prio;
1271 
1272 #ifdef CONFIG_DST_CACHE
1273 	dst_cache = (struct dst_cache *)&info->dst_cache;
1274 	if (use_cache) {
1275 		dst = dst_cache_get_ip6(dst_cache, saddr);
1276 		if (dst)
1277 			return dst;
1278 	}
1279 #endif
1280 	memset(&fl6, 0, sizeof(fl6));
1281 	fl6.flowi6_mark = skb->mark;
1282 	fl6.flowi6_proto = protocol;
1283 	fl6.daddr = info->key.u.ipv6.dst;
1284 	fl6.saddr = info->key.u.ipv6.src;
1285 	prio = info->key.tos;
1286 	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1287 					  info->key.label);
1288 
1289 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1290 					      NULL);
1291 	if (IS_ERR(dst)) {
1292 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1293 		return ERR_PTR(-ENETUNREACH);
1294 	}
1295 	if (dst->dev == dev) { /* is this necessary? */
1296 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1297 		dst_release(dst);
1298 		return ERR_PTR(-ELOOP);
1299 	}
1300 #ifdef CONFIG_DST_CACHE
1301 	if (use_cache)
1302 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1303 #endif
1304 	*saddr = fl6.saddr;
1305 	return dst;
1306 }
1307 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1308 
1309 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1310 					       gfp_t gfp)
1311 {
1312 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1313 }
1314 
1315 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1316 						gfp_t gfp)
1317 {
1318 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1319 }
1320 
1321 static void ip6_append_data_mtu(unsigned int *mtu,
1322 				int *maxfraglen,
1323 				unsigned int fragheaderlen,
1324 				struct sk_buff *skb,
1325 				struct rt6_info *rt,
1326 				unsigned int orig_mtu)
1327 {
1328 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1329 		if (!skb) {
1330 			/* first fragment, reserve header_len */
1331 			*mtu = orig_mtu - rt->dst.header_len;
1332 
1333 		} else {
1334 			/*
1335 			 * this fragment is not first, the headers
1336 			 * space is regarded as data space.
1337 			 */
1338 			*mtu = orig_mtu;
1339 		}
1340 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1341 			      + fragheaderlen - sizeof(struct frag_hdr);
1342 	}
1343 }
1344 
1345 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1346 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1347 			  struct rt6_info *rt, struct flowi6 *fl6)
1348 {
1349 	struct ipv6_pinfo *np = inet6_sk(sk);
1350 	unsigned int mtu;
1351 	struct ipv6_txoptions *opt = ipc6->opt;
1352 
1353 	/*
1354 	 * setup for corking
1355 	 */
1356 	if (opt) {
1357 		if (WARN_ON(v6_cork->opt))
1358 			return -EINVAL;
1359 
1360 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1361 		if (unlikely(!v6_cork->opt))
1362 			return -ENOBUFS;
1363 
1364 		v6_cork->opt->tot_len = sizeof(*opt);
1365 		v6_cork->opt->opt_flen = opt->opt_flen;
1366 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1367 
1368 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1369 						    sk->sk_allocation);
1370 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1371 			return -ENOBUFS;
1372 
1373 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1374 						    sk->sk_allocation);
1375 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1376 			return -ENOBUFS;
1377 
1378 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1379 						   sk->sk_allocation);
1380 		if (opt->hopopt && !v6_cork->opt->hopopt)
1381 			return -ENOBUFS;
1382 
1383 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1384 						    sk->sk_allocation);
1385 		if (opt->srcrt && !v6_cork->opt->srcrt)
1386 			return -ENOBUFS;
1387 
1388 		/* need source address above miyazawa*/
1389 	}
1390 	dst_hold(&rt->dst);
1391 	cork->base.dst = &rt->dst;
1392 	cork->fl.u.ip6 = *fl6;
1393 	v6_cork->hop_limit = ipc6->hlimit;
1394 	v6_cork->tclass = ipc6->tclass;
1395 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1396 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1397 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1398 	else
1399 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1400 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1401 	if (np->frag_size < mtu) {
1402 		if (np->frag_size)
1403 			mtu = np->frag_size;
1404 	}
1405 	if (mtu < IPV6_MIN_MTU)
1406 		return -EINVAL;
1407 	cork->base.fragsize = mtu;
1408 	cork->base.gso_size = ipc6->gso_size;
1409 	cork->base.tx_flags = 0;
1410 	cork->base.mark = ipc6->sockc.mark;
1411 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1412 
1413 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1414 		cork->base.flags |= IPCORK_ALLFRAG;
1415 	cork->base.length = 0;
1416 
1417 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1418 
1419 	return 0;
1420 }
1421 
1422 static int __ip6_append_data(struct sock *sk,
1423 			     struct flowi6 *fl6,
1424 			     struct sk_buff_head *queue,
1425 			     struct inet_cork *cork,
1426 			     struct inet6_cork *v6_cork,
1427 			     struct page_frag *pfrag,
1428 			     int getfrag(void *from, char *to, int offset,
1429 					 int len, int odd, struct sk_buff *skb),
1430 			     void *from, int length, int transhdrlen,
1431 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1432 {
1433 	struct sk_buff *skb, *skb_prev = NULL;
1434 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1435 	struct ubuf_info *uarg = NULL;
1436 	int exthdrlen = 0;
1437 	int dst_exthdrlen = 0;
1438 	int hh_len;
1439 	int copy;
1440 	int err;
1441 	int offset = 0;
1442 	u32 tskey = 0;
1443 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1444 	struct ipv6_txoptions *opt = v6_cork->opt;
1445 	int csummode = CHECKSUM_NONE;
1446 	unsigned int maxnonfragsize, headersize;
1447 	unsigned int wmem_alloc_delta = 0;
1448 	bool paged, extra_uref = false;
1449 
1450 	skb = skb_peek_tail(queue);
1451 	if (!skb) {
1452 		exthdrlen = opt ? opt->opt_flen : 0;
1453 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1454 	}
1455 
1456 	paged = !!cork->gso_size;
1457 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1458 	orig_mtu = mtu;
1459 
1460 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1461 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1462 		tskey = sk->sk_tskey++;
1463 
1464 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1465 
1466 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1467 			(opt ? opt->opt_nflen : 0);
1468 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1469 		     sizeof(struct frag_hdr);
1470 
1471 	headersize = sizeof(struct ipv6hdr) +
1472 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1473 		     (dst_allfrag(&rt->dst) ?
1474 		      sizeof(struct frag_hdr) : 0) +
1475 		     rt->rt6i_nfheader_len;
1476 
1477 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1478 	 * the first fragment
1479 	 */
1480 	if (headersize + transhdrlen > mtu)
1481 		goto emsgsize;
1482 
1483 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1484 	    (sk->sk_protocol == IPPROTO_UDP ||
1485 	     sk->sk_protocol == IPPROTO_RAW)) {
1486 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1487 				sizeof(struct ipv6hdr));
1488 		goto emsgsize;
1489 	}
1490 
1491 	if (ip6_sk_ignore_df(sk))
1492 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1493 	else
1494 		maxnonfragsize = mtu;
1495 
1496 	if (cork->length + length > maxnonfragsize - headersize) {
1497 emsgsize:
1498 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1499 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1500 		return -EMSGSIZE;
1501 	}
1502 
1503 	/* CHECKSUM_PARTIAL only with no extension headers and when
1504 	 * we are not going to fragment
1505 	 */
1506 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1507 	    headersize == sizeof(struct ipv6hdr) &&
1508 	    length <= mtu - headersize &&
1509 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1510 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1511 		csummode = CHECKSUM_PARTIAL;
1512 
1513 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1514 		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1515 		if (!uarg)
1516 			return -ENOBUFS;
1517 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1518 		if (rt->dst.dev->features & NETIF_F_SG &&
1519 		    csummode == CHECKSUM_PARTIAL) {
1520 			paged = true;
1521 		} else {
1522 			uarg->zerocopy = 0;
1523 			skb_zcopy_set(skb, uarg, &extra_uref);
1524 		}
1525 	}
1526 
1527 	/*
1528 	 * Let's try using as much space as possible.
1529 	 * Use MTU if total length of the message fits into the MTU.
1530 	 * Otherwise, we need to reserve fragment header and
1531 	 * fragment alignment (= 8-15 octects, in total).
1532 	 *
1533 	 * Note that we may need to "move" the data from the tail
1534 	 * of the buffer to the new fragment when we split
1535 	 * the message.
1536 	 *
1537 	 * FIXME: It may be fragmented into multiple chunks
1538 	 *        at once if non-fragmentable extension headers
1539 	 *        are too large.
1540 	 * --yoshfuji
1541 	 */
1542 
1543 	cork->length += length;
1544 	if (!skb)
1545 		goto alloc_new_skb;
1546 
1547 	while (length > 0) {
1548 		/* Check if the remaining data fits into current packet. */
1549 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1550 		if (copy < length)
1551 			copy = maxfraglen - skb->len;
1552 
1553 		if (copy <= 0) {
1554 			char *data;
1555 			unsigned int datalen;
1556 			unsigned int fraglen;
1557 			unsigned int fraggap;
1558 			unsigned int alloclen, alloc_extra;
1559 			unsigned int pagedlen;
1560 alloc_new_skb:
1561 			/* There's no room in the current skb */
1562 			if (skb)
1563 				fraggap = skb->len - maxfraglen;
1564 			else
1565 				fraggap = 0;
1566 			/* update mtu and maxfraglen if necessary */
1567 			if (!skb || !skb_prev)
1568 				ip6_append_data_mtu(&mtu, &maxfraglen,
1569 						    fragheaderlen, skb, rt,
1570 						    orig_mtu);
1571 
1572 			skb_prev = skb;
1573 
1574 			/*
1575 			 * If remaining data exceeds the mtu,
1576 			 * we know we need more fragment(s).
1577 			 */
1578 			datalen = length + fraggap;
1579 
1580 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1581 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1582 			fraglen = datalen + fragheaderlen;
1583 			pagedlen = 0;
1584 
1585 			alloc_extra = hh_len;
1586 			alloc_extra += dst_exthdrlen;
1587 			alloc_extra += rt->dst.trailer_len;
1588 
1589 			/* We just reserve space for fragment header.
1590 			 * Note: this may be overallocation if the message
1591 			 * (without MSG_MORE) fits into the MTU.
1592 			 */
1593 			alloc_extra += sizeof(struct frag_hdr);
1594 
1595 			if ((flags & MSG_MORE) &&
1596 			    !(rt->dst.dev->features&NETIF_F_SG))
1597 				alloclen = mtu;
1598 			else if (!paged &&
1599 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1600 				  !(rt->dst.dev->features & NETIF_F_SG)))
1601 				alloclen = fraglen;
1602 			else {
1603 				alloclen = min_t(int, fraglen, MAX_HEADER);
1604 				pagedlen = fraglen - alloclen;
1605 			}
1606 			alloclen += alloc_extra;
1607 
1608 			if (datalen != length + fraggap) {
1609 				/*
1610 				 * this is not the last fragment, the trailer
1611 				 * space is regarded as data space.
1612 				 */
1613 				datalen += rt->dst.trailer_len;
1614 			}
1615 
1616 			fraglen = datalen + fragheaderlen;
1617 
1618 			copy = datalen - transhdrlen - fraggap - pagedlen;
1619 			if (copy < 0) {
1620 				err = -EINVAL;
1621 				goto error;
1622 			}
1623 			if (transhdrlen) {
1624 				skb = sock_alloc_send_skb(sk, alloclen,
1625 						(flags & MSG_DONTWAIT), &err);
1626 			} else {
1627 				skb = NULL;
1628 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1629 				    2 * sk->sk_sndbuf)
1630 					skb = alloc_skb(alloclen,
1631 							sk->sk_allocation);
1632 				if (unlikely(!skb))
1633 					err = -ENOBUFS;
1634 			}
1635 			if (!skb)
1636 				goto error;
1637 			/*
1638 			 *	Fill in the control structures
1639 			 */
1640 			skb->protocol = htons(ETH_P_IPV6);
1641 			skb->ip_summed = csummode;
1642 			skb->csum = 0;
1643 			/* reserve for fragmentation and ipsec header */
1644 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1645 				    dst_exthdrlen);
1646 
1647 			/*
1648 			 *	Find where to start putting bytes
1649 			 */
1650 			data = skb_put(skb, fraglen - pagedlen);
1651 			skb_set_network_header(skb, exthdrlen);
1652 			data += fragheaderlen;
1653 			skb->transport_header = (skb->network_header +
1654 						 fragheaderlen);
1655 			if (fraggap) {
1656 				skb->csum = skb_copy_and_csum_bits(
1657 					skb_prev, maxfraglen,
1658 					data + transhdrlen, fraggap);
1659 				skb_prev->csum = csum_sub(skb_prev->csum,
1660 							  skb->csum);
1661 				data += fraggap;
1662 				pskb_trim_unique(skb_prev, maxfraglen);
1663 			}
1664 			if (copy > 0 &&
1665 			    getfrag(from, data + transhdrlen, offset,
1666 				    copy, fraggap, skb) < 0) {
1667 				err = -EFAULT;
1668 				kfree_skb(skb);
1669 				goto error;
1670 			}
1671 
1672 			offset += copy;
1673 			length -= copy + transhdrlen;
1674 			transhdrlen = 0;
1675 			exthdrlen = 0;
1676 			dst_exthdrlen = 0;
1677 
1678 			/* Only the initial fragment is time stamped */
1679 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1680 			cork->tx_flags = 0;
1681 			skb_shinfo(skb)->tskey = tskey;
1682 			tskey = 0;
1683 			skb_zcopy_set(skb, uarg, &extra_uref);
1684 
1685 			if ((flags & MSG_CONFIRM) && !skb_prev)
1686 				skb_set_dst_pending_confirm(skb, 1);
1687 
1688 			/*
1689 			 * Put the packet on the pending queue
1690 			 */
1691 			if (!skb->destructor) {
1692 				skb->destructor = sock_wfree;
1693 				skb->sk = sk;
1694 				wmem_alloc_delta += skb->truesize;
1695 			}
1696 			__skb_queue_tail(queue, skb);
1697 			continue;
1698 		}
1699 
1700 		if (copy > length)
1701 			copy = length;
1702 
1703 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1704 		    skb_tailroom(skb) >= copy) {
1705 			unsigned int off;
1706 
1707 			off = skb->len;
1708 			if (getfrag(from, skb_put(skb, copy),
1709 						offset, copy, off, skb) < 0) {
1710 				__skb_trim(skb, off);
1711 				err = -EFAULT;
1712 				goto error;
1713 			}
1714 		} else if (!uarg || !uarg->zerocopy) {
1715 			int i = skb_shinfo(skb)->nr_frags;
1716 
1717 			err = -ENOMEM;
1718 			if (!sk_page_frag_refill(sk, pfrag))
1719 				goto error;
1720 
1721 			if (!skb_can_coalesce(skb, i, pfrag->page,
1722 					      pfrag->offset)) {
1723 				err = -EMSGSIZE;
1724 				if (i == MAX_SKB_FRAGS)
1725 					goto error;
1726 
1727 				__skb_fill_page_desc(skb, i, pfrag->page,
1728 						     pfrag->offset, 0);
1729 				skb_shinfo(skb)->nr_frags = ++i;
1730 				get_page(pfrag->page);
1731 			}
1732 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1733 			if (getfrag(from,
1734 				    page_address(pfrag->page) + pfrag->offset,
1735 				    offset, copy, skb->len, skb) < 0)
1736 				goto error_efault;
1737 
1738 			pfrag->offset += copy;
1739 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1740 			skb->len += copy;
1741 			skb->data_len += copy;
1742 			skb->truesize += copy;
1743 			wmem_alloc_delta += copy;
1744 		} else {
1745 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1746 			if (err < 0)
1747 				goto error;
1748 		}
1749 		offset += copy;
1750 		length -= copy;
1751 	}
1752 
1753 	if (wmem_alloc_delta)
1754 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1755 	return 0;
1756 
1757 error_efault:
1758 	err = -EFAULT;
1759 error:
1760 	net_zcopy_put_abort(uarg, extra_uref);
1761 	cork->length -= length;
1762 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1763 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1764 	return err;
1765 }
1766 
1767 int ip6_append_data(struct sock *sk,
1768 		    int getfrag(void *from, char *to, int offset, int len,
1769 				int odd, struct sk_buff *skb),
1770 		    void *from, int length, int transhdrlen,
1771 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1772 		    struct rt6_info *rt, unsigned int flags)
1773 {
1774 	struct inet_sock *inet = inet_sk(sk);
1775 	struct ipv6_pinfo *np = inet6_sk(sk);
1776 	int exthdrlen;
1777 	int err;
1778 
1779 	if (flags&MSG_PROBE)
1780 		return 0;
1781 	if (skb_queue_empty(&sk->sk_write_queue)) {
1782 		/*
1783 		 * setup for corking
1784 		 */
1785 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1786 				     ipc6, rt, fl6);
1787 		if (err)
1788 			return err;
1789 
1790 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1791 		length += exthdrlen;
1792 		transhdrlen += exthdrlen;
1793 	} else {
1794 		fl6 = &inet->cork.fl.u.ip6;
1795 		transhdrlen = 0;
1796 	}
1797 
1798 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1799 				 &np->cork, sk_page_frag(sk), getfrag,
1800 				 from, length, transhdrlen, flags, ipc6);
1801 }
1802 EXPORT_SYMBOL_GPL(ip6_append_data);
1803 
1804 static void ip6_cork_release(struct inet_cork_full *cork,
1805 			     struct inet6_cork *v6_cork)
1806 {
1807 	if (v6_cork->opt) {
1808 		kfree(v6_cork->opt->dst0opt);
1809 		kfree(v6_cork->opt->dst1opt);
1810 		kfree(v6_cork->opt->hopopt);
1811 		kfree(v6_cork->opt->srcrt);
1812 		kfree(v6_cork->opt);
1813 		v6_cork->opt = NULL;
1814 	}
1815 
1816 	if (cork->base.dst) {
1817 		dst_release(cork->base.dst);
1818 		cork->base.dst = NULL;
1819 		cork->base.flags &= ~IPCORK_ALLFRAG;
1820 	}
1821 	memset(&cork->fl, 0, sizeof(cork->fl));
1822 }
1823 
1824 struct sk_buff *__ip6_make_skb(struct sock *sk,
1825 			       struct sk_buff_head *queue,
1826 			       struct inet_cork_full *cork,
1827 			       struct inet6_cork *v6_cork)
1828 {
1829 	struct sk_buff *skb, *tmp_skb;
1830 	struct sk_buff **tail_skb;
1831 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1832 	struct ipv6_pinfo *np = inet6_sk(sk);
1833 	struct net *net = sock_net(sk);
1834 	struct ipv6hdr *hdr;
1835 	struct ipv6_txoptions *opt = v6_cork->opt;
1836 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1837 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1838 	unsigned char proto = fl6->flowi6_proto;
1839 
1840 	skb = __skb_dequeue(queue);
1841 	if (!skb)
1842 		goto out;
1843 	tail_skb = &(skb_shinfo(skb)->frag_list);
1844 
1845 	/* move skb->data to ip header from ext header */
1846 	if (skb->data < skb_network_header(skb))
1847 		__skb_pull(skb, skb_network_offset(skb));
1848 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1849 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1850 		*tail_skb = tmp_skb;
1851 		tail_skb = &(tmp_skb->next);
1852 		skb->len += tmp_skb->len;
1853 		skb->data_len += tmp_skb->len;
1854 		skb->truesize += tmp_skb->truesize;
1855 		tmp_skb->destructor = NULL;
1856 		tmp_skb->sk = NULL;
1857 	}
1858 
1859 	/* Allow local fragmentation. */
1860 	skb->ignore_df = ip6_sk_ignore_df(sk);
1861 
1862 	*final_dst = fl6->daddr;
1863 	__skb_pull(skb, skb_network_header_len(skb));
1864 	if (opt && opt->opt_flen)
1865 		ipv6_push_frag_opts(skb, opt, &proto);
1866 	if (opt && opt->opt_nflen)
1867 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1868 
1869 	skb_push(skb, sizeof(struct ipv6hdr));
1870 	skb_reset_network_header(skb);
1871 	hdr = ipv6_hdr(skb);
1872 
1873 	ip6_flow_hdr(hdr, v6_cork->tclass,
1874 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1875 					ip6_autoflowlabel(net, np), fl6));
1876 	hdr->hop_limit = v6_cork->hop_limit;
1877 	hdr->nexthdr = proto;
1878 	hdr->saddr = fl6->saddr;
1879 	hdr->daddr = *final_dst;
1880 
1881 	skb->priority = sk->sk_priority;
1882 	skb->mark = cork->base.mark;
1883 
1884 	skb->tstamp = cork->base.transmit_time;
1885 
1886 	skb_dst_set(skb, dst_clone(&rt->dst));
1887 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1888 	if (proto == IPPROTO_ICMPV6) {
1889 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1890 
1891 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1892 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1893 	}
1894 
1895 	ip6_cork_release(cork, v6_cork);
1896 out:
1897 	return skb;
1898 }
1899 
1900 int ip6_send_skb(struct sk_buff *skb)
1901 {
1902 	struct net *net = sock_net(skb->sk);
1903 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1904 	int err;
1905 
1906 	err = ip6_local_out(net, skb->sk, skb);
1907 	if (err) {
1908 		if (err > 0)
1909 			err = net_xmit_errno(err);
1910 		if (err)
1911 			IP6_INC_STATS(net, rt->rt6i_idev,
1912 				      IPSTATS_MIB_OUTDISCARDS);
1913 	}
1914 
1915 	return err;
1916 }
1917 
1918 int ip6_push_pending_frames(struct sock *sk)
1919 {
1920 	struct sk_buff *skb;
1921 
1922 	skb = ip6_finish_skb(sk);
1923 	if (!skb)
1924 		return 0;
1925 
1926 	return ip6_send_skb(skb);
1927 }
1928 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1929 
1930 static void __ip6_flush_pending_frames(struct sock *sk,
1931 				       struct sk_buff_head *queue,
1932 				       struct inet_cork_full *cork,
1933 				       struct inet6_cork *v6_cork)
1934 {
1935 	struct sk_buff *skb;
1936 
1937 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1938 		if (skb_dst(skb))
1939 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1940 				      IPSTATS_MIB_OUTDISCARDS);
1941 		kfree_skb(skb);
1942 	}
1943 
1944 	ip6_cork_release(cork, v6_cork);
1945 }
1946 
1947 void ip6_flush_pending_frames(struct sock *sk)
1948 {
1949 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1950 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1951 }
1952 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1953 
1954 struct sk_buff *ip6_make_skb(struct sock *sk,
1955 			     int getfrag(void *from, char *to, int offset,
1956 					 int len, int odd, struct sk_buff *skb),
1957 			     void *from, int length, int transhdrlen,
1958 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1959 			     struct rt6_info *rt, unsigned int flags,
1960 			     struct inet_cork_full *cork)
1961 {
1962 	struct inet6_cork v6_cork;
1963 	struct sk_buff_head queue;
1964 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1965 	int err;
1966 
1967 	if (flags & MSG_PROBE)
1968 		return NULL;
1969 
1970 	__skb_queue_head_init(&queue);
1971 
1972 	cork->base.flags = 0;
1973 	cork->base.addr = 0;
1974 	cork->base.opt = NULL;
1975 	cork->base.dst = NULL;
1976 	v6_cork.opt = NULL;
1977 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1978 	if (err) {
1979 		ip6_cork_release(cork, &v6_cork);
1980 		return ERR_PTR(err);
1981 	}
1982 	if (ipc6->dontfrag < 0)
1983 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1984 
1985 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1986 				&current->task_frag, getfrag, from,
1987 				length + exthdrlen, transhdrlen + exthdrlen,
1988 				flags, ipc6);
1989 	if (err) {
1990 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1991 		return ERR_PTR(err);
1992 	}
1993 
1994 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1995 }
1996