xref: /linux/net/ipv6/ip6_output.c (revision 49a695ba723224875df50e327bd7b0b65dd9a56b)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	struct ipv6hdr *hdr;
199 	u8  proto = fl6->flowi6_proto;
200 	int seg_len = skb->len;
201 	int hlimit = -1;
202 	u32 mtu;
203 
204 	if (opt) {
205 		unsigned int head_room;
206 
207 		/* First: exthdrs may take lots of space (~8K for now)
208 		   MAX_HEADER is not enough.
209 		 */
210 		head_room = opt->opt_nflen + opt->opt_flen;
211 		seg_len += head_room;
212 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213 
214 		if (skb_headroom(skb) < head_room) {
215 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 			if (!skb2) {
217 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218 					      IPSTATS_MIB_OUTDISCARDS);
219 				kfree_skb(skb);
220 				return -ENOBUFS;
221 			}
222 			consume_skb(skb);
223 			skb = skb2;
224 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225 			 * it is safe to call in our context (socket lock not held)
226 			 */
227 			skb_set_owner_w(skb, (struct sock *)sk);
228 		}
229 		if (opt->opt_flen)
230 			ipv6_push_frag_opts(skb, opt, &proto);
231 		if (opt->opt_nflen)
232 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233 					     &fl6->saddr);
234 	}
235 
236 	skb_push(skb, sizeof(struct ipv6hdr));
237 	skb_reset_network_header(skb);
238 	hdr = ipv6_hdr(skb);
239 
240 	/*
241 	 *	Fill in the IPv6 header
242 	 */
243 	if (np)
244 		hlimit = np->hop_limit;
245 	if (hlimit < 0)
246 		hlimit = ip6_dst_hoplimit(dst);
247 
248 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249 				ip6_autoflowlabel(net, np), fl6));
250 
251 	hdr->payload_len = htons(seg_len);
252 	hdr->nexthdr = proto;
253 	hdr->hop_limit = hlimit;
254 
255 	hdr->saddr = fl6->saddr;
256 	hdr->daddr = *first_hop;
257 
258 	skb->protocol = htons(ETH_P_IPV6);
259 	skb->priority = sk->sk_priority;
260 	skb->mark = mark;
261 
262 	mtu = dst_mtu(dst);
263 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265 			      IPSTATS_MIB_OUT, skb->len);
266 
267 		/* if egress device is enslaved to an L3 master device pass the
268 		 * skb to its handler for processing
269 		 */
270 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
271 		if (unlikely(!skb))
272 			return 0;
273 
274 		/* hooks should never assume socket lock is held.
275 		 * we promote our socket to non const
276 		 */
277 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278 			       net, (struct sock *)sk, skb, NULL, dst->dev,
279 			       dst_output);
280 	}
281 
282 	skb->dev = dst->dev;
283 	/* ipv6_local_error() does not require socket lock,
284 	 * we promote our socket to non const
285 	 */
286 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287 
288 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289 	kfree_skb(skb);
290 	return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293 
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296 	struct ip6_ra_chain *ra;
297 	struct sock *last = NULL;
298 
299 	read_lock(&ip6_ra_lock);
300 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
301 		struct sock *sk = ra->sk;
302 		if (sk && ra->sel == sel &&
303 		    (!sk->sk_bound_dev_if ||
304 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
305 			if (last) {
306 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307 				if (skb2)
308 					rawv6_rcv(last, skb2);
309 			}
310 			last = sk;
311 		}
312 	}
313 
314 	if (last) {
315 		rawv6_rcv(last, skb);
316 		read_unlock(&ip6_ra_lock);
317 		return 1;
318 	}
319 	read_unlock(&ip6_ra_lock);
320 	return 0;
321 }
322 
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325 	struct ipv6hdr *hdr = ipv6_hdr(skb);
326 	u8 nexthdr = hdr->nexthdr;
327 	__be16 frag_off;
328 	int offset;
329 
330 	if (ipv6_ext_hdr(nexthdr)) {
331 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332 		if (offset < 0)
333 			return 0;
334 	} else
335 		offset = sizeof(struct ipv6hdr);
336 
337 	if (nexthdr == IPPROTO_ICMPV6) {
338 		struct icmp6hdr *icmp6;
339 
340 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
341 					 offset + 1 - skb->data)))
342 			return 0;
343 
344 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345 
346 		switch (icmp6->icmp6_type) {
347 		case NDISC_ROUTER_SOLICITATION:
348 		case NDISC_ROUTER_ADVERTISEMENT:
349 		case NDISC_NEIGHBOUR_SOLICITATION:
350 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
351 		case NDISC_REDIRECT:
352 			/* For reaction involving unicast neighbor discovery
353 			 * message destined to the proxied address, pass it to
354 			 * input function.
355 			 */
356 			return 1;
357 		default:
358 			break;
359 		}
360 	}
361 
362 	/*
363 	 * The proxying router can't forward traffic sent to a link-local
364 	 * address, so signal the sender and discard the packet. This
365 	 * behavior is clarified by the MIPv6 specification.
366 	 */
367 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368 		dst_link_failure(skb);
369 		return -1;
370 	}
371 
372 	return 0;
373 }
374 
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376 				     struct sk_buff *skb)
377 {
378 	return dst_output(net, sk, skb);
379 }
380 
381 unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
382 {
383 	unsigned int mtu;
384 	struct inet6_dev *idev;
385 
386 	if (dst_metric_locked(dst, RTAX_MTU)) {
387 		mtu = dst_metric_raw(dst, RTAX_MTU);
388 		if (mtu)
389 			return mtu;
390 	}
391 
392 	mtu = IPV6_MIN_MTU;
393 	rcu_read_lock();
394 	idev = __in6_dev_get(dst->dev);
395 	if (idev)
396 		mtu = idev->cnf.mtu6;
397 	rcu_read_unlock();
398 
399 	return mtu;
400 }
401 EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
402 
403 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
404 {
405 	if (skb->len <= mtu)
406 		return false;
407 
408 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
409 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
410 		return true;
411 
412 	if (skb->ignore_df)
413 		return false;
414 
415 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
416 		return false;
417 
418 	return true;
419 }
420 
421 int ip6_forward(struct sk_buff *skb)
422 {
423 	struct dst_entry *dst = skb_dst(skb);
424 	struct ipv6hdr *hdr = ipv6_hdr(skb);
425 	struct inet6_skb_parm *opt = IP6CB(skb);
426 	struct net *net = dev_net(dst->dev);
427 	u32 mtu;
428 
429 	if (net->ipv6.devconf_all->forwarding == 0)
430 		goto error;
431 
432 	if (skb->pkt_type != PACKET_HOST)
433 		goto drop;
434 
435 	if (unlikely(skb->sk))
436 		goto drop;
437 
438 	if (skb_warn_if_lro(skb))
439 		goto drop;
440 
441 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
442 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
443 				IPSTATS_MIB_INDISCARDS);
444 		goto drop;
445 	}
446 
447 	skb_forward_csum(skb);
448 
449 	/*
450 	 *	We DO NOT make any processing on
451 	 *	RA packets, pushing them to user level AS IS
452 	 *	without ane WARRANTY that application will be able
453 	 *	to interpret them. The reason is that we
454 	 *	cannot make anything clever here.
455 	 *
456 	 *	We are not end-node, so that if packet contains
457 	 *	AH/ESP, we cannot make anything.
458 	 *	Defragmentation also would be mistake, RA packets
459 	 *	cannot be fragmented, because there is no warranty
460 	 *	that different fragments will go along one path. --ANK
461 	 */
462 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
463 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
464 			return 0;
465 	}
466 
467 	/*
468 	 *	check and decrement ttl
469 	 */
470 	if (hdr->hop_limit <= 1) {
471 		/* Force OUTPUT device used as source address */
472 		skb->dev = dst->dev;
473 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
474 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
475 				IPSTATS_MIB_INHDRERRORS);
476 
477 		kfree_skb(skb);
478 		return -ETIMEDOUT;
479 	}
480 
481 	/* XXX: idev->cnf.proxy_ndp? */
482 	if (net->ipv6.devconf_all->proxy_ndp &&
483 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
484 		int proxied = ip6_forward_proxy_check(skb);
485 		if (proxied > 0)
486 			return ip6_input(skb);
487 		else if (proxied < 0) {
488 			__IP6_INC_STATS(net, ip6_dst_idev(dst),
489 					IPSTATS_MIB_INDISCARDS);
490 			goto drop;
491 		}
492 	}
493 
494 	if (!xfrm6_route_forward(skb)) {
495 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
496 				IPSTATS_MIB_INDISCARDS);
497 		goto drop;
498 	}
499 	dst = skb_dst(skb);
500 
501 	/* IPv6 specs say nothing about it, but it is clear that we cannot
502 	   send redirects to source routed frames.
503 	   We don't send redirects to frames decapsulated from IPsec.
504 	 */
505 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
506 		struct in6_addr *target = NULL;
507 		struct inet_peer *peer;
508 		struct rt6_info *rt;
509 
510 		/*
511 		 *	incoming and outgoing devices are the same
512 		 *	send a redirect.
513 		 */
514 
515 		rt = (struct rt6_info *) dst;
516 		if (rt->rt6i_flags & RTF_GATEWAY)
517 			target = &rt->rt6i_gateway;
518 		else
519 			target = &hdr->daddr;
520 
521 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
522 
523 		/* Limit redirects both by destination (here)
524 		   and by source (inside ndisc_send_redirect)
525 		 */
526 		if (inet_peer_xrlim_allow(peer, 1*HZ))
527 			ndisc_send_redirect(skb, target);
528 		if (peer)
529 			inet_putpeer(peer);
530 	} else {
531 		int addrtype = ipv6_addr_type(&hdr->saddr);
532 
533 		/* This check is security critical. */
534 		if (addrtype == IPV6_ADDR_ANY ||
535 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
536 			goto error;
537 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
538 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
539 				    ICMPV6_NOT_NEIGHBOUR, 0);
540 			goto error;
541 		}
542 	}
543 
544 	mtu = ip6_dst_mtu_forward(dst);
545 	if (mtu < IPV6_MIN_MTU)
546 		mtu = IPV6_MIN_MTU;
547 
548 	if (ip6_pkt_too_big(skb, mtu)) {
549 		/* Again, force OUTPUT device used as source address */
550 		skb->dev = dst->dev;
551 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
552 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
553 				IPSTATS_MIB_INTOOBIGERRORS);
554 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
555 				IPSTATS_MIB_FRAGFAILS);
556 		kfree_skb(skb);
557 		return -EMSGSIZE;
558 	}
559 
560 	if (skb_cow(skb, dst->dev->hard_header_len)) {
561 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
562 				IPSTATS_MIB_OUTDISCARDS);
563 		goto drop;
564 	}
565 
566 	hdr = ipv6_hdr(skb);
567 
568 	/* Mangling hops number delayed to point after skb COW */
569 
570 	hdr->hop_limit--;
571 
572 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
573 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
574 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
575 		       net, NULL, skb, skb->dev, dst->dev,
576 		       ip6_forward_finish);
577 
578 error:
579 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
580 drop:
581 	kfree_skb(skb);
582 	return -EINVAL;
583 }
584 
585 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
586 {
587 	to->pkt_type = from->pkt_type;
588 	to->priority = from->priority;
589 	to->protocol = from->protocol;
590 	skb_dst_drop(to);
591 	skb_dst_set(to, dst_clone(skb_dst(from)));
592 	to->dev = from->dev;
593 	to->mark = from->mark;
594 
595 #ifdef CONFIG_NET_SCHED
596 	to->tc_index = from->tc_index;
597 #endif
598 	nf_copy(to, from);
599 	skb_copy_secmark(to, from);
600 }
601 
602 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
603 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
604 {
605 	struct sk_buff *frag;
606 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
607 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
608 				inet6_sk(skb->sk) : NULL;
609 	struct ipv6hdr *tmp_hdr;
610 	struct frag_hdr *fh;
611 	unsigned int mtu, hlen, left, len;
612 	int hroom, troom;
613 	__be32 frag_id;
614 	int ptr, offset = 0, err = 0;
615 	u8 *prevhdr, nexthdr = 0;
616 
617 	err = ip6_find_1stfragopt(skb, &prevhdr);
618 	if (err < 0)
619 		goto fail;
620 	hlen = err;
621 	nexthdr = *prevhdr;
622 
623 	mtu = ip6_skb_dst_mtu(skb);
624 
625 	/* We must not fragment if the socket is set to force MTU discovery
626 	 * or if the skb it not generated by a local socket.
627 	 */
628 	if (unlikely(!skb->ignore_df && skb->len > mtu))
629 		goto fail_toobig;
630 
631 	if (IP6CB(skb)->frag_max_size) {
632 		if (IP6CB(skb)->frag_max_size > mtu)
633 			goto fail_toobig;
634 
635 		/* don't send fragments larger than what we received */
636 		mtu = IP6CB(skb)->frag_max_size;
637 		if (mtu < IPV6_MIN_MTU)
638 			mtu = IPV6_MIN_MTU;
639 	}
640 
641 	if (np && np->frag_size < mtu) {
642 		if (np->frag_size)
643 			mtu = np->frag_size;
644 	}
645 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
646 		goto fail_toobig;
647 	mtu -= hlen + sizeof(struct frag_hdr);
648 
649 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
650 				    &ipv6_hdr(skb)->saddr);
651 
652 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
653 	    (err = skb_checksum_help(skb)))
654 		goto fail;
655 
656 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
657 	if (skb_has_frag_list(skb)) {
658 		unsigned int first_len = skb_pagelen(skb);
659 		struct sk_buff *frag2;
660 
661 		if (first_len - hlen > mtu ||
662 		    ((first_len - hlen) & 7) ||
663 		    skb_cloned(skb) ||
664 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
665 			goto slow_path;
666 
667 		skb_walk_frags(skb, frag) {
668 			/* Correct geometry. */
669 			if (frag->len > mtu ||
670 			    ((frag->len & 7) && frag->next) ||
671 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
672 				goto slow_path_clean;
673 
674 			/* Partially cloned skb? */
675 			if (skb_shared(frag))
676 				goto slow_path_clean;
677 
678 			BUG_ON(frag->sk);
679 			if (skb->sk) {
680 				frag->sk = skb->sk;
681 				frag->destructor = sock_wfree;
682 			}
683 			skb->truesize -= frag->truesize;
684 		}
685 
686 		err = 0;
687 		offset = 0;
688 		/* BUILD HEADER */
689 
690 		*prevhdr = NEXTHDR_FRAGMENT;
691 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692 		if (!tmp_hdr) {
693 			err = -ENOMEM;
694 			goto fail;
695 		}
696 		frag = skb_shinfo(skb)->frag_list;
697 		skb_frag_list_init(skb);
698 
699 		__skb_pull(skb, hlen);
700 		fh = __skb_push(skb, sizeof(struct frag_hdr));
701 		__skb_push(skb, hlen);
702 		skb_reset_network_header(skb);
703 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
704 
705 		fh->nexthdr = nexthdr;
706 		fh->reserved = 0;
707 		fh->frag_off = htons(IP6_MF);
708 		fh->identification = frag_id;
709 
710 		first_len = skb_pagelen(skb);
711 		skb->data_len = first_len - skb_headlen(skb);
712 		skb->len = first_len;
713 		ipv6_hdr(skb)->payload_len = htons(first_len -
714 						   sizeof(struct ipv6hdr));
715 
716 		for (;;) {
717 			/* Prepare header of the next frame,
718 			 * before previous one went down. */
719 			if (frag) {
720 				frag->ip_summed = CHECKSUM_NONE;
721 				skb_reset_transport_header(frag);
722 				fh = __skb_push(frag, sizeof(struct frag_hdr));
723 				__skb_push(frag, hlen);
724 				skb_reset_network_header(frag);
725 				memcpy(skb_network_header(frag), tmp_hdr,
726 				       hlen);
727 				offset += skb->len - hlen - sizeof(struct frag_hdr);
728 				fh->nexthdr = nexthdr;
729 				fh->reserved = 0;
730 				fh->frag_off = htons(offset);
731 				if (frag->next)
732 					fh->frag_off |= htons(IP6_MF);
733 				fh->identification = frag_id;
734 				ipv6_hdr(frag)->payload_len =
735 						htons(frag->len -
736 						      sizeof(struct ipv6hdr));
737 				ip6_copy_metadata(frag, skb);
738 			}
739 
740 			err = output(net, sk, skb);
741 			if (!err)
742 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743 					      IPSTATS_MIB_FRAGCREATES);
744 
745 			if (err || !frag)
746 				break;
747 
748 			skb = frag;
749 			frag = skb->next;
750 			skb->next = NULL;
751 		}
752 
753 		kfree(tmp_hdr);
754 
755 		if (err == 0) {
756 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757 				      IPSTATS_MIB_FRAGOKS);
758 			return 0;
759 		}
760 
761 		kfree_skb_list(frag);
762 
763 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764 			      IPSTATS_MIB_FRAGFAILS);
765 		return err;
766 
767 slow_path_clean:
768 		skb_walk_frags(skb, frag2) {
769 			if (frag2 == frag)
770 				break;
771 			frag2->sk = NULL;
772 			frag2->destructor = NULL;
773 			skb->truesize += frag2->truesize;
774 		}
775 	}
776 
777 slow_path:
778 	left = skb->len - hlen;		/* Space per frame */
779 	ptr = hlen;			/* Where to start from */
780 
781 	/*
782 	 *	Fragment the datagram.
783 	 */
784 
785 	troom = rt->dst.dev->needed_tailroom;
786 
787 	/*
788 	 *	Keep copying data until we run out.
789 	 */
790 	while (left > 0)	{
791 		u8 *fragnexthdr_offset;
792 
793 		len = left;
794 		/* IF: it doesn't fit, use 'mtu' - the data space left */
795 		if (len > mtu)
796 			len = mtu;
797 		/* IF: we are not sending up to and including the packet end
798 		   then align the next start on an eight byte boundary */
799 		if (len < left)	{
800 			len &= ~7;
801 		}
802 
803 		/* Allocate buffer */
804 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
805 				 hroom + troom, GFP_ATOMIC);
806 		if (!frag) {
807 			err = -ENOMEM;
808 			goto fail;
809 		}
810 
811 		/*
812 		 *	Set up data on packet
813 		 */
814 
815 		ip6_copy_metadata(frag, skb);
816 		skb_reserve(frag, hroom);
817 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
818 		skb_reset_network_header(frag);
819 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
820 		frag->transport_header = (frag->network_header + hlen +
821 					  sizeof(struct frag_hdr));
822 
823 		/*
824 		 *	Charge the memory for the fragment to any owner
825 		 *	it might possess
826 		 */
827 		if (skb->sk)
828 			skb_set_owner_w(frag, skb->sk);
829 
830 		/*
831 		 *	Copy the packet header into the new buffer.
832 		 */
833 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
834 
835 		fragnexthdr_offset = skb_network_header(frag);
836 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
837 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
838 
839 		/*
840 		 *	Build fragment header.
841 		 */
842 		fh->nexthdr = nexthdr;
843 		fh->reserved = 0;
844 		fh->identification = frag_id;
845 
846 		/*
847 		 *	Copy a block of the IP datagram.
848 		 */
849 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
850 				     len));
851 		left -= len;
852 
853 		fh->frag_off = htons(offset);
854 		if (left > 0)
855 			fh->frag_off |= htons(IP6_MF);
856 		ipv6_hdr(frag)->payload_len = htons(frag->len -
857 						    sizeof(struct ipv6hdr));
858 
859 		ptr += len;
860 		offset += len;
861 
862 		/*
863 		 *	Put this fragment into the sending queue.
864 		 */
865 		err = output(net, sk, frag);
866 		if (err)
867 			goto fail;
868 
869 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
870 			      IPSTATS_MIB_FRAGCREATES);
871 	}
872 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
873 		      IPSTATS_MIB_FRAGOKS);
874 	consume_skb(skb);
875 	return err;
876 
877 fail_toobig:
878 	if (skb->sk && dst_allfrag(skb_dst(skb)))
879 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
880 
881 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
882 	err = -EMSGSIZE;
883 
884 fail:
885 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886 		      IPSTATS_MIB_FRAGFAILS);
887 	kfree_skb(skb);
888 	return err;
889 }
890 
891 static inline int ip6_rt_check(const struct rt6key *rt_key,
892 			       const struct in6_addr *fl_addr,
893 			       const struct in6_addr *addr_cache)
894 {
895 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
896 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
897 }
898 
899 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
900 					  struct dst_entry *dst,
901 					  const struct flowi6 *fl6)
902 {
903 	struct ipv6_pinfo *np = inet6_sk(sk);
904 	struct rt6_info *rt;
905 
906 	if (!dst)
907 		goto out;
908 
909 	if (dst->ops->family != AF_INET6) {
910 		dst_release(dst);
911 		return NULL;
912 	}
913 
914 	rt = (struct rt6_info *)dst;
915 	/* Yes, checking route validity in not connected
916 	 * case is not very simple. Take into account,
917 	 * that we do not support routing by source, TOS,
918 	 * and MSG_DONTROUTE		--ANK (980726)
919 	 *
920 	 * 1. ip6_rt_check(): If route was host route,
921 	 *    check that cached destination is current.
922 	 *    If it is network route, we still may
923 	 *    check its validity using saved pointer
924 	 *    to the last used address: daddr_cache.
925 	 *    We do not want to save whole address now,
926 	 *    (because main consumer of this service
927 	 *    is tcp, which has not this problem),
928 	 *    so that the last trick works only on connected
929 	 *    sockets.
930 	 * 2. oif also should be the same.
931 	 */
932 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES
934 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935 #endif
936 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
937 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
938 		dst_release(dst);
939 		dst = NULL;
940 	}
941 
942 out:
943 	return dst;
944 }
945 
946 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
947 			       struct dst_entry **dst, struct flowi6 *fl6)
948 {
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950 	struct neighbour *n;
951 	struct rt6_info *rt;
952 #endif
953 	int err;
954 	int flags = 0;
955 
956 	/* The correct way to handle this would be to do
957 	 * ip6_route_get_saddr, and then ip6_route_output; however,
958 	 * the route-specific preferred source forces the
959 	 * ip6_route_output call _before_ ip6_route_get_saddr.
960 	 *
961 	 * In source specific routing (no src=any default route),
962 	 * ip6_route_output will fail given src=any saddr, though, so
963 	 * that's why we try it again later.
964 	 */
965 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
966 		struct rt6_info *rt;
967 		bool had_dst = *dst != NULL;
968 
969 		if (!had_dst)
970 			*dst = ip6_route_output(net, sk, fl6);
971 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
972 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
973 					  sk ? inet6_sk(sk)->srcprefs : 0,
974 					  &fl6->saddr);
975 		if (err)
976 			goto out_err_release;
977 
978 		/* If we had an erroneous initial result, pretend it
979 		 * never existed and let the SA-enabled version take
980 		 * over.
981 		 */
982 		if (!had_dst && (*dst)->error) {
983 			dst_release(*dst);
984 			*dst = NULL;
985 		}
986 
987 		if (fl6->flowi6_oif)
988 			flags |= RT6_LOOKUP_F_IFACE;
989 	}
990 
991 	if (!*dst)
992 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
993 
994 	err = (*dst)->error;
995 	if (err)
996 		goto out_err_release;
997 
998 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
999 	/*
1000 	 * Here if the dst entry we've looked up
1001 	 * has a neighbour entry that is in the INCOMPLETE
1002 	 * state and the src address from the flow is
1003 	 * marked as OPTIMISTIC, we release the found
1004 	 * dst entry and replace it instead with the
1005 	 * dst entry of the nexthop router
1006 	 */
1007 	rt = (struct rt6_info *) *dst;
1008 	rcu_read_lock_bh();
1009 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1010 				      rt6_nexthop(rt, &fl6->daddr));
1011 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1012 	rcu_read_unlock_bh();
1013 
1014 	if (err) {
1015 		struct inet6_ifaddr *ifp;
1016 		struct flowi6 fl_gw6;
1017 		int redirect;
1018 
1019 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1020 				      (*dst)->dev, 1);
1021 
1022 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1023 		if (ifp)
1024 			in6_ifa_put(ifp);
1025 
1026 		if (redirect) {
1027 			/*
1028 			 * We need to get the dst entry for the
1029 			 * default router instead
1030 			 */
1031 			dst_release(*dst);
1032 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1033 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1034 			*dst = ip6_route_output(net, sk, &fl_gw6);
1035 			err = (*dst)->error;
1036 			if (err)
1037 				goto out_err_release;
1038 		}
1039 	}
1040 #endif
1041 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1042 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1043 		err = -EAFNOSUPPORT;
1044 		goto out_err_release;
1045 	}
1046 
1047 	return 0;
1048 
1049 out_err_release:
1050 	dst_release(*dst);
1051 	*dst = NULL;
1052 
1053 	if (err == -ENETUNREACH)
1054 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1055 	return err;
1056 }
1057 
1058 /**
1059  *	ip6_dst_lookup - perform route lookup on flow
1060  *	@sk: socket which provides route info
1061  *	@dst: pointer to dst_entry * for result
1062  *	@fl6: flow to lookup
1063  *
1064  *	This function performs a route lookup on the given flow.
1065  *
1066  *	It returns zero on success, or a standard errno code on error.
1067  */
1068 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1069 		   struct flowi6 *fl6)
1070 {
1071 	*dst = NULL;
1072 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1075 
1076 /**
1077  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1078  *	@sk: socket which provides route info
1079  *	@fl6: flow to lookup
1080  *	@final_dst: final destination address for ipsec lookup
1081  *
1082  *	This function performs a route lookup on the given flow.
1083  *
1084  *	It returns a valid dst pointer on success, or a pointer encoded
1085  *	error code.
1086  */
1087 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1088 				      const struct in6_addr *final_dst)
1089 {
1090 	struct dst_entry *dst = NULL;
1091 	int err;
1092 
1093 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1094 	if (err)
1095 		return ERR_PTR(err);
1096 	if (final_dst)
1097 		fl6->daddr = *final_dst;
1098 
1099 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1100 }
1101 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1102 
1103 /**
1104  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1105  *	@sk: socket which provides the dst cache and route info
1106  *	@fl6: flow to lookup
1107  *	@final_dst: final destination address for ipsec lookup
1108  *	@connected: whether @sk is connected or not
1109  *
1110  *	This function performs a route lookup on the given flow with the
1111  *	possibility of using the cached route in the socket if it is valid.
1112  *	It will take the socket dst lock when operating on the dst cache.
1113  *	As a result, this function can only be used in process context.
1114  *
1115  *	In addition, for a connected socket, cache the dst in the socket
1116  *	if the current cache is not valid.
1117  *
1118  *	It returns a valid dst pointer on success, or a pointer encoded
1119  *	error code.
1120  */
1121 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1122 					 const struct in6_addr *final_dst,
1123 					 bool connected)
1124 {
1125 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1126 
1127 	dst = ip6_sk_dst_check(sk, dst, fl6);
1128 	if (dst)
1129 		return dst;
1130 
1131 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1132 	if (connected && !IS_ERR(dst))
1133 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1134 
1135 	return dst;
1136 }
1137 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1138 
1139 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1140 					       gfp_t gfp)
1141 {
1142 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1143 }
1144 
1145 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1146 						gfp_t gfp)
1147 {
1148 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1149 }
1150 
1151 static void ip6_append_data_mtu(unsigned int *mtu,
1152 				int *maxfraglen,
1153 				unsigned int fragheaderlen,
1154 				struct sk_buff *skb,
1155 				struct rt6_info *rt,
1156 				unsigned int orig_mtu)
1157 {
1158 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1159 		if (!skb) {
1160 			/* first fragment, reserve header_len */
1161 			*mtu = orig_mtu - rt->dst.header_len;
1162 
1163 		} else {
1164 			/*
1165 			 * this fragment is not first, the headers
1166 			 * space is regarded as data space.
1167 			 */
1168 			*mtu = orig_mtu;
1169 		}
1170 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1171 			      + fragheaderlen - sizeof(struct frag_hdr);
1172 	}
1173 }
1174 
1175 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1176 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1177 			  struct rt6_info *rt, struct flowi6 *fl6)
1178 {
1179 	struct ipv6_pinfo *np = inet6_sk(sk);
1180 	unsigned int mtu;
1181 	struct ipv6_txoptions *opt = ipc6->opt;
1182 
1183 	/*
1184 	 * setup for corking
1185 	 */
1186 	if (opt) {
1187 		if (WARN_ON(v6_cork->opt))
1188 			return -EINVAL;
1189 
1190 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1191 		if (unlikely(!v6_cork->opt))
1192 			return -ENOBUFS;
1193 
1194 		v6_cork->opt->tot_len = sizeof(*opt);
1195 		v6_cork->opt->opt_flen = opt->opt_flen;
1196 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1197 
1198 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1199 						    sk->sk_allocation);
1200 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1201 			return -ENOBUFS;
1202 
1203 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1204 						    sk->sk_allocation);
1205 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1206 			return -ENOBUFS;
1207 
1208 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1209 						   sk->sk_allocation);
1210 		if (opt->hopopt && !v6_cork->opt->hopopt)
1211 			return -ENOBUFS;
1212 
1213 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1214 						    sk->sk_allocation);
1215 		if (opt->srcrt && !v6_cork->opt->srcrt)
1216 			return -ENOBUFS;
1217 
1218 		/* need source address above miyazawa*/
1219 	}
1220 	dst_hold(&rt->dst);
1221 	cork->base.dst = &rt->dst;
1222 	cork->fl.u.ip6 = *fl6;
1223 	v6_cork->hop_limit = ipc6->hlimit;
1224 	v6_cork->tclass = ipc6->tclass;
1225 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1226 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1227 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1228 	else
1229 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1230 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1231 	if (np->frag_size < mtu) {
1232 		if (np->frag_size)
1233 			mtu = np->frag_size;
1234 	}
1235 	if (mtu < IPV6_MIN_MTU)
1236 		return -EINVAL;
1237 	cork->base.fragsize = mtu;
1238 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1239 		cork->base.flags |= IPCORK_ALLFRAG;
1240 	cork->base.length = 0;
1241 
1242 	return 0;
1243 }
1244 
1245 static int __ip6_append_data(struct sock *sk,
1246 			     struct flowi6 *fl6,
1247 			     struct sk_buff_head *queue,
1248 			     struct inet_cork *cork,
1249 			     struct inet6_cork *v6_cork,
1250 			     struct page_frag *pfrag,
1251 			     int getfrag(void *from, char *to, int offset,
1252 					 int len, int odd, struct sk_buff *skb),
1253 			     void *from, int length, int transhdrlen,
1254 			     unsigned int flags, struct ipcm6_cookie *ipc6,
1255 			     const struct sockcm_cookie *sockc)
1256 {
1257 	struct sk_buff *skb, *skb_prev = NULL;
1258 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1259 	int exthdrlen = 0;
1260 	int dst_exthdrlen = 0;
1261 	int hh_len;
1262 	int copy;
1263 	int err;
1264 	int offset = 0;
1265 	__u8 tx_flags = 0;
1266 	u32 tskey = 0;
1267 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1268 	struct ipv6_txoptions *opt = v6_cork->opt;
1269 	int csummode = CHECKSUM_NONE;
1270 	unsigned int maxnonfragsize, headersize;
1271 	unsigned int wmem_alloc_delta = 0;
1272 
1273 	skb = skb_peek_tail(queue);
1274 	if (!skb) {
1275 		exthdrlen = opt ? opt->opt_flen : 0;
1276 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1277 	}
1278 
1279 	mtu = cork->fragsize;
1280 	orig_mtu = mtu;
1281 
1282 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1283 
1284 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1285 			(opt ? opt->opt_nflen : 0);
1286 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1287 		     sizeof(struct frag_hdr);
1288 
1289 	headersize = sizeof(struct ipv6hdr) +
1290 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1291 		     (dst_allfrag(&rt->dst) ?
1292 		      sizeof(struct frag_hdr) : 0) +
1293 		     rt->rt6i_nfheader_len;
1294 
1295 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1296 	 * the first fragment
1297 	 */
1298 	if (headersize + transhdrlen > mtu)
1299 		goto emsgsize;
1300 
1301 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1302 	    (sk->sk_protocol == IPPROTO_UDP ||
1303 	     sk->sk_protocol == IPPROTO_RAW)) {
1304 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1305 				sizeof(struct ipv6hdr));
1306 		goto emsgsize;
1307 	}
1308 
1309 	if (ip6_sk_ignore_df(sk))
1310 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1311 	else
1312 		maxnonfragsize = mtu;
1313 
1314 	if (cork->length + length > maxnonfragsize - headersize) {
1315 emsgsize:
1316 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1317 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1318 		return -EMSGSIZE;
1319 	}
1320 
1321 	/* CHECKSUM_PARTIAL only with no extension headers and when
1322 	 * we are not going to fragment
1323 	 */
1324 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1325 	    headersize == sizeof(struct ipv6hdr) &&
1326 	    length <= mtu - headersize &&
1327 	    !(flags & MSG_MORE) &&
1328 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1329 		csummode = CHECKSUM_PARTIAL;
1330 
1331 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1332 		sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1333 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1334 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1335 			tskey = sk->sk_tskey++;
1336 	}
1337 
1338 	/*
1339 	 * Let's try using as much space as possible.
1340 	 * Use MTU if total length of the message fits into the MTU.
1341 	 * Otherwise, we need to reserve fragment header and
1342 	 * fragment alignment (= 8-15 octects, in total).
1343 	 *
1344 	 * Note that we may need to "move" the data from the tail of
1345 	 * of the buffer to the new fragment when we split
1346 	 * the message.
1347 	 *
1348 	 * FIXME: It may be fragmented into multiple chunks
1349 	 *        at once if non-fragmentable extension headers
1350 	 *        are too large.
1351 	 * --yoshfuji
1352 	 */
1353 
1354 	cork->length += length;
1355 	if (!skb)
1356 		goto alloc_new_skb;
1357 
1358 	while (length > 0) {
1359 		/* Check if the remaining data fits into current packet. */
1360 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1361 		if (copy < length)
1362 			copy = maxfraglen - skb->len;
1363 
1364 		if (copy <= 0) {
1365 			char *data;
1366 			unsigned int datalen;
1367 			unsigned int fraglen;
1368 			unsigned int fraggap;
1369 			unsigned int alloclen;
1370 alloc_new_skb:
1371 			/* There's no room in the current skb */
1372 			if (skb)
1373 				fraggap = skb->len - maxfraglen;
1374 			else
1375 				fraggap = 0;
1376 			/* update mtu and maxfraglen if necessary */
1377 			if (!skb || !skb_prev)
1378 				ip6_append_data_mtu(&mtu, &maxfraglen,
1379 						    fragheaderlen, skb, rt,
1380 						    orig_mtu);
1381 
1382 			skb_prev = skb;
1383 
1384 			/*
1385 			 * If remaining data exceeds the mtu,
1386 			 * we know we need more fragment(s).
1387 			 */
1388 			datalen = length + fraggap;
1389 
1390 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1391 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1392 			if ((flags & MSG_MORE) &&
1393 			    !(rt->dst.dev->features&NETIF_F_SG))
1394 				alloclen = mtu;
1395 			else
1396 				alloclen = datalen + fragheaderlen;
1397 
1398 			alloclen += dst_exthdrlen;
1399 
1400 			if (datalen != length + fraggap) {
1401 				/*
1402 				 * this is not the last fragment, the trailer
1403 				 * space is regarded as data space.
1404 				 */
1405 				datalen += rt->dst.trailer_len;
1406 			}
1407 
1408 			alloclen += rt->dst.trailer_len;
1409 			fraglen = datalen + fragheaderlen;
1410 
1411 			/*
1412 			 * We just reserve space for fragment header.
1413 			 * Note: this may be overallocation if the message
1414 			 * (without MSG_MORE) fits into the MTU.
1415 			 */
1416 			alloclen += sizeof(struct frag_hdr);
1417 
1418 			copy = datalen - transhdrlen - fraggap;
1419 			if (copy < 0) {
1420 				err = -EINVAL;
1421 				goto error;
1422 			}
1423 			if (transhdrlen) {
1424 				skb = sock_alloc_send_skb(sk,
1425 						alloclen + hh_len,
1426 						(flags & MSG_DONTWAIT), &err);
1427 			} else {
1428 				skb = NULL;
1429 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1430 				    2 * sk->sk_sndbuf)
1431 					skb = alloc_skb(alloclen + hh_len,
1432 							sk->sk_allocation);
1433 				if (unlikely(!skb))
1434 					err = -ENOBUFS;
1435 			}
1436 			if (!skb)
1437 				goto error;
1438 			/*
1439 			 *	Fill in the control structures
1440 			 */
1441 			skb->protocol = htons(ETH_P_IPV6);
1442 			skb->ip_summed = csummode;
1443 			skb->csum = 0;
1444 			/* reserve for fragmentation and ipsec header */
1445 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1446 				    dst_exthdrlen);
1447 
1448 			/* Only the initial fragment is time stamped */
1449 			skb_shinfo(skb)->tx_flags = tx_flags;
1450 			tx_flags = 0;
1451 			skb_shinfo(skb)->tskey = tskey;
1452 			tskey = 0;
1453 
1454 			/*
1455 			 *	Find where to start putting bytes
1456 			 */
1457 			data = skb_put(skb, fraglen);
1458 			skb_set_network_header(skb, exthdrlen);
1459 			data += fragheaderlen;
1460 			skb->transport_header = (skb->network_header +
1461 						 fragheaderlen);
1462 			if (fraggap) {
1463 				skb->csum = skb_copy_and_csum_bits(
1464 					skb_prev, maxfraglen,
1465 					data + transhdrlen, fraggap, 0);
1466 				skb_prev->csum = csum_sub(skb_prev->csum,
1467 							  skb->csum);
1468 				data += fraggap;
1469 				pskb_trim_unique(skb_prev, maxfraglen);
1470 			}
1471 			if (copy > 0 &&
1472 			    getfrag(from, data + transhdrlen, offset,
1473 				    copy, fraggap, skb) < 0) {
1474 				err = -EFAULT;
1475 				kfree_skb(skb);
1476 				goto error;
1477 			}
1478 
1479 			offset += copy;
1480 			length -= datalen - fraggap;
1481 			transhdrlen = 0;
1482 			exthdrlen = 0;
1483 			dst_exthdrlen = 0;
1484 
1485 			if ((flags & MSG_CONFIRM) && !skb_prev)
1486 				skb_set_dst_pending_confirm(skb, 1);
1487 
1488 			/*
1489 			 * Put the packet on the pending queue
1490 			 */
1491 			if (!skb->destructor) {
1492 				skb->destructor = sock_wfree;
1493 				skb->sk = sk;
1494 				wmem_alloc_delta += skb->truesize;
1495 			}
1496 			__skb_queue_tail(queue, skb);
1497 			continue;
1498 		}
1499 
1500 		if (copy > length)
1501 			copy = length;
1502 
1503 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1504 			unsigned int off;
1505 
1506 			off = skb->len;
1507 			if (getfrag(from, skb_put(skb, copy),
1508 						offset, copy, off, skb) < 0) {
1509 				__skb_trim(skb, off);
1510 				err = -EFAULT;
1511 				goto error;
1512 			}
1513 		} else {
1514 			int i = skb_shinfo(skb)->nr_frags;
1515 
1516 			err = -ENOMEM;
1517 			if (!sk_page_frag_refill(sk, pfrag))
1518 				goto error;
1519 
1520 			if (!skb_can_coalesce(skb, i, pfrag->page,
1521 					      pfrag->offset)) {
1522 				err = -EMSGSIZE;
1523 				if (i == MAX_SKB_FRAGS)
1524 					goto error;
1525 
1526 				__skb_fill_page_desc(skb, i, pfrag->page,
1527 						     pfrag->offset, 0);
1528 				skb_shinfo(skb)->nr_frags = ++i;
1529 				get_page(pfrag->page);
1530 			}
1531 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1532 			if (getfrag(from,
1533 				    page_address(pfrag->page) + pfrag->offset,
1534 				    offset, copy, skb->len, skb) < 0)
1535 				goto error_efault;
1536 
1537 			pfrag->offset += copy;
1538 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1539 			skb->len += copy;
1540 			skb->data_len += copy;
1541 			skb->truesize += copy;
1542 			wmem_alloc_delta += copy;
1543 		}
1544 		offset += copy;
1545 		length -= copy;
1546 	}
1547 
1548 	if (wmem_alloc_delta)
1549 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1550 	return 0;
1551 
1552 error_efault:
1553 	err = -EFAULT;
1554 error:
1555 	cork->length -= length;
1556 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1557 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1558 	return err;
1559 }
1560 
1561 int ip6_append_data(struct sock *sk,
1562 		    int getfrag(void *from, char *to, int offset, int len,
1563 				int odd, struct sk_buff *skb),
1564 		    void *from, int length, int transhdrlen,
1565 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1566 		    struct rt6_info *rt, unsigned int flags,
1567 		    const struct sockcm_cookie *sockc)
1568 {
1569 	struct inet_sock *inet = inet_sk(sk);
1570 	struct ipv6_pinfo *np = inet6_sk(sk);
1571 	int exthdrlen;
1572 	int err;
1573 
1574 	if (flags&MSG_PROBE)
1575 		return 0;
1576 	if (skb_queue_empty(&sk->sk_write_queue)) {
1577 		/*
1578 		 * setup for corking
1579 		 */
1580 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1581 				     ipc6, rt, fl6);
1582 		if (err)
1583 			return err;
1584 
1585 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1586 		length += exthdrlen;
1587 		transhdrlen += exthdrlen;
1588 	} else {
1589 		fl6 = &inet->cork.fl.u.ip6;
1590 		transhdrlen = 0;
1591 	}
1592 
1593 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1594 				 &np->cork, sk_page_frag(sk), getfrag,
1595 				 from, length, transhdrlen, flags, ipc6, sockc);
1596 }
1597 EXPORT_SYMBOL_GPL(ip6_append_data);
1598 
1599 static void ip6_cork_release(struct inet_cork_full *cork,
1600 			     struct inet6_cork *v6_cork)
1601 {
1602 	if (v6_cork->opt) {
1603 		kfree(v6_cork->opt->dst0opt);
1604 		kfree(v6_cork->opt->dst1opt);
1605 		kfree(v6_cork->opt->hopopt);
1606 		kfree(v6_cork->opt->srcrt);
1607 		kfree(v6_cork->opt);
1608 		v6_cork->opt = NULL;
1609 	}
1610 
1611 	if (cork->base.dst) {
1612 		dst_release(cork->base.dst);
1613 		cork->base.dst = NULL;
1614 		cork->base.flags &= ~IPCORK_ALLFRAG;
1615 	}
1616 	memset(&cork->fl, 0, sizeof(cork->fl));
1617 }
1618 
1619 struct sk_buff *__ip6_make_skb(struct sock *sk,
1620 			       struct sk_buff_head *queue,
1621 			       struct inet_cork_full *cork,
1622 			       struct inet6_cork *v6_cork)
1623 {
1624 	struct sk_buff *skb, *tmp_skb;
1625 	struct sk_buff **tail_skb;
1626 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1627 	struct ipv6_pinfo *np = inet6_sk(sk);
1628 	struct net *net = sock_net(sk);
1629 	struct ipv6hdr *hdr;
1630 	struct ipv6_txoptions *opt = v6_cork->opt;
1631 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1632 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1633 	unsigned char proto = fl6->flowi6_proto;
1634 
1635 	skb = __skb_dequeue(queue);
1636 	if (!skb)
1637 		goto out;
1638 	tail_skb = &(skb_shinfo(skb)->frag_list);
1639 
1640 	/* move skb->data to ip header from ext header */
1641 	if (skb->data < skb_network_header(skb))
1642 		__skb_pull(skb, skb_network_offset(skb));
1643 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1644 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1645 		*tail_skb = tmp_skb;
1646 		tail_skb = &(tmp_skb->next);
1647 		skb->len += tmp_skb->len;
1648 		skb->data_len += tmp_skb->len;
1649 		skb->truesize += tmp_skb->truesize;
1650 		tmp_skb->destructor = NULL;
1651 		tmp_skb->sk = NULL;
1652 	}
1653 
1654 	/* Allow local fragmentation. */
1655 	skb->ignore_df = ip6_sk_ignore_df(sk);
1656 
1657 	*final_dst = fl6->daddr;
1658 	__skb_pull(skb, skb_network_header_len(skb));
1659 	if (opt && opt->opt_flen)
1660 		ipv6_push_frag_opts(skb, opt, &proto);
1661 	if (opt && opt->opt_nflen)
1662 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1663 
1664 	skb_push(skb, sizeof(struct ipv6hdr));
1665 	skb_reset_network_header(skb);
1666 	hdr = ipv6_hdr(skb);
1667 
1668 	ip6_flow_hdr(hdr, v6_cork->tclass,
1669 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1670 					ip6_autoflowlabel(net, np), fl6));
1671 	hdr->hop_limit = v6_cork->hop_limit;
1672 	hdr->nexthdr = proto;
1673 	hdr->saddr = fl6->saddr;
1674 	hdr->daddr = *final_dst;
1675 
1676 	skb->priority = sk->sk_priority;
1677 	skb->mark = sk->sk_mark;
1678 
1679 	skb_dst_set(skb, dst_clone(&rt->dst));
1680 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1681 	if (proto == IPPROTO_ICMPV6) {
1682 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1683 
1684 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1685 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1686 	}
1687 
1688 	ip6_cork_release(cork, v6_cork);
1689 out:
1690 	return skb;
1691 }
1692 
1693 int ip6_send_skb(struct sk_buff *skb)
1694 {
1695 	struct net *net = sock_net(skb->sk);
1696 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1697 	int err;
1698 
1699 	err = ip6_local_out(net, skb->sk, skb);
1700 	if (err) {
1701 		if (err > 0)
1702 			err = net_xmit_errno(err);
1703 		if (err)
1704 			IP6_INC_STATS(net, rt->rt6i_idev,
1705 				      IPSTATS_MIB_OUTDISCARDS);
1706 	}
1707 
1708 	return err;
1709 }
1710 
1711 int ip6_push_pending_frames(struct sock *sk)
1712 {
1713 	struct sk_buff *skb;
1714 
1715 	skb = ip6_finish_skb(sk);
1716 	if (!skb)
1717 		return 0;
1718 
1719 	return ip6_send_skb(skb);
1720 }
1721 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1722 
1723 static void __ip6_flush_pending_frames(struct sock *sk,
1724 				       struct sk_buff_head *queue,
1725 				       struct inet_cork_full *cork,
1726 				       struct inet6_cork *v6_cork)
1727 {
1728 	struct sk_buff *skb;
1729 
1730 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1731 		if (skb_dst(skb))
1732 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1733 				      IPSTATS_MIB_OUTDISCARDS);
1734 		kfree_skb(skb);
1735 	}
1736 
1737 	ip6_cork_release(cork, v6_cork);
1738 }
1739 
1740 void ip6_flush_pending_frames(struct sock *sk)
1741 {
1742 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1743 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1744 }
1745 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1746 
1747 struct sk_buff *ip6_make_skb(struct sock *sk,
1748 			     int getfrag(void *from, char *to, int offset,
1749 					 int len, int odd, struct sk_buff *skb),
1750 			     void *from, int length, int transhdrlen,
1751 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1752 			     struct rt6_info *rt, unsigned int flags,
1753 			     const struct sockcm_cookie *sockc)
1754 {
1755 	struct inet_cork_full cork;
1756 	struct inet6_cork v6_cork;
1757 	struct sk_buff_head queue;
1758 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1759 	int err;
1760 
1761 	if (flags & MSG_PROBE)
1762 		return NULL;
1763 
1764 	__skb_queue_head_init(&queue);
1765 
1766 	cork.base.flags = 0;
1767 	cork.base.addr = 0;
1768 	cork.base.opt = NULL;
1769 	cork.base.dst = NULL;
1770 	v6_cork.opt = NULL;
1771 	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1772 	if (err) {
1773 		ip6_cork_release(&cork, &v6_cork);
1774 		return ERR_PTR(err);
1775 	}
1776 	if (ipc6->dontfrag < 0)
1777 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1778 
1779 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1780 				&current->task_frag, getfrag, from,
1781 				length + exthdrlen, transhdrlen + exthdrlen,
1782 				flags, ipc6, sockc);
1783 	if (err) {
1784 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1785 		return ERR_PTR(err);
1786 	}
1787 
1788 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1789 }
1790