xref: /linux/net/ipv6/ip6_output.c (revision 4ab5a5d2a4a2289c2af07accbec7170ca5671f41)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	unsigned int head_room;
199 	struct ipv6hdr *hdr;
200 	u8  proto = fl6->flowi6_proto;
201 	int seg_len = skb->len;
202 	int hlimit = -1;
203 	u32 mtu;
204 
205 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206 	if (opt)
207 		head_room += opt->opt_nflen + opt->opt_flen;
208 
209 	if (unlikely(skb_headroom(skb) < head_room)) {
210 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 		if (!skb2) {
212 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 				      IPSTATS_MIB_OUTDISCARDS);
214 			kfree_skb(skb);
215 			return -ENOBUFS;
216 		}
217 		if (skb->sk)
218 			skb_set_owner_w(skb2, skb->sk);
219 		consume_skb(skb);
220 		skb = skb2;
221 	}
222 
223 	if (opt) {
224 		seg_len += opt->opt_nflen + opt->opt_flen;
225 
226 		if (opt->opt_flen)
227 			ipv6_push_frag_opts(skb, opt, &proto);
228 
229 		if (opt->opt_nflen)
230 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 					     &fl6->saddr);
232 	}
233 
234 	skb_push(skb, sizeof(struct ipv6hdr));
235 	skb_reset_network_header(skb);
236 	hdr = ipv6_hdr(skb);
237 
238 	/*
239 	 *	Fill in the IPv6 header
240 	 */
241 	if (np)
242 		hlimit = np->hop_limit;
243 	if (hlimit < 0)
244 		hlimit = ip6_dst_hoplimit(dst);
245 
246 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 				ip6_autoflowlabel(net, np), fl6));
248 
249 	hdr->payload_len = htons(seg_len);
250 	hdr->nexthdr = proto;
251 	hdr->hop_limit = hlimit;
252 
253 	hdr->saddr = fl6->saddr;
254 	hdr->daddr = *first_hop;
255 
256 	skb->protocol = htons(ETH_P_IPV6);
257 	skb->priority = sk->sk_priority;
258 	skb->mark = mark;
259 
260 	mtu = dst_mtu(dst);
261 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 			      IPSTATS_MIB_OUT, skb->len);
264 
265 		/* if egress device is enslaved to an L3 master device pass the
266 		 * skb to its handler for processing
267 		 */
268 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 		if (unlikely(!skb))
270 			return 0;
271 
272 		/* hooks should never assume socket lock is held.
273 		 * we promote our socket to non const
274 		 */
275 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 			       net, (struct sock *)sk, skb, NULL, dst->dev,
277 			       dst_output);
278 	}
279 
280 	skb->dev = dst->dev;
281 	/* ipv6_local_error() does not require socket lock,
282 	 * we promote our socket to non const
283 	 */
284 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285 
286 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 	kfree_skb(skb);
288 	return -EMSGSIZE;
289 }
290 EXPORT_SYMBOL(ip6_xmit);
291 
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293 {
294 	struct ip6_ra_chain *ra;
295 	struct sock *last = NULL;
296 
297 	read_lock(&ip6_ra_lock);
298 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 		struct sock *sk = ra->sk;
300 		if (sk && ra->sel == sel &&
301 		    (!sk->sk_bound_dev_if ||
302 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 			if (last) {
304 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
305 				if (skb2)
306 					rawv6_rcv(last, skb2);
307 			}
308 			last = sk;
309 		}
310 	}
311 
312 	if (last) {
313 		rawv6_rcv(last, skb);
314 		read_unlock(&ip6_ra_lock);
315 		return 1;
316 	}
317 	read_unlock(&ip6_ra_lock);
318 	return 0;
319 }
320 
321 static int ip6_forward_proxy_check(struct sk_buff *skb)
322 {
323 	struct ipv6hdr *hdr = ipv6_hdr(skb);
324 	u8 nexthdr = hdr->nexthdr;
325 	__be16 frag_off;
326 	int offset;
327 
328 	if (ipv6_ext_hdr(nexthdr)) {
329 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
330 		if (offset < 0)
331 			return 0;
332 	} else
333 		offset = sizeof(struct ipv6hdr);
334 
335 	if (nexthdr == IPPROTO_ICMPV6) {
336 		struct icmp6hdr *icmp6;
337 
338 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
339 					 offset + 1 - skb->data)))
340 			return 0;
341 
342 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
343 
344 		switch (icmp6->icmp6_type) {
345 		case NDISC_ROUTER_SOLICITATION:
346 		case NDISC_ROUTER_ADVERTISEMENT:
347 		case NDISC_NEIGHBOUR_SOLICITATION:
348 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
349 		case NDISC_REDIRECT:
350 			/* For reaction involving unicast neighbor discovery
351 			 * message destined to the proxied address, pass it to
352 			 * input function.
353 			 */
354 			return 1;
355 		default:
356 			break;
357 		}
358 	}
359 
360 	/*
361 	 * The proxying router can't forward traffic sent to a link-local
362 	 * address, so signal the sender and discard the packet. This
363 	 * behavior is clarified by the MIPv6 specification.
364 	 */
365 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
366 		dst_link_failure(skb);
367 		return -1;
368 	}
369 
370 	return 0;
371 }
372 
373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
374 				     struct sk_buff *skb)
375 {
376 	struct dst_entry *dst = skb_dst(skb);
377 
378 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
379 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
380 
381 	skb->tstamp = 0;
382 	return dst_output(net, sk, skb);
383 }
384 
385 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
386 {
387 	if (skb->len <= mtu)
388 		return false;
389 
390 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
391 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
392 		return true;
393 
394 	if (skb->ignore_df)
395 		return false;
396 
397 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
398 		return false;
399 
400 	return true;
401 }
402 
403 int ip6_forward(struct sk_buff *skb)
404 {
405 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
406 	struct dst_entry *dst = skb_dst(skb);
407 	struct ipv6hdr *hdr = ipv6_hdr(skb);
408 	struct inet6_skb_parm *opt = IP6CB(skb);
409 	struct net *net = dev_net(dst->dev);
410 	u32 mtu;
411 
412 	if (net->ipv6.devconf_all->forwarding == 0)
413 		goto error;
414 
415 	if (skb->pkt_type != PACKET_HOST)
416 		goto drop;
417 
418 	if (unlikely(skb->sk))
419 		goto drop;
420 
421 	if (skb_warn_if_lro(skb))
422 		goto drop;
423 
424 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
425 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
426 		goto drop;
427 	}
428 
429 	skb_forward_csum(skb);
430 
431 	/*
432 	 *	We DO NOT make any processing on
433 	 *	RA packets, pushing them to user level AS IS
434 	 *	without ane WARRANTY that application will be able
435 	 *	to interpret them. The reason is that we
436 	 *	cannot make anything clever here.
437 	 *
438 	 *	We are not end-node, so that if packet contains
439 	 *	AH/ESP, we cannot make anything.
440 	 *	Defragmentation also would be mistake, RA packets
441 	 *	cannot be fragmented, because there is no warranty
442 	 *	that different fragments will go along one path. --ANK
443 	 */
444 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
445 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
446 			return 0;
447 	}
448 
449 	/*
450 	 *	check and decrement ttl
451 	 */
452 	if (hdr->hop_limit <= 1) {
453 		/* Force OUTPUT device used as source address */
454 		skb->dev = dst->dev;
455 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
456 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
457 
458 		kfree_skb(skb);
459 		return -ETIMEDOUT;
460 	}
461 
462 	/* XXX: idev->cnf.proxy_ndp? */
463 	if (net->ipv6.devconf_all->proxy_ndp &&
464 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
465 		int proxied = ip6_forward_proxy_check(skb);
466 		if (proxied > 0)
467 			return ip6_input(skb);
468 		else if (proxied < 0) {
469 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
470 			goto drop;
471 		}
472 	}
473 
474 	if (!xfrm6_route_forward(skb)) {
475 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
476 		goto drop;
477 	}
478 	dst = skb_dst(skb);
479 
480 	/* IPv6 specs say nothing about it, but it is clear that we cannot
481 	   send redirects to source routed frames.
482 	   We don't send redirects to frames decapsulated from IPsec.
483 	 */
484 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
485 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
486 		struct in6_addr *target = NULL;
487 		struct inet_peer *peer;
488 		struct rt6_info *rt;
489 
490 		/*
491 		 *	incoming and outgoing devices are the same
492 		 *	send a redirect.
493 		 */
494 
495 		rt = (struct rt6_info *) dst;
496 		if (rt->rt6i_flags & RTF_GATEWAY)
497 			target = &rt->rt6i_gateway;
498 		else
499 			target = &hdr->daddr;
500 
501 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
502 
503 		/* Limit redirects both by destination (here)
504 		   and by source (inside ndisc_send_redirect)
505 		 */
506 		if (inet_peer_xrlim_allow(peer, 1*HZ))
507 			ndisc_send_redirect(skb, target);
508 		if (peer)
509 			inet_putpeer(peer);
510 	} else {
511 		int addrtype = ipv6_addr_type(&hdr->saddr);
512 
513 		/* This check is security critical. */
514 		if (addrtype == IPV6_ADDR_ANY ||
515 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
516 			goto error;
517 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
518 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
519 				    ICMPV6_NOT_NEIGHBOUR, 0);
520 			goto error;
521 		}
522 	}
523 
524 	mtu = ip6_dst_mtu_forward(dst);
525 	if (mtu < IPV6_MIN_MTU)
526 		mtu = IPV6_MIN_MTU;
527 
528 	if (ip6_pkt_too_big(skb, mtu)) {
529 		/* Again, force OUTPUT device used as source address */
530 		skb->dev = dst->dev;
531 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
532 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
533 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
534 				IPSTATS_MIB_FRAGFAILS);
535 		kfree_skb(skb);
536 		return -EMSGSIZE;
537 	}
538 
539 	if (skb_cow(skb, dst->dev->hard_header_len)) {
540 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
541 				IPSTATS_MIB_OUTDISCARDS);
542 		goto drop;
543 	}
544 
545 	hdr = ipv6_hdr(skb);
546 
547 	/* Mangling hops number delayed to point after skb COW */
548 
549 	hdr->hop_limit--;
550 
551 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
552 		       net, NULL, skb, skb->dev, dst->dev,
553 		       ip6_forward_finish);
554 
555 error:
556 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
557 drop:
558 	kfree_skb(skb);
559 	return -EINVAL;
560 }
561 
562 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
563 {
564 	to->pkt_type = from->pkt_type;
565 	to->priority = from->priority;
566 	to->protocol = from->protocol;
567 	skb_dst_drop(to);
568 	skb_dst_set(to, dst_clone(skb_dst(from)));
569 	to->dev = from->dev;
570 	to->mark = from->mark;
571 
572 	skb_copy_hash(to, from);
573 
574 #ifdef CONFIG_NET_SCHED
575 	to->tc_index = from->tc_index;
576 #endif
577 	nf_copy(to, from);
578 	skb_copy_secmark(to, from);
579 }
580 
581 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
582 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
583 {
584 	struct sk_buff *frag;
585 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
586 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
587 				inet6_sk(skb->sk) : NULL;
588 	struct ipv6hdr *tmp_hdr;
589 	struct frag_hdr *fh;
590 	unsigned int mtu, hlen, left, len;
591 	int hroom, troom;
592 	__be32 frag_id;
593 	int ptr, offset = 0, err = 0;
594 	u8 *prevhdr, nexthdr = 0;
595 
596 	err = ip6_find_1stfragopt(skb, &prevhdr);
597 	if (err < 0)
598 		goto fail;
599 	hlen = err;
600 	nexthdr = *prevhdr;
601 
602 	mtu = ip6_skb_dst_mtu(skb);
603 
604 	/* We must not fragment if the socket is set to force MTU discovery
605 	 * or if the skb it not generated by a local socket.
606 	 */
607 	if (unlikely(!skb->ignore_df && skb->len > mtu))
608 		goto fail_toobig;
609 
610 	if (IP6CB(skb)->frag_max_size) {
611 		if (IP6CB(skb)->frag_max_size > mtu)
612 			goto fail_toobig;
613 
614 		/* don't send fragments larger than what we received */
615 		mtu = IP6CB(skb)->frag_max_size;
616 		if (mtu < IPV6_MIN_MTU)
617 			mtu = IPV6_MIN_MTU;
618 	}
619 
620 	if (np && np->frag_size < mtu) {
621 		if (np->frag_size)
622 			mtu = np->frag_size;
623 	}
624 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
625 		goto fail_toobig;
626 	mtu -= hlen + sizeof(struct frag_hdr);
627 
628 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
629 				    &ipv6_hdr(skb)->saddr);
630 
631 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
632 	    (err = skb_checksum_help(skb)))
633 		goto fail;
634 
635 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
636 	if (skb_has_frag_list(skb)) {
637 		unsigned int first_len = skb_pagelen(skb);
638 		struct sk_buff *frag2;
639 
640 		if (first_len - hlen > mtu ||
641 		    ((first_len - hlen) & 7) ||
642 		    skb_cloned(skb) ||
643 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
644 			goto slow_path;
645 
646 		skb_walk_frags(skb, frag) {
647 			/* Correct geometry. */
648 			if (frag->len > mtu ||
649 			    ((frag->len & 7) && frag->next) ||
650 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
651 				goto slow_path_clean;
652 
653 			/* Partially cloned skb? */
654 			if (skb_shared(frag))
655 				goto slow_path_clean;
656 
657 			BUG_ON(frag->sk);
658 			if (skb->sk) {
659 				frag->sk = skb->sk;
660 				frag->destructor = sock_wfree;
661 			}
662 			skb->truesize -= frag->truesize;
663 		}
664 
665 		err = 0;
666 		offset = 0;
667 		/* BUILD HEADER */
668 
669 		*prevhdr = NEXTHDR_FRAGMENT;
670 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
671 		if (!tmp_hdr) {
672 			err = -ENOMEM;
673 			goto fail;
674 		}
675 		frag = skb_shinfo(skb)->frag_list;
676 		skb_frag_list_init(skb);
677 
678 		__skb_pull(skb, hlen);
679 		fh = __skb_push(skb, sizeof(struct frag_hdr));
680 		__skb_push(skb, hlen);
681 		skb_reset_network_header(skb);
682 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
683 
684 		fh->nexthdr = nexthdr;
685 		fh->reserved = 0;
686 		fh->frag_off = htons(IP6_MF);
687 		fh->identification = frag_id;
688 
689 		first_len = skb_pagelen(skb);
690 		skb->data_len = first_len - skb_headlen(skb);
691 		skb->len = first_len;
692 		ipv6_hdr(skb)->payload_len = htons(first_len -
693 						   sizeof(struct ipv6hdr));
694 
695 		for (;;) {
696 			/* Prepare header of the next frame,
697 			 * before previous one went down. */
698 			if (frag) {
699 				frag->ip_summed = CHECKSUM_NONE;
700 				skb_reset_transport_header(frag);
701 				fh = __skb_push(frag, sizeof(struct frag_hdr));
702 				__skb_push(frag, hlen);
703 				skb_reset_network_header(frag);
704 				memcpy(skb_network_header(frag), tmp_hdr,
705 				       hlen);
706 				offset += skb->len - hlen - sizeof(struct frag_hdr);
707 				fh->nexthdr = nexthdr;
708 				fh->reserved = 0;
709 				fh->frag_off = htons(offset);
710 				if (frag->next)
711 					fh->frag_off |= htons(IP6_MF);
712 				fh->identification = frag_id;
713 				ipv6_hdr(frag)->payload_len =
714 						htons(frag->len -
715 						      sizeof(struct ipv6hdr));
716 				ip6_copy_metadata(frag, skb);
717 			}
718 
719 			err = output(net, sk, skb);
720 			if (!err)
721 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
722 					      IPSTATS_MIB_FRAGCREATES);
723 
724 			if (err || !frag)
725 				break;
726 
727 			skb = frag;
728 			frag = skb->next;
729 			skb_mark_not_on_list(skb);
730 		}
731 
732 		kfree(tmp_hdr);
733 
734 		if (err == 0) {
735 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
736 				      IPSTATS_MIB_FRAGOKS);
737 			return 0;
738 		}
739 
740 		kfree_skb_list(frag);
741 
742 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743 			      IPSTATS_MIB_FRAGFAILS);
744 		return err;
745 
746 slow_path_clean:
747 		skb_walk_frags(skb, frag2) {
748 			if (frag2 == frag)
749 				break;
750 			frag2->sk = NULL;
751 			frag2->destructor = NULL;
752 			skb->truesize += frag2->truesize;
753 		}
754 	}
755 
756 slow_path:
757 	left = skb->len - hlen;		/* Space per frame */
758 	ptr = hlen;			/* Where to start from */
759 
760 	/*
761 	 *	Fragment the datagram.
762 	 */
763 
764 	troom = rt->dst.dev->needed_tailroom;
765 
766 	/*
767 	 *	Keep copying data until we run out.
768 	 */
769 	while (left > 0)	{
770 		u8 *fragnexthdr_offset;
771 
772 		len = left;
773 		/* IF: it doesn't fit, use 'mtu' - the data space left */
774 		if (len > mtu)
775 			len = mtu;
776 		/* IF: we are not sending up to and including the packet end
777 		   then align the next start on an eight byte boundary */
778 		if (len < left)	{
779 			len &= ~7;
780 		}
781 
782 		/* Allocate buffer */
783 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
784 				 hroom + troom, GFP_ATOMIC);
785 		if (!frag) {
786 			err = -ENOMEM;
787 			goto fail;
788 		}
789 
790 		/*
791 		 *	Set up data on packet
792 		 */
793 
794 		ip6_copy_metadata(frag, skb);
795 		skb_reserve(frag, hroom);
796 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
797 		skb_reset_network_header(frag);
798 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
799 		frag->transport_header = (frag->network_header + hlen +
800 					  sizeof(struct frag_hdr));
801 
802 		/*
803 		 *	Charge the memory for the fragment to any owner
804 		 *	it might possess
805 		 */
806 		if (skb->sk)
807 			skb_set_owner_w(frag, skb->sk);
808 
809 		/*
810 		 *	Copy the packet header into the new buffer.
811 		 */
812 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
813 
814 		fragnexthdr_offset = skb_network_header(frag);
815 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
816 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
817 
818 		/*
819 		 *	Build fragment header.
820 		 */
821 		fh->nexthdr = nexthdr;
822 		fh->reserved = 0;
823 		fh->identification = frag_id;
824 
825 		/*
826 		 *	Copy a block of the IP datagram.
827 		 */
828 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
829 				     len));
830 		left -= len;
831 
832 		fh->frag_off = htons(offset);
833 		if (left > 0)
834 			fh->frag_off |= htons(IP6_MF);
835 		ipv6_hdr(frag)->payload_len = htons(frag->len -
836 						    sizeof(struct ipv6hdr));
837 
838 		ptr += len;
839 		offset += len;
840 
841 		/*
842 		 *	Put this fragment into the sending queue.
843 		 */
844 		err = output(net, sk, frag);
845 		if (err)
846 			goto fail;
847 
848 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
849 			      IPSTATS_MIB_FRAGCREATES);
850 	}
851 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
852 		      IPSTATS_MIB_FRAGOKS);
853 	consume_skb(skb);
854 	return err;
855 
856 fail_toobig:
857 	if (skb->sk && dst_allfrag(skb_dst(skb)))
858 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
859 
860 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
861 	err = -EMSGSIZE;
862 
863 fail:
864 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
865 		      IPSTATS_MIB_FRAGFAILS);
866 	kfree_skb(skb);
867 	return err;
868 }
869 
870 static inline int ip6_rt_check(const struct rt6key *rt_key,
871 			       const struct in6_addr *fl_addr,
872 			       const struct in6_addr *addr_cache)
873 {
874 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
875 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
876 }
877 
878 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
879 					  struct dst_entry *dst,
880 					  const struct flowi6 *fl6)
881 {
882 	struct ipv6_pinfo *np = inet6_sk(sk);
883 	struct rt6_info *rt;
884 
885 	if (!dst)
886 		goto out;
887 
888 	if (dst->ops->family != AF_INET6) {
889 		dst_release(dst);
890 		return NULL;
891 	}
892 
893 	rt = (struct rt6_info *)dst;
894 	/* Yes, checking route validity in not connected
895 	 * case is not very simple. Take into account,
896 	 * that we do not support routing by source, TOS,
897 	 * and MSG_DONTROUTE		--ANK (980726)
898 	 *
899 	 * 1. ip6_rt_check(): If route was host route,
900 	 *    check that cached destination is current.
901 	 *    If it is network route, we still may
902 	 *    check its validity using saved pointer
903 	 *    to the last used address: daddr_cache.
904 	 *    We do not want to save whole address now,
905 	 *    (because main consumer of this service
906 	 *    is tcp, which has not this problem),
907 	 *    so that the last trick works only on connected
908 	 *    sockets.
909 	 * 2. oif also should be the same.
910 	 */
911 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
912 #ifdef CONFIG_IPV6_SUBTREES
913 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
914 #endif
915 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
916 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
917 		dst_release(dst);
918 		dst = NULL;
919 	}
920 
921 out:
922 	return dst;
923 }
924 
925 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
926 			       struct dst_entry **dst, struct flowi6 *fl6)
927 {
928 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
929 	struct neighbour *n;
930 	struct rt6_info *rt;
931 #endif
932 	int err;
933 	int flags = 0;
934 
935 	/* The correct way to handle this would be to do
936 	 * ip6_route_get_saddr, and then ip6_route_output; however,
937 	 * the route-specific preferred source forces the
938 	 * ip6_route_output call _before_ ip6_route_get_saddr.
939 	 *
940 	 * In source specific routing (no src=any default route),
941 	 * ip6_route_output will fail given src=any saddr, though, so
942 	 * that's why we try it again later.
943 	 */
944 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
945 		struct fib6_info *from;
946 		struct rt6_info *rt;
947 		bool had_dst = *dst != NULL;
948 
949 		if (!had_dst)
950 			*dst = ip6_route_output(net, sk, fl6);
951 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
952 
953 		rcu_read_lock();
954 		from = rt ? rcu_dereference(rt->from) : NULL;
955 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
956 					  sk ? inet6_sk(sk)->srcprefs : 0,
957 					  &fl6->saddr);
958 		rcu_read_unlock();
959 
960 		if (err)
961 			goto out_err_release;
962 
963 		/* If we had an erroneous initial result, pretend it
964 		 * never existed and let the SA-enabled version take
965 		 * over.
966 		 */
967 		if (!had_dst && (*dst)->error) {
968 			dst_release(*dst);
969 			*dst = NULL;
970 		}
971 
972 		if (fl6->flowi6_oif)
973 			flags |= RT6_LOOKUP_F_IFACE;
974 	}
975 
976 	if (!*dst)
977 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
978 
979 	err = (*dst)->error;
980 	if (err)
981 		goto out_err_release;
982 
983 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
984 	/*
985 	 * Here if the dst entry we've looked up
986 	 * has a neighbour entry that is in the INCOMPLETE
987 	 * state and the src address from the flow is
988 	 * marked as OPTIMISTIC, we release the found
989 	 * dst entry and replace it instead with the
990 	 * dst entry of the nexthop router
991 	 */
992 	rt = (struct rt6_info *) *dst;
993 	rcu_read_lock_bh();
994 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
995 				      rt6_nexthop(rt, &fl6->daddr));
996 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
997 	rcu_read_unlock_bh();
998 
999 	if (err) {
1000 		struct inet6_ifaddr *ifp;
1001 		struct flowi6 fl_gw6;
1002 		int redirect;
1003 
1004 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1005 				      (*dst)->dev, 1);
1006 
1007 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1008 		if (ifp)
1009 			in6_ifa_put(ifp);
1010 
1011 		if (redirect) {
1012 			/*
1013 			 * We need to get the dst entry for the
1014 			 * default router instead
1015 			 */
1016 			dst_release(*dst);
1017 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1018 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1019 			*dst = ip6_route_output(net, sk, &fl_gw6);
1020 			err = (*dst)->error;
1021 			if (err)
1022 				goto out_err_release;
1023 		}
1024 	}
1025 #endif
1026 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1027 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1028 		err = -EAFNOSUPPORT;
1029 		goto out_err_release;
1030 	}
1031 
1032 	return 0;
1033 
1034 out_err_release:
1035 	dst_release(*dst);
1036 	*dst = NULL;
1037 
1038 	if (err == -ENETUNREACH)
1039 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1040 	return err;
1041 }
1042 
1043 /**
1044  *	ip6_dst_lookup - perform route lookup on flow
1045  *	@sk: socket which provides route info
1046  *	@dst: pointer to dst_entry * for result
1047  *	@fl6: flow to lookup
1048  *
1049  *	This function performs a route lookup on the given flow.
1050  *
1051  *	It returns zero on success, or a standard errno code on error.
1052  */
1053 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1054 		   struct flowi6 *fl6)
1055 {
1056 	*dst = NULL;
1057 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1058 }
1059 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1060 
1061 /**
1062  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1063  *	@sk: socket which provides route info
1064  *	@fl6: flow to lookup
1065  *	@final_dst: final destination address for ipsec lookup
1066  *
1067  *	This function performs a route lookup on the given flow.
1068  *
1069  *	It returns a valid dst pointer on success, or a pointer encoded
1070  *	error code.
1071  */
1072 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1073 				      const struct in6_addr *final_dst)
1074 {
1075 	struct dst_entry *dst = NULL;
1076 	int err;
1077 
1078 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1079 	if (err)
1080 		return ERR_PTR(err);
1081 	if (final_dst)
1082 		fl6->daddr = *final_dst;
1083 
1084 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1085 }
1086 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1087 
1088 /**
1089  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1090  *	@sk: socket which provides the dst cache and route info
1091  *	@fl6: flow to lookup
1092  *	@final_dst: final destination address for ipsec lookup
1093  *	@connected: whether @sk is connected or not
1094  *
1095  *	This function performs a route lookup on the given flow with the
1096  *	possibility of using the cached route in the socket if it is valid.
1097  *	It will take the socket dst lock when operating on the dst cache.
1098  *	As a result, this function can only be used in process context.
1099  *
1100  *	In addition, for a connected socket, cache the dst in the socket
1101  *	if the current cache is not valid.
1102  *
1103  *	It returns a valid dst pointer on success, or a pointer encoded
1104  *	error code.
1105  */
1106 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1107 					 const struct in6_addr *final_dst,
1108 					 bool connected)
1109 {
1110 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1111 
1112 	dst = ip6_sk_dst_check(sk, dst, fl6);
1113 	if (dst)
1114 		return dst;
1115 
1116 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1117 	if (connected && !IS_ERR(dst))
1118 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1119 
1120 	return dst;
1121 }
1122 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1123 
1124 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1125 					       gfp_t gfp)
1126 {
1127 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1128 }
1129 
1130 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1131 						gfp_t gfp)
1132 {
1133 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 }
1135 
1136 static void ip6_append_data_mtu(unsigned int *mtu,
1137 				int *maxfraglen,
1138 				unsigned int fragheaderlen,
1139 				struct sk_buff *skb,
1140 				struct rt6_info *rt,
1141 				unsigned int orig_mtu)
1142 {
1143 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1144 		if (!skb) {
1145 			/* first fragment, reserve header_len */
1146 			*mtu = orig_mtu - rt->dst.header_len;
1147 
1148 		} else {
1149 			/*
1150 			 * this fragment is not first, the headers
1151 			 * space is regarded as data space.
1152 			 */
1153 			*mtu = orig_mtu;
1154 		}
1155 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1156 			      + fragheaderlen - sizeof(struct frag_hdr);
1157 	}
1158 }
1159 
1160 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1161 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1162 			  struct rt6_info *rt, struct flowi6 *fl6)
1163 {
1164 	struct ipv6_pinfo *np = inet6_sk(sk);
1165 	unsigned int mtu;
1166 	struct ipv6_txoptions *opt = ipc6->opt;
1167 
1168 	/*
1169 	 * setup for corking
1170 	 */
1171 	if (opt) {
1172 		if (WARN_ON(v6_cork->opt))
1173 			return -EINVAL;
1174 
1175 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1176 		if (unlikely(!v6_cork->opt))
1177 			return -ENOBUFS;
1178 
1179 		v6_cork->opt->tot_len = sizeof(*opt);
1180 		v6_cork->opt->opt_flen = opt->opt_flen;
1181 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1182 
1183 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1184 						    sk->sk_allocation);
1185 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1186 			return -ENOBUFS;
1187 
1188 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1189 						    sk->sk_allocation);
1190 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1191 			return -ENOBUFS;
1192 
1193 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1194 						   sk->sk_allocation);
1195 		if (opt->hopopt && !v6_cork->opt->hopopt)
1196 			return -ENOBUFS;
1197 
1198 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1199 						    sk->sk_allocation);
1200 		if (opt->srcrt && !v6_cork->opt->srcrt)
1201 			return -ENOBUFS;
1202 
1203 		/* need source address above miyazawa*/
1204 	}
1205 	dst_hold(&rt->dst);
1206 	cork->base.dst = &rt->dst;
1207 	cork->fl.u.ip6 = *fl6;
1208 	v6_cork->hop_limit = ipc6->hlimit;
1209 	v6_cork->tclass = ipc6->tclass;
1210 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1211 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1212 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1213 	else
1214 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1215 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1216 	if (np->frag_size < mtu) {
1217 		if (np->frag_size)
1218 			mtu = np->frag_size;
1219 	}
1220 	if (mtu < IPV6_MIN_MTU)
1221 		return -EINVAL;
1222 	cork->base.fragsize = mtu;
1223 	cork->base.gso_size = ipc6->gso_size;
1224 	cork->base.tx_flags = 0;
1225 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1226 
1227 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1228 		cork->base.flags |= IPCORK_ALLFRAG;
1229 	cork->base.length = 0;
1230 
1231 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1232 
1233 	return 0;
1234 }
1235 
1236 static int __ip6_append_data(struct sock *sk,
1237 			     struct flowi6 *fl6,
1238 			     struct sk_buff_head *queue,
1239 			     struct inet_cork *cork,
1240 			     struct inet6_cork *v6_cork,
1241 			     struct page_frag *pfrag,
1242 			     int getfrag(void *from, char *to, int offset,
1243 					 int len, int odd, struct sk_buff *skb),
1244 			     void *from, int length, int transhdrlen,
1245 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1246 {
1247 	struct sk_buff *skb, *skb_prev = NULL;
1248 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1249 	int exthdrlen = 0;
1250 	int dst_exthdrlen = 0;
1251 	int hh_len;
1252 	int copy;
1253 	int err;
1254 	int offset = 0;
1255 	u32 tskey = 0;
1256 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1257 	struct ipv6_txoptions *opt = v6_cork->opt;
1258 	int csummode = CHECKSUM_NONE;
1259 	unsigned int maxnonfragsize, headersize;
1260 	unsigned int wmem_alloc_delta = 0;
1261 	bool paged;
1262 
1263 	skb = skb_peek_tail(queue);
1264 	if (!skb) {
1265 		exthdrlen = opt ? opt->opt_flen : 0;
1266 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1267 	}
1268 
1269 	paged = !!cork->gso_size;
1270 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1271 	orig_mtu = mtu;
1272 
1273 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1274 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1275 		tskey = sk->sk_tskey++;
1276 
1277 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1278 
1279 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1280 			(opt ? opt->opt_nflen : 0);
1281 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1282 		     sizeof(struct frag_hdr);
1283 
1284 	headersize = sizeof(struct ipv6hdr) +
1285 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1286 		     (dst_allfrag(&rt->dst) ?
1287 		      sizeof(struct frag_hdr) : 0) +
1288 		     rt->rt6i_nfheader_len;
1289 
1290 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1291 	 * the first fragment
1292 	 */
1293 	if (headersize + transhdrlen > mtu)
1294 		goto emsgsize;
1295 
1296 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1297 	    (sk->sk_protocol == IPPROTO_UDP ||
1298 	     sk->sk_protocol == IPPROTO_RAW)) {
1299 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1300 				sizeof(struct ipv6hdr));
1301 		goto emsgsize;
1302 	}
1303 
1304 	if (ip6_sk_ignore_df(sk))
1305 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1306 	else
1307 		maxnonfragsize = mtu;
1308 
1309 	if (cork->length + length > maxnonfragsize - headersize) {
1310 emsgsize:
1311 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1312 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1313 		return -EMSGSIZE;
1314 	}
1315 
1316 	/* CHECKSUM_PARTIAL only with no extension headers and when
1317 	 * we are not going to fragment
1318 	 */
1319 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1320 	    headersize == sizeof(struct ipv6hdr) &&
1321 	    length <= mtu - headersize &&
1322 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1323 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1324 		csummode = CHECKSUM_PARTIAL;
1325 
1326 	/*
1327 	 * Let's try using as much space as possible.
1328 	 * Use MTU if total length of the message fits into the MTU.
1329 	 * Otherwise, we need to reserve fragment header and
1330 	 * fragment alignment (= 8-15 octects, in total).
1331 	 *
1332 	 * Note that we may need to "move" the data from the tail of
1333 	 * of the buffer to the new fragment when we split
1334 	 * the message.
1335 	 *
1336 	 * FIXME: It may be fragmented into multiple chunks
1337 	 *        at once if non-fragmentable extension headers
1338 	 *        are too large.
1339 	 * --yoshfuji
1340 	 */
1341 
1342 	cork->length += length;
1343 	if (!skb)
1344 		goto alloc_new_skb;
1345 
1346 	while (length > 0) {
1347 		/* Check if the remaining data fits into current packet. */
1348 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1349 		if (copy < length)
1350 			copy = maxfraglen - skb->len;
1351 
1352 		if (copy <= 0) {
1353 			char *data;
1354 			unsigned int datalen;
1355 			unsigned int fraglen;
1356 			unsigned int fraggap;
1357 			unsigned int alloclen;
1358 			unsigned int pagedlen;
1359 alloc_new_skb:
1360 			/* There's no room in the current skb */
1361 			if (skb)
1362 				fraggap = skb->len - maxfraglen;
1363 			else
1364 				fraggap = 0;
1365 			/* update mtu and maxfraglen if necessary */
1366 			if (!skb || !skb_prev)
1367 				ip6_append_data_mtu(&mtu, &maxfraglen,
1368 						    fragheaderlen, skb, rt,
1369 						    orig_mtu);
1370 
1371 			skb_prev = skb;
1372 
1373 			/*
1374 			 * If remaining data exceeds the mtu,
1375 			 * we know we need more fragment(s).
1376 			 */
1377 			datalen = length + fraggap;
1378 
1379 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1380 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1381 			fraglen = datalen + fragheaderlen;
1382 			pagedlen = 0;
1383 
1384 			if ((flags & MSG_MORE) &&
1385 			    !(rt->dst.dev->features&NETIF_F_SG))
1386 				alloclen = mtu;
1387 			else if (!paged)
1388 				alloclen = fraglen;
1389 			else {
1390 				alloclen = min_t(int, fraglen, MAX_HEADER);
1391 				pagedlen = fraglen - alloclen;
1392 			}
1393 
1394 			alloclen += dst_exthdrlen;
1395 
1396 			if (datalen != length + fraggap) {
1397 				/*
1398 				 * this is not the last fragment, the trailer
1399 				 * space is regarded as data space.
1400 				 */
1401 				datalen += rt->dst.trailer_len;
1402 			}
1403 
1404 			alloclen += rt->dst.trailer_len;
1405 			fraglen = datalen + fragheaderlen;
1406 
1407 			/*
1408 			 * We just reserve space for fragment header.
1409 			 * Note: this may be overallocation if the message
1410 			 * (without MSG_MORE) fits into the MTU.
1411 			 */
1412 			alloclen += sizeof(struct frag_hdr);
1413 
1414 			copy = datalen - transhdrlen - fraggap - pagedlen;
1415 			if (copy < 0) {
1416 				err = -EINVAL;
1417 				goto error;
1418 			}
1419 			if (transhdrlen) {
1420 				skb = sock_alloc_send_skb(sk,
1421 						alloclen + hh_len,
1422 						(flags & MSG_DONTWAIT), &err);
1423 			} else {
1424 				skb = NULL;
1425 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1426 				    2 * sk->sk_sndbuf)
1427 					skb = alloc_skb(alloclen + hh_len,
1428 							sk->sk_allocation);
1429 				if (unlikely(!skb))
1430 					err = -ENOBUFS;
1431 			}
1432 			if (!skb)
1433 				goto error;
1434 			/*
1435 			 *	Fill in the control structures
1436 			 */
1437 			skb->protocol = htons(ETH_P_IPV6);
1438 			skb->ip_summed = csummode;
1439 			skb->csum = 0;
1440 			/* reserve for fragmentation and ipsec header */
1441 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1442 				    dst_exthdrlen);
1443 
1444 			/* Only the initial fragment is time stamped */
1445 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1446 			cork->tx_flags = 0;
1447 			skb_shinfo(skb)->tskey = tskey;
1448 			tskey = 0;
1449 
1450 			/*
1451 			 *	Find where to start putting bytes
1452 			 */
1453 			data = skb_put(skb, fraglen - pagedlen);
1454 			skb_set_network_header(skb, exthdrlen);
1455 			data += fragheaderlen;
1456 			skb->transport_header = (skb->network_header +
1457 						 fragheaderlen);
1458 			if (fraggap) {
1459 				skb->csum = skb_copy_and_csum_bits(
1460 					skb_prev, maxfraglen,
1461 					data + transhdrlen, fraggap, 0);
1462 				skb_prev->csum = csum_sub(skb_prev->csum,
1463 							  skb->csum);
1464 				data += fraggap;
1465 				pskb_trim_unique(skb_prev, maxfraglen);
1466 			}
1467 			if (copy > 0 &&
1468 			    getfrag(from, data + transhdrlen, offset,
1469 				    copy, fraggap, skb) < 0) {
1470 				err = -EFAULT;
1471 				kfree_skb(skb);
1472 				goto error;
1473 			}
1474 
1475 			offset += copy;
1476 			length -= copy + transhdrlen;
1477 			transhdrlen = 0;
1478 			exthdrlen = 0;
1479 			dst_exthdrlen = 0;
1480 
1481 			if ((flags & MSG_CONFIRM) && !skb_prev)
1482 				skb_set_dst_pending_confirm(skb, 1);
1483 
1484 			/*
1485 			 * Put the packet on the pending queue
1486 			 */
1487 			if (!skb->destructor) {
1488 				skb->destructor = sock_wfree;
1489 				skb->sk = sk;
1490 				wmem_alloc_delta += skb->truesize;
1491 			}
1492 			__skb_queue_tail(queue, skb);
1493 			continue;
1494 		}
1495 
1496 		if (copy > length)
1497 			copy = length;
1498 
1499 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1500 		    skb_tailroom(skb) >= copy) {
1501 			unsigned int off;
1502 
1503 			off = skb->len;
1504 			if (getfrag(from, skb_put(skb, copy),
1505 						offset, copy, off, skb) < 0) {
1506 				__skb_trim(skb, off);
1507 				err = -EFAULT;
1508 				goto error;
1509 			}
1510 		} else {
1511 			int i = skb_shinfo(skb)->nr_frags;
1512 
1513 			err = -ENOMEM;
1514 			if (!sk_page_frag_refill(sk, pfrag))
1515 				goto error;
1516 
1517 			if (!skb_can_coalesce(skb, i, pfrag->page,
1518 					      pfrag->offset)) {
1519 				err = -EMSGSIZE;
1520 				if (i == MAX_SKB_FRAGS)
1521 					goto error;
1522 
1523 				__skb_fill_page_desc(skb, i, pfrag->page,
1524 						     pfrag->offset, 0);
1525 				skb_shinfo(skb)->nr_frags = ++i;
1526 				get_page(pfrag->page);
1527 			}
1528 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1529 			if (getfrag(from,
1530 				    page_address(pfrag->page) + pfrag->offset,
1531 				    offset, copy, skb->len, skb) < 0)
1532 				goto error_efault;
1533 
1534 			pfrag->offset += copy;
1535 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1536 			skb->len += copy;
1537 			skb->data_len += copy;
1538 			skb->truesize += copy;
1539 			wmem_alloc_delta += copy;
1540 		}
1541 		offset += copy;
1542 		length -= copy;
1543 	}
1544 
1545 	if (wmem_alloc_delta)
1546 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1547 	return 0;
1548 
1549 error_efault:
1550 	err = -EFAULT;
1551 error:
1552 	cork->length -= length;
1553 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1554 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1555 	return err;
1556 }
1557 
1558 int ip6_append_data(struct sock *sk,
1559 		    int getfrag(void *from, char *to, int offset, int len,
1560 				int odd, struct sk_buff *skb),
1561 		    void *from, int length, int transhdrlen,
1562 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1563 		    struct rt6_info *rt, unsigned int flags)
1564 {
1565 	struct inet_sock *inet = inet_sk(sk);
1566 	struct ipv6_pinfo *np = inet6_sk(sk);
1567 	int exthdrlen;
1568 	int err;
1569 
1570 	if (flags&MSG_PROBE)
1571 		return 0;
1572 	if (skb_queue_empty(&sk->sk_write_queue)) {
1573 		/*
1574 		 * setup for corking
1575 		 */
1576 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1577 				     ipc6, rt, fl6);
1578 		if (err)
1579 			return err;
1580 
1581 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1582 		length += exthdrlen;
1583 		transhdrlen += exthdrlen;
1584 	} else {
1585 		fl6 = &inet->cork.fl.u.ip6;
1586 		transhdrlen = 0;
1587 	}
1588 
1589 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1590 				 &np->cork, sk_page_frag(sk), getfrag,
1591 				 from, length, transhdrlen, flags, ipc6);
1592 }
1593 EXPORT_SYMBOL_GPL(ip6_append_data);
1594 
1595 static void ip6_cork_release(struct inet_cork_full *cork,
1596 			     struct inet6_cork *v6_cork)
1597 {
1598 	if (v6_cork->opt) {
1599 		kfree(v6_cork->opt->dst0opt);
1600 		kfree(v6_cork->opt->dst1opt);
1601 		kfree(v6_cork->opt->hopopt);
1602 		kfree(v6_cork->opt->srcrt);
1603 		kfree(v6_cork->opt);
1604 		v6_cork->opt = NULL;
1605 	}
1606 
1607 	if (cork->base.dst) {
1608 		dst_release(cork->base.dst);
1609 		cork->base.dst = NULL;
1610 		cork->base.flags &= ~IPCORK_ALLFRAG;
1611 	}
1612 	memset(&cork->fl, 0, sizeof(cork->fl));
1613 }
1614 
1615 struct sk_buff *__ip6_make_skb(struct sock *sk,
1616 			       struct sk_buff_head *queue,
1617 			       struct inet_cork_full *cork,
1618 			       struct inet6_cork *v6_cork)
1619 {
1620 	struct sk_buff *skb, *tmp_skb;
1621 	struct sk_buff **tail_skb;
1622 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1623 	struct ipv6_pinfo *np = inet6_sk(sk);
1624 	struct net *net = sock_net(sk);
1625 	struct ipv6hdr *hdr;
1626 	struct ipv6_txoptions *opt = v6_cork->opt;
1627 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1628 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1629 	unsigned char proto = fl6->flowi6_proto;
1630 
1631 	skb = __skb_dequeue(queue);
1632 	if (!skb)
1633 		goto out;
1634 	tail_skb = &(skb_shinfo(skb)->frag_list);
1635 
1636 	/* move skb->data to ip header from ext header */
1637 	if (skb->data < skb_network_header(skb))
1638 		__skb_pull(skb, skb_network_offset(skb));
1639 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1640 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1641 		*tail_skb = tmp_skb;
1642 		tail_skb = &(tmp_skb->next);
1643 		skb->len += tmp_skb->len;
1644 		skb->data_len += tmp_skb->len;
1645 		skb->truesize += tmp_skb->truesize;
1646 		tmp_skb->destructor = NULL;
1647 		tmp_skb->sk = NULL;
1648 	}
1649 
1650 	/* Allow local fragmentation. */
1651 	skb->ignore_df = ip6_sk_ignore_df(sk);
1652 
1653 	*final_dst = fl6->daddr;
1654 	__skb_pull(skb, skb_network_header_len(skb));
1655 	if (opt && opt->opt_flen)
1656 		ipv6_push_frag_opts(skb, opt, &proto);
1657 	if (opt && opt->opt_nflen)
1658 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1659 
1660 	skb_push(skb, sizeof(struct ipv6hdr));
1661 	skb_reset_network_header(skb);
1662 	hdr = ipv6_hdr(skb);
1663 
1664 	ip6_flow_hdr(hdr, v6_cork->tclass,
1665 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1666 					ip6_autoflowlabel(net, np), fl6));
1667 	hdr->hop_limit = v6_cork->hop_limit;
1668 	hdr->nexthdr = proto;
1669 	hdr->saddr = fl6->saddr;
1670 	hdr->daddr = *final_dst;
1671 
1672 	skb->priority = sk->sk_priority;
1673 	skb->mark = sk->sk_mark;
1674 
1675 	skb->tstamp = cork->base.transmit_time;
1676 
1677 	skb_dst_set(skb, dst_clone(&rt->dst));
1678 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1679 	if (proto == IPPROTO_ICMPV6) {
1680 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1681 
1682 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1683 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1684 	}
1685 
1686 	ip6_cork_release(cork, v6_cork);
1687 out:
1688 	return skb;
1689 }
1690 
1691 int ip6_send_skb(struct sk_buff *skb)
1692 {
1693 	struct net *net = sock_net(skb->sk);
1694 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1695 	int err;
1696 
1697 	err = ip6_local_out(net, skb->sk, skb);
1698 	if (err) {
1699 		if (err > 0)
1700 			err = net_xmit_errno(err);
1701 		if (err)
1702 			IP6_INC_STATS(net, rt->rt6i_idev,
1703 				      IPSTATS_MIB_OUTDISCARDS);
1704 	}
1705 
1706 	return err;
1707 }
1708 
1709 int ip6_push_pending_frames(struct sock *sk)
1710 {
1711 	struct sk_buff *skb;
1712 
1713 	skb = ip6_finish_skb(sk);
1714 	if (!skb)
1715 		return 0;
1716 
1717 	return ip6_send_skb(skb);
1718 }
1719 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1720 
1721 static void __ip6_flush_pending_frames(struct sock *sk,
1722 				       struct sk_buff_head *queue,
1723 				       struct inet_cork_full *cork,
1724 				       struct inet6_cork *v6_cork)
1725 {
1726 	struct sk_buff *skb;
1727 
1728 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1729 		if (skb_dst(skb))
1730 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1731 				      IPSTATS_MIB_OUTDISCARDS);
1732 		kfree_skb(skb);
1733 	}
1734 
1735 	ip6_cork_release(cork, v6_cork);
1736 }
1737 
1738 void ip6_flush_pending_frames(struct sock *sk)
1739 {
1740 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1741 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1742 }
1743 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1744 
1745 struct sk_buff *ip6_make_skb(struct sock *sk,
1746 			     int getfrag(void *from, char *to, int offset,
1747 					 int len, int odd, struct sk_buff *skb),
1748 			     void *from, int length, int transhdrlen,
1749 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1750 			     struct rt6_info *rt, unsigned int flags,
1751 			     struct inet_cork_full *cork)
1752 {
1753 	struct inet6_cork v6_cork;
1754 	struct sk_buff_head queue;
1755 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1756 	int err;
1757 
1758 	if (flags & MSG_PROBE)
1759 		return NULL;
1760 
1761 	__skb_queue_head_init(&queue);
1762 
1763 	cork->base.flags = 0;
1764 	cork->base.addr = 0;
1765 	cork->base.opt = NULL;
1766 	cork->base.dst = NULL;
1767 	v6_cork.opt = NULL;
1768 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1769 	if (err) {
1770 		ip6_cork_release(cork, &v6_cork);
1771 		return ERR_PTR(err);
1772 	}
1773 	if (ipc6->dontfrag < 0)
1774 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1775 
1776 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1777 				&current->task_frag, getfrag, from,
1778 				length + exthdrlen, transhdrlen + exthdrlen,
1779 				flags, ipc6);
1780 	if (err) {
1781 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1782 		return ERR_PTR(err);
1783 	}
1784 
1785 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1786 }
1787