xref: /linux/net/ipv6/ip6_output.c (revision c6ed444fd6fffaaf2e3857d926ed18bf3df81e8e)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	struct ipv6hdr *hdr;
199 	u8  proto = fl6->flowi6_proto;
200 	int seg_len = skb->len;
201 	int hlimit = -1;
202 	u32 mtu;
203 
204 	if (opt) {
205 		unsigned int head_room;
206 
207 		/* First: exthdrs may take lots of space (~8K for now)
208 		   MAX_HEADER is not enough.
209 		 */
210 		head_room = opt->opt_nflen + opt->opt_flen;
211 		seg_len += head_room;
212 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213 
214 		if (skb_headroom(skb) < head_room) {
215 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 			if (!skb2) {
217 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218 					      IPSTATS_MIB_OUTDISCARDS);
219 				kfree_skb(skb);
220 				return -ENOBUFS;
221 			}
222 			consume_skb(skb);
223 			skb = skb2;
224 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225 			 * it is safe to call in our context (socket lock not held)
226 			 */
227 			skb_set_owner_w(skb, (struct sock *)sk);
228 		}
229 		if (opt->opt_flen)
230 			ipv6_push_frag_opts(skb, opt, &proto);
231 		if (opt->opt_nflen)
232 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233 					     &fl6->saddr);
234 	}
235 
236 	skb_push(skb, sizeof(struct ipv6hdr));
237 	skb_reset_network_header(skb);
238 	hdr = ipv6_hdr(skb);
239 
240 	/*
241 	 *	Fill in the IPv6 header
242 	 */
243 	if (np)
244 		hlimit = np->hop_limit;
245 	if (hlimit < 0)
246 		hlimit = ip6_dst_hoplimit(dst);
247 
248 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249 				ip6_autoflowlabel(net, np), fl6));
250 
251 	hdr->payload_len = htons(seg_len);
252 	hdr->nexthdr = proto;
253 	hdr->hop_limit = hlimit;
254 
255 	hdr->saddr = fl6->saddr;
256 	hdr->daddr = *first_hop;
257 
258 	skb->protocol = htons(ETH_P_IPV6);
259 	skb->priority = sk->sk_priority;
260 	skb->mark = mark;
261 
262 	mtu = dst_mtu(dst);
263 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265 			      IPSTATS_MIB_OUT, skb->len);
266 
267 		/* if egress device is enslaved to an L3 master device pass the
268 		 * skb to its handler for processing
269 		 */
270 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
271 		if (unlikely(!skb))
272 			return 0;
273 
274 		/* hooks should never assume socket lock is held.
275 		 * we promote our socket to non const
276 		 */
277 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278 			       net, (struct sock *)sk, skb, NULL, dst->dev,
279 			       dst_output);
280 	}
281 
282 	skb->dev = dst->dev;
283 	/* ipv6_local_error() does not require socket lock,
284 	 * we promote our socket to non const
285 	 */
286 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287 
288 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289 	kfree_skb(skb);
290 	return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293 
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296 	struct ip6_ra_chain *ra;
297 	struct sock *last = NULL;
298 
299 	read_lock(&ip6_ra_lock);
300 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
301 		struct sock *sk = ra->sk;
302 		if (sk && ra->sel == sel &&
303 		    (!sk->sk_bound_dev_if ||
304 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
305 			if (last) {
306 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307 				if (skb2)
308 					rawv6_rcv(last, skb2);
309 			}
310 			last = sk;
311 		}
312 	}
313 
314 	if (last) {
315 		rawv6_rcv(last, skb);
316 		read_unlock(&ip6_ra_lock);
317 		return 1;
318 	}
319 	read_unlock(&ip6_ra_lock);
320 	return 0;
321 }
322 
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325 	struct ipv6hdr *hdr = ipv6_hdr(skb);
326 	u8 nexthdr = hdr->nexthdr;
327 	__be16 frag_off;
328 	int offset;
329 
330 	if (ipv6_ext_hdr(nexthdr)) {
331 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332 		if (offset < 0)
333 			return 0;
334 	} else
335 		offset = sizeof(struct ipv6hdr);
336 
337 	if (nexthdr == IPPROTO_ICMPV6) {
338 		struct icmp6hdr *icmp6;
339 
340 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
341 					 offset + 1 - skb->data)))
342 			return 0;
343 
344 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345 
346 		switch (icmp6->icmp6_type) {
347 		case NDISC_ROUTER_SOLICITATION:
348 		case NDISC_ROUTER_ADVERTISEMENT:
349 		case NDISC_NEIGHBOUR_SOLICITATION:
350 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
351 		case NDISC_REDIRECT:
352 			/* For reaction involving unicast neighbor discovery
353 			 * message destined to the proxied address, pass it to
354 			 * input function.
355 			 */
356 			return 1;
357 		default:
358 			break;
359 		}
360 	}
361 
362 	/*
363 	 * The proxying router can't forward traffic sent to a link-local
364 	 * address, so signal the sender and discard the packet. This
365 	 * behavior is clarified by the MIPv6 specification.
366 	 */
367 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368 		dst_link_failure(skb);
369 		return -1;
370 	}
371 
372 	return 0;
373 }
374 
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376 				     struct sk_buff *skb)
377 {
378 	struct dst_entry *dst = skb_dst(skb);
379 
380 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
381 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
382 
383 	return dst_output(net, sk, skb);
384 }
385 
386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
387 {
388 	if (skb->len <= mtu)
389 		return false;
390 
391 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
392 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
393 		return true;
394 
395 	if (skb->ignore_df)
396 		return false;
397 
398 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
399 		return false;
400 
401 	return true;
402 }
403 
404 int ip6_forward(struct sk_buff *skb)
405 {
406 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
407 	struct dst_entry *dst = skb_dst(skb);
408 	struct ipv6hdr *hdr = ipv6_hdr(skb);
409 	struct inet6_skb_parm *opt = IP6CB(skb);
410 	struct net *net = dev_net(dst->dev);
411 	u32 mtu;
412 
413 	if (net->ipv6.devconf_all->forwarding == 0)
414 		goto error;
415 
416 	if (skb->pkt_type != PACKET_HOST)
417 		goto drop;
418 
419 	if (unlikely(skb->sk))
420 		goto drop;
421 
422 	if (skb_warn_if_lro(skb))
423 		goto drop;
424 
425 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
426 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
427 		goto drop;
428 	}
429 
430 	skb_forward_csum(skb);
431 
432 	/*
433 	 *	We DO NOT make any processing on
434 	 *	RA packets, pushing them to user level AS IS
435 	 *	without ane WARRANTY that application will be able
436 	 *	to interpret them. The reason is that we
437 	 *	cannot make anything clever here.
438 	 *
439 	 *	We are not end-node, so that if packet contains
440 	 *	AH/ESP, we cannot make anything.
441 	 *	Defragmentation also would be mistake, RA packets
442 	 *	cannot be fragmented, because there is no warranty
443 	 *	that different fragments will go along one path. --ANK
444 	 */
445 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
446 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
447 			return 0;
448 	}
449 
450 	/*
451 	 *	check and decrement ttl
452 	 */
453 	if (hdr->hop_limit <= 1) {
454 		/* Force OUTPUT device used as source address */
455 		skb->dev = dst->dev;
456 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
457 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
458 
459 		kfree_skb(skb);
460 		return -ETIMEDOUT;
461 	}
462 
463 	/* XXX: idev->cnf.proxy_ndp? */
464 	if (net->ipv6.devconf_all->proxy_ndp &&
465 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
466 		int proxied = ip6_forward_proxy_check(skb);
467 		if (proxied > 0)
468 			return ip6_input(skb);
469 		else if (proxied < 0) {
470 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
471 			goto drop;
472 		}
473 	}
474 
475 	if (!xfrm6_route_forward(skb)) {
476 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
477 		goto drop;
478 	}
479 	dst = skb_dst(skb);
480 
481 	/* IPv6 specs say nothing about it, but it is clear that we cannot
482 	   send redirects to source routed frames.
483 	   We don't send redirects to frames decapsulated from IPsec.
484 	 */
485 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
486 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
487 		struct in6_addr *target = NULL;
488 		struct inet_peer *peer;
489 		struct rt6_info *rt;
490 
491 		/*
492 		 *	incoming and outgoing devices are the same
493 		 *	send a redirect.
494 		 */
495 
496 		rt = (struct rt6_info *) dst;
497 		if (rt->rt6i_flags & RTF_GATEWAY)
498 			target = &rt->rt6i_gateway;
499 		else
500 			target = &hdr->daddr;
501 
502 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
503 
504 		/* Limit redirects both by destination (here)
505 		   and by source (inside ndisc_send_redirect)
506 		 */
507 		if (inet_peer_xrlim_allow(peer, 1*HZ))
508 			ndisc_send_redirect(skb, target);
509 		if (peer)
510 			inet_putpeer(peer);
511 	} else {
512 		int addrtype = ipv6_addr_type(&hdr->saddr);
513 
514 		/* This check is security critical. */
515 		if (addrtype == IPV6_ADDR_ANY ||
516 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
517 			goto error;
518 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
519 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
520 				    ICMPV6_NOT_NEIGHBOUR, 0);
521 			goto error;
522 		}
523 	}
524 
525 	mtu = ip6_dst_mtu_forward(dst);
526 	if (mtu < IPV6_MIN_MTU)
527 		mtu = IPV6_MIN_MTU;
528 
529 	if (ip6_pkt_too_big(skb, mtu)) {
530 		/* Again, force OUTPUT device used as source address */
531 		skb->dev = dst->dev;
532 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
533 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
534 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
535 				IPSTATS_MIB_FRAGFAILS);
536 		kfree_skb(skb);
537 		return -EMSGSIZE;
538 	}
539 
540 	if (skb_cow(skb, dst->dev->hard_header_len)) {
541 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
542 				IPSTATS_MIB_OUTDISCARDS);
543 		goto drop;
544 	}
545 
546 	hdr = ipv6_hdr(skb);
547 
548 	/* Mangling hops number delayed to point after skb COW */
549 
550 	hdr->hop_limit--;
551 
552 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
553 		       net, NULL, skb, skb->dev, dst->dev,
554 		       ip6_forward_finish);
555 
556 error:
557 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
558 drop:
559 	kfree_skb(skb);
560 	return -EINVAL;
561 }
562 
563 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
564 {
565 	to->pkt_type = from->pkt_type;
566 	to->priority = from->priority;
567 	to->protocol = from->protocol;
568 	skb_dst_drop(to);
569 	skb_dst_set(to, dst_clone(skb_dst(from)));
570 	to->dev = from->dev;
571 	to->mark = from->mark;
572 
573 	skb_copy_hash(to, from);
574 
575 #ifdef CONFIG_NET_SCHED
576 	to->tc_index = from->tc_index;
577 #endif
578 	nf_copy(to, from);
579 	skb_copy_secmark(to, from);
580 }
581 
582 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
583 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
584 {
585 	struct sk_buff *frag;
586 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
587 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
588 				inet6_sk(skb->sk) : NULL;
589 	struct ipv6hdr *tmp_hdr;
590 	struct frag_hdr *fh;
591 	unsigned int mtu, hlen, left, len;
592 	int hroom, troom;
593 	__be32 frag_id;
594 	int ptr, offset = 0, err = 0;
595 	u8 *prevhdr, nexthdr = 0;
596 
597 	err = ip6_find_1stfragopt(skb, &prevhdr);
598 	if (err < 0)
599 		goto fail;
600 	hlen = err;
601 	nexthdr = *prevhdr;
602 
603 	mtu = ip6_skb_dst_mtu(skb);
604 
605 	/* We must not fragment if the socket is set to force MTU discovery
606 	 * or if the skb it not generated by a local socket.
607 	 */
608 	if (unlikely(!skb->ignore_df && skb->len > mtu))
609 		goto fail_toobig;
610 
611 	if (IP6CB(skb)->frag_max_size) {
612 		if (IP6CB(skb)->frag_max_size > mtu)
613 			goto fail_toobig;
614 
615 		/* don't send fragments larger than what we received */
616 		mtu = IP6CB(skb)->frag_max_size;
617 		if (mtu < IPV6_MIN_MTU)
618 			mtu = IPV6_MIN_MTU;
619 	}
620 
621 	if (np && np->frag_size < mtu) {
622 		if (np->frag_size)
623 			mtu = np->frag_size;
624 	}
625 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
626 		goto fail_toobig;
627 	mtu -= hlen + sizeof(struct frag_hdr);
628 
629 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
630 				    &ipv6_hdr(skb)->saddr);
631 
632 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
633 	    (err = skb_checksum_help(skb)))
634 		goto fail;
635 
636 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
637 	if (skb_has_frag_list(skb)) {
638 		unsigned int first_len = skb_pagelen(skb);
639 		struct sk_buff *frag2;
640 
641 		if (first_len - hlen > mtu ||
642 		    ((first_len - hlen) & 7) ||
643 		    skb_cloned(skb) ||
644 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
645 			goto slow_path;
646 
647 		skb_walk_frags(skb, frag) {
648 			/* Correct geometry. */
649 			if (frag->len > mtu ||
650 			    ((frag->len & 7) && frag->next) ||
651 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
652 				goto slow_path_clean;
653 
654 			/* Partially cloned skb? */
655 			if (skb_shared(frag))
656 				goto slow_path_clean;
657 
658 			BUG_ON(frag->sk);
659 			if (skb->sk) {
660 				frag->sk = skb->sk;
661 				frag->destructor = sock_wfree;
662 			}
663 			skb->truesize -= frag->truesize;
664 		}
665 
666 		err = 0;
667 		offset = 0;
668 		/* BUILD HEADER */
669 
670 		*prevhdr = NEXTHDR_FRAGMENT;
671 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
672 		if (!tmp_hdr) {
673 			err = -ENOMEM;
674 			goto fail;
675 		}
676 		frag = skb_shinfo(skb)->frag_list;
677 		skb_frag_list_init(skb);
678 
679 		__skb_pull(skb, hlen);
680 		fh = __skb_push(skb, sizeof(struct frag_hdr));
681 		__skb_push(skb, hlen);
682 		skb_reset_network_header(skb);
683 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
684 
685 		fh->nexthdr = nexthdr;
686 		fh->reserved = 0;
687 		fh->frag_off = htons(IP6_MF);
688 		fh->identification = frag_id;
689 
690 		first_len = skb_pagelen(skb);
691 		skb->data_len = first_len - skb_headlen(skb);
692 		skb->len = first_len;
693 		ipv6_hdr(skb)->payload_len = htons(first_len -
694 						   sizeof(struct ipv6hdr));
695 
696 		for (;;) {
697 			/* Prepare header of the next frame,
698 			 * before previous one went down. */
699 			if (frag) {
700 				frag->ip_summed = CHECKSUM_NONE;
701 				skb_reset_transport_header(frag);
702 				fh = __skb_push(frag, sizeof(struct frag_hdr));
703 				__skb_push(frag, hlen);
704 				skb_reset_network_header(frag);
705 				memcpy(skb_network_header(frag), tmp_hdr,
706 				       hlen);
707 				offset += skb->len - hlen - sizeof(struct frag_hdr);
708 				fh->nexthdr = nexthdr;
709 				fh->reserved = 0;
710 				fh->frag_off = htons(offset);
711 				if (frag->next)
712 					fh->frag_off |= htons(IP6_MF);
713 				fh->identification = frag_id;
714 				ipv6_hdr(frag)->payload_len =
715 						htons(frag->len -
716 						      sizeof(struct ipv6hdr));
717 				ip6_copy_metadata(frag, skb);
718 			}
719 
720 			err = output(net, sk, skb);
721 			if (!err)
722 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
723 					      IPSTATS_MIB_FRAGCREATES);
724 
725 			if (err || !frag)
726 				break;
727 
728 			skb = frag;
729 			frag = skb->next;
730 			skb->next = NULL;
731 		}
732 
733 		kfree(tmp_hdr);
734 
735 		if (err == 0) {
736 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
737 				      IPSTATS_MIB_FRAGOKS);
738 			return 0;
739 		}
740 
741 		kfree_skb_list(frag);
742 
743 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
744 			      IPSTATS_MIB_FRAGFAILS);
745 		return err;
746 
747 slow_path_clean:
748 		skb_walk_frags(skb, frag2) {
749 			if (frag2 == frag)
750 				break;
751 			frag2->sk = NULL;
752 			frag2->destructor = NULL;
753 			skb->truesize += frag2->truesize;
754 		}
755 	}
756 
757 slow_path:
758 	left = skb->len - hlen;		/* Space per frame */
759 	ptr = hlen;			/* Where to start from */
760 
761 	/*
762 	 *	Fragment the datagram.
763 	 */
764 
765 	troom = rt->dst.dev->needed_tailroom;
766 
767 	/*
768 	 *	Keep copying data until we run out.
769 	 */
770 	while (left > 0)	{
771 		u8 *fragnexthdr_offset;
772 
773 		len = left;
774 		/* IF: it doesn't fit, use 'mtu' - the data space left */
775 		if (len > mtu)
776 			len = mtu;
777 		/* IF: we are not sending up to and including the packet end
778 		   then align the next start on an eight byte boundary */
779 		if (len < left)	{
780 			len &= ~7;
781 		}
782 
783 		/* Allocate buffer */
784 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
785 				 hroom + troom, GFP_ATOMIC);
786 		if (!frag) {
787 			err = -ENOMEM;
788 			goto fail;
789 		}
790 
791 		/*
792 		 *	Set up data on packet
793 		 */
794 
795 		ip6_copy_metadata(frag, skb);
796 		skb_reserve(frag, hroom);
797 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
798 		skb_reset_network_header(frag);
799 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
800 		frag->transport_header = (frag->network_header + hlen +
801 					  sizeof(struct frag_hdr));
802 
803 		/*
804 		 *	Charge the memory for the fragment to any owner
805 		 *	it might possess
806 		 */
807 		if (skb->sk)
808 			skb_set_owner_w(frag, skb->sk);
809 
810 		/*
811 		 *	Copy the packet header into the new buffer.
812 		 */
813 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
814 
815 		fragnexthdr_offset = skb_network_header(frag);
816 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
817 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
818 
819 		/*
820 		 *	Build fragment header.
821 		 */
822 		fh->nexthdr = nexthdr;
823 		fh->reserved = 0;
824 		fh->identification = frag_id;
825 
826 		/*
827 		 *	Copy a block of the IP datagram.
828 		 */
829 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
830 				     len));
831 		left -= len;
832 
833 		fh->frag_off = htons(offset);
834 		if (left > 0)
835 			fh->frag_off |= htons(IP6_MF);
836 		ipv6_hdr(frag)->payload_len = htons(frag->len -
837 						    sizeof(struct ipv6hdr));
838 
839 		ptr += len;
840 		offset += len;
841 
842 		/*
843 		 *	Put this fragment into the sending queue.
844 		 */
845 		err = output(net, sk, frag);
846 		if (err)
847 			goto fail;
848 
849 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
850 			      IPSTATS_MIB_FRAGCREATES);
851 	}
852 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
853 		      IPSTATS_MIB_FRAGOKS);
854 	consume_skb(skb);
855 	return err;
856 
857 fail_toobig:
858 	if (skb->sk && dst_allfrag(skb_dst(skb)))
859 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
860 
861 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
862 	err = -EMSGSIZE;
863 
864 fail:
865 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
866 		      IPSTATS_MIB_FRAGFAILS);
867 	kfree_skb(skb);
868 	return err;
869 }
870 
871 static inline int ip6_rt_check(const struct rt6key *rt_key,
872 			       const struct in6_addr *fl_addr,
873 			       const struct in6_addr *addr_cache)
874 {
875 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
876 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
877 }
878 
879 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
880 					  struct dst_entry *dst,
881 					  const struct flowi6 *fl6)
882 {
883 	struct ipv6_pinfo *np = inet6_sk(sk);
884 	struct rt6_info *rt;
885 
886 	if (!dst)
887 		goto out;
888 
889 	if (dst->ops->family != AF_INET6) {
890 		dst_release(dst);
891 		return NULL;
892 	}
893 
894 	rt = (struct rt6_info *)dst;
895 	/* Yes, checking route validity in not connected
896 	 * case is not very simple. Take into account,
897 	 * that we do not support routing by source, TOS,
898 	 * and MSG_DONTROUTE		--ANK (980726)
899 	 *
900 	 * 1. ip6_rt_check(): If route was host route,
901 	 *    check that cached destination is current.
902 	 *    If it is network route, we still may
903 	 *    check its validity using saved pointer
904 	 *    to the last used address: daddr_cache.
905 	 *    We do not want to save whole address now,
906 	 *    (because main consumer of this service
907 	 *    is tcp, which has not this problem),
908 	 *    so that the last trick works only on connected
909 	 *    sockets.
910 	 * 2. oif also should be the same.
911 	 */
912 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
913 #ifdef CONFIG_IPV6_SUBTREES
914 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
915 #endif
916 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
917 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
918 		dst_release(dst);
919 		dst = NULL;
920 	}
921 
922 out:
923 	return dst;
924 }
925 
926 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
927 			       struct dst_entry **dst, struct flowi6 *fl6)
928 {
929 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
930 	struct neighbour *n;
931 	struct rt6_info *rt;
932 #endif
933 	int err;
934 	int flags = 0;
935 
936 	/* The correct way to handle this would be to do
937 	 * ip6_route_get_saddr, and then ip6_route_output; however,
938 	 * the route-specific preferred source forces the
939 	 * ip6_route_output call _before_ ip6_route_get_saddr.
940 	 *
941 	 * In source specific routing (no src=any default route),
942 	 * ip6_route_output will fail given src=any saddr, though, so
943 	 * that's why we try it again later.
944 	 */
945 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
946 		struct fib6_info *from;
947 		struct rt6_info *rt;
948 		bool had_dst = *dst != NULL;
949 
950 		if (!had_dst)
951 			*dst = ip6_route_output(net, sk, fl6);
952 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
953 
954 		rcu_read_lock();
955 		from = rt ? rcu_dereference(rt->from) : NULL;
956 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
957 					  sk ? inet6_sk(sk)->srcprefs : 0,
958 					  &fl6->saddr);
959 		rcu_read_unlock();
960 
961 		if (err)
962 			goto out_err_release;
963 
964 		/* If we had an erroneous initial result, pretend it
965 		 * never existed and let the SA-enabled version take
966 		 * over.
967 		 */
968 		if (!had_dst && (*dst)->error) {
969 			dst_release(*dst);
970 			*dst = NULL;
971 		}
972 
973 		if (fl6->flowi6_oif)
974 			flags |= RT6_LOOKUP_F_IFACE;
975 	}
976 
977 	if (!*dst)
978 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
979 
980 	err = (*dst)->error;
981 	if (err)
982 		goto out_err_release;
983 
984 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
985 	/*
986 	 * Here if the dst entry we've looked up
987 	 * has a neighbour entry that is in the INCOMPLETE
988 	 * state and the src address from the flow is
989 	 * marked as OPTIMISTIC, we release the found
990 	 * dst entry and replace it instead with the
991 	 * dst entry of the nexthop router
992 	 */
993 	rt = (struct rt6_info *) *dst;
994 	rcu_read_lock_bh();
995 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
996 				      rt6_nexthop(rt, &fl6->daddr));
997 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
998 	rcu_read_unlock_bh();
999 
1000 	if (err) {
1001 		struct inet6_ifaddr *ifp;
1002 		struct flowi6 fl_gw6;
1003 		int redirect;
1004 
1005 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1006 				      (*dst)->dev, 1);
1007 
1008 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1009 		if (ifp)
1010 			in6_ifa_put(ifp);
1011 
1012 		if (redirect) {
1013 			/*
1014 			 * We need to get the dst entry for the
1015 			 * default router instead
1016 			 */
1017 			dst_release(*dst);
1018 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1019 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1020 			*dst = ip6_route_output(net, sk, &fl_gw6);
1021 			err = (*dst)->error;
1022 			if (err)
1023 				goto out_err_release;
1024 		}
1025 	}
1026 #endif
1027 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1028 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1029 		err = -EAFNOSUPPORT;
1030 		goto out_err_release;
1031 	}
1032 
1033 	return 0;
1034 
1035 out_err_release:
1036 	dst_release(*dst);
1037 	*dst = NULL;
1038 
1039 	if (err == -ENETUNREACH)
1040 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1041 	return err;
1042 }
1043 
1044 /**
1045  *	ip6_dst_lookup - perform route lookup on flow
1046  *	@sk: socket which provides route info
1047  *	@dst: pointer to dst_entry * for result
1048  *	@fl6: flow to lookup
1049  *
1050  *	This function performs a route lookup on the given flow.
1051  *
1052  *	It returns zero on success, or a standard errno code on error.
1053  */
1054 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1055 		   struct flowi6 *fl6)
1056 {
1057 	*dst = NULL;
1058 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1059 }
1060 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1061 
1062 /**
1063  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1064  *	@sk: socket which provides route info
1065  *	@fl6: flow to lookup
1066  *	@final_dst: final destination address for ipsec lookup
1067  *
1068  *	This function performs a route lookup on the given flow.
1069  *
1070  *	It returns a valid dst pointer on success, or a pointer encoded
1071  *	error code.
1072  */
1073 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1074 				      const struct in6_addr *final_dst)
1075 {
1076 	struct dst_entry *dst = NULL;
1077 	int err;
1078 
1079 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1080 	if (err)
1081 		return ERR_PTR(err);
1082 	if (final_dst)
1083 		fl6->daddr = *final_dst;
1084 
1085 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1086 }
1087 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1088 
1089 /**
1090  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1091  *	@sk: socket which provides the dst cache and route info
1092  *	@fl6: flow to lookup
1093  *	@final_dst: final destination address for ipsec lookup
1094  *	@connected: whether @sk is connected or not
1095  *
1096  *	This function performs a route lookup on the given flow with the
1097  *	possibility of using the cached route in the socket if it is valid.
1098  *	It will take the socket dst lock when operating on the dst cache.
1099  *	As a result, this function can only be used in process context.
1100  *
1101  *	In addition, for a connected socket, cache the dst in the socket
1102  *	if the current cache is not valid.
1103  *
1104  *	It returns a valid dst pointer on success, or a pointer encoded
1105  *	error code.
1106  */
1107 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1108 					 const struct in6_addr *final_dst,
1109 					 bool connected)
1110 {
1111 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1112 
1113 	dst = ip6_sk_dst_check(sk, dst, fl6);
1114 	if (dst)
1115 		return dst;
1116 
1117 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1118 	if (connected && !IS_ERR(dst))
1119 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1120 
1121 	return dst;
1122 }
1123 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1124 
1125 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1126 					       gfp_t gfp)
1127 {
1128 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1129 }
1130 
1131 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1132 						gfp_t gfp)
1133 {
1134 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1135 }
1136 
1137 static void ip6_append_data_mtu(unsigned int *mtu,
1138 				int *maxfraglen,
1139 				unsigned int fragheaderlen,
1140 				struct sk_buff *skb,
1141 				struct rt6_info *rt,
1142 				unsigned int orig_mtu)
1143 {
1144 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1145 		if (!skb) {
1146 			/* first fragment, reserve header_len */
1147 			*mtu = orig_mtu - rt->dst.header_len;
1148 
1149 		} else {
1150 			/*
1151 			 * this fragment is not first, the headers
1152 			 * space is regarded as data space.
1153 			 */
1154 			*mtu = orig_mtu;
1155 		}
1156 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1157 			      + fragheaderlen - sizeof(struct frag_hdr);
1158 	}
1159 }
1160 
1161 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1162 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1163 			  struct rt6_info *rt, struct flowi6 *fl6)
1164 {
1165 	struct ipv6_pinfo *np = inet6_sk(sk);
1166 	unsigned int mtu;
1167 	struct ipv6_txoptions *opt = ipc6->opt;
1168 
1169 	/*
1170 	 * setup for corking
1171 	 */
1172 	if (opt) {
1173 		if (WARN_ON(v6_cork->opt))
1174 			return -EINVAL;
1175 
1176 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1177 		if (unlikely(!v6_cork->opt))
1178 			return -ENOBUFS;
1179 
1180 		v6_cork->opt->tot_len = sizeof(*opt);
1181 		v6_cork->opt->opt_flen = opt->opt_flen;
1182 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1183 
1184 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1185 						    sk->sk_allocation);
1186 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1187 			return -ENOBUFS;
1188 
1189 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1190 						    sk->sk_allocation);
1191 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1192 			return -ENOBUFS;
1193 
1194 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1195 						   sk->sk_allocation);
1196 		if (opt->hopopt && !v6_cork->opt->hopopt)
1197 			return -ENOBUFS;
1198 
1199 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1200 						    sk->sk_allocation);
1201 		if (opt->srcrt && !v6_cork->opt->srcrt)
1202 			return -ENOBUFS;
1203 
1204 		/* need source address above miyazawa*/
1205 	}
1206 	dst_hold(&rt->dst);
1207 	cork->base.dst = &rt->dst;
1208 	cork->fl.u.ip6 = *fl6;
1209 	v6_cork->hop_limit = ipc6->hlimit;
1210 	v6_cork->tclass = ipc6->tclass;
1211 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1212 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1213 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1214 	else
1215 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1216 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1217 	if (np->frag_size < mtu) {
1218 		if (np->frag_size)
1219 			mtu = np->frag_size;
1220 	}
1221 	if (mtu < IPV6_MIN_MTU)
1222 		return -EINVAL;
1223 	cork->base.fragsize = mtu;
1224 	cork->base.gso_size = sk->sk_type == SOCK_DGRAM &&
1225 			      sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0;
1226 
1227 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1228 		cork->base.flags |= IPCORK_ALLFRAG;
1229 	cork->base.length = 0;
1230 
1231 	return 0;
1232 }
1233 
1234 static int __ip6_append_data(struct sock *sk,
1235 			     struct flowi6 *fl6,
1236 			     struct sk_buff_head *queue,
1237 			     struct inet_cork *cork,
1238 			     struct inet6_cork *v6_cork,
1239 			     struct page_frag *pfrag,
1240 			     int getfrag(void *from, char *to, int offset,
1241 					 int len, int odd, struct sk_buff *skb),
1242 			     void *from, int length, int transhdrlen,
1243 			     unsigned int flags, struct ipcm6_cookie *ipc6,
1244 			     const struct sockcm_cookie *sockc)
1245 {
1246 	struct sk_buff *skb, *skb_prev = NULL;
1247 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1248 	int exthdrlen = 0;
1249 	int dst_exthdrlen = 0;
1250 	int hh_len;
1251 	int copy;
1252 	int err;
1253 	int offset = 0;
1254 	__u8 tx_flags = 0;
1255 	u32 tskey = 0;
1256 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1257 	struct ipv6_txoptions *opt = v6_cork->opt;
1258 	int csummode = CHECKSUM_NONE;
1259 	unsigned int maxnonfragsize, headersize;
1260 	unsigned int wmem_alloc_delta = 0;
1261 	bool paged;
1262 
1263 	skb = skb_peek_tail(queue);
1264 	if (!skb) {
1265 		exthdrlen = opt ? opt->opt_flen : 0;
1266 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1267 	}
1268 
1269 	paged = !!cork->gso_size;
1270 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1271 	orig_mtu = mtu;
1272 
1273 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1274 
1275 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1276 			(opt ? opt->opt_nflen : 0);
1277 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1278 		     sizeof(struct frag_hdr);
1279 
1280 	headersize = sizeof(struct ipv6hdr) +
1281 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1282 		     (dst_allfrag(&rt->dst) ?
1283 		      sizeof(struct frag_hdr) : 0) +
1284 		     rt->rt6i_nfheader_len;
1285 
1286 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1287 	 * the first fragment
1288 	 */
1289 	if (headersize + transhdrlen > mtu)
1290 		goto emsgsize;
1291 
1292 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1293 	    (sk->sk_protocol == IPPROTO_UDP ||
1294 	     sk->sk_protocol == IPPROTO_RAW)) {
1295 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1296 				sizeof(struct ipv6hdr));
1297 		goto emsgsize;
1298 	}
1299 
1300 	if (ip6_sk_ignore_df(sk))
1301 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1302 	else
1303 		maxnonfragsize = mtu;
1304 
1305 	if (cork->length + length > maxnonfragsize - headersize) {
1306 emsgsize:
1307 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1308 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1309 		return -EMSGSIZE;
1310 	}
1311 
1312 	/* CHECKSUM_PARTIAL only with no extension headers and when
1313 	 * we are not going to fragment
1314 	 */
1315 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1316 	    headersize == sizeof(struct ipv6hdr) &&
1317 	    length <= mtu - headersize &&
1318 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1319 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1320 		csummode = CHECKSUM_PARTIAL;
1321 
1322 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1323 		sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1324 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1325 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1326 			tskey = sk->sk_tskey++;
1327 	}
1328 
1329 	/*
1330 	 * Let's try using as much space as possible.
1331 	 * Use MTU if total length of the message fits into the MTU.
1332 	 * Otherwise, we need to reserve fragment header and
1333 	 * fragment alignment (= 8-15 octects, in total).
1334 	 *
1335 	 * Note that we may need to "move" the data from the tail of
1336 	 * of the buffer to the new fragment when we split
1337 	 * the message.
1338 	 *
1339 	 * FIXME: It may be fragmented into multiple chunks
1340 	 *        at once if non-fragmentable extension headers
1341 	 *        are too large.
1342 	 * --yoshfuji
1343 	 */
1344 
1345 	cork->length += length;
1346 	if (!skb)
1347 		goto alloc_new_skb;
1348 
1349 	while (length > 0) {
1350 		/* Check if the remaining data fits into current packet. */
1351 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1352 		if (copy < length)
1353 			copy = maxfraglen - skb->len;
1354 
1355 		if (copy <= 0) {
1356 			char *data;
1357 			unsigned int datalen;
1358 			unsigned int fraglen;
1359 			unsigned int fraggap;
1360 			unsigned int alloclen;
1361 			unsigned int pagedlen = 0;
1362 alloc_new_skb:
1363 			/* There's no room in the current skb */
1364 			if (skb)
1365 				fraggap = skb->len - maxfraglen;
1366 			else
1367 				fraggap = 0;
1368 			/* update mtu and maxfraglen if necessary */
1369 			if (!skb || !skb_prev)
1370 				ip6_append_data_mtu(&mtu, &maxfraglen,
1371 						    fragheaderlen, skb, rt,
1372 						    orig_mtu);
1373 
1374 			skb_prev = skb;
1375 
1376 			/*
1377 			 * If remaining data exceeds the mtu,
1378 			 * we know we need more fragment(s).
1379 			 */
1380 			datalen = length + fraggap;
1381 
1382 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1383 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1384 			fraglen = datalen + fragheaderlen;
1385 
1386 			if ((flags & MSG_MORE) &&
1387 			    !(rt->dst.dev->features&NETIF_F_SG))
1388 				alloclen = mtu;
1389 			else if (!paged)
1390 				alloclen = fraglen;
1391 			else {
1392 				alloclen = min_t(int, fraglen, MAX_HEADER);
1393 				pagedlen = fraglen - alloclen;
1394 			}
1395 
1396 			alloclen += dst_exthdrlen;
1397 
1398 			if (datalen != length + fraggap) {
1399 				/*
1400 				 * this is not the last fragment, the trailer
1401 				 * space is regarded as data space.
1402 				 */
1403 				datalen += rt->dst.trailer_len;
1404 			}
1405 
1406 			alloclen += rt->dst.trailer_len;
1407 			fraglen = datalen + fragheaderlen;
1408 
1409 			/*
1410 			 * We just reserve space for fragment header.
1411 			 * Note: this may be overallocation if the message
1412 			 * (without MSG_MORE) fits into the MTU.
1413 			 */
1414 			alloclen += sizeof(struct frag_hdr);
1415 
1416 			copy = datalen - transhdrlen - fraggap - pagedlen;
1417 			if (copy < 0) {
1418 				err = -EINVAL;
1419 				goto error;
1420 			}
1421 			if (transhdrlen) {
1422 				skb = sock_alloc_send_skb(sk,
1423 						alloclen + hh_len,
1424 						(flags & MSG_DONTWAIT), &err);
1425 			} else {
1426 				skb = NULL;
1427 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1428 				    2 * sk->sk_sndbuf)
1429 					skb = alloc_skb(alloclen + hh_len,
1430 							sk->sk_allocation);
1431 				if (unlikely(!skb))
1432 					err = -ENOBUFS;
1433 			}
1434 			if (!skb)
1435 				goto error;
1436 			/*
1437 			 *	Fill in the control structures
1438 			 */
1439 			skb->protocol = htons(ETH_P_IPV6);
1440 			skb->ip_summed = csummode;
1441 			skb->csum = 0;
1442 			/* reserve for fragmentation and ipsec header */
1443 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1444 				    dst_exthdrlen);
1445 
1446 			/* Only the initial fragment is time stamped */
1447 			skb_shinfo(skb)->tx_flags = tx_flags;
1448 			tx_flags = 0;
1449 			skb_shinfo(skb)->tskey = tskey;
1450 			tskey = 0;
1451 
1452 			/*
1453 			 *	Find where to start putting bytes
1454 			 */
1455 			data = skb_put(skb, fraglen - pagedlen);
1456 			skb_set_network_header(skb, exthdrlen);
1457 			data += fragheaderlen;
1458 			skb->transport_header = (skb->network_header +
1459 						 fragheaderlen);
1460 			if (fraggap) {
1461 				skb->csum = skb_copy_and_csum_bits(
1462 					skb_prev, maxfraglen,
1463 					data + transhdrlen, fraggap, 0);
1464 				skb_prev->csum = csum_sub(skb_prev->csum,
1465 							  skb->csum);
1466 				data += fraggap;
1467 				pskb_trim_unique(skb_prev, maxfraglen);
1468 			}
1469 			if (copy > 0 &&
1470 			    getfrag(from, data + transhdrlen, offset,
1471 				    copy, fraggap, skb) < 0) {
1472 				err = -EFAULT;
1473 				kfree_skb(skb);
1474 				goto error;
1475 			}
1476 
1477 			offset += copy;
1478 			length -= copy + transhdrlen;
1479 			transhdrlen = 0;
1480 			exthdrlen = 0;
1481 			dst_exthdrlen = 0;
1482 
1483 			if ((flags & MSG_CONFIRM) && !skb_prev)
1484 				skb_set_dst_pending_confirm(skb, 1);
1485 
1486 			/*
1487 			 * Put the packet on the pending queue
1488 			 */
1489 			if (!skb->destructor) {
1490 				skb->destructor = sock_wfree;
1491 				skb->sk = sk;
1492 				wmem_alloc_delta += skb->truesize;
1493 			}
1494 			__skb_queue_tail(queue, skb);
1495 			continue;
1496 		}
1497 
1498 		if (copy > length)
1499 			copy = length;
1500 
1501 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1502 		    skb_tailroom(skb) >= copy) {
1503 			unsigned int off;
1504 
1505 			off = skb->len;
1506 			if (getfrag(from, skb_put(skb, copy),
1507 						offset, copy, off, skb) < 0) {
1508 				__skb_trim(skb, off);
1509 				err = -EFAULT;
1510 				goto error;
1511 			}
1512 		} else {
1513 			int i = skb_shinfo(skb)->nr_frags;
1514 
1515 			err = -ENOMEM;
1516 			if (!sk_page_frag_refill(sk, pfrag))
1517 				goto error;
1518 
1519 			if (!skb_can_coalesce(skb, i, pfrag->page,
1520 					      pfrag->offset)) {
1521 				err = -EMSGSIZE;
1522 				if (i == MAX_SKB_FRAGS)
1523 					goto error;
1524 
1525 				__skb_fill_page_desc(skb, i, pfrag->page,
1526 						     pfrag->offset, 0);
1527 				skb_shinfo(skb)->nr_frags = ++i;
1528 				get_page(pfrag->page);
1529 			}
1530 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1531 			if (getfrag(from,
1532 				    page_address(pfrag->page) + pfrag->offset,
1533 				    offset, copy, skb->len, skb) < 0)
1534 				goto error_efault;
1535 
1536 			pfrag->offset += copy;
1537 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1538 			skb->len += copy;
1539 			skb->data_len += copy;
1540 			skb->truesize += copy;
1541 			wmem_alloc_delta += copy;
1542 		}
1543 		offset += copy;
1544 		length -= copy;
1545 	}
1546 
1547 	if (wmem_alloc_delta)
1548 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1549 	return 0;
1550 
1551 error_efault:
1552 	err = -EFAULT;
1553 error:
1554 	cork->length -= length;
1555 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1556 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1557 	return err;
1558 }
1559 
1560 int ip6_append_data(struct sock *sk,
1561 		    int getfrag(void *from, char *to, int offset, int len,
1562 				int odd, struct sk_buff *skb),
1563 		    void *from, int length, int transhdrlen,
1564 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1565 		    struct rt6_info *rt, unsigned int flags,
1566 		    const struct sockcm_cookie *sockc)
1567 {
1568 	struct inet_sock *inet = inet_sk(sk);
1569 	struct ipv6_pinfo *np = inet6_sk(sk);
1570 	int exthdrlen;
1571 	int err;
1572 
1573 	if (flags&MSG_PROBE)
1574 		return 0;
1575 	if (skb_queue_empty(&sk->sk_write_queue)) {
1576 		/*
1577 		 * setup for corking
1578 		 */
1579 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1580 				     ipc6, rt, fl6);
1581 		if (err)
1582 			return err;
1583 
1584 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1585 		length += exthdrlen;
1586 		transhdrlen += exthdrlen;
1587 	} else {
1588 		fl6 = &inet->cork.fl.u.ip6;
1589 		transhdrlen = 0;
1590 	}
1591 
1592 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1593 				 &np->cork, sk_page_frag(sk), getfrag,
1594 				 from, length, transhdrlen, flags, ipc6, sockc);
1595 }
1596 EXPORT_SYMBOL_GPL(ip6_append_data);
1597 
1598 static void ip6_cork_release(struct inet_cork_full *cork,
1599 			     struct inet6_cork *v6_cork)
1600 {
1601 	if (v6_cork->opt) {
1602 		kfree(v6_cork->opt->dst0opt);
1603 		kfree(v6_cork->opt->dst1opt);
1604 		kfree(v6_cork->opt->hopopt);
1605 		kfree(v6_cork->opt->srcrt);
1606 		kfree(v6_cork->opt);
1607 		v6_cork->opt = NULL;
1608 	}
1609 
1610 	if (cork->base.dst) {
1611 		dst_release(cork->base.dst);
1612 		cork->base.dst = NULL;
1613 		cork->base.flags &= ~IPCORK_ALLFRAG;
1614 	}
1615 	memset(&cork->fl, 0, sizeof(cork->fl));
1616 }
1617 
1618 struct sk_buff *__ip6_make_skb(struct sock *sk,
1619 			       struct sk_buff_head *queue,
1620 			       struct inet_cork_full *cork,
1621 			       struct inet6_cork *v6_cork)
1622 {
1623 	struct sk_buff *skb, *tmp_skb;
1624 	struct sk_buff **tail_skb;
1625 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1626 	struct ipv6_pinfo *np = inet6_sk(sk);
1627 	struct net *net = sock_net(sk);
1628 	struct ipv6hdr *hdr;
1629 	struct ipv6_txoptions *opt = v6_cork->opt;
1630 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1631 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1632 	unsigned char proto = fl6->flowi6_proto;
1633 
1634 	skb = __skb_dequeue(queue);
1635 	if (!skb)
1636 		goto out;
1637 	tail_skb = &(skb_shinfo(skb)->frag_list);
1638 
1639 	/* move skb->data to ip header from ext header */
1640 	if (skb->data < skb_network_header(skb))
1641 		__skb_pull(skb, skb_network_offset(skb));
1642 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1643 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1644 		*tail_skb = tmp_skb;
1645 		tail_skb = &(tmp_skb->next);
1646 		skb->len += tmp_skb->len;
1647 		skb->data_len += tmp_skb->len;
1648 		skb->truesize += tmp_skb->truesize;
1649 		tmp_skb->destructor = NULL;
1650 		tmp_skb->sk = NULL;
1651 	}
1652 
1653 	/* Allow local fragmentation. */
1654 	skb->ignore_df = ip6_sk_ignore_df(sk);
1655 
1656 	*final_dst = fl6->daddr;
1657 	__skb_pull(skb, skb_network_header_len(skb));
1658 	if (opt && opt->opt_flen)
1659 		ipv6_push_frag_opts(skb, opt, &proto);
1660 	if (opt && opt->opt_nflen)
1661 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1662 
1663 	skb_push(skb, sizeof(struct ipv6hdr));
1664 	skb_reset_network_header(skb);
1665 	hdr = ipv6_hdr(skb);
1666 
1667 	ip6_flow_hdr(hdr, v6_cork->tclass,
1668 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1669 					ip6_autoflowlabel(net, np), fl6));
1670 	hdr->hop_limit = v6_cork->hop_limit;
1671 	hdr->nexthdr = proto;
1672 	hdr->saddr = fl6->saddr;
1673 	hdr->daddr = *final_dst;
1674 
1675 	skb->priority = sk->sk_priority;
1676 	skb->mark = sk->sk_mark;
1677 
1678 	skb_dst_set(skb, dst_clone(&rt->dst));
1679 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1680 	if (proto == IPPROTO_ICMPV6) {
1681 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1682 
1683 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1684 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1685 	}
1686 
1687 	ip6_cork_release(cork, v6_cork);
1688 out:
1689 	return skb;
1690 }
1691 
1692 int ip6_send_skb(struct sk_buff *skb)
1693 {
1694 	struct net *net = sock_net(skb->sk);
1695 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1696 	int err;
1697 
1698 	err = ip6_local_out(net, skb->sk, skb);
1699 	if (err) {
1700 		if (err > 0)
1701 			err = net_xmit_errno(err);
1702 		if (err)
1703 			IP6_INC_STATS(net, rt->rt6i_idev,
1704 				      IPSTATS_MIB_OUTDISCARDS);
1705 	}
1706 
1707 	return err;
1708 }
1709 
1710 int ip6_push_pending_frames(struct sock *sk)
1711 {
1712 	struct sk_buff *skb;
1713 
1714 	skb = ip6_finish_skb(sk);
1715 	if (!skb)
1716 		return 0;
1717 
1718 	return ip6_send_skb(skb);
1719 }
1720 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1721 
1722 static void __ip6_flush_pending_frames(struct sock *sk,
1723 				       struct sk_buff_head *queue,
1724 				       struct inet_cork_full *cork,
1725 				       struct inet6_cork *v6_cork)
1726 {
1727 	struct sk_buff *skb;
1728 
1729 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1730 		if (skb_dst(skb))
1731 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1732 				      IPSTATS_MIB_OUTDISCARDS);
1733 		kfree_skb(skb);
1734 	}
1735 
1736 	ip6_cork_release(cork, v6_cork);
1737 }
1738 
1739 void ip6_flush_pending_frames(struct sock *sk)
1740 {
1741 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1742 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1743 }
1744 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1745 
1746 struct sk_buff *ip6_make_skb(struct sock *sk,
1747 			     int getfrag(void *from, char *to, int offset,
1748 					 int len, int odd, struct sk_buff *skb),
1749 			     void *from, int length, int transhdrlen,
1750 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1751 			     struct rt6_info *rt, unsigned int flags,
1752 			     struct inet_cork_full *cork,
1753 			     const struct sockcm_cookie *sockc)
1754 {
1755 	struct inet6_cork v6_cork;
1756 	struct sk_buff_head queue;
1757 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1758 	int err;
1759 
1760 	if (flags & MSG_PROBE)
1761 		return NULL;
1762 
1763 	__skb_queue_head_init(&queue);
1764 
1765 	cork->base.flags = 0;
1766 	cork->base.addr = 0;
1767 	cork->base.opt = NULL;
1768 	cork->base.dst = NULL;
1769 	v6_cork.opt = NULL;
1770 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1771 	if (err) {
1772 		ip6_cork_release(cork, &v6_cork);
1773 		return ERR_PTR(err);
1774 	}
1775 	if (ipc6->dontfrag < 0)
1776 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1777 
1778 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1779 				&current->task_frag, getfrag, from,
1780 				length + exthdrlen, transhdrlen + exthdrlen,
1781 				flags, ipc6, sockc);
1782 	if (err) {
1783 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1784 		return ERR_PTR(err);
1785 	}
1786 
1787 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1788 }
1789