xref: /linux/net/ipv6/ip6_output.c (revision fee6d4c777a125e56de9370db3b2bf359bf958d6)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	rcu_read_lock_bh();
107 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
108 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
109 	if (unlikely(!neigh))
110 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
111 	if (!IS_ERR(neigh)) {
112 		ret = dst_neigh_output(dst, neigh, skb);
113 		rcu_read_unlock_bh();
114 		return ret;
115 	}
116 	rcu_read_unlock_bh();
117 
118 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
119 	kfree_skb(skb);
120 	return -EINVAL;
121 }
122 
123 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
124 {
125 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
126 	    dst_allfrag(skb_dst(skb)) ||
127 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
128 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
129 	else
130 		return ip6_finish_output2(net, sk, skb);
131 }
132 
133 int ip6_output(struct sock *sk, struct sk_buff *skb)
134 {
135 	struct net_device *dev = skb_dst(skb)->dev;
136 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
137 	struct net *net = dev_net(dev);
138 
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
141 		kfree_skb(skb);
142 		return 0;
143 	}
144 
145 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
146 			    net, sk, skb, NULL, dev,
147 			    ip6_finish_output,
148 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
149 }
150 
151 /*
152  * xmit an sk_buff (used by TCP, SCTP and DCCP)
153  * Note : socket lock is not held for SYNACK packets, but might be modified
154  * by calls to skb_set_owner_w() and ipv6_local_error(),
155  * which are using proper atomic operations or spinlocks.
156  */
157 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
158 	     struct ipv6_txoptions *opt, int tclass)
159 {
160 	struct net *net = sock_net(sk);
161 	const struct ipv6_pinfo *np = inet6_sk(sk);
162 	struct in6_addr *first_hop = &fl6->daddr;
163 	struct dst_entry *dst = skb_dst(skb);
164 	struct ipv6hdr *hdr;
165 	u8  proto = fl6->flowi6_proto;
166 	int seg_len = skb->len;
167 	int hlimit = -1;
168 	u32 mtu;
169 
170 	if (opt) {
171 		unsigned int head_room;
172 
173 		/* First: exthdrs may take lots of space (~8K for now)
174 		   MAX_HEADER is not enough.
175 		 */
176 		head_room = opt->opt_nflen + opt->opt_flen;
177 		seg_len += head_room;
178 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
179 
180 		if (skb_headroom(skb) < head_room) {
181 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
182 			if (!skb2) {
183 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
184 					      IPSTATS_MIB_OUTDISCARDS);
185 				kfree_skb(skb);
186 				return -ENOBUFS;
187 			}
188 			consume_skb(skb);
189 			skb = skb2;
190 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
191 			 * it is safe to call in our context (socket lock not held)
192 			 */
193 			skb_set_owner_w(skb, (struct sock *)sk);
194 		}
195 		if (opt->opt_flen)
196 			ipv6_push_frag_opts(skb, opt, &proto);
197 		if (opt->opt_nflen)
198 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
199 	}
200 
201 	skb_push(skb, sizeof(struct ipv6hdr));
202 	skb_reset_network_header(skb);
203 	hdr = ipv6_hdr(skb);
204 
205 	/*
206 	 *	Fill in the IPv6 header
207 	 */
208 	if (np)
209 		hlimit = np->hop_limit;
210 	if (hlimit < 0)
211 		hlimit = ip6_dst_hoplimit(dst);
212 
213 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
214 						     np->autoflowlabel, fl6));
215 
216 	hdr->payload_len = htons(seg_len);
217 	hdr->nexthdr = proto;
218 	hdr->hop_limit = hlimit;
219 
220 	hdr->saddr = fl6->saddr;
221 	hdr->daddr = *first_hop;
222 
223 	skb->protocol = htons(ETH_P_IPV6);
224 	skb->priority = sk->sk_priority;
225 	skb->mark = sk->sk_mark;
226 
227 	mtu = dst_mtu(dst);
228 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
229 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
230 			      IPSTATS_MIB_OUT, skb->len);
231 		/* hooks should never assume socket lock is held.
232 		 * we promote our socket to non const
233 		 */
234 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
235 			       net, (struct sock *)sk, skb, NULL, dst->dev,
236 			       dst_output_okfn);
237 	}
238 
239 	skb->dev = dst->dev;
240 	/* ipv6_local_error() does not require socket lock,
241 	 * we promote our socket to non const
242 	 */
243 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
244 
245 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
246 	kfree_skb(skb);
247 	return -EMSGSIZE;
248 }
249 EXPORT_SYMBOL(ip6_xmit);
250 
251 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
252 {
253 	struct ip6_ra_chain *ra;
254 	struct sock *last = NULL;
255 
256 	read_lock(&ip6_ra_lock);
257 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
258 		struct sock *sk = ra->sk;
259 		if (sk && ra->sel == sel &&
260 		    (!sk->sk_bound_dev_if ||
261 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
262 			if (last) {
263 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
264 				if (skb2)
265 					rawv6_rcv(last, skb2);
266 			}
267 			last = sk;
268 		}
269 	}
270 
271 	if (last) {
272 		rawv6_rcv(last, skb);
273 		read_unlock(&ip6_ra_lock);
274 		return 1;
275 	}
276 	read_unlock(&ip6_ra_lock);
277 	return 0;
278 }
279 
280 static int ip6_forward_proxy_check(struct sk_buff *skb)
281 {
282 	struct ipv6hdr *hdr = ipv6_hdr(skb);
283 	u8 nexthdr = hdr->nexthdr;
284 	__be16 frag_off;
285 	int offset;
286 
287 	if (ipv6_ext_hdr(nexthdr)) {
288 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
289 		if (offset < 0)
290 			return 0;
291 	} else
292 		offset = sizeof(struct ipv6hdr);
293 
294 	if (nexthdr == IPPROTO_ICMPV6) {
295 		struct icmp6hdr *icmp6;
296 
297 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
298 					 offset + 1 - skb->data)))
299 			return 0;
300 
301 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
302 
303 		switch (icmp6->icmp6_type) {
304 		case NDISC_ROUTER_SOLICITATION:
305 		case NDISC_ROUTER_ADVERTISEMENT:
306 		case NDISC_NEIGHBOUR_SOLICITATION:
307 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
308 		case NDISC_REDIRECT:
309 			/* For reaction involving unicast neighbor discovery
310 			 * message destined to the proxied address, pass it to
311 			 * input function.
312 			 */
313 			return 1;
314 		default:
315 			break;
316 		}
317 	}
318 
319 	/*
320 	 * The proxying router can't forward traffic sent to a link-local
321 	 * address, so signal the sender and discard the packet. This
322 	 * behavior is clarified by the MIPv6 specification.
323 	 */
324 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
325 		dst_link_failure(skb);
326 		return -1;
327 	}
328 
329 	return 0;
330 }
331 
332 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
333 				     struct sk_buff *skb)
334 {
335 	skb_sender_cpu_clear(skb);
336 	return dst_output(sk, skb);
337 }
338 
339 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
340 {
341 	unsigned int mtu;
342 	struct inet6_dev *idev;
343 
344 	if (dst_metric_locked(dst, RTAX_MTU)) {
345 		mtu = dst_metric_raw(dst, RTAX_MTU);
346 		if (mtu)
347 			return mtu;
348 	}
349 
350 	mtu = IPV6_MIN_MTU;
351 	rcu_read_lock();
352 	idev = __in6_dev_get(dst->dev);
353 	if (idev)
354 		mtu = idev->cnf.mtu6;
355 	rcu_read_unlock();
356 
357 	return mtu;
358 }
359 
360 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
361 {
362 	if (skb->len <= mtu)
363 		return false;
364 
365 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
366 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
367 		return true;
368 
369 	if (skb->ignore_df)
370 		return false;
371 
372 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
373 		return false;
374 
375 	return true;
376 }
377 
378 int ip6_forward(struct sk_buff *skb)
379 {
380 	struct dst_entry *dst = skb_dst(skb);
381 	struct ipv6hdr *hdr = ipv6_hdr(skb);
382 	struct inet6_skb_parm *opt = IP6CB(skb);
383 	struct net *net = dev_net(dst->dev);
384 	u32 mtu;
385 
386 	if (net->ipv6.devconf_all->forwarding == 0)
387 		goto error;
388 
389 	if (skb->pkt_type != PACKET_HOST)
390 		goto drop;
391 
392 	if (skb_warn_if_lro(skb))
393 		goto drop;
394 
395 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
396 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
397 				 IPSTATS_MIB_INDISCARDS);
398 		goto drop;
399 	}
400 
401 	skb_forward_csum(skb);
402 
403 	/*
404 	 *	We DO NOT make any processing on
405 	 *	RA packets, pushing them to user level AS IS
406 	 *	without ane WARRANTY that application will be able
407 	 *	to interpret them. The reason is that we
408 	 *	cannot make anything clever here.
409 	 *
410 	 *	We are not end-node, so that if packet contains
411 	 *	AH/ESP, we cannot make anything.
412 	 *	Defragmentation also would be mistake, RA packets
413 	 *	cannot be fragmented, because there is no warranty
414 	 *	that different fragments will go along one path. --ANK
415 	 */
416 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
417 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
418 			return 0;
419 	}
420 
421 	/*
422 	 *	check and decrement ttl
423 	 */
424 	if (hdr->hop_limit <= 1) {
425 		/* Force OUTPUT device used as source address */
426 		skb->dev = dst->dev;
427 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
428 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
429 				 IPSTATS_MIB_INHDRERRORS);
430 
431 		kfree_skb(skb);
432 		return -ETIMEDOUT;
433 	}
434 
435 	/* XXX: idev->cnf.proxy_ndp? */
436 	if (net->ipv6.devconf_all->proxy_ndp &&
437 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
438 		int proxied = ip6_forward_proxy_check(skb);
439 		if (proxied > 0)
440 			return ip6_input(skb);
441 		else if (proxied < 0) {
442 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
443 					 IPSTATS_MIB_INDISCARDS);
444 			goto drop;
445 		}
446 	}
447 
448 	if (!xfrm6_route_forward(skb)) {
449 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
450 				 IPSTATS_MIB_INDISCARDS);
451 		goto drop;
452 	}
453 	dst = skb_dst(skb);
454 
455 	/* IPv6 specs say nothing about it, but it is clear that we cannot
456 	   send redirects to source routed frames.
457 	   We don't send redirects to frames decapsulated from IPsec.
458 	 */
459 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
460 		struct in6_addr *target = NULL;
461 		struct inet_peer *peer;
462 		struct rt6_info *rt;
463 
464 		/*
465 		 *	incoming and outgoing devices are the same
466 		 *	send a redirect.
467 		 */
468 
469 		rt = (struct rt6_info *) dst;
470 		if (rt->rt6i_flags & RTF_GATEWAY)
471 			target = &rt->rt6i_gateway;
472 		else
473 			target = &hdr->daddr;
474 
475 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
476 
477 		/* Limit redirects both by destination (here)
478 		   and by source (inside ndisc_send_redirect)
479 		 */
480 		if (inet_peer_xrlim_allow(peer, 1*HZ))
481 			ndisc_send_redirect(skb, target);
482 		if (peer)
483 			inet_putpeer(peer);
484 	} else {
485 		int addrtype = ipv6_addr_type(&hdr->saddr);
486 
487 		/* This check is security critical. */
488 		if (addrtype == IPV6_ADDR_ANY ||
489 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
490 			goto error;
491 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
492 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
493 				    ICMPV6_NOT_NEIGHBOUR, 0);
494 			goto error;
495 		}
496 	}
497 
498 	mtu = ip6_dst_mtu_forward(dst);
499 	if (mtu < IPV6_MIN_MTU)
500 		mtu = IPV6_MIN_MTU;
501 
502 	if (ip6_pkt_too_big(skb, mtu)) {
503 		/* Again, force OUTPUT device used as source address */
504 		skb->dev = dst->dev;
505 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
506 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
507 				 IPSTATS_MIB_INTOOBIGERRORS);
508 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
509 				 IPSTATS_MIB_FRAGFAILS);
510 		kfree_skb(skb);
511 		return -EMSGSIZE;
512 	}
513 
514 	if (skb_cow(skb, dst->dev->hard_header_len)) {
515 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
516 				 IPSTATS_MIB_OUTDISCARDS);
517 		goto drop;
518 	}
519 
520 	hdr = ipv6_hdr(skb);
521 
522 	/* Mangling hops number delayed to point after skb COW */
523 
524 	hdr->hop_limit--;
525 
526 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
527 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
528 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
529 		       net, NULL, skb, skb->dev, dst->dev,
530 		       ip6_forward_finish);
531 
532 error:
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535 	kfree_skb(skb);
536 	return -EINVAL;
537 }
538 
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541 	to->pkt_type = from->pkt_type;
542 	to->priority = from->priority;
543 	to->protocol = from->protocol;
544 	skb_dst_drop(to);
545 	skb_dst_set(to, dst_clone(skb_dst(from)));
546 	to->dev = from->dev;
547 	to->mark = from->mark;
548 
549 #ifdef CONFIG_NET_SCHED
550 	to->tc_index = from->tc_index;
551 #endif
552 	nf_copy(to, from);
553 	skb_copy_secmark(to, from);
554 }
555 
556 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
557 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
558 {
559 	struct sk_buff *frag;
560 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
561 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
562 				inet6_sk(skb->sk) : NULL;
563 	struct ipv6hdr *tmp_hdr;
564 	struct frag_hdr *fh;
565 	unsigned int mtu, hlen, left, len;
566 	int hroom, troom;
567 	__be32 frag_id;
568 	int ptr, offset = 0, err = 0;
569 	u8 *prevhdr, nexthdr = 0;
570 
571 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
572 	nexthdr = *prevhdr;
573 
574 	mtu = ip6_skb_dst_mtu(skb);
575 
576 	/* We must not fragment if the socket is set to force MTU discovery
577 	 * or if the skb it not generated by a local socket.
578 	 */
579 	if (unlikely(!skb->ignore_df && skb->len > mtu))
580 		goto fail_toobig;
581 
582 	if (IP6CB(skb)->frag_max_size) {
583 		if (IP6CB(skb)->frag_max_size > mtu)
584 			goto fail_toobig;
585 
586 		/* don't send fragments larger than what we received */
587 		mtu = IP6CB(skb)->frag_max_size;
588 		if (mtu < IPV6_MIN_MTU)
589 			mtu = IPV6_MIN_MTU;
590 	}
591 
592 	if (np && np->frag_size < mtu) {
593 		if (np->frag_size)
594 			mtu = np->frag_size;
595 	}
596 	mtu -= hlen + sizeof(struct frag_hdr);
597 
598 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
599 				    &ipv6_hdr(skb)->saddr);
600 
601 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
602 	if (skb_has_frag_list(skb)) {
603 		int first_len = skb_pagelen(skb);
604 		struct sk_buff *frag2;
605 
606 		if (first_len - hlen > mtu ||
607 		    ((first_len - hlen) & 7) ||
608 		    skb_cloned(skb) ||
609 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
610 			goto slow_path;
611 
612 		skb_walk_frags(skb, frag) {
613 			/* Correct geometry. */
614 			if (frag->len > mtu ||
615 			    ((frag->len & 7) && frag->next) ||
616 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
617 				goto slow_path_clean;
618 
619 			/* Partially cloned skb? */
620 			if (skb_shared(frag))
621 				goto slow_path_clean;
622 
623 			BUG_ON(frag->sk);
624 			if (skb->sk) {
625 				frag->sk = skb->sk;
626 				frag->destructor = sock_wfree;
627 			}
628 			skb->truesize -= frag->truesize;
629 		}
630 
631 		err = 0;
632 		offset = 0;
633 		/* BUILD HEADER */
634 
635 		*prevhdr = NEXTHDR_FRAGMENT;
636 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
637 		if (!tmp_hdr) {
638 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
639 				      IPSTATS_MIB_FRAGFAILS);
640 			err = -ENOMEM;
641 			goto fail;
642 		}
643 		frag = skb_shinfo(skb)->frag_list;
644 		skb_frag_list_init(skb);
645 
646 		__skb_pull(skb, hlen);
647 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
648 		__skb_push(skb, hlen);
649 		skb_reset_network_header(skb);
650 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
651 
652 		fh->nexthdr = nexthdr;
653 		fh->reserved = 0;
654 		fh->frag_off = htons(IP6_MF);
655 		fh->identification = frag_id;
656 
657 		first_len = skb_pagelen(skb);
658 		skb->data_len = first_len - skb_headlen(skb);
659 		skb->len = first_len;
660 		ipv6_hdr(skb)->payload_len = htons(first_len -
661 						   sizeof(struct ipv6hdr));
662 
663 		dst_hold(&rt->dst);
664 
665 		for (;;) {
666 			/* Prepare header of the next frame,
667 			 * before previous one went down. */
668 			if (frag) {
669 				frag->ip_summed = CHECKSUM_NONE;
670 				skb_reset_transport_header(frag);
671 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
672 				__skb_push(frag, hlen);
673 				skb_reset_network_header(frag);
674 				memcpy(skb_network_header(frag), tmp_hdr,
675 				       hlen);
676 				offset += skb->len - hlen - sizeof(struct frag_hdr);
677 				fh->nexthdr = nexthdr;
678 				fh->reserved = 0;
679 				fh->frag_off = htons(offset);
680 				if (frag->next)
681 					fh->frag_off |= htons(IP6_MF);
682 				fh->identification = frag_id;
683 				ipv6_hdr(frag)->payload_len =
684 						htons(frag->len -
685 						      sizeof(struct ipv6hdr));
686 				ip6_copy_metadata(frag, skb);
687 			}
688 
689 			err = output(net, sk, skb);
690 			if (!err)
691 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
692 					      IPSTATS_MIB_FRAGCREATES);
693 
694 			if (err || !frag)
695 				break;
696 
697 			skb = frag;
698 			frag = skb->next;
699 			skb->next = NULL;
700 		}
701 
702 		kfree(tmp_hdr);
703 
704 		if (err == 0) {
705 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
706 				      IPSTATS_MIB_FRAGOKS);
707 			ip6_rt_put(rt);
708 			return 0;
709 		}
710 
711 		kfree_skb_list(frag);
712 
713 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
714 			      IPSTATS_MIB_FRAGFAILS);
715 		ip6_rt_put(rt);
716 		return err;
717 
718 slow_path_clean:
719 		skb_walk_frags(skb, frag2) {
720 			if (frag2 == frag)
721 				break;
722 			frag2->sk = NULL;
723 			frag2->destructor = NULL;
724 			skb->truesize += frag2->truesize;
725 		}
726 	}
727 
728 slow_path:
729 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
730 	    skb_checksum_help(skb))
731 		goto fail;
732 
733 	left = skb->len - hlen;		/* Space per frame */
734 	ptr = hlen;			/* Where to start from */
735 
736 	/*
737 	 *	Fragment the datagram.
738 	 */
739 
740 	*prevhdr = NEXTHDR_FRAGMENT;
741 	troom = rt->dst.dev->needed_tailroom;
742 
743 	/*
744 	 *	Keep copying data until we run out.
745 	 */
746 	while (left > 0)	{
747 		len = left;
748 		/* IF: it doesn't fit, use 'mtu' - the data space left */
749 		if (len > mtu)
750 			len = mtu;
751 		/* IF: we are not sending up to and including the packet end
752 		   then align the next start on an eight byte boundary */
753 		if (len < left)	{
754 			len &= ~7;
755 		}
756 
757 		/* Allocate buffer */
758 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
759 				 hroom + troom, GFP_ATOMIC);
760 		if (!frag) {
761 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
762 				      IPSTATS_MIB_FRAGFAILS);
763 			err = -ENOMEM;
764 			goto fail;
765 		}
766 
767 		/*
768 		 *	Set up data on packet
769 		 */
770 
771 		ip6_copy_metadata(frag, skb);
772 		skb_reserve(frag, hroom);
773 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
774 		skb_reset_network_header(frag);
775 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
776 		frag->transport_header = (frag->network_header + hlen +
777 					  sizeof(struct frag_hdr));
778 
779 		/*
780 		 *	Charge the memory for the fragment to any owner
781 		 *	it might possess
782 		 */
783 		if (skb->sk)
784 			skb_set_owner_w(frag, skb->sk);
785 
786 		/*
787 		 *	Copy the packet header into the new buffer.
788 		 */
789 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
790 
791 		/*
792 		 *	Build fragment header.
793 		 */
794 		fh->nexthdr = nexthdr;
795 		fh->reserved = 0;
796 		fh->identification = frag_id;
797 
798 		/*
799 		 *	Copy a block of the IP datagram.
800 		 */
801 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
802 				     len));
803 		left -= len;
804 
805 		fh->frag_off = htons(offset);
806 		if (left > 0)
807 			fh->frag_off |= htons(IP6_MF);
808 		ipv6_hdr(frag)->payload_len = htons(frag->len -
809 						    sizeof(struct ipv6hdr));
810 
811 		ptr += len;
812 		offset += len;
813 
814 		/*
815 		 *	Put this fragment into the sending queue.
816 		 */
817 		err = output(net, sk, frag);
818 		if (err)
819 			goto fail;
820 
821 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
822 			      IPSTATS_MIB_FRAGCREATES);
823 	}
824 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
825 		      IPSTATS_MIB_FRAGOKS);
826 	consume_skb(skb);
827 	return err;
828 
829 fail_toobig:
830 	if (skb->sk && dst_allfrag(skb_dst(skb)))
831 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
832 
833 	skb->dev = skb_dst(skb)->dev;
834 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
835 	err = -EMSGSIZE;
836 
837 fail:
838 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
839 		      IPSTATS_MIB_FRAGFAILS);
840 	kfree_skb(skb);
841 	return err;
842 }
843 
844 static inline int ip6_rt_check(const struct rt6key *rt_key,
845 			       const struct in6_addr *fl_addr,
846 			       const struct in6_addr *addr_cache)
847 {
848 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
849 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
850 }
851 
852 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
853 					  struct dst_entry *dst,
854 					  const struct flowi6 *fl6)
855 {
856 	struct ipv6_pinfo *np = inet6_sk(sk);
857 	struct rt6_info *rt;
858 
859 	if (!dst)
860 		goto out;
861 
862 	if (dst->ops->family != AF_INET6) {
863 		dst_release(dst);
864 		return NULL;
865 	}
866 
867 	rt = (struct rt6_info *)dst;
868 	/* Yes, checking route validity in not connected
869 	 * case is not very simple. Take into account,
870 	 * that we do not support routing by source, TOS,
871 	 * and MSG_DONTROUTE		--ANK (980726)
872 	 *
873 	 * 1. ip6_rt_check(): If route was host route,
874 	 *    check that cached destination is current.
875 	 *    If it is network route, we still may
876 	 *    check its validity using saved pointer
877 	 *    to the last used address: daddr_cache.
878 	 *    We do not want to save whole address now,
879 	 *    (because main consumer of this service
880 	 *    is tcp, which has not this problem),
881 	 *    so that the last trick works only on connected
882 	 *    sockets.
883 	 * 2. oif also should be the same.
884 	 */
885 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
886 #ifdef CONFIG_IPV6_SUBTREES
887 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
888 #endif
889 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
890 		dst_release(dst);
891 		dst = NULL;
892 	}
893 
894 out:
895 	return dst;
896 }
897 
898 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
899 			       struct dst_entry **dst, struct flowi6 *fl6)
900 {
901 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
902 	struct neighbour *n;
903 	struct rt6_info *rt;
904 #endif
905 	int err;
906 
907 	/* The correct way to handle this would be to do
908 	 * ip6_route_get_saddr, and then ip6_route_output; however,
909 	 * the route-specific preferred source forces the
910 	 * ip6_route_output call _before_ ip6_route_get_saddr.
911 	 *
912 	 * In source specific routing (no src=any default route),
913 	 * ip6_route_output will fail given src=any saddr, though, so
914 	 * that's why we try it again later.
915 	 */
916 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
917 		struct rt6_info *rt;
918 		bool had_dst = *dst != NULL;
919 
920 		if (!had_dst)
921 			*dst = ip6_route_output(net, sk, fl6);
922 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
923 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
924 					  sk ? inet6_sk(sk)->srcprefs : 0,
925 					  &fl6->saddr);
926 		if (err)
927 			goto out_err_release;
928 
929 		/* If we had an erroneous initial result, pretend it
930 		 * never existed and let the SA-enabled version take
931 		 * over.
932 		 */
933 		if (!had_dst && (*dst)->error) {
934 			dst_release(*dst);
935 			*dst = NULL;
936 		}
937 	}
938 
939 	if (!*dst)
940 		*dst = ip6_route_output(net, sk, fl6);
941 
942 	err = (*dst)->error;
943 	if (err)
944 		goto out_err_release;
945 
946 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
947 	/*
948 	 * Here if the dst entry we've looked up
949 	 * has a neighbour entry that is in the INCOMPLETE
950 	 * state and the src address from the flow is
951 	 * marked as OPTIMISTIC, we release the found
952 	 * dst entry and replace it instead with the
953 	 * dst entry of the nexthop router
954 	 */
955 	rt = (struct rt6_info *) *dst;
956 	rcu_read_lock_bh();
957 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
958 				      rt6_nexthop(rt, &fl6->daddr));
959 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
960 	rcu_read_unlock_bh();
961 
962 	if (err) {
963 		struct inet6_ifaddr *ifp;
964 		struct flowi6 fl_gw6;
965 		int redirect;
966 
967 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
968 				      (*dst)->dev, 1);
969 
970 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
971 		if (ifp)
972 			in6_ifa_put(ifp);
973 
974 		if (redirect) {
975 			/*
976 			 * We need to get the dst entry for the
977 			 * default router instead
978 			 */
979 			dst_release(*dst);
980 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
981 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
982 			*dst = ip6_route_output(net, sk, &fl_gw6);
983 			err = (*dst)->error;
984 			if (err)
985 				goto out_err_release;
986 		}
987 	}
988 #endif
989 
990 	return 0;
991 
992 out_err_release:
993 	if (err == -ENETUNREACH)
994 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
995 	dst_release(*dst);
996 	*dst = NULL;
997 	return err;
998 }
999 
1000 /**
1001  *	ip6_dst_lookup - perform route lookup on flow
1002  *	@sk: socket which provides route info
1003  *	@dst: pointer to dst_entry * for result
1004  *	@fl6: flow to lookup
1005  *
1006  *	This function performs a route lookup on the given flow.
1007  *
1008  *	It returns zero on success, or a standard errno code on error.
1009  */
1010 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1011 		   struct flowi6 *fl6)
1012 {
1013 	*dst = NULL;
1014 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1015 }
1016 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1017 
1018 /**
1019  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1020  *	@sk: socket which provides route info
1021  *	@fl6: flow to lookup
1022  *	@final_dst: final destination address for ipsec lookup
1023  *
1024  *	This function performs a route lookup on the given flow.
1025  *
1026  *	It returns a valid dst pointer on success, or a pointer encoded
1027  *	error code.
1028  */
1029 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1030 				      const struct in6_addr *final_dst)
1031 {
1032 	struct dst_entry *dst = NULL;
1033 	int err;
1034 
1035 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1036 	if (err)
1037 		return ERR_PTR(err);
1038 	if (final_dst)
1039 		fl6->daddr = *final_dst;
1040 	if (!fl6->flowi6_oif)
1041 		fl6->flowi6_oif = dst->dev->ifindex;
1042 
1043 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1044 }
1045 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1046 
1047 /**
1048  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1049  *	@sk: socket which provides the dst cache and route info
1050  *	@fl6: flow to lookup
1051  *	@final_dst: final destination address for ipsec lookup
1052  *
1053  *	This function performs a route lookup on the given flow with the
1054  *	possibility of using the cached route in the socket if it is valid.
1055  *	It will take the socket dst lock when operating on the dst cache.
1056  *	As a result, this function can only be used in process context.
1057  *
1058  *	It returns a valid dst pointer on success, or a pointer encoded
1059  *	error code.
1060  */
1061 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1062 					 const struct in6_addr *final_dst)
1063 {
1064 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1065 	int err;
1066 
1067 	dst = ip6_sk_dst_check(sk, dst, fl6);
1068 
1069 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1070 	if (err)
1071 		return ERR_PTR(err);
1072 	if (final_dst)
1073 		fl6->daddr = *final_dst;
1074 
1075 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1076 }
1077 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1078 
1079 static inline int ip6_ufo_append_data(struct sock *sk,
1080 			struct sk_buff_head *queue,
1081 			int getfrag(void *from, char *to, int offset, int len,
1082 			int odd, struct sk_buff *skb),
1083 			void *from, int length, int hh_len, int fragheaderlen,
1084 			int transhdrlen, int mtu, unsigned int flags,
1085 			const struct flowi6 *fl6)
1086 
1087 {
1088 	struct sk_buff *skb;
1089 	int err;
1090 
1091 	/* There is support for UDP large send offload by network
1092 	 * device, so create one single skb packet containing complete
1093 	 * udp datagram
1094 	 */
1095 	skb = skb_peek_tail(queue);
1096 	if (!skb) {
1097 		skb = sock_alloc_send_skb(sk,
1098 			hh_len + fragheaderlen + transhdrlen + 20,
1099 			(flags & MSG_DONTWAIT), &err);
1100 		if (!skb)
1101 			return err;
1102 
1103 		/* reserve space for Hardware header */
1104 		skb_reserve(skb, hh_len);
1105 
1106 		/* create space for UDP/IP header */
1107 		skb_put(skb, fragheaderlen + transhdrlen);
1108 
1109 		/* initialize network header pointer */
1110 		skb_reset_network_header(skb);
1111 
1112 		/* initialize protocol header pointer */
1113 		skb->transport_header = skb->network_header + fragheaderlen;
1114 
1115 		skb->protocol = htons(ETH_P_IPV6);
1116 		skb->csum = 0;
1117 
1118 		__skb_queue_tail(queue, skb);
1119 	} else if (skb_is_gso(skb)) {
1120 		goto append;
1121 	}
1122 
1123 	skb->ip_summed = CHECKSUM_PARTIAL;
1124 	/* Specify the length of each IPv6 datagram fragment.
1125 	 * It has to be a multiple of 8.
1126 	 */
1127 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1128 				     sizeof(struct frag_hdr)) & ~7;
1129 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1130 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1131 							 &fl6->daddr,
1132 							 &fl6->saddr);
1133 
1134 append:
1135 	return skb_append_datato_frags(sk, skb, getfrag, from,
1136 				       (length - transhdrlen));
1137 }
1138 
1139 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1140 					       gfp_t gfp)
1141 {
1142 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1143 }
1144 
1145 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1146 						gfp_t gfp)
1147 {
1148 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1149 }
1150 
1151 static void ip6_append_data_mtu(unsigned int *mtu,
1152 				int *maxfraglen,
1153 				unsigned int fragheaderlen,
1154 				struct sk_buff *skb,
1155 				struct rt6_info *rt,
1156 				unsigned int orig_mtu)
1157 {
1158 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1159 		if (!skb) {
1160 			/* first fragment, reserve header_len */
1161 			*mtu = orig_mtu - rt->dst.header_len;
1162 
1163 		} else {
1164 			/*
1165 			 * this fragment is not first, the headers
1166 			 * space is regarded as data space.
1167 			 */
1168 			*mtu = orig_mtu;
1169 		}
1170 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1171 			      + fragheaderlen - sizeof(struct frag_hdr);
1172 	}
1173 }
1174 
1175 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1176 			  struct inet6_cork *v6_cork,
1177 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1178 			  struct rt6_info *rt, struct flowi6 *fl6)
1179 {
1180 	struct ipv6_pinfo *np = inet6_sk(sk);
1181 	unsigned int mtu;
1182 
1183 	/*
1184 	 * setup for corking
1185 	 */
1186 	if (opt) {
1187 		if (WARN_ON(v6_cork->opt))
1188 			return -EINVAL;
1189 
1190 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1191 		if (unlikely(!v6_cork->opt))
1192 			return -ENOBUFS;
1193 
1194 		v6_cork->opt->tot_len = opt->tot_len;
1195 		v6_cork->opt->opt_flen = opt->opt_flen;
1196 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1197 
1198 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1199 						    sk->sk_allocation);
1200 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1201 			return -ENOBUFS;
1202 
1203 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1204 						    sk->sk_allocation);
1205 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1206 			return -ENOBUFS;
1207 
1208 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1209 						   sk->sk_allocation);
1210 		if (opt->hopopt && !v6_cork->opt->hopopt)
1211 			return -ENOBUFS;
1212 
1213 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1214 						    sk->sk_allocation);
1215 		if (opt->srcrt && !v6_cork->opt->srcrt)
1216 			return -ENOBUFS;
1217 
1218 		/* need source address above miyazawa*/
1219 	}
1220 	dst_hold(&rt->dst);
1221 	cork->base.dst = &rt->dst;
1222 	cork->fl.u.ip6 = *fl6;
1223 	v6_cork->hop_limit = hlimit;
1224 	v6_cork->tclass = tclass;
1225 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1226 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1227 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1228 	else
1229 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1230 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1231 	if (np->frag_size < mtu) {
1232 		if (np->frag_size)
1233 			mtu = np->frag_size;
1234 	}
1235 	cork->base.fragsize = mtu;
1236 	if (dst_allfrag(rt->dst.path))
1237 		cork->base.flags |= IPCORK_ALLFRAG;
1238 	cork->base.length = 0;
1239 
1240 	return 0;
1241 }
1242 
1243 static int __ip6_append_data(struct sock *sk,
1244 			     struct flowi6 *fl6,
1245 			     struct sk_buff_head *queue,
1246 			     struct inet_cork *cork,
1247 			     struct inet6_cork *v6_cork,
1248 			     struct page_frag *pfrag,
1249 			     int getfrag(void *from, char *to, int offset,
1250 					 int len, int odd, struct sk_buff *skb),
1251 			     void *from, int length, int transhdrlen,
1252 			     unsigned int flags, int dontfrag)
1253 {
1254 	struct sk_buff *skb, *skb_prev = NULL;
1255 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1256 	int exthdrlen = 0;
1257 	int dst_exthdrlen = 0;
1258 	int hh_len;
1259 	int copy;
1260 	int err;
1261 	int offset = 0;
1262 	__u8 tx_flags = 0;
1263 	u32 tskey = 0;
1264 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1265 	struct ipv6_txoptions *opt = v6_cork->opt;
1266 	int csummode = CHECKSUM_NONE;
1267 
1268 	skb = skb_peek_tail(queue);
1269 	if (!skb) {
1270 		exthdrlen = opt ? opt->opt_flen : 0;
1271 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1272 	}
1273 
1274 	mtu = cork->fragsize;
1275 	orig_mtu = mtu;
1276 
1277 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1278 
1279 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1280 			(opt ? opt->opt_nflen : 0);
1281 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1282 		     sizeof(struct frag_hdr);
1283 
1284 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1285 		unsigned int maxnonfragsize, headersize;
1286 
1287 		headersize = sizeof(struct ipv6hdr) +
1288 			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1289 			     (dst_allfrag(&rt->dst) ?
1290 			      sizeof(struct frag_hdr) : 0) +
1291 			     rt->rt6i_nfheader_len;
1292 
1293 		if (ip6_sk_ignore_df(sk))
1294 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1295 		else
1296 			maxnonfragsize = mtu;
1297 
1298 		/* dontfrag active */
1299 		if ((cork->length + length > mtu - headersize) && dontfrag &&
1300 		    (sk->sk_protocol == IPPROTO_UDP ||
1301 		     sk->sk_protocol == IPPROTO_RAW)) {
1302 			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1303 						   sizeof(struct ipv6hdr));
1304 			goto emsgsize;
1305 		}
1306 
1307 		if (cork->length + length > maxnonfragsize - headersize) {
1308 emsgsize:
1309 			ipv6_local_error(sk, EMSGSIZE, fl6,
1310 					 mtu - headersize +
1311 					 sizeof(struct ipv6hdr));
1312 			return -EMSGSIZE;
1313 		}
1314 	}
1315 
1316 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1317 		sock_tx_timestamp(sk, &tx_flags);
1318 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1319 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1320 			tskey = sk->sk_tskey++;
1321 	}
1322 
1323 	/* If this is the first and only packet and device
1324 	 * supports checksum offloading, let's use it.
1325 	 * Use transhdrlen, same as IPv4, because partial
1326 	 * sums only work when transhdrlen is set.
1327 	 */
1328 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1329 	    length + fragheaderlen < mtu &&
1330 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1331 	    !exthdrlen)
1332 		csummode = CHECKSUM_PARTIAL;
1333 	/*
1334 	 * Let's try using as much space as possible.
1335 	 * Use MTU if total length of the message fits into the MTU.
1336 	 * Otherwise, we need to reserve fragment header and
1337 	 * fragment alignment (= 8-15 octects, in total).
1338 	 *
1339 	 * Note that we may need to "move" the data from the tail of
1340 	 * of the buffer to the new fragment when we split
1341 	 * the message.
1342 	 *
1343 	 * FIXME: It may be fragmented into multiple chunks
1344 	 *        at once if non-fragmentable extension headers
1345 	 *        are too large.
1346 	 * --yoshfuji
1347 	 */
1348 
1349 	cork->length += length;
1350 	if (((length > mtu) ||
1351 	     (skb && skb_is_gso(skb))) &&
1352 	    (sk->sk_protocol == IPPROTO_UDP) &&
1353 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1354 	    (sk->sk_type == SOCK_DGRAM)) {
1355 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1356 					  hh_len, fragheaderlen,
1357 					  transhdrlen, mtu, flags, fl6);
1358 		if (err)
1359 			goto error;
1360 		return 0;
1361 	}
1362 
1363 	if (!skb)
1364 		goto alloc_new_skb;
1365 
1366 	while (length > 0) {
1367 		/* Check if the remaining data fits into current packet. */
1368 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1369 		if (copy < length)
1370 			copy = maxfraglen - skb->len;
1371 
1372 		if (copy <= 0) {
1373 			char *data;
1374 			unsigned int datalen;
1375 			unsigned int fraglen;
1376 			unsigned int fraggap;
1377 			unsigned int alloclen;
1378 alloc_new_skb:
1379 			/* There's no room in the current skb */
1380 			if (skb)
1381 				fraggap = skb->len - maxfraglen;
1382 			else
1383 				fraggap = 0;
1384 			/* update mtu and maxfraglen if necessary */
1385 			if (!skb || !skb_prev)
1386 				ip6_append_data_mtu(&mtu, &maxfraglen,
1387 						    fragheaderlen, skb, rt,
1388 						    orig_mtu);
1389 
1390 			skb_prev = skb;
1391 
1392 			/*
1393 			 * If remaining data exceeds the mtu,
1394 			 * we know we need more fragment(s).
1395 			 */
1396 			datalen = length + fraggap;
1397 
1398 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1399 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1400 			if ((flags & MSG_MORE) &&
1401 			    !(rt->dst.dev->features&NETIF_F_SG))
1402 				alloclen = mtu;
1403 			else
1404 				alloclen = datalen + fragheaderlen;
1405 
1406 			alloclen += dst_exthdrlen;
1407 
1408 			if (datalen != length + fraggap) {
1409 				/*
1410 				 * this is not the last fragment, the trailer
1411 				 * space is regarded as data space.
1412 				 */
1413 				datalen += rt->dst.trailer_len;
1414 			}
1415 
1416 			alloclen += rt->dst.trailer_len;
1417 			fraglen = datalen + fragheaderlen;
1418 
1419 			/*
1420 			 * We just reserve space for fragment header.
1421 			 * Note: this may be overallocation if the message
1422 			 * (without MSG_MORE) fits into the MTU.
1423 			 */
1424 			alloclen += sizeof(struct frag_hdr);
1425 
1426 			if (transhdrlen) {
1427 				skb = sock_alloc_send_skb(sk,
1428 						alloclen + hh_len,
1429 						(flags & MSG_DONTWAIT), &err);
1430 			} else {
1431 				skb = NULL;
1432 				if (atomic_read(&sk->sk_wmem_alloc) <=
1433 				    2 * sk->sk_sndbuf)
1434 					skb = sock_wmalloc(sk,
1435 							   alloclen + hh_len, 1,
1436 							   sk->sk_allocation);
1437 				if (unlikely(!skb))
1438 					err = -ENOBUFS;
1439 			}
1440 			if (!skb)
1441 				goto error;
1442 			/*
1443 			 *	Fill in the control structures
1444 			 */
1445 			skb->protocol = htons(ETH_P_IPV6);
1446 			skb->ip_summed = csummode;
1447 			skb->csum = 0;
1448 			/* reserve for fragmentation and ipsec header */
1449 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1450 				    dst_exthdrlen);
1451 
1452 			/* Only the initial fragment is time stamped */
1453 			skb_shinfo(skb)->tx_flags = tx_flags;
1454 			tx_flags = 0;
1455 			skb_shinfo(skb)->tskey = tskey;
1456 			tskey = 0;
1457 
1458 			/*
1459 			 *	Find where to start putting bytes
1460 			 */
1461 			data = skb_put(skb, fraglen);
1462 			skb_set_network_header(skb, exthdrlen);
1463 			data += fragheaderlen;
1464 			skb->transport_header = (skb->network_header +
1465 						 fragheaderlen);
1466 			if (fraggap) {
1467 				skb->csum = skb_copy_and_csum_bits(
1468 					skb_prev, maxfraglen,
1469 					data + transhdrlen, fraggap, 0);
1470 				skb_prev->csum = csum_sub(skb_prev->csum,
1471 							  skb->csum);
1472 				data += fraggap;
1473 				pskb_trim_unique(skb_prev, maxfraglen);
1474 			}
1475 			copy = datalen - transhdrlen - fraggap;
1476 
1477 			if (copy < 0) {
1478 				err = -EINVAL;
1479 				kfree_skb(skb);
1480 				goto error;
1481 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1482 				err = -EFAULT;
1483 				kfree_skb(skb);
1484 				goto error;
1485 			}
1486 
1487 			offset += copy;
1488 			length -= datalen - fraggap;
1489 			transhdrlen = 0;
1490 			exthdrlen = 0;
1491 			dst_exthdrlen = 0;
1492 
1493 			/*
1494 			 * Put the packet on the pending queue
1495 			 */
1496 			__skb_queue_tail(queue, skb);
1497 			continue;
1498 		}
1499 
1500 		if (copy > length)
1501 			copy = length;
1502 
1503 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1504 			unsigned int off;
1505 
1506 			off = skb->len;
1507 			if (getfrag(from, skb_put(skb, copy),
1508 						offset, copy, off, skb) < 0) {
1509 				__skb_trim(skb, off);
1510 				err = -EFAULT;
1511 				goto error;
1512 			}
1513 		} else {
1514 			int i = skb_shinfo(skb)->nr_frags;
1515 
1516 			err = -ENOMEM;
1517 			if (!sk_page_frag_refill(sk, pfrag))
1518 				goto error;
1519 
1520 			if (!skb_can_coalesce(skb, i, pfrag->page,
1521 					      pfrag->offset)) {
1522 				err = -EMSGSIZE;
1523 				if (i == MAX_SKB_FRAGS)
1524 					goto error;
1525 
1526 				__skb_fill_page_desc(skb, i, pfrag->page,
1527 						     pfrag->offset, 0);
1528 				skb_shinfo(skb)->nr_frags = ++i;
1529 				get_page(pfrag->page);
1530 			}
1531 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1532 			if (getfrag(from,
1533 				    page_address(pfrag->page) + pfrag->offset,
1534 				    offset, copy, skb->len, skb) < 0)
1535 				goto error_efault;
1536 
1537 			pfrag->offset += copy;
1538 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1539 			skb->len += copy;
1540 			skb->data_len += copy;
1541 			skb->truesize += copy;
1542 			atomic_add(copy, &sk->sk_wmem_alloc);
1543 		}
1544 		offset += copy;
1545 		length -= copy;
1546 	}
1547 
1548 	return 0;
1549 
1550 error_efault:
1551 	err = -EFAULT;
1552 error:
1553 	cork->length -= length;
1554 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1555 	return err;
1556 }
1557 
1558 int ip6_append_data(struct sock *sk,
1559 		    int getfrag(void *from, char *to, int offset, int len,
1560 				int odd, struct sk_buff *skb),
1561 		    void *from, int length, int transhdrlen, int hlimit,
1562 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1563 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1564 {
1565 	struct inet_sock *inet = inet_sk(sk);
1566 	struct ipv6_pinfo *np = inet6_sk(sk);
1567 	int exthdrlen;
1568 	int err;
1569 
1570 	if (flags&MSG_PROBE)
1571 		return 0;
1572 	if (skb_queue_empty(&sk->sk_write_queue)) {
1573 		/*
1574 		 * setup for corking
1575 		 */
1576 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1577 				     tclass, opt, rt, fl6);
1578 		if (err)
1579 			return err;
1580 
1581 		exthdrlen = (opt ? opt->opt_flen : 0);
1582 		length += exthdrlen;
1583 		transhdrlen += exthdrlen;
1584 	} else {
1585 		fl6 = &inet->cork.fl.u.ip6;
1586 		transhdrlen = 0;
1587 	}
1588 
1589 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1590 				 &np->cork, sk_page_frag(sk), getfrag,
1591 				 from, length, transhdrlen, flags, dontfrag);
1592 }
1593 EXPORT_SYMBOL_GPL(ip6_append_data);
1594 
1595 static void ip6_cork_release(struct inet_cork_full *cork,
1596 			     struct inet6_cork *v6_cork)
1597 {
1598 	if (v6_cork->opt) {
1599 		kfree(v6_cork->opt->dst0opt);
1600 		kfree(v6_cork->opt->dst1opt);
1601 		kfree(v6_cork->opt->hopopt);
1602 		kfree(v6_cork->opt->srcrt);
1603 		kfree(v6_cork->opt);
1604 		v6_cork->opt = NULL;
1605 	}
1606 
1607 	if (cork->base.dst) {
1608 		dst_release(cork->base.dst);
1609 		cork->base.dst = NULL;
1610 		cork->base.flags &= ~IPCORK_ALLFRAG;
1611 	}
1612 	memset(&cork->fl, 0, sizeof(cork->fl));
1613 }
1614 
1615 struct sk_buff *__ip6_make_skb(struct sock *sk,
1616 			       struct sk_buff_head *queue,
1617 			       struct inet_cork_full *cork,
1618 			       struct inet6_cork *v6_cork)
1619 {
1620 	struct sk_buff *skb, *tmp_skb;
1621 	struct sk_buff **tail_skb;
1622 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1623 	struct ipv6_pinfo *np = inet6_sk(sk);
1624 	struct net *net = sock_net(sk);
1625 	struct ipv6hdr *hdr;
1626 	struct ipv6_txoptions *opt = v6_cork->opt;
1627 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1628 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1629 	unsigned char proto = fl6->flowi6_proto;
1630 
1631 	skb = __skb_dequeue(queue);
1632 	if (!skb)
1633 		goto out;
1634 	tail_skb = &(skb_shinfo(skb)->frag_list);
1635 
1636 	/* move skb->data to ip header from ext header */
1637 	if (skb->data < skb_network_header(skb))
1638 		__skb_pull(skb, skb_network_offset(skb));
1639 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1640 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1641 		*tail_skb = tmp_skb;
1642 		tail_skb = &(tmp_skb->next);
1643 		skb->len += tmp_skb->len;
1644 		skb->data_len += tmp_skb->len;
1645 		skb->truesize += tmp_skb->truesize;
1646 		tmp_skb->destructor = NULL;
1647 		tmp_skb->sk = NULL;
1648 	}
1649 
1650 	/* Allow local fragmentation. */
1651 	skb->ignore_df = ip6_sk_ignore_df(sk);
1652 
1653 	*final_dst = fl6->daddr;
1654 	__skb_pull(skb, skb_network_header_len(skb));
1655 	if (opt && opt->opt_flen)
1656 		ipv6_push_frag_opts(skb, opt, &proto);
1657 	if (opt && opt->opt_nflen)
1658 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1659 
1660 	skb_push(skb, sizeof(struct ipv6hdr));
1661 	skb_reset_network_header(skb);
1662 	hdr = ipv6_hdr(skb);
1663 
1664 	ip6_flow_hdr(hdr, v6_cork->tclass,
1665 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1666 					np->autoflowlabel, fl6));
1667 	hdr->hop_limit = v6_cork->hop_limit;
1668 	hdr->nexthdr = proto;
1669 	hdr->saddr = fl6->saddr;
1670 	hdr->daddr = *final_dst;
1671 
1672 	skb->priority = sk->sk_priority;
1673 	skb->mark = sk->sk_mark;
1674 
1675 	skb_dst_set(skb, dst_clone(&rt->dst));
1676 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1677 	if (proto == IPPROTO_ICMPV6) {
1678 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1679 
1680 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1681 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1682 	}
1683 
1684 	ip6_cork_release(cork, v6_cork);
1685 out:
1686 	return skb;
1687 }
1688 
1689 int ip6_send_skb(struct sk_buff *skb)
1690 {
1691 	struct net *net = sock_net(skb->sk);
1692 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1693 	int err;
1694 
1695 	err = ip6_local_out(skb);
1696 	if (err) {
1697 		if (err > 0)
1698 			err = net_xmit_errno(err);
1699 		if (err)
1700 			IP6_INC_STATS(net, rt->rt6i_idev,
1701 				      IPSTATS_MIB_OUTDISCARDS);
1702 	}
1703 
1704 	return err;
1705 }
1706 
1707 int ip6_push_pending_frames(struct sock *sk)
1708 {
1709 	struct sk_buff *skb;
1710 
1711 	skb = ip6_finish_skb(sk);
1712 	if (!skb)
1713 		return 0;
1714 
1715 	return ip6_send_skb(skb);
1716 }
1717 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1718 
1719 static void __ip6_flush_pending_frames(struct sock *sk,
1720 				       struct sk_buff_head *queue,
1721 				       struct inet_cork_full *cork,
1722 				       struct inet6_cork *v6_cork)
1723 {
1724 	struct sk_buff *skb;
1725 
1726 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1727 		if (skb_dst(skb))
1728 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1729 				      IPSTATS_MIB_OUTDISCARDS);
1730 		kfree_skb(skb);
1731 	}
1732 
1733 	ip6_cork_release(cork, v6_cork);
1734 }
1735 
1736 void ip6_flush_pending_frames(struct sock *sk)
1737 {
1738 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1739 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1740 }
1741 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1742 
1743 struct sk_buff *ip6_make_skb(struct sock *sk,
1744 			     int getfrag(void *from, char *to, int offset,
1745 					 int len, int odd, struct sk_buff *skb),
1746 			     void *from, int length, int transhdrlen,
1747 			     int hlimit, int tclass,
1748 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1749 			     struct rt6_info *rt, unsigned int flags,
1750 			     int dontfrag)
1751 {
1752 	struct inet_cork_full cork;
1753 	struct inet6_cork v6_cork;
1754 	struct sk_buff_head queue;
1755 	int exthdrlen = (opt ? opt->opt_flen : 0);
1756 	int err;
1757 
1758 	if (flags & MSG_PROBE)
1759 		return NULL;
1760 
1761 	__skb_queue_head_init(&queue);
1762 
1763 	cork.base.flags = 0;
1764 	cork.base.addr = 0;
1765 	cork.base.opt = NULL;
1766 	v6_cork.opt = NULL;
1767 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1768 	if (err)
1769 		return ERR_PTR(err);
1770 
1771 	if (dontfrag < 0)
1772 		dontfrag = inet6_sk(sk)->dontfrag;
1773 
1774 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1775 				&current->task_frag, getfrag, from,
1776 				length + exthdrlen, transhdrlen + exthdrlen,
1777 				flags, dontfrag);
1778 	if (err) {
1779 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1780 		return ERR_PTR(err);
1781 	}
1782 
1783 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1784 }
1785