xref: /linux/net/ipv6/ip6_output.c (revision 492c826b9facefa84995f4dea917e301b5ee0884)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 
104 	skb->protocol = htons(ETH_P_IPV6);
105 	skb->dev = dev;
106 
107 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109 
110 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111 		    ((mroute6_socket(dev_net(dev), skb) &&
112 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114 					 &ipv6_hdr(skb)->saddr))) {
115 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116 
117 			/* Do not check for IFF_ALLMULTI; multicast routing
118 			   is not supported in any case.
119 			 */
120 			if (newskb)
121 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122 					newskb, NULL, newskb->dev,
123 					ip6_dev_loopback_xmit);
124 
125 			if (ipv6_hdr(skb)->hop_limit == 0) {
126 				IP6_INC_STATS(dev_net(dev), idev,
127 					      IPSTATS_MIB_OUTDISCARDS);
128 				kfree_skb(skb);
129 				return 0;
130 			}
131 		}
132 
133 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134 				skb->len);
135 	}
136 
137 	if (dst->hh)
138 		return neigh_hh_output(dst->hh, skb);
139 	else if (dst->neighbour)
140 		return dst->neighbour->output(skb);
141 
142 	IP6_INC_STATS_BH(dev_net(dst->dev),
143 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144 	kfree_skb(skb);
145 	return -EINVAL;
146 }
147 
148 static int ip6_finish_output(struct sk_buff *skb)
149 {
150 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151 	    dst_allfrag(skb_dst(skb)))
152 		return ip6_fragment(skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(skb);
155 }
156 
157 int ip6_output(struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 	if (unlikely(idev->cnf.disable_ipv6)) {
162 		IP6_INC_STATS(dev_net(dev), idev,
163 			      IPSTATS_MIB_OUTDISCARDS);
164 		kfree_skb(skb);
165 		return 0;
166 	}
167 
168 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169 			    ip6_finish_output,
170 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172 
173 /*
174  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
175  */
176 
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
178 	     struct ipv6_txoptions *opt)
179 {
180 	struct net *net = sock_net(sk);
181 	struct ipv6_pinfo *np = inet6_sk(sk);
182 	struct in6_addr *first_hop = &fl6->daddr;
183 	struct dst_entry *dst = skb_dst(skb);
184 	struct ipv6hdr *hdr;
185 	u8  proto = fl6->flowi6_proto;
186 	int seg_len = skb->len;
187 	int hlimit = -1;
188 	int tclass = 0;
189 	u32 mtu;
190 
191 	if (opt) {
192 		unsigned int head_room;
193 
194 		/* First: exthdrs may take lots of space (~8K for now)
195 		   MAX_HEADER is not enough.
196 		 */
197 		head_room = opt->opt_nflen + opt->opt_flen;
198 		seg_len += head_room;
199 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200 
201 		if (skb_headroom(skb) < head_room) {
202 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203 			if (skb2 == NULL) {
204 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205 					      IPSTATS_MIB_OUTDISCARDS);
206 				kfree_skb(skb);
207 				return -ENOBUFS;
208 			}
209 			kfree_skb(skb);
210 			skb = skb2;
211 			skb_set_owner_w(skb, sk);
212 		}
213 		if (opt->opt_flen)
214 			ipv6_push_frag_opts(skb, opt, &proto);
215 		if (opt->opt_nflen)
216 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217 	}
218 
219 	skb_push(skb, sizeof(struct ipv6hdr));
220 	skb_reset_network_header(skb);
221 	hdr = ipv6_hdr(skb);
222 
223 	/*
224 	 *	Fill in the IPv6 header
225 	 */
226 	if (np) {
227 		tclass = np->tclass;
228 		hlimit = np->hop_limit;
229 	}
230 	if (hlimit < 0)
231 		hlimit = ip6_dst_hoplimit(dst);
232 
233 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
234 
235 	hdr->payload_len = htons(seg_len);
236 	hdr->nexthdr = proto;
237 	hdr->hop_limit = hlimit;
238 
239 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
240 	ipv6_addr_copy(&hdr->daddr, first_hop);
241 
242 	skb->priority = sk->sk_priority;
243 	skb->mark = sk->sk_mark;
244 
245 	mtu = dst_mtu(dst);
246 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248 			      IPSTATS_MIB_OUT, skb->len);
249 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250 			       dst->dev, dst_output);
251 	}
252 
253 	if (net_ratelimit())
254 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255 	skb->dev = dst->dev;
256 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258 	kfree_skb(skb);
259 	return -EMSGSIZE;
260 }
261 
262 EXPORT_SYMBOL(ip6_xmit);
263 
264 /*
265  *	To avoid extra problems ND packets are send through this
266  *	routine. It's code duplication but I really want to avoid
267  *	extra checks since ipv6_build_header is used by TCP (which
268  *	is for us performance critical)
269  */
270 
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
273 	       int proto, int len)
274 {
275 	struct ipv6_pinfo *np = inet6_sk(sk);
276 	struct ipv6hdr *hdr;
277 
278 	skb->protocol = htons(ETH_P_IPV6);
279 	skb->dev = dev;
280 
281 	skb_reset_network_header(skb);
282 	skb_put(skb, sizeof(struct ipv6hdr));
283 	hdr = ipv6_hdr(skb);
284 
285 	*(__be32*)hdr = htonl(0x60000000);
286 
287 	hdr->payload_len = htons(len);
288 	hdr->nexthdr = proto;
289 	hdr->hop_limit = np->hop_limit;
290 
291 	ipv6_addr_copy(&hdr->saddr, saddr);
292 	ipv6_addr_copy(&hdr->daddr, daddr);
293 
294 	return 0;
295 }
296 
297 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
298 {
299 	struct ip6_ra_chain *ra;
300 	struct sock *last = NULL;
301 
302 	read_lock(&ip6_ra_lock);
303 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
304 		struct sock *sk = ra->sk;
305 		if (sk && ra->sel == sel &&
306 		    (!sk->sk_bound_dev_if ||
307 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
308 			if (last) {
309 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
310 				if (skb2)
311 					rawv6_rcv(last, skb2);
312 			}
313 			last = sk;
314 		}
315 	}
316 
317 	if (last) {
318 		rawv6_rcv(last, skb);
319 		read_unlock(&ip6_ra_lock);
320 		return 1;
321 	}
322 	read_unlock(&ip6_ra_lock);
323 	return 0;
324 }
325 
326 static int ip6_forward_proxy_check(struct sk_buff *skb)
327 {
328 	struct ipv6hdr *hdr = ipv6_hdr(skb);
329 	u8 nexthdr = hdr->nexthdr;
330 	int offset;
331 
332 	if (ipv6_ext_hdr(nexthdr)) {
333 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
334 		if (offset < 0)
335 			return 0;
336 	} else
337 		offset = sizeof(struct ipv6hdr);
338 
339 	if (nexthdr == IPPROTO_ICMPV6) {
340 		struct icmp6hdr *icmp6;
341 
342 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
343 					 offset + 1 - skb->data)))
344 			return 0;
345 
346 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
347 
348 		switch (icmp6->icmp6_type) {
349 		case NDISC_ROUTER_SOLICITATION:
350 		case NDISC_ROUTER_ADVERTISEMENT:
351 		case NDISC_NEIGHBOUR_SOLICITATION:
352 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
353 		case NDISC_REDIRECT:
354 			/* For reaction involving unicast neighbor discovery
355 			 * message destined to the proxied address, pass it to
356 			 * input function.
357 			 */
358 			return 1;
359 		default:
360 			break;
361 		}
362 	}
363 
364 	/*
365 	 * The proxying router can't forward traffic sent to a link-local
366 	 * address, so signal the sender and discard the packet. This
367 	 * behavior is clarified by the MIPv6 specification.
368 	 */
369 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
370 		dst_link_failure(skb);
371 		return -1;
372 	}
373 
374 	return 0;
375 }
376 
377 static inline int ip6_forward_finish(struct sk_buff *skb)
378 {
379 	return dst_output(skb);
380 }
381 
382 int ip6_forward(struct sk_buff *skb)
383 {
384 	struct dst_entry *dst = skb_dst(skb);
385 	struct ipv6hdr *hdr = ipv6_hdr(skb);
386 	struct inet6_skb_parm *opt = IP6CB(skb);
387 	struct net *net = dev_net(dst->dev);
388 	u32 mtu;
389 
390 	if (net->ipv6.devconf_all->forwarding == 0)
391 		goto error;
392 
393 	if (skb_warn_if_lro(skb))
394 		goto drop;
395 
396 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
397 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
398 		goto drop;
399 	}
400 
401 	if (skb->pkt_type != PACKET_HOST)
402 		goto drop;
403 
404 	skb_forward_csum(skb);
405 
406 	/*
407 	 *	We DO NOT make any processing on
408 	 *	RA packets, pushing them to user level AS IS
409 	 *	without ane WARRANTY that application will be able
410 	 *	to interpret them. The reason is that we
411 	 *	cannot make anything clever here.
412 	 *
413 	 *	We are not end-node, so that if packet contains
414 	 *	AH/ESP, we cannot make anything.
415 	 *	Defragmentation also would be mistake, RA packets
416 	 *	cannot be fragmented, because there is no warranty
417 	 *	that different fragments will go along one path. --ANK
418 	 */
419 	if (opt->ra) {
420 		u8 *ptr = skb_network_header(skb) + opt->ra;
421 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
422 			return 0;
423 	}
424 
425 	/*
426 	 *	check and decrement ttl
427 	 */
428 	if (hdr->hop_limit <= 1) {
429 		/* Force OUTPUT device used as source address */
430 		skb->dev = dst->dev;
431 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
432 		IP6_INC_STATS_BH(net,
433 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
434 
435 		kfree_skb(skb);
436 		return -ETIMEDOUT;
437 	}
438 
439 	/* XXX: idev->cnf.proxy_ndp? */
440 	if (net->ipv6.devconf_all->proxy_ndp &&
441 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
442 		int proxied = ip6_forward_proxy_check(skb);
443 		if (proxied > 0)
444 			return ip6_input(skb);
445 		else if (proxied < 0) {
446 			IP6_INC_STATS(net, ip6_dst_idev(dst),
447 				      IPSTATS_MIB_INDISCARDS);
448 			goto drop;
449 		}
450 	}
451 
452 	if (!xfrm6_route_forward(skb)) {
453 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
454 		goto drop;
455 	}
456 	dst = skb_dst(skb);
457 
458 	/* IPv6 specs say nothing about it, but it is clear that we cannot
459 	   send redirects to source routed frames.
460 	   We don't send redirects to frames decapsulated from IPsec.
461 	 */
462 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
463 	    !skb_sec_path(skb)) {
464 		struct in6_addr *target = NULL;
465 		struct rt6_info *rt;
466 		struct neighbour *n = dst->neighbour;
467 
468 		/*
469 		 *	incoming and outgoing devices are the same
470 		 *	send a redirect.
471 		 */
472 
473 		rt = (struct rt6_info *) dst;
474 		if ((rt->rt6i_flags & RTF_GATEWAY))
475 			target = (struct in6_addr*)&n->primary_key;
476 		else
477 			target = &hdr->daddr;
478 
479 		if (!rt->rt6i_peer)
480 			rt6_bind_peer(rt, 1);
481 
482 		/* Limit redirects both by destination (here)
483 		   and by source (inside ndisc_send_redirect)
484 		 */
485 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486 			ndisc_send_redirect(skb, n, target);
487 	} else {
488 		int addrtype = ipv6_addr_type(&hdr->saddr);
489 
490 		/* This check is security critical. */
491 		if (addrtype == IPV6_ADDR_ANY ||
492 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 			goto error;
494 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 				    ICMPV6_NOT_NEIGHBOUR, 0);
497 			goto error;
498 		}
499 	}
500 
501 	mtu = dst_mtu(dst);
502 	if (mtu < IPV6_MIN_MTU)
503 		mtu = IPV6_MIN_MTU;
504 
505 	if (skb->len > mtu && !skb_is_gso(skb)) {
506 		/* Again, force OUTPUT device used as source address */
507 		skb->dev = dst->dev;
508 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 		IP6_INC_STATS_BH(net,
510 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 		IP6_INC_STATS_BH(net,
512 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513 		kfree_skb(skb);
514 		return -EMSGSIZE;
515 	}
516 
517 	if (skb_cow(skb, dst->dev->hard_header_len)) {
518 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519 		goto drop;
520 	}
521 
522 	hdr = ipv6_hdr(skb);
523 
524 	/* Mangling hops number delayed to point after skb COW */
525 
526 	hdr->hop_limit--;
527 
528 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530 		       ip6_forward_finish);
531 
532 error:
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535 	kfree_skb(skb);
536 	return -EINVAL;
537 }
538 
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541 	to->pkt_type = from->pkt_type;
542 	to->priority = from->priority;
543 	to->protocol = from->protocol;
544 	skb_dst_drop(to);
545 	skb_dst_set(to, dst_clone(skb_dst(from)));
546 	to->dev = from->dev;
547 	to->mark = from->mark;
548 
549 #ifdef CONFIG_NET_SCHED
550 	to->tc_index = from->tc_index;
551 #endif
552 	nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 	to->nf_trace = from->nf_trace;
556 #endif
557 	skb_copy_secmark(to, from);
558 }
559 
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 {
562 	u16 offset = sizeof(struct ipv6hdr);
563 	struct ipv6_opt_hdr *exthdr =
564 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 	unsigned int packet_len = skb->tail - skb->network_header;
566 	int found_rhdr = 0;
567 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
568 
569 	while (offset + 1 <= packet_len) {
570 
571 		switch (**nexthdr) {
572 
573 		case NEXTHDR_HOP:
574 			break;
575 		case NEXTHDR_ROUTING:
576 			found_rhdr = 1;
577 			break;
578 		case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581 				break;
582 #endif
583 			if (found_rhdr)
584 				return offset;
585 			break;
586 		default :
587 			return offset;
588 		}
589 
590 		offset += ipv6_optlen(exthdr);
591 		*nexthdr = &exthdr->nexthdr;
592 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593 						 offset);
594 	}
595 
596 	return offset;
597 }
598 
599 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
600 {
601 	struct sk_buff *frag;
602 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
603 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
604 	struct ipv6hdr *tmp_hdr;
605 	struct frag_hdr *fh;
606 	unsigned int mtu, hlen, left, len;
607 	__be32 frag_id = 0;
608 	int ptr, offset = 0, err=0;
609 	u8 *prevhdr, nexthdr = 0;
610 	struct net *net = dev_net(skb_dst(skb)->dev);
611 
612 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
613 	nexthdr = *prevhdr;
614 
615 	mtu = ip6_skb_dst_mtu(skb);
616 
617 	/* We must not fragment if the socket is set to force MTU discovery
618 	 * or if the skb it not generated by a local socket.
619 	 */
620 	if (!skb->local_df && skb->len > mtu) {
621 		skb->dev = skb_dst(skb)->dev;
622 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
623 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
624 			      IPSTATS_MIB_FRAGFAILS);
625 		kfree_skb(skb);
626 		return -EMSGSIZE;
627 	}
628 
629 	if (np && np->frag_size < mtu) {
630 		if (np->frag_size)
631 			mtu = np->frag_size;
632 	}
633 	mtu -= hlen + sizeof(struct frag_hdr);
634 
635 	if (skb_has_frag_list(skb)) {
636 		int first_len = skb_pagelen(skb);
637 		struct sk_buff *frag2;
638 
639 		if (first_len - hlen > mtu ||
640 		    ((first_len - hlen) & 7) ||
641 		    skb_cloned(skb))
642 			goto slow_path;
643 
644 		skb_walk_frags(skb, frag) {
645 			/* Correct geometry. */
646 			if (frag->len > mtu ||
647 			    ((frag->len & 7) && frag->next) ||
648 			    skb_headroom(frag) < hlen)
649 				goto slow_path_clean;
650 
651 			/* Partially cloned skb? */
652 			if (skb_shared(frag))
653 				goto slow_path_clean;
654 
655 			BUG_ON(frag->sk);
656 			if (skb->sk) {
657 				frag->sk = skb->sk;
658 				frag->destructor = sock_wfree;
659 			}
660 			skb->truesize -= frag->truesize;
661 		}
662 
663 		err = 0;
664 		offset = 0;
665 		frag = skb_shinfo(skb)->frag_list;
666 		skb_frag_list_init(skb);
667 		/* BUILD HEADER */
668 
669 		*prevhdr = NEXTHDR_FRAGMENT;
670 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
671 		if (!tmp_hdr) {
672 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
673 				      IPSTATS_MIB_FRAGFAILS);
674 			return -ENOMEM;
675 		}
676 
677 		__skb_pull(skb, hlen);
678 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
679 		__skb_push(skb, hlen);
680 		skb_reset_network_header(skb);
681 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
682 
683 		ipv6_select_ident(fh);
684 		fh->nexthdr = nexthdr;
685 		fh->reserved = 0;
686 		fh->frag_off = htons(IP6_MF);
687 		frag_id = fh->identification;
688 
689 		first_len = skb_pagelen(skb);
690 		skb->data_len = first_len - skb_headlen(skb);
691 		skb->len = first_len;
692 		ipv6_hdr(skb)->payload_len = htons(first_len -
693 						   sizeof(struct ipv6hdr));
694 
695 		dst_hold(&rt->dst);
696 
697 		for (;;) {
698 			/* Prepare header of the next frame,
699 			 * before previous one went down. */
700 			if (frag) {
701 				frag->ip_summed = CHECKSUM_NONE;
702 				skb_reset_transport_header(frag);
703 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
704 				__skb_push(frag, hlen);
705 				skb_reset_network_header(frag);
706 				memcpy(skb_network_header(frag), tmp_hdr,
707 				       hlen);
708 				offset += skb->len - hlen - sizeof(struct frag_hdr);
709 				fh->nexthdr = nexthdr;
710 				fh->reserved = 0;
711 				fh->frag_off = htons(offset);
712 				if (frag->next != NULL)
713 					fh->frag_off |= htons(IP6_MF);
714 				fh->identification = frag_id;
715 				ipv6_hdr(frag)->payload_len =
716 						htons(frag->len -
717 						      sizeof(struct ipv6hdr));
718 				ip6_copy_metadata(frag, skb);
719 			}
720 
721 			err = output(skb);
722 			if(!err)
723 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
724 					      IPSTATS_MIB_FRAGCREATES);
725 
726 			if (err || !frag)
727 				break;
728 
729 			skb = frag;
730 			frag = skb->next;
731 			skb->next = NULL;
732 		}
733 
734 		kfree(tmp_hdr);
735 
736 		if (err == 0) {
737 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738 				      IPSTATS_MIB_FRAGOKS);
739 			dst_release(&rt->dst);
740 			return 0;
741 		}
742 
743 		while (frag) {
744 			skb = frag->next;
745 			kfree_skb(frag);
746 			frag = skb;
747 		}
748 
749 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
750 			      IPSTATS_MIB_FRAGFAILS);
751 		dst_release(&rt->dst);
752 		return err;
753 
754 slow_path_clean:
755 		skb_walk_frags(skb, frag2) {
756 			if (frag2 == frag)
757 				break;
758 			frag2->sk = NULL;
759 			frag2->destructor = NULL;
760 			skb->truesize += frag2->truesize;
761 		}
762 	}
763 
764 slow_path:
765 	left = skb->len - hlen;		/* Space per frame */
766 	ptr = hlen;			/* Where to start from */
767 
768 	/*
769 	 *	Fragment the datagram.
770 	 */
771 
772 	*prevhdr = NEXTHDR_FRAGMENT;
773 
774 	/*
775 	 *	Keep copying data until we run out.
776 	 */
777 	while(left > 0)	{
778 		len = left;
779 		/* IF: it doesn't fit, use 'mtu' - the data space left */
780 		if (len > mtu)
781 			len = mtu;
782 		/* IF: we are not sending up to and including the packet end
783 		   then align the next start on an eight byte boundary */
784 		if (len < left)	{
785 			len &= ~7;
786 		}
787 		/*
788 		 *	Allocate buffer.
789 		 */
790 
791 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
792 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
793 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
794 				      IPSTATS_MIB_FRAGFAILS);
795 			err = -ENOMEM;
796 			goto fail;
797 		}
798 
799 		/*
800 		 *	Set up data on packet
801 		 */
802 
803 		ip6_copy_metadata(frag, skb);
804 		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
805 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
806 		skb_reset_network_header(frag);
807 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
808 		frag->transport_header = (frag->network_header + hlen +
809 					  sizeof(struct frag_hdr));
810 
811 		/*
812 		 *	Charge the memory for the fragment to any owner
813 		 *	it might possess
814 		 */
815 		if (skb->sk)
816 			skb_set_owner_w(frag, skb->sk);
817 
818 		/*
819 		 *	Copy the packet header into the new buffer.
820 		 */
821 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
822 
823 		/*
824 		 *	Build fragment header.
825 		 */
826 		fh->nexthdr = nexthdr;
827 		fh->reserved = 0;
828 		if (!frag_id) {
829 			ipv6_select_ident(fh);
830 			frag_id = fh->identification;
831 		} else
832 			fh->identification = frag_id;
833 
834 		/*
835 		 *	Copy a block of the IP datagram.
836 		 */
837 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
838 			BUG();
839 		left -= len;
840 
841 		fh->frag_off = htons(offset);
842 		if (left > 0)
843 			fh->frag_off |= htons(IP6_MF);
844 		ipv6_hdr(frag)->payload_len = htons(frag->len -
845 						    sizeof(struct ipv6hdr));
846 
847 		ptr += len;
848 		offset += len;
849 
850 		/*
851 		 *	Put this fragment into the sending queue.
852 		 */
853 		err = output(frag);
854 		if (err)
855 			goto fail;
856 
857 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858 			      IPSTATS_MIB_FRAGCREATES);
859 	}
860 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861 		      IPSTATS_MIB_FRAGOKS);
862 	kfree_skb(skb);
863 	return err;
864 
865 fail:
866 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
867 		      IPSTATS_MIB_FRAGFAILS);
868 	kfree_skb(skb);
869 	return err;
870 }
871 
872 static inline int ip6_rt_check(const struct rt6key *rt_key,
873 			       const struct in6_addr *fl_addr,
874 			       const struct in6_addr *addr_cache)
875 {
876 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
878 }
879 
880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881 					  struct dst_entry *dst,
882 					  const struct flowi6 *fl6)
883 {
884 	struct ipv6_pinfo *np = inet6_sk(sk);
885 	struct rt6_info *rt = (struct rt6_info *)dst;
886 
887 	if (!dst)
888 		goto out;
889 
890 	/* Yes, checking route validity in not connected
891 	 * case is not very simple. Take into account,
892 	 * that we do not support routing by source, TOS,
893 	 * and MSG_DONTROUTE 		--ANK (980726)
894 	 *
895 	 * 1. ip6_rt_check(): If route was host route,
896 	 *    check that cached destination is current.
897 	 *    If it is network route, we still may
898 	 *    check its validity using saved pointer
899 	 *    to the last used address: daddr_cache.
900 	 *    We do not want to save whole address now,
901 	 *    (because main consumer of this service
902 	 *    is tcp, which has not this problem),
903 	 *    so that the last trick works only on connected
904 	 *    sockets.
905 	 * 2. oif also should be the same.
906 	 */
907 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
908 #ifdef CONFIG_IPV6_SUBTREES
909 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
910 #endif
911 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
912 		dst_release(dst);
913 		dst = NULL;
914 	}
915 
916 out:
917 	return dst;
918 }
919 
920 static int ip6_dst_lookup_tail(struct sock *sk,
921 			       struct dst_entry **dst, struct flowi6 *fl6)
922 {
923 	int err;
924 	struct net *net = sock_net(sk);
925 
926 	if (*dst == NULL)
927 		*dst = ip6_route_output(net, sk, fl6);
928 
929 	if ((err = (*dst)->error))
930 		goto out_err_release;
931 
932 	if (ipv6_addr_any(&fl6->saddr)) {
933 		struct rt6_info *rt = (struct rt6_info *) *dst;
934 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
935 					  sk ? inet6_sk(sk)->srcprefs : 0,
936 					  &fl6->saddr);
937 		if (err)
938 			goto out_err_release;
939 	}
940 
941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
942 	/*
943 	 * Here if the dst entry we've looked up
944 	 * has a neighbour entry that is in the INCOMPLETE
945 	 * state and the src address from the flow is
946 	 * marked as OPTIMISTIC, we release the found
947 	 * dst entry and replace it instead with the
948 	 * dst entry of the nexthop router
949 	 */
950 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
951 		struct inet6_ifaddr *ifp;
952 		struct flowi6 fl_gw6;
953 		int redirect;
954 
955 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
956 				      (*dst)->dev, 1);
957 
958 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959 		if (ifp)
960 			in6_ifa_put(ifp);
961 
962 		if (redirect) {
963 			/*
964 			 * We need to get the dst entry for the
965 			 * default router instead
966 			 */
967 			dst_release(*dst);
968 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
969 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
970 			*dst = ip6_route_output(net, sk, &fl_gw6);
971 			if ((err = (*dst)->error))
972 				goto out_err_release;
973 		}
974 	}
975 #endif
976 
977 	return 0;
978 
979 out_err_release:
980 	if (err == -ENETUNREACH)
981 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
982 	dst_release(*dst);
983 	*dst = NULL;
984 	return err;
985 }
986 
987 /**
988  *	ip6_dst_lookup - perform route lookup on flow
989  *	@sk: socket which provides route info
990  *	@dst: pointer to dst_entry * for result
991  *	@fl6: flow to lookup
992  *
993  *	This function performs a route lookup on the given flow.
994  *
995  *	It returns zero on success, or a standard errno code on error.
996  */
997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
998 {
999 	*dst = NULL;
1000 	return ip6_dst_lookup_tail(sk, dst, fl6);
1001 }
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003 
1004 /**
1005  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1006  *	@sk: socket which provides route info
1007  *	@fl6: flow to lookup
1008  *	@final_dst: final destination address for ipsec lookup
1009  *	@can_sleep: we are in a sleepable context
1010  *
1011  *	This function performs a route lookup on the given flow.
1012  *
1013  *	It returns a valid dst pointer on success, or a pointer encoded
1014  *	error code.
1015  */
1016 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1017 				      const struct in6_addr *final_dst,
1018 				      bool can_sleep)
1019 {
1020 	struct dst_entry *dst = NULL;
1021 	int err;
1022 
1023 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1024 	if (err)
1025 		return ERR_PTR(err);
1026 	if (final_dst)
1027 		ipv6_addr_copy(&fl6->daddr, final_dst);
1028 	if (can_sleep)
1029 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1030 
1031 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1032 }
1033 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1034 
1035 /**
1036  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1037  *	@sk: socket which provides the dst cache and route info
1038  *	@fl6: flow to lookup
1039  *	@final_dst: final destination address for ipsec lookup
1040  *	@can_sleep: we are in a sleepable context
1041  *
1042  *	This function performs a route lookup on the given flow with the
1043  *	possibility of using the cached route in the socket if it is valid.
1044  *	It will take the socket dst lock when operating on the dst cache.
1045  *	As a result, this function can only be used in process context.
1046  *
1047  *	It returns a valid dst pointer on success, or a pointer encoded
1048  *	error code.
1049  */
1050 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1051 					 const struct in6_addr *final_dst,
1052 					 bool can_sleep)
1053 {
1054 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1055 	int err;
1056 
1057 	dst = ip6_sk_dst_check(sk, dst, fl6);
1058 
1059 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1060 	if (err)
1061 		return ERR_PTR(err);
1062 	if (final_dst)
1063 		ipv6_addr_copy(&fl6->daddr, final_dst);
1064 	if (can_sleep)
1065 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1066 
1067 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1068 }
1069 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1070 
1071 static inline int ip6_ufo_append_data(struct sock *sk,
1072 			int getfrag(void *from, char *to, int offset, int len,
1073 			int odd, struct sk_buff *skb),
1074 			void *from, int length, int hh_len, int fragheaderlen,
1075 			int transhdrlen, int mtu,unsigned int flags)
1076 
1077 {
1078 	struct sk_buff *skb;
1079 	int err;
1080 
1081 	/* There is support for UDP large send offload by network
1082 	 * device, so create one single skb packet containing complete
1083 	 * udp datagram
1084 	 */
1085 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1086 		skb = sock_alloc_send_skb(sk,
1087 			hh_len + fragheaderlen + transhdrlen + 20,
1088 			(flags & MSG_DONTWAIT), &err);
1089 		if (skb == NULL)
1090 			return -ENOMEM;
1091 
1092 		/* reserve space for Hardware header */
1093 		skb_reserve(skb, hh_len);
1094 
1095 		/* create space for UDP/IP header */
1096 		skb_put(skb,fragheaderlen + transhdrlen);
1097 
1098 		/* initialize network header pointer */
1099 		skb_reset_network_header(skb);
1100 
1101 		/* initialize protocol header pointer */
1102 		skb->transport_header = skb->network_header + fragheaderlen;
1103 
1104 		skb->ip_summed = CHECKSUM_PARTIAL;
1105 		skb->csum = 0;
1106 	}
1107 
1108 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1109 				      (length - transhdrlen));
1110 	if (!err) {
1111 		struct frag_hdr fhdr;
1112 
1113 		/* Specify the length of each IPv6 datagram fragment.
1114 		 * It has to be a multiple of 8.
1115 		 */
1116 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1117 					     sizeof(struct frag_hdr)) & ~7;
1118 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1119 		ipv6_select_ident(&fhdr);
1120 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1121 		__skb_queue_tail(&sk->sk_write_queue, skb);
1122 
1123 		return 0;
1124 	}
1125 	/* There is not enough support do UPD LSO,
1126 	 * so follow normal path
1127 	 */
1128 	kfree_skb(skb);
1129 
1130 	return err;
1131 }
1132 
1133 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1134 					       gfp_t gfp)
1135 {
1136 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1137 }
1138 
1139 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1140 						gfp_t gfp)
1141 {
1142 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1143 }
1144 
1145 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1146 	int offset, int len, int odd, struct sk_buff *skb),
1147 	void *from, int length, int transhdrlen,
1148 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1149 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1150 {
1151 	struct inet_sock *inet = inet_sk(sk);
1152 	struct ipv6_pinfo *np = inet6_sk(sk);
1153 	struct inet_cork *cork;
1154 	struct sk_buff *skb;
1155 	unsigned int maxfraglen, fragheaderlen;
1156 	int exthdrlen;
1157 	int hh_len;
1158 	int mtu;
1159 	int copy;
1160 	int err;
1161 	int offset = 0;
1162 	int csummode = CHECKSUM_NONE;
1163 	__u8 tx_flags = 0;
1164 
1165 	if (flags&MSG_PROBE)
1166 		return 0;
1167 	cork = &inet->cork.base;
1168 	if (skb_queue_empty(&sk->sk_write_queue)) {
1169 		/*
1170 		 * setup for corking
1171 		 */
1172 		if (opt) {
1173 			if (WARN_ON(np->cork.opt))
1174 				return -EINVAL;
1175 
1176 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1177 			if (unlikely(np->cork.opt == NULL))
1178 				return -ENOBUFS;
1179 
1180 			np->cork.opt->tot_len = opt->tot_len;
1181 			np->cork.opt->opt_flen = opt->opt_flen;
1182 			np->cork.opt->opt_nflen = opt->opt_nflen;
1183 
1184 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1185 							    sk->sk_allocation);
1186 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1187 				return -ENOBUFS;
1188 
1189 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1190 							    sk->sk_allocation);
1191 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1192 				return -ENOBUFS;
1193 
1194 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1195 							   sk->sk_allocation);
1196 			if (opt->hopopt && !np->cork.opt->hopopt)
1197 				return -ENOBUFS;
1198 
1199 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1200 							    sk->sk_allocation);
1201 			if (opt->srcrt && !np->cork.opt->srcrt)
1202 				return -ENOBUFS;
1203 
1204 			/* need source address above miyazawa*/
1205 		}
1206 		dst_hold(&rt->dst);
1207 		cork->dst = &rt->dst;
1208 		inet->cork.fl.u.ip6 = *fl6;
1209 		np->cork.hop_limit = hlimit;
1210 		np->cork.tclass = tclass;
1211 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1212 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1213 		if (np->frag_size < mtu) {
1214 			if (np->frag_size)
1215 				mtu = np->frag_size;
1216 		}
1217 		cork->fragsize = mtu;
1218 		if (dst_allfrag(rt->dst.path))
1219 			cork->flags |= IPCORK_ALLFRAG;
1220 		cork->length = 0;
1221 		sk->sk_sndmsg_page = NULL;
1222 		sk->sk_sndmsg_off = 0;
1223 		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1224 			    rt->rt6i_nfheader_len;
1225 		length += exthdrlen;
1226 		transhdrlen += exthdrlen;
1227 	} else {
1228 		rt = (struct rt6_info *)cork->dst;
1229 		fl6 = &inet->cork.fl.u.ip6;
1230 		opt = np->cork.opt;
1231 		transhdrlen = 0;
1232 		exthdrlen = 0;
1233 		mtu = cork->fragsize;
1234 	}
1235 
1236 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1237 
1238 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1239 			(opt ? opt->opt_nflen : 0);
1240 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1241 
1242 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1243 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1244 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1245 			return -EMSGSIZE;
1246 		}
1247 	}
1248 
1249 	/* For UDP, check if TX timestamp is enabled */
1250 	if (sk->sk_type == SOCK_DGRAM) {
1251 		err = sock_tx_timestamp(sk, &tx_flags);
1252 		if (err)
1253 			goto error;
1254 	}
1255 
1256 	/*
1257 	 * Let's try using as much space as possible.
1258 	 * Use MTU if total length of the message fits into the MTU.
1259 	 * Otherwise, we need to reserve fragment header and
1260 	 * fragment alignment (= 8-15 octects, in total).
1261 	 *
1262 	 * Note that we may need to "move" the data from the tail of
1263 	 * of the buffer to the new fragment when we split
1264 	 * the message.
1265 	 *
1266 	 * FIXME: It may be fragmented into multiple chunks
1267 	 *        at once if non-fragmentable extension headers
1268 	 *        are too large.
1269 	 * --yoshfuji
1270 	 */
1271 
1272 	cork->length += length;
1273 	if (length > mtu) {
1274 		int proto = sk->sk_protocol;
1275 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1276 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1277 			return -EMSGSIZE;
1278 		}
1279 
1280 		if (proto == IPPROTO_UDP &&
1281 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1282 
1283 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1284 						  hh_len, fragheaderlen,
1285 						  transhdrlen, mtu, flags);
1286 			if (err)
1287 				goto error;
1288 			return 0;
1289 		}
1290 	}
1291 
1292 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1293 		goto alloc_new_skb;
1294 
1295 	while (length > 0) {
1296 		/* Check if the remaining data fits into current packet. */
1297 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1298 		if (copy < length)
1299 			copy = maxfraglen - skb->len;
1300 
1301 		if (copy <= 0) {
1302 			char *data;
1303 			unsigned int datalen;
1304 			unsigned int fraglen;
1305 			unsigned int fraggap;
1306 			unsigned int alloclen;
1307 			struct sk_buff *skb_prev;
1308 alloc_new_skb:
1309 			skb_prev = skb;
1310 
1311 			/* There's no room in the current skb */
1312 			if (skb_prev)
1313 				fraggap = skb_prev->len - maxfraglen;
1314 			else
1315 				fraggap = 0;
1316 
1317 			/*
1318 			 * If remaining data exceeds the mtu,
1319 			 * we know we need more fragment(s).
1320 			 */
1321 			datalen = length + fraggap;
1322 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1323 				datalen = maxfraglen - fragheaderlen;
1324 
1325 			fraglen = datalen + fragheaderlen;
1326 			if ((flags & MSG_MORE) &&
1327 			    !(rt->dst.dev->features&NETIF_F_SG))
1328 				alloclen = mtu;
1329 			else
1330 				alloclen = datalen + fragheaderlen;
1331 
1332 			/*
1333 			 * The last fragment gets additional space at tail.
1334 			 * Note: we overallocate on fragments with MSG_MODE
1335 			 * because we have no idea if we're the last one.
1336 			 */
1337 			if (datalen == length + fraggap)
1338 				alloclen += rt->dst.trailer_len;
1339 
1340 			/*
1341 			 * We just reserve space for fragment header.
1342 			 * Note: this may be overallocation if the message
1343 			 * (without MSG_MORE) fits into the MTU.
1344 			 */
1345 			alloclen += sizeof(struct frag_hdr);
1346 
1347 			if (transhdrlen) {
1348 				skb = sock_alloc_send_skb(sk,
1349 						alloclen + hh_len,
1350 						(flags & MSG_DONTWAIT), &err);
1351 			} else {
1352 				skb = NULL;
1353 				if (atomic_read(&sk->sk_wmem_alloc) <=
1354 				    2 * sk->sk_sndbuf)
1355 					skb = sock_wmalloc(sk,
1356 							   alloclen + hh_len, 1,
1357 							   sk->sk_allocation);
1358 				if (unlikely(skb == NULL))
1359 					err = -ENOBUFS;
1360 				else {
1361 					/* Only the initial fragment
1362 					 * is time stamped.
1363 					 */
1364 					tx_flags = 0;
1365 				}
1366 			}
1367 			if (skb == NULL)
1368 				goto error;
1369 			/*
1370 			 *	Fill in the control structures
1371 			 */
1372 			skb->ip_summed = csummode;
1373 			skb->csum = 0;
1374 			/* reserve for fragmentation */
1375 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1376 
1377 			if (sk->sk_type == SOCK_DGRAM)
1378 				skb_shinfo(skb)->tx_flags = tx_flags;
1379 
1380 			/*
1381 			 *	Find where to start putting bytes
1382 			 */
1383 			data = skb_put(skb, fraglen);
1384 			skb_set_network_header(skb, exthdrlen);
1385 			data += fragheaderlen;
1386 			skb->transport_header = (skb->network_header +
1387 						 fragheaderlen);
1388 			if (fraggap) {
1389 				skb->csum = skb_copy_and_csum_bits(
1390 					skb_prev, maxfraglen,
1391 					data + transhdrlen, fraggap, 0);
1392 				skb_prev->csum = csum_sub(skb_prev->csum,
1393 							  skb->csum);
1394 				data += fraggap;
1395 				pskb_trim_unique(skb_prev, maxfraglen);
1396 			}
1397 			copy = datalen - transhdrlen - fraggap;
1398 			if (copy < 0) {
1399 				err = -EINVAL;
1400 				kfree_skb(skb);
1401 				goto error;
1402 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1403 				err = -EFAULT;
1404 				kfree_skb(skb);
1405 				goto error;
1406 			}
1407 
1408 			offset += copy;
1409 			length -= datalen - fraggap;
1410 			transhdrlen = 0;
1411 			exthdrlen = 0;
1412 			csummode = CHECKSUM_NONE;
1413 
1414 			/*
1415 			 * Put the packet on the pending queue
1416 			 */
1417 			__skb_queue_tail(&sk->sk_write_queue, skb);
1418 			continue;
1419 		}
1420 
1421 		if (copy > length)
1422 			copy = length;
1423 
1424 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1425 			unsigned int off;
1426 
1427 			off = skb->len;
1428 			if (getfrag(from, skb_put(skb, copy),
1429 						offset, copy, off, skb) < 0) {
1430 				__skb_trim(skb, off);
1431 				err = -EFAULT;
1432 				goto error;
1433 			}
1434 		} else {
1435 			int i = skb_shinfo(skb)->nr_frags;
1436 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1437 			struct page *page = sk->sk_sndmsg_page;
1438 			int off = sk->sk_sndmsg_off;
1439 			unsigned int left;
1440 
1441 			if (page && (left = PAGE_SIZE - off) > 0) {
1442 				if (copy >= left)
1443 					copy = left;
1444 				if (page != frag->page) {
1445 					if (i == MAX_SKB_FRAGS) {
1446 						err = -EMSGSIZE;
1447 						goto error;
1448 					}
1449 					get_page(page);
1450 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1451 					frag = &skb_shinfo(skb)->frags[i];
1452 				}
1453 			} else if(i < MAX_SKB_FRAGS) {
1454 				if (copy > PAGE_SIZE)
1455 					copy = PAGE_SIZE;
1456 				page = alloc_pages(sk->sk_allocation, 0);
1457 				if (page == NULL) {
1458 					err = -ENOMEM;
1459 					goto error;
1460 				}
1461 				sk->sk_sndmsg_page = page;
1462 				sk->sk_sndmsg_off = 0;
1463 
1464 				skb_fill_page_desc(skb, i, page, 0, 0);
1465 				frag = &skb_shinfo(skb)->frags[i];
1466 			} else {
1467 				err = -EMSGSIZE;
1468 				goto error;
1469 			}
1470 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1471 				err = -EFAULT;
1472 				goto error;
1473 			}
1474 			sk->sk_sndmsg_off += copy;
1475 			frag->size += copy;
1476 			skb->len += copy;
1477 			skb->data_len += copy;
1478 			skb->truesize += copy;
1479 			atomic_add(copy, &sk->sk_wmem_alloc);
1480 		}
1481 		offset += copy;
1482 		length -= copy;
1483 	}
1484 	return 0;
1485 error:
1486 	cork->length -= length;
1487 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1488 	return err;
1489 }
1490 
1491 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1492 {
1493 	if (np->cork.opt) {
1494 		kfree(np->cork.opt->dst0opt);
1495 		kfree(np->cork.opt->dst1opt);
1496 		kfree(np->cork.opt->hopopt);
1497 		kfree(np->cork.opt->srcrt);
1498 		kfree(np->cork.opt);
1499 		np->cork.opt = NULL;
1500 	}
1501 
1502 	if (inet->cork.base.dst) {
1503 		dst_release(inet->cork.base.dst);
1504 		inet->cork.base.dst = NULL;
1505 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1506 	}
1507 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1508 }
1509 
1510 int ip6_push_pending_frames(struct sock *sk)
1511 {
1512 	struct sk_buff *skb, *tmp_skb;
1513 	struct sk_buff **tail_skb;
1514 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1515 	struct inet_sock *inet = inet_sk(sk);
1516 	struct ipv6_pinfo *np = inet6_sk(sk);
1517 	struct net *net = sock_net(sk);
1518 	struct ipv6hdr *hdr;
1519 	struct ipv6_txoptions *opt = np->cork.opt;
1520 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1521 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1522 	unsigned char proto = fl6->flowi6_proto;
1523 	int err = 0;
1524 
1525 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1526 		goto out;
1527 	tail_skb = &(skb_shinfo(skb)->frag_list);
1528 
1529 	/* move skb->data to ip header from ext header */
1530 	if (skb->data < skb_network_header(skb))
1531 		__skb_pull(skb, skb_network_offset(skb));
1532 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1533 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1534 		*tail_skb = tmp_skb;
1535 		tail_skb = &(tmp_skb->next);
1536 		skb->len += tmp_skb->len;
1537 		skb->data_len += tmp_skb->len;
1538 		skb->truesize += tmp_skb->truesize;
1539 		tmp_skb->destructor = NULL;
1540 		tmp_skb->sk = NULL;
1541 	}
1542 
1543 	/* Allow local fragmentation. */
1544 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1545 		skb->local_df = 1;
1546 
1547 	ipv6_addr_copy(final_dst, &fl6->daddr);
1548 	__skb_pull(skb, skb_network_header_len(skb));
1549 	if (opt && opt->opt_flen)
1550 		ipv6_push_frag_opts(skb, opt, &proto);
1551 	if (opt && opt->opt_nflen)
1552 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1553 
1554 	skb_push(skb, sizeof(struct ipv6hdr));
1555 	skb_reset_network_header(skb);
1556 	hdr = ipv6_hdr(skb);
1557 
1558 	*(__be32*)hdr = fl6->flowlabel |
1559 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1560 
1561 	hdr->hop_limit = np->cork.hop_limit;
1562 	hdr->nexthdr = proto;
1563 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1564 	ipv6_addr_copy(&hdr->daddr, final_dst);
1565 
1566 	skb->priority = sk->sk_priority;
1567 	skb->mark = sk->sk_mark;
1568 
1569 	skb_dst_set(skb, dst_clone(&rt->dst));
1570 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1571 	if (proto == IPPROTO_ICMPV6) {
1572 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1573 
1574 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1575 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1576 	}
1577 
1578 	err = ip6_local_out(skb);
1579 	if (err) {
1580 		if (err > 0)
1581 			err = net_xmit_errno(err);
1582 		if (err)
1583 			goto error;
1584 	}
1585 
1586 out:
1587 	ip6_cork_release(inet, np);
1588 	return err;
1589 error:
1590 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1591 	goto out;
1592 }
1593 
1594 void ip6_flush_pending_frames(struct sock *sk)
1595 {
1596 	struct sk_buff *skb;
1597 
1598 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1599 		if (skb_dst(skb))
1600 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1601 				      IPSTATS_MIB_OUTDISCARDS);
1602 		kfree_skb(skb);
1603 	}
1604 
1605 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1606 }
1607