xref: /linux/net/ipv6/ip6_output.c (revision a33f32244d8550da8b4a26e277ce07d5c6d158b5)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
71 		       dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 static int ip6_output_finish(struct sk_buff *skb)
87 {
88 	struct dst_entry *dst = skb_dst(skb);
89 
90 	if (dst->hh)
91 		return neigh_hh_output(dst->hh, skb);
92 	else if (dst->neighbour)
93 		return dst->neighbour->output(skb);
94 
95 	IP6_INC_STATS_BH(dev_net(dst->dev),
96 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
97 	kfree_skb(skb);
98 	return -EINVAL;
99 
100 }
101 
102 /* dev_loopback_xmit for use with netfilter. */
103 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
104 {
105 	skb_reset_mac_header(newskb);
106 	__skb_pull(newskb, skb_network_offset(newskb));
107 	newskb->pkt_type = PACKET_LOOPBACK;
108 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 	WARN_ON(!skb_dst(newskb));
110 
111 	netif_rx_ni(newskb);
112 	return 0;
113 }
114 
115 
116 static int ip6_output2(struct sk_buff *skb)
117 {
118 	struct dst_entry *dst = skb_dst(skb);
119 	struct net_device *dev = dst->dev;
120 
121 	skb->protocol = htons(ETH_P_IPV6);
122 	skb->dev = dev;
123 
124 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
125 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
126 
127 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
128 		    ((mroute6_socket(dev_net(dev)) &&
129 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
130 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
131 					 &ipv6_hdr(skb)->saddr))) {
132 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
133 
134 			/* Do not check for IFF_ALLMULTI; multicast routing
135 			   is not supported in any case.
136 			 */
137 			if (newskb)
138 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
139 					NULL, newskb->dev,
140 					ip6_dev_loopback_xmit);
141 
142 			if (ipv6_hdr(skb)->hop_limit == 0) {
143 				IP6_INC_STATS(dev_net(dev), idev,
144 					      IPSTATS_MIB_OUTDISCARDS);
145 				kfree_skb(skb);
146 				return 0;
147 			}
148 		}
149 
150 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
151 				skb->len);
152 	}
153 
154 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
155 		       ip6_output_finish);
156 }
157 
158 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
159 {
160 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
161 
162 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
163 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
164 }
165 
166 int ip6_output(struct sk_buff *skb)
167 {
168 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
169 	if (unlikely(idev->cnf.disable_ipv6)) {
170 		IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
171 			      IPSTATS_MIB_OUTDISCARDS);
172 		kfree_skb(skb);
173 		return 0;
174 	}
175 
176 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
177 				dst_allfrag(skb_dst(skb)))
178 		return ip6_fragment(skb, ip6_output2);
179 	else
180 		return ip6_output2(skb);
181 }
182 
183 /*
184  *	xmit an sk_buff (used by TCP)
185  */
186 
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188 	     struct ipv6_txoptions *opt, int ipfragok)
189 {
190 	struct net *net = sock_net(sk);
191 	struct ipv6_pinfo *np = inet6_sk(sk);
192 	struct in6_addr *first_hop = &fl->fl6_dst;
193 	struct dst_entry *dst = skb_dst(skb);
194 	struct ipv6hdr *hdr;
195 	u8  proto = fl->proto;
196 	int seg_len = skb->len;
197 	int hlimit = -1;
198 	int tclass = 0;
199 	u32 mtu;
200 
201 	if (opt) {
202 		unsigned int head_room;
203 
204 		/* First: exthdrs may take lots of space (~8K for now)
205 		   MAX_HEADER is not enough.
206 		 */
207 		head_room = opt->opt_nflen + opt->opt_flen;
208 		seg_len += head_room;
209 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
210 
211 		if (skb_headroom(skb) < head_room) {
212 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
213 			if (skb2 == NULL) {
214 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
215 					      IPSTATS_MIB_OUTDISCARDS);
216 				kfree_skb(skb);
217 				return -ENOBUFS;
218 			}
219 			kfree_skb(skb);
220 			skb = skb2;
221 			if (sk)
222 				skb_set_owner_w(skb, sk);
223 		}
224 		if (opt->opt_flen)
225 			ipv6_push_frag_opts(skb, opt, &proto);
226 		if (opt->opt_nflen)
227 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 	}
229 
230 	skb_push(skb, sizeof(struct ipv6hdr));
231 	skb_reset_network_header(skb);
232 	hdr = ipv6_hdr(skb);
233 
234 	/* Allow local fragmentation. */
235 	if (ipfragok)
236 		skb->local_df = 1;
237 
238 	/*
239 	 *	Fill in the IPv6 header
240 	 */
241 	if (np) {
242 		tclass = np->tclass;
243 		hlimit = np->hop_limit;
244 	}
245 	if (hlimit < 0)
246 		hlimit = ip6_dst_hoplimit(dst);
247 
248 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
249 
250 	hdr->payload_len = htons(seg_len);
251 	hdr->nexthdr = proto;
252 	hdr->hop_limit = hlimit;
253 
254 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
255 	ipv6_addr_copy(&hdr->daddr, first_hop);
256 
257 	skb->priority = sk->sk_priority;
258 	skb->mark = sk->sk_mark;
259 
260 	mtu = dst_mtu(dst);
261 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
262 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 			      IPSTATS_MIB_OUT, skb->len);
264 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
265 				dst_output);
266 	}
267 
268 	if (net_ratelimit())
269 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
270 	skb->dev = dst->dev;
271 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
272 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
273 	kfree_skb(skb);
274 	return -EMSGSIZE;
275 }
276 
277 EXPORT_SYMBOL(ip6_xmit);
278 
279 /*
280  *	To avoid extra problems ND packets are send through this
281  *	routine. It's code duplication but I really want to avoid
282  *	extra checks since ipv6_build_header is used by TCP (which
283  *	is for us performance critical)
284  */
285 
286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
287 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
288 	       int proto, int len)
289 {
290 	struct ipv6_pinfo *np = inet6_sk(sk);
291 	struct ipv6hdr *hdr;
292 	int totlen;
293 
294 	skb->protocol = htons(ETH_P_IPV6);
295 	skb->dev = dev;
296 
297 	totlen = len + sizeof(struct ipv6hdr);
298 
299 	skb_reset_network_header(skb);
300 	skb_put(skb, sizeof(struct ipv6hdr));
301 	hdr = ipv6_hdr(skb);
302 
303 	*(__be32*)hdr = htonl(0x60000000);
304 
305 	hdr->payload_len = htons(len);
306 	hdr->nexthdr = proto;
307 	hdr->hop_limit = np->hop_limit;
308 
309 	ipv6_addr_copy(&hdr->saddr, saddr);
310 	ipv6_addr_copy(&hdr->daddr, daddr);
311 
312 	return 0;
313 }
314 
315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
316 {
317 	struct ip6_ra_chain *ra;
318 	struct sock *last = NULL;
319 
320 	read_lock(&ip6_ra_lock);
321 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
322 		struct sock *sk = ra->sk;
323 		if (sk && ra->sel == sel &&
324 		    (!sk->sk_bound_dev_if ||
325 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
326 			if (last) {
327 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
328 				if (skb2)
329 					rawv6_rcv(last, skb2);
330 			}
331 			last = sk;
332 		}
333 	}
334 
335 	if (last) {
336 		rawv6_rcv(last, skb);
337 		read_unlock(&ip6_ra_lock);
338 		return 1;
339 	}
340 	read_unlock(&ip6_ra_lock);
341 	return 0;
342 }
343 
344 static int ip6_forward_proxy_check(struct sk_buff *skb)
345 {
346 	struct ipv6hdr *hdr = ipv6_hdr(skb);
347 	u8 nexthdr = hdr->nexthdr;
348 	int offset;
349 
350 	if (ipv6_ext_hdr(nexthdr)) {
351 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
352 		if (offset < 0)
353 			return 0;
354 	} else
355 		offset = sizeof(struct ipv6hdr);
356 
357 	if (nexthdr == IPPROTO_ICMPV6) {
358 		struct icmp6hdr *icmp6;
359 
360 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
361 					 offset + 1 - skb->data)))
362 			return 0;
363 
364 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
365 
366 		switch (icmp6->icmp6_type) {
367 		case NDISC_ROUTER_SOLICITATION:
368 		case NDISC_ROUTER_ADVERTISEMENT:
369 		case NDISC_NEIGHBOUR_SOLICITATION:
370 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
371 		case NDISC_REDIRECT:
372 			/* For reaction involving unicast neighbor discovery
373 			 * message destined to the proxied address, pass it to
374 			 * input function.
375 			 */
376 			return 1;
377 		default:
378 			break;
379 		}
380 	}
381 
382 	/*
383 	 * The proxying router can't forward traffic sent to a link-local
384 	 * address, so signal the sender and discard the packet. This
385 	 * behavior is clarified by the MIPv6 specification.
386 	 */
387 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
388 		dst_link_failure(skb);
389 		return -1;
390 	}
391 
392 	return 0;
393 }
394 
395 static inline int ip6_forward_finish(struct sk_buff *skb)
396 {
397 	return dst_output(skb);
398 }
399 
400 int ip6_forward(struct sk_buff *skb)
401 {
402 	struct dst_entry *dst = skb_dst(skb);
403 	struct ipv6hdr *hdr = ipv6_hdr(skb);
404 	struct inet6_skb_parm *opt = IP6CB(skb);
405 	struct net *net = dev_net(dst->dev);
406 	u32 mtu;
407 
408 	if (net->ipv6.devconf_all->forwarding == 0)
409 		goto error;
410 
411 	if (skb_warn_if_lro(skb))
412 		goto drop;
413 
414 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
415 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
416 		goto drop;
417 	}
418 
419 	skb_forward_csum(skb);
420 
421 	/*
422 	 *	We DO NOT make any processing on
423 	 *	RA packets, pushing them to user level AS IS
424 	 *	without ane WARRANTY that application will be able
425 	 *	to interpret them. The reason is that we
426 	 *	cannot make anything clever here.
427 	 *
428 	 *	We are not end-node, so that if packet contains
429 	 *	AH/ESP, we cannot make anything.
430 	 *	Defragmentation also would be mistake, RA packets
431 	 *	cannot be fragmented, because there is no warranty
432 	 *	that different fragments will go along one path. --ANK
433 	 */
434 	if (opt->ra) {
435 		u8 *ptr = skb_network_header(skb) + opt->ra;
436 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
437 			return 0;
438 	}
439 
440 	/*
441 	 *	check and decrement ttl
442 	 */
443 	if (hdr->hop_limit <= 1) {
444 		/* Force OUTPUT device used as source address */
445 		skb->dev = dst->dev;
446 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
447 		IP6_INC_STATS_BH(net,
448 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
449 
450 		kfree_skb(skb);
451 		return -ETIMEDOUT;
452 	}
453 
454 	/* XXX: idev->cnf.proxy_ndp? */
455 	if (net->ipv6.devconf_all->proxy_ndp &&
456 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
457 		int proxied = ip6_forward_proxy_check(skb);
458 		if (proxied > 0)
459 			return ip6_input(skb);
460 		else if (proxied < 0) {
461 			IP6_INC_STATS(net, ip6_dst_idev(dst),
462 				      IPSTATS_MIB_INDISCARDS);
463 			goto drop;
464 		}
465 	}
466 
467 	if (!xfrm6_route_forward(skb)) {
468 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
469 		goto drop;
470 	}
471 	dst = skb_dst(skb);
472 
473 	/* IPv6 specs say nothing about it, but it is clear that we cannot
474 	   send redirects to source routed frames.
475 	   We don't send redirects to frames decapsulated from IPsec.
476 	 */
477 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
478 	    !skb_sec_path(skb)) {
479 		struct in6_addr *target = NULL;
480 		struct rt6_info *rt;
481 		struct neighbour *n = dst->neighbour;
482 
483 		/*
484 		 *	incoming and outgoing devices are the same
485 		 *	send a redirect.
486 		 */
487 
488 		rt = (struct rt6_info *) dst;
489 		if ((rt->rt6i_flags & RTF_GATEWAY))
490 			target = (struct in6_addr*)&n->primary_key;
491 		else
492 			target = &hdr->daddr;
493 
494 		/* Limit redirects both by destination (here)
495 		   and by source (inside ndisc_send_redirect)
496 		 */
497 		if (xrlim_allow(dst, 1*HZ))
498 			ndisc_send_redirect(skb, n, target);
499 	} else {
500 		int addrtype = ipv6_addr_type(&hdr->saddr);
501 
502 		/* This check is security critical. */
503 		if (addrtype == IPV6_ADDR_ANY ||
504 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
505 			goto error;
506 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
507 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
508 				    ICMPV6_NOT_NEIGHBOUR, 0);
509 			goto error;
510 		}
511 	}
512 
513 	mtu = dst_mtu(dst);
514 	if (mtu < IPV6_MIN_MTU)
515 		mtu = IPV6_MIN_MTU;
516 
517 	if (skb->len > mtu) {
518 		/* Again, force OUTPUT device used as source address */
519 		skb->dev = dst->dev;
520 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
521 		IP6_INC_STATS_BH(net,
522 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
523 		IP6_INC_STATS_BH(net,
524 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
525 		kfree_skb(skb);
526 		return -EMSGSIZE;
527 	}
528 
529 	if (skb_cow(skb, dst->dev->hard_header_len)) {
530 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
531 		goto drop;
532 	}
533 
534 	hdr = ipv6_hdr(skb);
535 
536 	/* Mangling hops number delayed to point after skb COW */
537 
538 	hdr->hop_limit--;
539 
540 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
541 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
542 		       ip6_forward_finish);
543 
544 error:
545 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
546 drop:
547 	kfree_skb(skb);
548 	return -EINVAL;
549 }
550 
551 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
552 {
553 	to->pkt_type = from->pkt_type;
554 	to->priority = from->priority;
555 	to->protocol = from->protocol;
556 	skb_dst_drop(to);
557 	skb_dst_set(to, dst_clone(skb_dst(from)));
558 	to->dev = from->dev;
559 	to->mark = from->mark;
560 
561 #ifdef CONFIG_NET_SCHED
562 	to->tc_index = from->tc_index;
563 #endif
564 	nf_copy(to, from);
565 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
566     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
567 	to->nf_trace = from->nf_trace;
568 #endif
569 	skb_copy_secmark(to, from);
570 }
571 
572 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
573 {
574 	u16 offset = sizeof(struct ipv6hdr);
575 	struct ipv6_opt_hdr *exthdr =
576 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
577 	unsigned int packet_len = skb->tail - skb->network_header;
578 	int found_rhdr = 0;
579 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
580 
581 	while (offset + 1 <= packet_len) {
582 
583 		switch (**nexthdr) {
584 
585 		case NEXTHDR_HOP:
586 			break;
587 		case NEXTHDR_ROUTING:
588 			found_rhdr = 1;
589 			break;
590 		case NEXTHDR_DEST:
591 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
592 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
593 				break;
594 #endif
595 			if (found_rhdr)
596 				return offset;
597 			break;
598 		default :
599 			return offset;
600 		}
601 
602 		offset += ipv6_optlen(exthdr);
603 		*nexthdr = &exthdr->nexthdr;
604 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
605 						 offset);
606 	}
607 
608 	return offset;
609 }
610 
611 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
612 {
613 	struct sk_buff *frag;
614 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
615 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
616 	struct ipv6hdr *tmp_hdr;
617 	struct frag_hdr *fh;
618 	unsigned int mtu, hlen, left, len;
619 	__be32 frag_id = 0;
620 	int ptr, offset = 0, err=0;
621 	u8 *prevhdr, nexthdr = 0;
622 	struct net *net = dev_net(skb_dst(skb)->dev);
623 
624 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
625 	nexthdr = *prevhdr;
626 
627 	mtu = ip6_skb_dst_mtu(skb);
628 
629 	/* We must not fragment if the socket is set to force MTU discovery
630 	 * or if the skb it not generated by a local socket.
631 	 */
632 	if (!skb->local_df) {
633 		skb->dev = skb_dst(skb)->dev;
634 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
635 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
636 			      IPSTATS_MIB_FRAGFAILS);
637 		kfree_skb(skb);
638 		return -EMSGSIZE;
639 	}
640 
641 	if (np && np->frag_size < mtu) {
642 		if (np->frag_size)
643 			mtu = np->frag_size;
644 	}
645 	mtu -= hlen + sizeof(struct frag_hdr);
646 
647 	if (skb_has_frags(skb)) {
648 		int first_len = skb_pagelen(skb);
649 		int truesizes = 0;
650 
651 		if (first_len - hlen > mtu ||
652 		    ((first_len - hlen) & 7) ||
653 		    skb_cloned(skb))
654 			goto slow_path;
655 
656 		skb_walk_frags(skb, frag) {
657 			/* Correct geometry. */
658 			if (frag->len > mtu ||
659 			    ((frag->len & 7) && frag->next) ||
660 			    skb_headroom(frag) < hlen)
661 			    goto slow_path;
662 
663 			/* Partially cloned skb? */
664 			if (skb_shared(frag))
665 				goto slow_path;
666 
667 			BUG_ON(frag->sk);
668 			if (skb->sk) {
669 				frag->sk = skb->sk;
670 				frag->destructor = sock_wfree;
671 				truesizes += frag->truesize;
672 			}
673 		}
674 
675 		err = 0;
676 		offset = 0;
677 		frag = skb_shinfo(skb)->frag_list;
678 		skb_frag_list_init(skb);
679 		/* BUILD HEADER */
680 
681 		*prevhdr = NEXTHDR_FRAGMENT;
682 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
683 		if (!tmp_hdr) {
684 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
685 				      IPSTATS_MIB_FRAGFAILS);
686 			return -ENOMEM;
687 		}
688 
689 		__skb_pull(skb, hlen);
690 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
691 		__skb_push(skb, hlen);
692 		skb_reset_network_header(skb);
693 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
694 
695 		ipv6_select_ident(fh);
696 		fh->nexthdr = nexthdr;
697 		fh->reserved = 0;
698 		fh->frag_off = htons(IP6_MF);
699 		frag_id = fh->identification;
700 
701 		first_len = skb_pagelen(skb);
702 		skb->data_len = first_len - skb_headlen(skb);
703 		skb->truesize -= truesizes;
704 		skb->len = first_len;
705 		ipv6_hdr(skb)->payload_len = htons(first_len -
706 						   sizeof(struct ipv6hdr));
707 
708 		dst_hold(&rt->u.dst);
709 
710 		for (;;) {
711 			/* Prepare header of the next frame,
712 			 * before previous one went down. */
713 			if (frag) {
714 				frag->ip_summed = CHECKSUM_NONE;
715 				skb_reset_transport_header(frag);
716 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
717 				__skb_push(frag, hlen);
718 				skb_reset_network_header(frag);
719 				memcpy(skb_network_header(frag), tmp_hdr,
720 				       hlen);
721 				offset += skb->len - hlen - sizeof(struct frag_hdr);
722 				fh->nexthdr = nexthdr;
723 				fh->reserved = 0;
724 				fh->frag_off = htons(offset);
725 				if (frag->next != NULL)
726 					fh->frag_off |= htons(IP6_MF);
727 				fh->identification = frag_id;
728 				ipv6_hdr(frag)->payload_len =
729 						htons(frag->len -
730 						      sizeof(struct ipv6hdr));
731 				ip6_copy_metadata(frag, skb);
732 			}
733 
734 			err = output(skb);
735 			if(!err)
736 				IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
737 					      IPSTATS_MIB_FRAGCREATES);
738 
739 			if (err || !frag)
740 				break;
741 
742 			skb = frag;
743 			frag = skb->next;
744 			skb->next = NULL;
745 		}
746 
747 		kfree(tmp_hdr);
748 
749 		if (err == 0) {
750 			IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
751 				      IPSTATS_MIB_FRAGOKS);
752 			dst_release(&rt->u.dst);
753 			return 0;
754 		}
755 
756 		while (frag) {
757 			skb = frag->next;
758 			kfree_skb(frag);
759 			frag = skb;
760 		}
761 
762 		IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
763 			      IPSTATS_MIB_FRAGFAILS);
764 		dst_release(&rt->u.dst);
765 		return err;
766 	}
767 
768 slow_path:
769 	left = skb->len - hlen;		/* Space per frame */
770 	ptr = hlen;			/* Where to start from */
771 
772 	/*
773 	 *	Fragment the datagram.
774 	 */
775 
776 	*prevhdr = NEXTHDR_FRAGMENT;
777 
778 	/*
779 	 *	Keep copying data until we run out.
780 	 */
781 	while(left > 0)	{
782 		len = left;
783 		/* IF: it doesn't fit, use 'mtu' - the data space left */
784 		if (len > mtu)
785 			len = mtu;
786 		/* IF: we are not sending upto and including the packet end
787 		   then align the next start on an eight byte boundary */
788 		if (len < left)	{
789 			len &= ~7;
790 		}
791 		/*
792 		 *	Allocate buffer.
793 		 */
794 
795 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
796 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
797 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
798 				      IPSTATS_MIB_FRAGFAILS);
799 			err = -ENOMEM;
800 			goto fail;
801 		}
802 
803 		/*
804 		 *	Set up data on packet
805 		 */
806 
807 		ip6_copy_metadata(frag, skb);
808 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
809 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
810 		skb_reset_network_header(frag);
811 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
812 		frag->transport_header = (frag->network_header + hlen +
813 					  sizeof(struct frag_hdr));
814 
815 		/*
816 		 *	Charge the memory for the fragment to any owner
817 		 *	it might possess
818 		 */
819 		if (skb->sk)
820 			skb_set_owner_w(frag, skb->sk);
821 
822 		/*
823 		 *	Copy the packet header into the new buffer.
824 		 */
825 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
826 
827 		/*
828 		 *	Build fragment header.
829 		 */
830 		fh->nexthdr = nexthdr;
831 		fh->reserved = 0;
832 		if (!frag_id) {
833 			ipv6_select_ident(fh);
834 			frag_id = fh->identification;
835 		} else
836 			fh->identification = frag_id;
837 
838 		/*
839 		 *	Copy a block of the IP datagram.
840 		 */
841 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
842 			BUG();
843 		left -= len;
844 
845 		fh->frag_off = htons(offset);
846 		if (left > 0)
847 			fh->frag_off |= htons(IP6_MF);
848 		ipv6_hdr(frag)->payload_len = htons(frag->len -
849 						    sizeof(struct ipv6hdr));
850 
851 		ptr += len;
852 		offset += len;
853 
854 		/*
855 		 *	Put this fragment into the sending queue.
856 		 */
857 		err = output(frag);
858 		if (err)
859 			goto fail;
860 
861 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
862 			      IPSTATS_MIB_FRAGCREATES);
863 	}
864 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
865 		      IPSTATS_MIB_FRAGOKS);
866 	kfree_skb(skb);
867 	return err;
868 
869 fail:
870 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
871 		      IPSTATS_MIB_FRAGFAILS);
872 	kfree_skb(skb);
873 	return err;
874 }
875 
876 static inline int ip6_rt_check(struct rt6key *rt_key,
877 			       struct in6_addr *fl_addr,
878 			       struct in6_addr *addr_cache)
879 {
880 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
881 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
882 }
883 
884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
885 					  struct dst_entry *dst,
886 					  struct flowi *fl)
887 {
888 	struct ipv6_pinfo *np = inet6_sk(sk);
889 	struct rt6_info *rt = (struct rt6_info *)dst;
890 
891 	if (!dst)
892 		goto out;
893 
894 	/* Yes, checking route validity in not connected
895 	 * case is not very simple. Take into account,
896 	 * that we do not support routing by source, TOS,
897 	 * and MSG_DONTROUTE 		--ANK (980726)
898 	 *
899 	 * 1. ip6_rt_check(): If route was host route,
900 	 *    check that cached destination is current.
901 	 *    If it is network route, we still may
902 	 *    check its validity using saved pointer
903 	 *    to the last used address: daddr_cache.
904 	 *    We do not want to save whole address now,
905 	 *    (because main consumer of this service
906 	 *    is tcp, which has not this problem),
907 	 *    so that the last trick works only on connected
908 	 *    sockets.
909 	 * 2. oif also should be the same.
910 	 */
911 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
912 #ifdef CONFIG_IPV6_SUBTREES
913 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
914 #endif
915 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
916 		dst_release(dst);
917 		dst = NULL;
918 	}
919 
920 out:
921 	return dst;
922 }
923 
924 static int ip6_dst_lookup_tail(struct sock *sk,
925 			       struct dst_entry **dst, struct flowi *fl)
926 {
927 	int err;
928 	struct net *net = sock_net(sk);
929 
930 	if (*dst == NULL)
931 		*dst = ip6_route_output(net, sk, fl);
932 
933 	if ((err = (*dst)->error))
934 		goto out_err_release;
935 
936 	if (ipv6_addr_any(&fl->fl6_src)) {
937 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
938 					 &fl->fl6_dst,
939 					 sk ? inet6_sk(sk)->srcprefs : 0,
940 					 &fl->fl6_src);
941 		if (err)
942 			goto out_err_release;
943 	}
944 
945 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
946 	/*
947 	 * Here if the dst entry we've looked up
948 	 * has a neighbour entry that is in the INCOMPLETE
949 	 * state and the src address from the flow is
950 	 * marked as OPTIMISTIC, we release the found
951 	 * dst entry and replace it instead with the
952 	 * dst entry of the nexthop router
953 	 */
954 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
955 		struct inet6_ifaddr *ifp;
956 		struct flowi fl_gw;
957 		int redirect;
958 
959 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
960 				      (*dst)->dev, 1);
961 
962 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
963 		if (ifp)
964 			in6_ifa_put(ifp);
965 
966 		if (redirect) {
967 			/*
968 			 * We need to get the dst entry for the
969 			 * default router instead
970 			 */
971 			dst_release(*dst);
972 			memcpy(&fl_gw, fl, sizeof(struct flowi));
973 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
974 			*dst = ip6_route_output(net, sk, &fl_gw);
975 			if ((err = (*dst)->error))
976 				goto out_err_release;
977 		}
978 	}
979 #endif
980 
981 	return 0;
982 
983 out_err_release:
984 	if (err == -ENETUNREACH)
985 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
986 	dst_release(*dst);
987 	*dst = NULL;
988 	return err;
989 }
990 
991 /**
992  *	ip6_dst_lookup - perform route lookup on flow
993  *	@sk: socket which provides route info
994  *	@dst: pointer to dst_entry * for result
995  *	@fl: flow to lookup
996  *
997  *	This function performs a route lookup on the given flow.
998  *
999  *	It returns zero on success, or a standard errno code on error.
1000  */
1001 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1002 {
1003 	*dst = NULL;
1004 	return ip6_dst_lookup_tail(sk, dst, fl);
1005 }
1006 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1007 
1008 /**
1009  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1010  *	@sk: socket which provides the dst cache and route info
1011  *	@dst: pointer to dst_entry * for result
1012  *	@fl: flow to lookup
1013  *
1014  *	This function performs a route lookup on the given flow with the
1015  *	possibility of using the cached route in the socket if it is valid.
1016  *	It will take the socket dst lock when operating on the dst cache.
1017  *	As a result, this function can only be used in process context.
1018  *
1019  *	It returns zero on success, or a standard errno code on error.
1020  */
1021 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1022 {
1023 	*dst = NULL;
1024 	if (sk) {
1025 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1026 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1027 	}
1028 
1029 	return ip6_dst_lookup_tail(sk, dst, fl);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1032 
1033 static inline int ip6_ufo_append_data(struct sock *sk,
1034 			int getfrag(void *from, char *to, int offset, int len,
1035 			int odd, struct sk_buff *skb),
1036 			void *from, int length, int hh_len, int fragheaderlen,
1037 			int transhdrlen, int mtu,unsigned int flags)
1038 
1039 {
1040 	struct sk_buff *skb;
1041 	int err;
1042 
1043 	/* There is support for UDP large send offload by network
1044 	 * device, so create one single skb packet containing complete
1045 	 * udp datagram
1046 	 */
1047 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1048 		skb = sock_alloc_send_skb(sk,
1049 			hh_len + fragheaderlen + transhdrlen + 20,
1050 			(flags & MSG_DONTWAIT), &err);
1051 		if (skb == NULL)
1052 			return -ENOMEM;
1053 
1054 		/* reserve space for Hardware header */
1055 		skb_reserve(skb, hh_len);
1056 
1057 		/* create space for UDP/IP header */
1058 		skb_put(skb,fragheaderlen + transhdrlen);
1059 
1060 		/* initialize network header pointer */
1061 		skb_reset_network_header(skb);
1062 
1063 		/* initialize protocol header pointer */
1064 		skb->transport_header = skb->network_header + fragheaderlen;
1065 
1066 		skb->ip_summed = CHECKSUM_PARTIAL;
1067 		skb->csum = 0;
1068 		sk->sk_sndmsg_off = 0;
1069 	}
1070 
1071 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1072 				      (length - transhdrlen));
1073 	if (!err) {
1074 		struct frag_hdr fhdr;
1075 
1076 		/* Specify the length of each IPv6 datagram fragment.
1077 		 * It has to be a multiple of 8.
1078 		 */
1079 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1080 					     sizeof(struct frag_hdr)) & ~7;
1081 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1082 		ipv6_select_ident(&fhdr);
1083 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1084 		__skb_queue_tail(&sk->sk_write_queue, skb);
1085 
1086 		return 0;
1087 	}
1088 	/* There is not enough support do UPD LSO,
1089 	 * so follow normal path
1090 	 */
1091 	kfree_skb(skb);
1092 
1093 	return err;
1094 }
1095 
1096 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1097 					       gfp_t gfp)
1098 {
1099 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1100 }
1101 
1102 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1103 						gfp_t gfp)
1104 {
1105 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1106 }
1107 
1108 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1109 	int offset, int len, int odd, struct sk_buff *skb),
1110 	void *from, int length, int transhdrlen,
1111 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1112 	struct rt6_info *rt, unsigned int flags)
1113 {
1114 	struct inet_sock *inet = inet_sk(sk);
1115 	struct ipv6_pinfo *np = inet6_sk(sk);
1116 	struct sk_buff *skb;
1117 	unsigned int maxfraglen, fragheaderlen;
1118 	int exthdrlen;
1119 	int hh_len;
1120 	int mtu;
1121 	int copy;
1122 	int err;
1123 	int offset = 0;
1124 	int csummode = CHECKSUM_NONE;
1125 
1126 	if (flags&MSG_PROBE)
1127 		return 0;
1128 	if (skb_queue_empty(&sk->sk_write_queue)) {
1129 		/*
1130 		 * setup for corking
1131 		 */
1132 		if (opt) {
1133 			if (WARN_ON(np->cork.opt))
1134 				return -EINVAL;
1135 
1136 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1137 			if (unlikely(np->cork.opt == NULL))
1138 				return -ENOBUFS;
1139 
1140 			np->cork.opt->tot_len = opt->tot_len;
1141 			np->cork.opt->opt_flen = opt->opt_flen;
1142 			np->cork.opt->opt_nflen = opt->opt_nflen;
1143 
1144 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1145 							    sk->sk_allocation);
1146 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1147 				return -ENOBUFS;
1148 
1149 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1150 							    sk->sk_allocation);
1151 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1152 				return -ENOBUFS;
1153 
1154 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1155 							   sk->sk_allocation);
1156 			if (opt->hopopt && !np->cork.opt->hopopt)
1157 				return -ENOBUFS;
1158 
1159 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1160 							    sk->sk_allocation);
1161 			if (opt->srcrt && !np->cork.opt->srcrt)
1162 				return -ENOBUFS;
1163 
1164 			/* need source address above miyazawa*/
1165 		}
1166 		dst_hold(&rt->u.dst);
1167 		inet->cork.dst = &rt->u.dst;
1168 		inet->cork.fl = *fl;
1169 		np->cork.hop_limit = hlimit;
1170 		np->cork.tclass = tclass;
1171 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1172 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1173 		if (np->frag_size < mtu) {
1174 			if (np->frag_size)
1175 				mtu = np->frag_size;
1176 		}
1177 		inet->cork.fragsize = mtu;
1178 		if (dst_allfrag(rt->u.dst.path))
1179 			inet->cork.flags |= IPCORK_ALLFRAG;
1180 		inet->cork.length = 0;
1181 		sk->sk_sndmsg_page = NULL;
1182 		sk->sk_sndmsg_off = 0;
1183 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1184 			    rt->rt6i_nfheader_len;
1185 		length += exthdrlen;
1186 		transhdrlen += exthdrlen;
1187 	} else {
1188 		rt = (struct rt6_info *)inet->cork.dst;
1189 		fl = &inet->cork.fl;
1190 		opt = np->cork.opt;
1191 		transhdrlen = 0;
1192 		exthdrlen = 0;
1193 		mtu = inet->cork.fragsize;
1194 	}
1195 
1196 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1197 
1198 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1199 			(opt ? opt->opt_nflen : 0);
1200 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1201 
1202 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1203 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1204 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1205 			return -EMSGSIZE;
1206 		}
1207 	}
1208 
1209 	/*
1210 	 * Let's try using as much space as possible.
1211 	 * Use MTU if total length of the message fits into the MTU.
1212 	 * Otherwise, we need to reserve fragment header and
1213 	 * fragment alignment (= 8-15 octects, in total).
1214 	 *
1215 	 * Note that we may need to "move" the data from the tail of
1216 	 * of the buffer to the new fragment when we split
1217 	 * the message.
1218 	 *
1219 	 * FIXME: It may be fragmented into multiple chunks
1220 	 *        at once if non-fragmentable extension headers
1221 	 *        are too large.
1222 	 * --yoshfuji
1223 	 */
1224 
1225 	inet->cork.length += length;
1226 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1227 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1228 
1229 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1230 					  fragheaderlen, transhdrlen, mtu,
1231 					  flags);
1232 		if (err)
1233 			goto error;
1234 		return 0;
1235 	}
1236 
1237 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1238 		goto alloc_new_skb;
1239 
1240 	while (length > 0) {
1241 		/* Check if the remaining data fits into current packet. */
1242 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1243 		if (copy < length)
1244 			copy = maxfraglen - skb->len;
1245 
1246 		if (copy <= 0) {
1247 			char *data;
1248 			unsigned int datalen;
1249 			unsigned int fraglen;
1250 			unsigned int fraggap;
1251 			unsigned int alloclen;
1252 			struct sk_buff *skb_prev;
1253 alloc_new_skb:
1254 			skb_prev = skb;
1255 
1256 			/* There's no room in the current skb */
1257 			if (skb_prev)
1258 				fraggap = skb_prev->len - maxfraglen;
1259 			else
1260 				fraggap = 0;
1261 
1262 			/*
1263 			 * If remaining data exceeds the mtu,
1264 			 * we know we need more fragment(s).
1265 			 */
1266 			datalen = length + fraggap;
1267 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1268 				datalen = maxfraglen - fragheaderlen;
1269 
1270 			fraglen = datalen + fragheaderlen;
1271 			if ((flags & MSG_MORE) &&
1272 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1273 				alloclen = mtu;
1274 			else
1275 				alloclen = datalen + fragheaderlen;
1276 
1277 			/*
1278 			 * The last fragment gets additional space at tail.
1279 			 * Note: we overallocate on fragments with MSG_MODE
1280 			 * because we have no idea if we're the last one.
1281 			 */
1282 			if (datalen == length + fraggap)
1283 				alloclen += rt->u.dst.trailer_len;
1284 
1285 			/*
1286 			 * We just reserve space for fragment header.
1287 			 * Note: this may be overallocation if the message
1288 			 * (without MSG_MORE) fits into the MTU.
1289 			 */
1290 			alloclen += sizeof(struct frag_hdr);
1291 
1292 			if (transhdrlen) {
1293 				skb = sock_alloc_send_skb(sk,
1294 						alloclen + hh_len,
1295 						(flags & MSG_DONTWAIT), &err);
1296 			} else {
1297 				skb = NULL;
1298 				if (atomic_read(&sk->sk_wmem_alloc) <=
1299 				    2 * sk->sk_sndbuf)
1300 					skb = sock_wmalloc(sk,
1301 							   alloclen + hh_len, 1,
1302 							   sk->sk_allocation);
1303 				if (unlikely(skb == NULL))
1304 					err = -ENOBUFS;
1305 			}
1306 			if (skb == NULL)
1307 				goto error;
1308 			/*
1309 			 *	Fill in the control structures
1310 			 */
1311 			skb->ip_summed = csummode;
1312 			skb->csum = 0;
1313 			/* reserve for fragmentation */
1314 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1315 
1316 			/*
1317 			 *	Find where to start putting bytes
1318 			 */
1319 			data = skb_put(skb, fraglen);
1320 			skb_set_network_header(skb, exthdrlen);
1321 			data += fragheaderlen;
1322 			skb->transport_header = (skb->network_header +
1323 						 fragheaderlen);
1324 			if (fraggap) {
1325 				skb->csum = skb_copy_and_csum_bits(
1326 					skb_prev, maxfraglen,
1327 					data + transhdrlen, fraggap, 0);
1328 				skb_prev->csum = csum_sub(skb_prev->csum,
1329 							  skb->csum);
1330 				data += fraggap;
1331 				pskb_trim_unique(skb_prev, maxfraglen);
1332 			}
1333 			copy = datalen - transhdrlen - fraggap;
1334 			if (copy < 0) {
1335 				err = -EINVAL;
1336 				kfree_skb(skb);
1337 				goto error;
1338 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1339 				err = -EFAULT;
1340 				kfree_skb(skb);
1341 				goto error;
1342 			}
1343 
1344 			offset += copy;
1345 			length -= datalen - fraggap;
1346 			transhdrlen = 0;
1347 			exthdrlen = 0;
1348 			csummode = CHECKSUM_NONE;
1349 
1350 			/*
1351 			 * Put the packet on the pending queue
1352 			 */
1353 			__skb_queue_tail(&sk->sk_write_queue, skb);
1354 			continue;
1355 		}
1356 
1357 		if (copy > length)
1358 			copy = length;
1359 
1360 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1361 			unsigned int off;
1362 
1363 			off = skb->len;
1364 			if (getfrag(from, skb_put(skb, copy),
1365 						offset, copy, off, skb) < 0) {
1366 				__skb_trim(skb, off);
1367 				err = -EFAULT;
1368 				goto error;
1369 			}
1370 		} else {
1371 			int i = skb_shinfo(skb)->nr_frags;
1372 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1373 			struct page *page = sk->sk_sndmsg_page;
1374 			int off = sk->sk_sndmsg_off;
1375 			unsigned int left;
1376 
1377 			if (page && (left = PAGE_SIZE - off) > 0) {
1378 				if (copy >= left)
1379 					copy = left;
1380 				if (page != frag->page) {
1381 					if (i == MAX_SKB_FRAGS) {
1382 						err = -EMSGSIZE;
1383 						goto error;
1384 					}
1385 					get_page(page);
1386 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1387 					frag = &skb_shinfo(skb)->frags[i];
1388 				}
1389 			} else if(i < MAX_SKB_FRAGS) {
1390 				if (copy > PAGE_SIZE)
1391 					copy = PAGE_SIZE;
1392 				page = alloc_pages(sk->sk_allocation, 0);
1393 				if (page == NULL) {
1394 					err = -ENOMEM;
1395 					goto error;
1396 				}
1397 				sk->sk_sndmsg_page = page;
1398 				sk->sk_sndmsg_off = 0;
1399 
1400 				skb_fill_page_desc(skb, i, page, 0, 0);
1401 				frag = &skb_shinfo(skb)->frags[i];
1402 			} else {
1403 				err = -EMSGSIZE;
1404 				goto error;
1405 			}
1406 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1407 				err = -EFAULT;
1408 				goto error;
1409 			}
1410 			sk->sk_sndmsg_off += copy;
1411 			frag->size += copy;
1412 			skb->len += copy;
1413 			skb->data_len += copy;
1414 			skb->truesize += copy;
1415 			atomic_add(copy, &sk->sk_wmem_alloc);
1416 		}
1417 		offset += copy;
1418 		length -= copy;
1419 	}
1420 	return 0;
1421 error:
1422 	inet->cork.length -= length;
1423 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1424 	return err;
1425 }
1426 
1427 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1428 {
1429 	if (np->cork.opt) {
1430 		kfree(np->cork.opt->dst0opt);
1431 		kfree(np->cork.opt->dst1opt);
1432 		kfree(np->cork.opt->hopopt);
1433 		kfree(np->cork.opt->srcrt);
1434 		kfree(np->cork.opt);
1435 		np->cork.opt = NULL;
1436 	}
1437 
1438 	if (inet->cork.dst) {
1439 		dst_release(inet->cork.dst);
1440 		inet->cork.dst = NULL;
1441 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1442 	}
1443 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1444 }
1445 
1446 int ip6_push_pending_frames(struct sock *sk)
1447 {
1448 	struct sk_buff *skb, *tmp_skb;
1449 	struct sk_buff **tail_skb;
1450 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1451 	struct inet_sock *inet = inet_sk(sk);
1452 	struct ipv6_pinfo *np = inet6_sk(sk);
1453 	struct net *net = sock_net(sk);
1454 	struct ipv6hdr *hdr;
1455 	struct ipv6_txoptions *opt = np->cork.opt;
1456 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1457 	struct flowi *fl = &inet->cork.fl;
1458 	unsigned char proto = fl->proto;
1459 	int err = 0;
1460 
1461 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1462 		goto out;
1463 	tail_skb = &(skb_shinfo(skb)->frag_list);
1464 
1465 	/* move skb->data to ip header from ext header */
1466 	if (skb->data < skb_network_header(skb))
1467 		__skb_pull(skb, skb_network_offset(skb));
1468 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1469 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1470 		*tail_skb = tmp_skb;
1471 		tail_skb = &(tmp_skb->next);
1472 		skb->len += tmp_skb->len;
1473 		skb->data_len += tmp_skb->len;
1474 		skb->truesize += tmp_skb->truesize;
1475 		tmp_skb->destructor = NULL;
1476 		tmp_skb->sk = NULL;
1477 	}
1478 
1479 	/* Allow local fragmentation. */
1480 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1481 		skb->local_df = 1;
1482 
1483 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1484 	__skb_pull(skb, skb_network_header_len(skb));
1485 	if (opt && opt->opt_flen)
1486 		ipv6_push_frag_opts(skb, opt, &proto);
1487 	if (opt && opt->opt_nflen)
1488 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1489 
1490 	skb_push(skb, sizeof(struct ipv6hdr));
1491 	skb_reset_network_header(skb);
1492 	hdr = ipv6_hdr(skb);
1493 
1494 	*(__be32*)hdr = fl->fl6_flowlabel |
1495 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1496 
1497 	hdr->hop_limit = np->cork.hop_limit;
1498 	hdr->nexthdr = proto;
1499 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1500 	ipv6_addr_copy(&hdr->daddr, final_dst);
1501 
1502 	skb->priority = sk->sk_priority;
1503 	skb->mark = sk->sk_mark;
1504 
1505 	skb_dst_set(skb, dst_clone(&rt->u.dst));
1506 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1507 	if (proto == IPPROTO_ICMPV6) {
1508 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1509 
1510 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1511 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1512 	}
1513 
1514 	err = ip6_local_out(skb);
1515 	if (err) {
1516 		if (err > 0)
1517 			err = net_xmit_errno(err);
1518 		if (err)
1519 			goto error;
1520 	}
1521 
1522 out:
1523 	ip6_cork_release(inet, np);
1524 	return err;
1525 error:
1526 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1527 	goto out;
1528 }
1529 
1530 void ip6_flush_pending_frames(struct sock *sk)
1531 {
1532 	struct sk_buff *skb;
1533 
1534 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1535 		if (skb_dst(skb))
1536 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1537 				      IPSTATS_MIB_OUTDISCARDS);
1538 		kfree_skb(skb);
1539 	}
1540 
1541 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1542 }
1543