xref: /linux/net/ipv6/ip6_output.c (revision 40d3057ac036f2501c1930728a6179be4fca577b)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43 
44 #include <net/sock.h>
45 #include <net/snmp.h>
46 
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57 
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59 
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61 {
62 	static u32 ipv6_fragmentation_id = 1;
63 	static DEFINE_SPINLOCK(ip6_id_lock);
64 
65 	spin_lock_bh(&ip6_id_lock);
66 	fhdr->identification = htonl(ipv6_fragmentation_id);
67 	if (++ipv6_fragmentation_id == 0)
68 		ipv6_fragmentation_id = 1;
69 	spin_unlock_bh(&ip6_id_lock);
70 }
71 
72 int __ip6_local_out(struct sk_buff *skb)
73 {
74 	int len;
75 
76 	len = skb->len - sizeof(struct ipv6hdr);
77 	if (len > IPV6_MAXPLEN)
78 		len = 0;
79 	ipv6_hdr(skb)->payload_len = htons(len);
80 
81 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
82 		       dst_output);
83 }
84 
85 int ip6_local_out(struct sk_buff *skb)
86 {
87 	int err;
88 
89 	err = __ip6_local_out(skb);
90 	if (likely(err == 1))
91 		err = dst_output(skb);
92 
93 	return err;
94 }
95 EXPORT_SYMBOL_GPL(ip6_local_out);
96 
97 static int ip6_output_finish(struct sk_buff *skb)
98 {
99 	struct dst_entry *dst = skb->dst;
100 
101 	if (dst->hh)
102 		return neigh_hh_output(dst->hh, skb);
103 	else if (dst->neighbour)
104 		return dst->neighbour->output(skb);
105 
106 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
107 	kfree_skb(skb);
108 	return -EINVAL;
109 
110 }
111 
112 /* dev_loopback_xmit for use with netfilter. */
113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
114 {
115 	skb_reset_mac_header(newskb);
116 	__skb_pull(newskb, skb_network_offset(newskb));
117 	newskb->pkt_type = PACKET_LOOPBACK;
118 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
119 	WARN_ON(!newskb->dst);
120 
121 	netif_rx(newskb);
122 	return 0;
123 }
124 
125 
126 static int ip6_output2(struct sk_buff *skb)
127 {
128 	struct dst_entry *dst = skb->dst;
129 	struct net_device *dev = dst->dev;
130 
131 	skb->protocol = htons(ETH_P_IPV6);
132 	skb->dev = dev;
133 
134 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
135 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
136 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
137 
138 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
139 		    ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
140 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 					 &ipv6_hdr(skb)->saddr))) {
142 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
143 
144 			/* Do not check for IFF_ALLMULTI; multicast routing
145 			   is not supported in any case.
146 			 */
147 			if (newskb)
148 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
149 					NULL, newskb->dev,
150 					ip6_dev_loopback_xmit);
151 
152 			if (ipv6_hdr(skb)->hop_limit == 0) {
153 				IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
154 				kfree_skb(skb);
155 				return 0;
156 			}
157 		}
158 
159 		IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
160 	}
161 
162 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
163 		       ip6_output_finish);
164 }
165 
166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
167 {
168 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
169 
170 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171 	       skb->dst->dev->mtu : dst_mtu(skb->dst);
172 }
173 
174 int ip6_output(struct sk_buff *skb)
175 {
176 	struct inet6_dev *idev = ip6_dst_idev(skb->dst);
177 	if (unlikely(idev->cnf.disable_ipv6)) {
178 		IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
179 		kfree_skb(skb);
180 		return 0;
181 	}
182 
183 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
184 				dst_allfrag(skb->dst))
185 		return ip6_fragment(skb, ip6_output2);
186 	else
187 		return ip6_output2(skb);
188 }
189 
190 /*
191  *	xmit an sk_buff (used by TCP)
192  */
193 
194 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
195 	     struct ipv6_txoptions *opt, int ipfragok)
196 {
197 	struct ipv6_pinfo *np = inet6_sk(sk);
198 	struct in6_addr *first_hop = &fl->fl6_dst;
199 	struct dst_entry *dst = skb->dst;
200 	struct ipv6hdr *hdr;
201 	u8  proto = fl->proto;
202 	int seg_len = skb->len;
203 	int hlimit, tclass;
204 	u32 mtu;
205 
206 	if (opt) {
207 		unsigned int head_room;
208 
209 		/* First: exthdrs may take lots of space (~8K for now)
210 		   MAX_HEADER is not enough.
211 		 */
212 		head_room = opt->opt_nflen + opt->opt_flen;
213 		seg_len += head_room;
214 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
215 
216 		if (skb_headroom(skb) < head_room) {
217 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
218 			if (skb2 == NULL) {
219 				IP6_INC_STATS(ip6_dst_idev(skb->dst),
220 					      IPSTATS_MIB_OUTDISCARDS);
221 				kfree_skb(skb);
222 				return -ENOBUFS;
223 			}
224 			kfree_skb(skb);
225 			skb = skb2;
226 			if (sk)
227 				skb_set_owner_w(skb, sk);
228 		}
229 		if (opt->opt_flen)
230 			ipv6_push_frag_opts(skb, opt, &proto);
231 		if (opt->opt_nflen)
232 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
233 	}
234 
235 	skb_push(skb, sizeof(struct ipv6hdr));
236 	skb_reset_network_header(skb);
237 	hdr = ipv6_hdr(skb);
238 
239 	/* Allow local fragmentation. */
240 	if (ipfragok)
241 		skb->local_df = 1;
242 
243 	/*
244 	 *	Fill in the IPv6 header
245 	 */
246 
247 	hlimit = -1;
248 	if (np)
249 		hlimit = np->hop_limit;
250 	if (hlimit < 0)
251 		hlimit = ip6_dst_hoplimit(dst);
252 
253 	tclass = -1;
254 	if (np)
255 		tclass = np->tclass;
256 	if (tclass < 0)
257 		tclass = 0;
258 
259 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
260 
261 	hdr->payload_len = htons(seg_len);
262 	hdr->nexthdr = proto;
263 	hdr->hop_limit = hlimit;
264 
265 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
266 	ipv6_addr_copy(&hdr->daddr, first_hop);
267 
268 	skb->priority = sk->sk_priority;
269 	skb->mark = sk->sk_mark;
270 
271 	mtu = dst_mtu(dst);
272 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
273 		IP6_INC_STATS(ip6_dst_idev(skb->dst),
274 			      IPSTATS_MIB_OUTREQUESTS);
275 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
276 				dst_output);
277 	}
278 
279 	if (net_ratelimit())
280 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
281 	skb->dev = dst->dev;
282 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
283 	IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
284 	kfree_skb(skb);
285 	return -EMSGSIZE;
286 }
287 
288 EXPORT_SYMBOL(ip6_xmit);
289 
290 /*
291  *	To avoid extra problems ND packets are send through this
292  *	routine. It's code duplication but I really want to avoid
293  *	extra checks since ipv6_build_header is used by TCP (which
294  *	is for us performance critical)
295  */
296 
297 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
298 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
299 	       int proto, int len)
300 {
301 	struct ipv6_pinfo *np = inet6_sk(sk);
302 	struct ipv6hdr *hdr;
303 	int totlen;
304 
305 	skb->protocol = htons(ETH_P_IPV6);
306 	skb->dev = dev;
307 
308 	totlen = len + sizeof(struct ipv6hdr);
309 
310 	skb_reset_network_header(skb);
311 	skb_put(skb, sizeof(struct ipv6hdr));
312 	hdr = ipv6_hdr(skb);
313 
314 	*(__be32*)hdr = htonl(0x60000000);
315 
316 	hdr->payload_len = htons(len);
317 	hdr->nexthdr = proto;
318 	hdr->hop_limit = np->hop_limit;
319 
320 	ipv6_addr_copy(&hdr->saddr, saddr);
321 	ipv6_addr_copy(&hdr->daddr, daddr);
322 
323 	return 0;
324 }
325 
326 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
327 {
328 	struct ip6_ra_chain *ra;
329 	struct sock *last = NULL;
330 
331 	read_lock(&ip6_ra_lock);
332 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
333 		struct sock *sk = ra->sk;
334 		if (sk && ra->sel == sel &&
335 		    (!sk->sk_bound_dev_if ||
336 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
337 			if (last) {
338 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
339 				if (skb2)
340 					rawv6_rcv(last, skb2);
341 			}
342 			last = sk;
343 		}
344 	}
345 
346 	if (last) {
347 		rawv6_rcv(last, skb);
348 		read_unlock(&ip6_ra_lock);
349 		return 1;
350 	}
351 	read_unlock(&ip6_ra_lock);
352 	return 0;
353 }
354 
355 static int ip6_forward_proxy_check(struct sk_buff *skb)
356 {
357 	struct ipv6hdr *hdr = ipv6_hdr(skb);
358 	u8 nexthdr = hdr->nexthdr;
359 	int offset;
360 
361 	if (ipv6_ext_hdr(nexthdr)) {
362 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
363 		if (offset < 0)
364 			return 0;
365 	} else
366 		offset = sizeof(struct ipv6hdr);
367 
368 	if (nexthdr == IPPROTO_ICMPV6) {
369 		struct icmp6hdr *icmp6;
370 
371 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
372 					 offset + 1 - skb->data)))
373 			return 0;
374 
375 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
376 
377 		switch (icmp6->icmp6_type) {
378 		case NDISC_ROUTER_SOLICITATION:
379 		case NDISC_ROUTER_ADVERTISEMENT:
380 		case NDISC_NEIGHBOUR_SOLICITATION:
381 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
382 		case NDISC_REDIRECT:
383 			/* For reaction involving unicast neighbor discovery
384 			 * message destined to the proxied address, pass it to
385 			 * input function.
386 			 */
387 			return 1;
388 		default:
389 			break;
390 		}
391 	}
392 
393 	/*
394 	 * The proxying router can't forward traffic sent to a link-local
395 	 * address, so signal the sender and discard the packet. This
396 	 * behavior is clarified by the MIPv6 specification.
397 	 */
398 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
399 		dst_link_failure(skb);
400 		return -1;
401 	}
402 
403 	return 0;
404 }
405 
406 static inline int ip6_forward_finish(struct sk_buff *skb)
407 {
408 	return dst_output(skb);
409 }
410 
411 int ip6_forward(struct sk_buff *skb)
412 {
413 	struct dst_entry *dst = skb->dst;
414 	struct ipv6hdr *hdr = ipv6_hdr(skb);
415 	struct inet6_skb_parm *opt = IP6CB(skb);
416 	struct net *net = dev_net(dst->dev);
417 
418 	if (net->ipv6.devconf_all->forwarding == 0)
419 		goto error;
420 
421 	if (skb_warn_if_lro(skb))
422 		goto drop;
423 
424 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
425 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
426 		goto drop;
427 	}
428 
429 	skb_forward_csum(skb);
430 
431 	/*
432 	 *	We DO NOT make any processing on
433 	 *	RA packets, pushing them to user level AS IS
434 	 *	without ane WARRANTY that application will be able
435 	 *	to interpret them. The reason is that we
436 	 *	cannot make anything clever here.
437 	 *
438 	 *	We are not end-node, so that if packet contains
439 	 *	AH/ESP, we cannot make anything.
440 	 *	Defragmentation also would be mistake, RA packets
441 	 *	cannot be fragmented, because there is no warranty
442 	 *	that different fragments will go along one path. --ANK
443 	 */
444 	if (opt->ra) {
445 		u8 *ptr = skb_network_header(skb) + opt->ra;
446 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
447 			return 0;
448 	}
449 
450 	/*
451 	 *	check and decrement ttl
452 	 */
453 	if (hdr->hop_limit <= 1) {
454 		/* Force OUTPUT device used as source address */
455 		skb->dev = dst->dev;
456 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
457 			    0, skb->dev);
458 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
459 
460 		kfree_skb(skb);
461 		return -ETIMEDOUT;
462 	}
463 
464 	/* XXX: idev->cnf.proxy_ndp? */
465 	if (net->ipv6.devconf_all->proxy_ndp &&
466 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
467 		int proxied = ip6_forward_proxy_check(skb);
468 		if (proxied > 0)
469 			return ip6_input(skb);
470 		else if (proxied < 0) {
471 			IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
472 			goto drop;
473 		}
474 	}
475 
476 	if (!xfrm6_route_forward(skb)) {
477 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
478 		goto drop;
479 	}
480 	dst = skb->dst;
481 
482 	/* IPv6 specs say nothing about it, but it is clear that we cannot
483 	   send redirects to source routed frames.
484 	   We don't send redirects to frames decapsulated from IPsec.
485 	 */
486 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
487 	    !skb->sp) {
488 		struct in6_addr *target = NULL;
489 		struct rt6_info *rt;
490 		struct neighbour *n = dst->neighbour;
491 
492 		/*
493 		 *	incoming and outgoing devices are the same
494 		 *	send a redirect.
495 		 */
496 
497 		rt = (struct rt6_info *) dst;
498 		if ((rt->rt6i_flags & RTF_GATEWAY))
499 			target = (struct in6_addr*)&n->primary_key;
500 		else
501 			target = &hdr->daddr;
502 
503 		/* Limit redirects both by destination (here)
504 		   and by source (inside ndisc_send_redirect)
505 		 */
506 		if (xrlim_allow(dst, 1*HZ))
507 			ndisc_send_redirect(skb, n, target);
508 	} else {
509 		int addrtype = ipv6_addr_type(&hdr->saddr);
510 
511 		/* This check is security critical. */
512 		if (addrtype == IPV6_ADDR_ANY ||
513 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
514 			goto error;
515 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
516 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
517 				ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
518 			goto error;
519 		}
520 	}
521 
522 	if (skb->len > dst_mtu(dst)) {
523 		/* Again, force OUTPUT device used as source address */
524 		skb->dev = dst->dev;
525 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
526 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
527 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
528 		kfree_skb(skb);
529 		return -EMSGSIZE;
530 	}
531 
532 	if (skb_cow(skb, dst->dev->hard_header_len)) {
533 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
534 		goto drop;
535 	}
536 
537 	hdr = ipv6_hdr(skb);
538 
539 	/* Mangling hops number delayed to point after skb COW */
540 
541 	hdr->hop_limit--;
542 
543 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
544 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
545 		       ip6_forward_finish);
546 
547 error:
548 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
549 drop:
550 	kfree_skb(skb);
551 	return -EINVAL;
552 }
553 
554 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
555 {
556 	to->pkt_type = from->pkt_type;
557 	to->priority = from->priority;
558 	to->protocol = from->protocol;
559 	dst_release(to->dst);
560 	to->dst = dst_clone(from->dst);
561 	to->dev = from->dev;
562 	to->mark = from->mark;
563 
564 #ifdef CONFIG_NET_SCHED
565 	to->tc_index = from->tc_index;
566 #endif
567 	nf_copy(to, from);
568 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
569     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
570 	to->nf_trace = from->nf_trace;
571 #endif
572 	skb_copy_secmark(to, from);
573 }
574 
575 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
576 {
577 	u16 offset = sizeof(struct ipv6hdr);
578 	struct ipv6_opt_hdr *exthdr =
579 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
580 	unsigned int packet_len = skb->tail - skb->network_header;
581 	int found_rhdr = 0;
582 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
583 
584 	while (offset + 1 <= packet_len) {
585 
586 		switch (**nexthdr) {
587 
588 		case NEXTHDR_HOP:
589 			break;
590 		case NEXTHDR_ROUTING:
591 			found_rhdr = 1;
592 			break;
593 		case NEXTHDR_DEST:
594 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
595 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
596 				break;
597 #endif
598 			if (found_rhdr)
599 				return offset;
600 			break;
601 		default :
602 			return offset;
603 		}
604 
605 		offset += ipv6_optlen(exthdr);
606 		*nexthdr = &exthdr->nexthdr;
607 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
608 						 offset);
609 	}
610 
611 	return offset;
612 }
613 
614 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
615 {
616 	struct net_device *dev;
617 	struct sk_buff *frag;
618 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
619 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
620 	struct ipv6hdr *tmp_hdr;
621 	struct frag_hdr *fh;
622 	unsigned int mtu, hlen, left, len;
623 	__be32 frag_id = 0;
624 	int ptr, offset = 0, err=0;
625 	u8 *prevhdr, nexthdr = 0;
626 
627 	dev = rt->u.dst.dev;
628 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
629 	nexthdr = *prevhdr;
630 
631 	mtu = ip6_skb_dst_mtu(skb);
632 
633 	/* We must not fragment if the socket is set to force MTU discovery
634 	 * or if the skb it not generated by a local socket.  (This last
635 	 * check should be redundant, but it's free.)
636 	 */
637 	if (!skb->local_df) {
638 		skb->dev = skb->dst->dev;
639 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
640 		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
641 		kfree_skb(skb);
642 		return -EMSGSIZE;
643 	}
644 
645 	if (np && np->frag_size < mtu) {
646 		if (np->frag_size)
647 			mtu = np->frag_size;
648 	}
649 	mtu -= hlen + sizeof(struct frag_hdr);
650 
651 	if (skb_shinfo(skb)->frag_list) {
652 		int first_len = skb_pagelen(skb);
653 		int truesizes = 0;
654 
655 		if (first_len - hlen > mtu ||
656 		    ((first_len - hlen) & 7) ||
657 		    skb_cloned(skb))
658 			goto slow_path;
659 
660 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
661 			/* Correct geometry. */
662 			if (frag->len > mtu ||
663 			    ((frag->len & 7) && frag->next) ||
664 			    skb_headroom(frag) < hlen)
665 			    goto slow_path;
666 
667 			/* Partially cloned skb? */
668 			if (skb_shared(frag))
669 				goto slow_path;
670 
671 			BUG_ON(frag->sk);
672 			if (skb->sk) {
673 				sock_hold(skb->sk);
674 				frag->sk = skb->sk;
675 				frag->destructor = sock_wfree;
676 				truesizes += frag->truesize;
677 			}
678 		}
679 
680 		err = 0;
681 		offset = 0;
682 		frag = skb_shinfo(skb)->frag_list;
683 		skb_shinfo(skb)->frag_list = NULL;
684 		/* BUILD HEADER */
685 
686 		*prevhdr = NEXTHDR_FRAGMENT;
687 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
688 		if (!tmp_hdr) {
689 			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
690 			return -ENOMEM;
691 		}
692 
693 		__skb_pull(skb, hlen);
694 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
695 		__skb_push(skb, hlen);
696 		skb_reset_network_header(skb);
697 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
698 
699 		ipv6_select_ident(skb, fh);
700 		fh->nexthdr = nexthdr;
701 		fh->reserved = 0;
702 		fh->frag_off = htons(IP6_MF);
703 		frag_id = fh->identification;
704 
705 		first_len = skb_pagelen(skb);
706 		skb->data_len = first_len - skb_headlen(skb);
707 		skb->truesize -= truesizes;
708 		skb->len = first_len;
709 		ipv6_hdr(skb)->payload_len = htons(first_len -
710 						   sizeof(struct ipv6hdr));
711 
712 		dst_hold(&rt->u.dst);
713 
714 		for (;;) {
715 			/* Prepare header of the next frame,
716 			 * before previous one went down. */
717 			if (frag) {
718 				frag->ip_summed = CHECKSUM_NONE;
719 				skb_reset_transport_header(frag);
720 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
721 				__skb_push(frag, hlen);
722 				skb_reset_network_header(frag);
723 				memcpy(skb_network_header(frag), tmp_hdr,
724 				       hlen);
725 				offset += skb->len - hlen - sizeof(struct frag_hdr);
726 				fh->nexthdr = nexthdr;
727 				fh->reserved = 0;
728 				fh->frag_off = htons(offset);
729 				if (frag->next != NULL)
730 					fh->frag_off |= htons(IP6_MF);
731 				fh->identification = frag_id;
732 				ipv6_hdr(frag)->payload_len =
733 						htons(frag->len -
734 						      sizeof(struct ipv6hdr));
735 				ip6_copy_metadata(frag, skb);
736 			}
737 
738 			err = output(skb);
739 			if(!err)
740 				IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
741 
742 			if (err || !frag)
743 				break;
744 
745 			skb = frag;
746 			frag = skb->next;
747 			skb->next = NULL;
748 		}
749 
750 		kfree(tmp_hdr);
751 
752 		if (err == 0) {
753 			IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
754 			dst_release(&rt->u.dst);
755 			return 0;
756 		}
757 
758 		while (frag) {
759 			skb = frag->next;
760 			kfree_skb(frag);
761 			frag = skb;
762 		}
763 
764 		IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
765 		dst_release(&rt->u.dst);
766 		return err;
767 	}
768 
769 slow_path:
770 	left = skb->len - hlen;		/* Space per frame */
771 	ptr = hlen;			/* Where to start from */
772 
773 	/*
774 	 *	Fragment the datagram.
775 	 */
776 
777 	*prevhdr = NEXTHDR_FRAGMENT;
778 
779 	/*
780 	 *	Keep copying data until we run out.
781 	 */
782 	while(left > 0)	{
783 		len = left;
784 		/* IF: it doesn't fit, use 'mtu' - the data space left */
785 		if (len > mtu)
786 			len = mtu;
787 		/* IF: we are not sending upto and including the packet end
788 		   then align the next start on an eight byte boundary */
789 		if (len < left)	{
790 			len &= ~7;
791 		}
792 		/*
793 		 *	Allocate buffer.
794 		 */
795 
796 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
797 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
798 			IP6_INC_STATS(ip6_dst_idev(skb->dst),
799 				      IPSTATS_MIB_FRAGFAILS);
800 			err = -ENOMEM;
801 			goto fail;
802 		}
803 
804 		/*
805 		 *	Set up data on packet
806 		 */
807 
808 		ip6_copy_metadata(frag, skb);
809 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
810 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
811 		skb_reset_network_header(frag);
812 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
813 		frag->transport_header = (frag->network_header + hlen +
814 					  sizeof(struct frag_hdr));
815 
816 		/*
817 		 *	Charge the memory for the fragment to any owner
818 		 *	it might possess
819 		 */
820 		if (skb->sk)
821 			skb_set_owner_w(frag, skb->sk);
822 
823 		/*
824 		 *	Copy the packet header into the new buffer.
825 		 */
826 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
827 
828 		/*
829 		 *	Build fragment header.
830 		 */
831 		fh->nexthdr = nexthdr;
832 		fh->reserved = 0;
833 		if (!frag_id) {
834 			ipv6_select_ident(skb, fh);
835 			frag_id = fh->identification;
836 		} else
837 			fh->identification = frag_id;
838 
839 		/*
840 		 *	Copy a block of the IP datagram.
841 		 */
842 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
843 			BUG();
844 		left -= len;
845 
846 		fh->frag_off = htons(offset);
847 		if (left > 0)
848 			fh->frag_off |= htons(IP6_MF);
849 		ipv6_hdr(frag)->payload_len = htons(frag->len -
850 						    sizeof(struct ipv6hdr));
851 
852 		ptr += len;
853 		offset += len;
854 
855 		/*
856 		 *	Put this fragment into the sending queue.
857 		 */
858 		err = output(frag);
859 		if (err)
860 			goto fail;
861 
862 		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
863 	}
864 	IP6_INC_STATS(ip6_dst_idev(skb->dst),
865 		      IPSTATS_MIB_FRAGOKS);
866 	kfree_skb(skb);
867 	return err;
868 
869 fail:
870 	IP6_INC_STATS(ip6_dst_idev(skb->dst),
871 		      IPSTATS_MIB_FRAGFAILS);
872 	kfree_skb(skb);
873 	return err;
874 }
875 
876 static inline int ip6_rt_check(struct rt6key *rt_key,
877 			       struct in6_addr *fl_addr,
878 			       struct in6_addr *addr_cache)
879 {
880 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
881 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
882 }
883 
884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
885 					  struct dst_entry *dst,
886 					  struct flowi *fl)
887 {
888 	struct ipv6_pinfo *np = inet6_sk(sk);
889 	struct rt6_info *rt = (struct rt6_info *)dst;
890 
891 	if (!dst)
892 		goto out;
893 
894 	/* Yes, checking route validity in not connected
895 	 * case is not very simple. Take into account,
896 	 * that we do not support routing by source, TOS,
897 	 * and MSG_DONTROUTE 		--ANK (980726)
898 	 *
899 	 * 1. ip6_rt_check(): If route was host route,
900 	 *    check that cached destination is current.
901 	 *    If it is network route, we still may
902 	 *    check its validity using saved pointer
903 	 *    to the last used address: daddr_cache.
904 	 *    We do not want to save whole address now,
905 	 *    (because main consumer of this service
906 	 *    is tcp, which has not this problem),
907 	 *    so that the last trick works only on connected
908 	 *    sockets.
909 	 * 2. oif also should be the same.
910 	 */
911 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
912 #ifdef CONFIG_IPV6_SUBTREES
913 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
914 #endif
915 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
916 		dst_release(dst);
917 		dst = NULL;
918 	}
919 
920 out:
921 	return dst;
922 }
923 
924 static int ip6_dst_lookup_tail(struct sock *sk,
925 			       struct dst_entry **dst, struct flowi *fl)
926 {
927 	int err;
928 	struct net *net = sock_net(sk);
929 
930 	if (*dst == NULL)
931 		*dst = ip6_route_output(net, sk, fl);
932 
933 	if ((err = (*dst)->error))
934 		goto out_err_release;
935 
936 	if (ipv6_addr_any(&fl->fl6_src)) {
937 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
938 					 &fl->fl6_dst,
939 					 sk ? inet6_sk(sk)->srcprefs : 0,
940 					 &fl->fl6_src);
941 		if (err)
942 			goto out_err_release;
943 	}
944 
945 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
946 		/*
947 		 * Here if the dst entry we've looked up
948 		 * has a neighbour entry that is in the INCOMPLETE
949 		 * state and the src address from the flow is
950 		 * marked as OPTIMISTIC, we release the found
951 		 * dst entry and replace it instead with the
952 		 * dst entry of the nexthop router
953 		 */
954 		if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
955 			struct inet6_ifaddr *ifp;
956 			struct flowi fl_gw;
957 			int redirect;
958 
959 			ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
960 					      (*dst)->dev, 1);
961 
962 			redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
963 			if (ifp)
964 				in6_ifa_put(ifp);
965 
966 			if (redirect) {
967 				/*
968 				 * We need to get the dst entry for the
969 				 * default router instead
970 				 */
971 				dst_release(*dst);
972 				memcpy(&fl_gw, fl, sizeof(struct flowi));
973 				memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
974 				*dst = ip6_route_output(net, sk, &fl_gw);
975 				if ((err = (*dst)->error))
976 					goto out_err_release;
977 			}
978 		}
979 #endif
980 
981 	return 0;
982 
983 out_err_release:
984 	if (err == -ENETUNREACH)
985 		IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
986 	dst_release(*dst);
987 	*dst = NULL;
988 	return err;
989 }
990 
991 /**
992  *	ip6_dst_lookup - perform route lookup on flow
993  *	@sk: socket which provides route info
994  *	@dst: pointer to dst_entry * for result
995  *	@fl: flow to lookup
996  *
997  *	This function performs a route lookup on the given flow.
998  *
999  *	It returns zero on success, or a standard errno code on error.
1000  */
1001 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1002 {
1003 	*dst = NULL;
1004 	return ip6_dst_lookup_tail(sk, dst, fl);
1005 }
1006 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1007 
1008 /**
1009  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1010  *	@sk: socket which provides the dst cache and route info
1011  *	@dst: pointer to dst_entry * for result
1012  *	@fl: flow to lookup
1013  *
1014  *	This function performs a route lookup on the given flow with the
1015  *	possibility of using the cached route in the socket if it is valid.
1016  *	It will take the socket dst lock when operating on the dst cache.
1017  *	As a result, this function can only be used in process context.
1018  *
1019  *	It returns zero on success, or a standard errno code on error.
1020  */
1021 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1022 {
1023 	*dst = NULL;
1024 	if (sk) {
1025 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1026 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1027 	}
1028 
1029 	return ip6_dst_lookup_tail(sk, dst, fl);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1032 
1033 static inline int ip6_ufo_append_data(struct sock *sk,
1034 			int getfrag(void *from, char *to, int offset, int len,
1035 			int odd, struct sk_buff *skb),
1036 			void *from, int length, int hh_len, int fragheaderlen,
1037 			int transhdrlen, int mtu,unsigned int flags)
1038 
1039 {
1040 	struct sk_buff *skb;
1041 	int err;
1042 
1043 	/* There is support for UDP large send offload by network
1044 	 * device, so create one single skb packet containing complete
1045 	 * udp datagram
1046 	 */
1047 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1048 		skb = sock_alloc_send_skb(sk,
1049 			hh_len + fragheaderlen + transhdrlen + 20,
1050 			(flags & MSG_DONTWAIT), &err);
1051 		if (skb == NULL)
1052 			return -ENOMEM;
1053 
1054 		/* reserve space for Hardware header */
1055 		skb_reserve(skb, hh_len);
1056 
1057 		/* create space for UDP/IP header */
1058 		skb_put(skb,fragheaderlen + transhdrlen);
1059 
1060 		/* initialize network header pointer */
1061 		skb_reset_network_header(skb);
1062 
1063 		/* initialize protocol header pointer */
1064 		skb->transport_header = skb->network_header + fragheaderlen;
1065 
1066 		skb->ip_summed = CHECKSUM_PARTIAL;
1067 		skb->csum = 0;
1068 		sk->sk_sndmsg_off = 0;
1069 	}
1070 
1071 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1072 				      (length - transhdrlen));
1073 	if (!err) {
1074 		struct frag_hdr fhdr;
1075 
1076 		/* specify the length of each IP datagram fragment*/
1077 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1078 					    sizeof(struct frag_hdr);
1079 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1080 		ipv6_select_ident(skb, &fhdr);
1081 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1082 		__skb_queue_tail(&sk->sk_write_queue, skb);
1083 
1084 		return 0;
1085 	}
1086 	/* There is not enough support do UPD LSO,
1087 	 * so follow normal path
1088 	 */
1089 	kfree_skb(skb);
1090 
1091 	return err;
1092 }
1093 
1094 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1095 	int offset, int len, int odd, struct sk_buff *skb),
1096 	void *from, int length, int transhdrlen,
1097 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1098 	struct rt6_info *rt, unsigned int flags)
1099 {
1100 	struct inet_sock *inet = inet_sk(sk);
1101 	struct ipv6_pinfo *np = inet6_sk(sk);
1102 	struct sk_buff *skb;
1103 	unsigned int maxfraglen, fragheaderlen;
1104 	int exthdrlen;
1105 	int hh_len;
1106 	int mtu;
1107 	int copy;
1108 	int err;
1109 	int offset = 0;
1110 	int csummode = CHECKSUM_NONE;
1111 
1112 	if (flags&MSG_PROBE)
1113 		return 0;
1114 	if (skb_queue_empty(&sk->sk_write_queue)) {
1115 		/*
1116 		 * setup for corking
1117 		 */
1118 		if (opt) {
1119 			if (np->cork.opt == NULL) {
1120 				np->cork.opt = kmalloc(opt->tot_len,
1121 						       sk->sk_allocation);
1122 				if (unlikely(np->cork.opt == NULL))
1123 					return -ENOBUFS;
1124 			} else if (np->cork.opt->tot_len < opt->tot_len) {
1125 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1126 				return -EINVAL;
1127 			}
1128 			memcpy(np->cork.opt, opt, opt->tot_len);
1129 			inet->cork.flags |= IPCORK_OPT;
1130 			/* need source address above miyazawa*/
1131 		}
1132 		dst_hold(&rt->u.dst);
1133 		inet->cork.dst = &rt->u.dst;
1134 		inet->cork.fl = *fl;
1135 		np->cork.hop_limit = hlimit;
1136 		np->cork.tclass = tclass;
1137 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1138 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1139 		if (np->frag_size < mtu) {
1140 			if (np->frag_size)
1141 				mtu = np->frag_size;
1142 		}
1143 		inet->cork.fragsize = mtu;
1144 		if (dst_allfrag(rt->u.dst.path))
1145 			inet->cork.flags |= IPCORK_ALLFRAG;
1146 		inet->cork.length = 0;
1147 		sk->sk_sndmsg_page = NULL;
1148 		sk->sk_sndmsg_off = 0;
1149 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1150 			    rt->rt6i_nfheader_len;
1151 		length += exthdrlen;
1152 		transhdrlen += exthdrlen;
1153 	} else {
1154 		rt = (struct rt6_info *)inet->cork.dst;
1155 		fl = &inet->cork.fl;
1156 		if (inet->cork.flags & IPCORK_OPT)
1157 			opt = np->cork.opt;
1158 		transhdrlen = 0;
1159 		exthdrlen = 0;
1160 		mtu = inet->cork.fragsize;
1161 	}
1162 
1163 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1164 
1165 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1166 			(opt ? opt->opt_nflen : 0);
1167 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1168 
1169 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1170 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1171 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1172 			return -EMSGSIZE;
1173 		}
1174 	}
1175 
1176 	/*
1177 	 * Let's try using as much space as possible.
1178 	 * Use MTU if total length of the message fits into the MTU.
1179 	 * Otherwise, we need to reserve fragment header and
1180 	 * fragment alignment (= 8-15 octects, in total).
1181 	 *
1182 	 * Note that we may need to "move" the data from the tail of
1183 	 * of the buffer to the new fragment when we split
1184 	 * the message.
1185 	 *
1186 	 * FIXME: It may be fragmented into multiple chunks
1187 	 *        at once if non-fragmentable extension headers
1188 	 *        are too large.
1189 	 * --yoshfuji
1190 	 */
1191 
1192 	inet->cork.length += length;
1193 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1194 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1195 
1196 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1197 					  fragheaderlen, transhdrlen, mtu,
1198 					  flags);
1199 		if (err)
1200 			goto error;
1201 		return 0;
1202 	}
1203 
1204 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1205 		goto alloc_new_skb;
1206 
1207 	while (length > 0) {
1208 		/* Check if the remaining data fits into current packet. */
1209 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1210 		if (copy < length)
1211 			copy = maxfraglen - skb->len;
1212 
1213 		if (copy <= 0) {
1214 			char *data;
1215 			unsigned int datalen;
1216 			unsigned int fraglen;
1217 			unsigned int fraggap;
1218 			unsigned int alloclen;
1219 			struct sk_buff *skb_prev;
1220 alloc_new_skb:
1221 			skb_prev = skb;
1222 
1223 			/* There's no room in the current skb */
1224 			if (skb_prev)
1225 				fraggap = skb_prev->len - maxfraglen;
1226 			else
1227 				fraggap = 0;
1228 
1229 			/*
1230 			 * If remaining data exceeds the mtu,
1231 			 * we know we need more fragment(s).
1232 			 */
1233 			datalen = length + fraggap;
1234 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1235 				datalen = maxfraglen - fragheaderlen;
1236 
1237 			fraglen = datalen + fragheaderlen;
1238 			if ((flags & MSG_MORE) &&
1239 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1240 				alloclen = mtu;
1241 			else
1242 				alloclen = datalen + fragheaderlen;
1243 
1244 			/*
1245 			 * The last fragment gets additional space at tail.
1246 			 * Note: we overallocate on fragments with MSG_MODE
1247 			 * because we have no idea if we're the last one.
1248 			 */
1249 			if (datalen == length + fraggap)
1250 				alloclen += rt->u.dst.trailer_len;
1251 
1252 			/*
1253 			 * We just reserve space for fragment header.
1254 			 * Note: this may be overallocation if the message
1255 			 * (without MSG_MORE) fits into the MTU.
1256 			 */
1257 			alloclen += sizeof(struct frag_hdr);
1258 
1259 			if (transhdrlen) {
1260 				skb = sock_alloc_send_skb(sk,
1261 						alloclen + hh_len,
1262 						(flags & MSG_DONTWAIT), &err);
1263 			} else {
1264 				skb = NULL;
1265 				if (atomic_read(&sk->sk_wmem_alloc) <=
1266 				    2 * sk->sk_sndbuf)
1267 					skb = sock_wmalloc(sk,
1268 							   alloclen + hh_len, 1,
1269 							   sk->sk_allocation);
1270 				if (unlikely(skb == NULL))
1271 					err = -ENOBUFS;
1272 			}
1273 			if (skb == NULL)
1274 				goto error;
1275 			/*
1276 			 *	Fill in the control structures
1277 			 */
1278 			skb->ip_summed = csummode;
1279 			skb->csum = 0;
1280 			/* reserve for fragmentation */
1281 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1282 
1283 			/*
1284 			 *	Find where to start putting bytes
1285 			 */
1286 			data = skb_put(skb, fraglen);
1287 			skb_set_network_header(skb, exthdrlen);
1288 			data += fragheaderlen;
1289 			skb->transport_header = (skb->network_header +
1290 						 fragheaderlen);
1291 			if (fraggap) {
1292 				skb->csum = skb_copy_and_csum_bits(
1293 					skb_prev, maxfraglen,
1294 					data + transhdrlen, fraggap, 0);
1295 				skb_prev->csum = csum_sub(skb_prev->csum,
1296 							  skb->csum);
1297 				data += fraggap;
1298 				pskb_trim_unique(skb_prev, maxfraglen);
1299 			}
1300 			copy = datalen - transhdrlen - fraggap;
1301 			if (copy < 0) {
1302 				err = -EINVAL;
1303 				kfree_skb(skb);
1304 				goto error;
1305 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1306 				err = -EFAULT;
1307 				kfree_skb(skb);
1308 				goto error;
1309 			}
1310 
1311 			offset += copy;
1312 			length -= datalen - fraggap;
1313 			transhdrlen = 0;
1314 			exthdrlen = 0;
1315 			csummode = CHECKSUM_NONE;
1316 
1317 			/*
1318 			 * Put the packet on the pending queue
1319 			 */
1320 			__skb_queue_tail(&sk->sk_write_queue, skb);
1321 			continue;
1322 		}
1323 
1324 		if (copy > length)
1325 			copy = length;
1326 
1327 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1328 			unsigned int off;
1329 
1330 			off = skb->len;
1331 			if (getfrag(from, skb_put(skb, copy),
1332 						offset, copy, off, skb) < 0) {
1333 				__skb_trim(skb, off);
1334 				err = -EFAULT;
1335 				goto error;
1336 			}
1337 		} else {
1338 			int i = skb_shinfo(skb)->nr_frags;
1339 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1340 			struct page *page = sk->sk_sndmsg_page;
1341 			int off = sk->sk_sndmsg_off;
1342 			unsigned int left;
1343 
1344 			if (page && (left = PAGE_SIZE - off) > 0) {
1345 				if (copy >= left)
1346 					copy = left;
1347 				if (page != frag->page) {
1348 					if (i == MAX_SKB_FRAGS) {
1349 						err = -EMSGSIZE;
1350 						goto error;
1351 					}
1352 					get_page(page);
1353 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1354 					frag = &skb_shinfo(skb)->frags[i];
1355 				}
1356 			} else if(i < MAX_SKB_FRAGS) {
1357 				if (copy > PAGE_SIZE)
1358 					copy = PAGE_SIZE;
1359 				page = alloc_pages(sk->sk_allocation, 0);
1360 				if (page == NULL) {
1361 					err = -ENOMEM;
1362 					goto error;
1363 				}
1364 				sk->sk_sndmsg_page = page;
1365 				sk->sk_sndmsg_off = 0;
1366 
1367 				skb_fill_page_desc(skb, i, page, 0, 0);
1368 				frag = &skb_shinfo(skb)->frags[i];
1369 			} else {
1370 				err = -EMSGSIZE;
1371 				goto error;
1372 			}
1373 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1374 				err = -EFAULT;
1375 				goto error;
1376 			}
1377 			sk->sk_sndmsg_off += copy;
1378 			frag->size += copy;
1379 			skb->len += copy;
1380 			skb->data_len += copy;
1381 			skb->truesize += copy;
1382 			atomic_add(copy, &sk->sk_wmem_alloc);
1383 		}
1384 		offset += copy;
1385 		length -= copy;
1386 	}
1387 	return 0;
1388 error:
1389 	inet->cork.length -= length;
1390 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1391 	return err;
1392 }
1393 
1394 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1395 {
1396 	inet->cork.flags &= ~IPCORK_OPT;
1397 	kfree(np->cork.opt);
1398 	np->cork.opt = NULL;
1399 	if (inet->cork.dst) {
1400 		dst_release(inet->cork.dst);
1401 		inet->cork.dst = NULL;
1402 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1403 	}
1404 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1405 }
1406 
1407 int ip6_push_pending_frames(struct sock *sk)
1408 {
1409 	struct sk_buff *skb, *tmp_skb;
1410 	struct sk_buff **tail_skb;
1411 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1412 	struct inet_sock *inet = inet_sk(sk);
1413 	struct ipv6_pinfo *np = inet6_sk(sk);
1414 	struct ipv6hdr *hdr;
1415 	struct ipv6_txoptions *opt = np->cork.opt;
1416 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1417 	struct flowi *fl = &inet->cork.fl;
1418 	unsigned char proto = fl->proto;
1419 	int err = 0;
1420 
1421 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1422 		goto out;
1423 	tail_skb = &(skb_shinfo(skb)->frag_list);
1424 
1425 	/* move skb->data to ip header from ext header */
1426 	if (skb->data < skb_network_header(skb))
1427 		__skb_pull(skb, skb_network_offset(skb));
1428 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1429 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1430 		*tail_skb = tmp_skb;
1431 		tail_skb = &(tmp_skb->next);
1432 		skb->len += tmp_skb->len;
1433 		skb->data_len += tmp_skb->len;
1434 		skb->truesize += tmp_skb->truesize;
1435 		__sock_put(tmp_skb->sk);
1436 		tmp_skb->destructor = NULL;
1437 		tmp_skb->sk = NULL;
1438 	}
1439 
1440 	/* Allow local fragmentation. */
1441 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1442 		skb->local_df = 1;
1443 
1444 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1445 	__skb_pull(skb, skb_network_header_len(skb));
1446 	if (opt && opt->opt_flen)
1447 		ipv6_push_frag_opts(skb, opt, &proto);
1448 	if (opt && opt->opt_nflen)
1449 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1450 
1451 	skb_push(skb, sizeof(struct ipv6hdr));
1452 	skb_reset_network_header(skb);
1453 	hdr = ipv6_hdr(skb);
1454 
1455 	*(__be32*)hdr = fl->fl6_flowlabel |
1456 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1457 
1458 	hdr->hop_limit = np->cork.hop_limit;
1459 	hdr->nexthdr = proto;
1460 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1461 	ipv6_addr_copy(&hdr->daddr, final_dst);
1462 
1463 	skb->priority = sk->sk_priority;
1464 	skb->mark = sk->sk_mark;
1465 
1466 	skb->dst = dst_clone(&rt->u.dst);
1467 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1468 	if (proto == IPPROTO_ICMPV6) {
1469 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1470 
1471 		ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1472 		ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1473 	}
1474 
1475 	err = ip6_local_out(skb);
1476 	if (err) {
1477 		if (err > 0)
1478 			err = np->recverr ? net_xmit_errno(err) : 0;
1479 		if (err)
1480 			goto error;
1481 	}
1482 
1483 out:
1484 	ip6_cork_release(inet, np);
1485 	return err;
1486 error:
1487 	goto out;
1488 }
1489 
1490 void ip6_flush_pending_frames(struct sock *sk)
1491 {
1492 	struct sk_buff *skb;
1493 
1494 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1495 		if (skb->dst)
1496 			IP6_INC_STATS(ip6_dst_idev(skb->dst),
1497 				      IPSTATS_MIB_OUTDISCARDS);
1498 		kfree_skb(skb);
1499 	}
1500 
1501 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1502 }
1503