xref: /linux/net/ipv6/ip6_output.c (revision 092e0e7e520a1fca03e13c9f2d157432a8657ff2)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 
104 	skb->protocol = htons(ETH_P_IPV6);
105 	skb->dev = dev;
106 
107 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109 
110 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111 		    ((mroute6_socket(dev_net(dev), skb) &&
112 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114 					 &ipv6_hdr(skb)->saddr))) {
115 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116 
117 			/* Do not check for IFF_ALLMULTI; multicast routing
118 			   is not supported in any case.
119 			 */
120 			if (newskb)
121 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122 					newskb, NULL, newskb->dev,
123 					ip6_dev_loopback_xmit);
124 
125 			if (ipv6_hdr(skb)->hop_limit == 0) {
126 				IP6_INC_STATS(dev_net(dev), idev,
127 					      IPSTATS_MIB_OUTDISCARDS);
128 				kfree_skb(skb);
129 				return 0;
130 			}
131 		}
132 
133 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134 				skb->len);
135 	}
136 
137 	if (dst->hh)
138 		return neigh_hh_output(dst->hh, skb);
139 	else if (dst->neighbour)
140 		return dst->neighbour->output(skb);
141 
142 	IP6_INC_STATS_BH(dev_net(dst->dev),
143 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144 	kfree_skb(skb);
145 	return -EINVAL;
146 }
147 
148 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
149 {
150 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
151 
152 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
153 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
154 }
155 
156 static int ip6_finish_output(struct sk_buff *skb)
157 {
158 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
159 	    dst_allfrag(skb_dst(skb)))
160 		return ip6_fragment(skb, ip6_finish_output2);
161 	else
162 		return ip6_finish_output2(skb);
163 }
164 
165 int ip6_output(struct sk_buff *skb)
166 {
167 	struct net_device *dev = skb_dst(skb)->dev;
168 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
169 	if (unlikely(idev->cnf.disable_ipv6)) {
170 		IP6_INC_STATS(dev_net(dev), idev,
171 			      IPSTATS_MIB_OUTDISCARDS);
172 		kfree_skb(skb);
173 		return 0;
174 	}
175 
176 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
177 			    ip6_finish_output,
178 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
179 }
180 
181 /*
182  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
183  */
184 
185 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
186 	     struct ipv6_txoptions *opt)
187 {
188 	struct net *net = sock_net(sk);
189 	struct ipv6_pinfo *np = inet6_sk(sk);
190 	struct in6_addr *first_hop = &fl->fl6_dst;
191 	struct dst_entry *dst = skb_dst(skb);
192 	struct ipv6hdr *hdr;
193 	u8  proto = fl->proto;
194 	int seg_len = skb->len;
195 	int hlimit = -1;
196 	int tclass = 0;
197 	u32 mtu;
198 
199 	if (opt) {
200 		unsigned int head_room;
201 
202 		/* First: exthdrs may take lots of space (~8K for now)
203 		   MAX_HEADER is not enough.
204 		 */
205 		head_room = opt->opt_nflen + opt->opt_flen;
206 		seg_len += head_room;
207 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
208 
209 		if (skb_headroom(skb) < head_room) {
210 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 			if (skb2 == NULL) {
212 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 					      IPSTATS_MIB_OUTDISCARDS);
214 				kfree_skb(skb);
215 				return -ENOBUFS;
216 			}
217 			kfree_skb(skb);
218 			skb = skb2;
219 			skb_set_owner_w(skb, sk);
220 		}
221 		if (opt->opt_flen)
222 			ipv6_push_frag_opts(skb, opt, &proto);
223 		if (opt->opt_nflen)
224 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
225 	}
226 
227 	skb_push(skb, sizeof(struct ipv6hdr));
228 	skb_reset_network_header(skb);
229 	hdr = ipv6_hdr(skb);
230 
231 	/*
232 	 *	Fill in the IPv6 header
233 	 */
234 	if (np) {
235 		tclass = np->tclass;
236 		hlimit = np->hop_limit;
237 	}
238 	if (hlimit < 0)
239 		hlimit = ip6_dst_hoplimit(dst);
240 
241 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
242 
243 	hdr->payload_len = htons(seg_len);
244 	hdr->nexthdr = proto;
245 	hdr->hop_limit = hlimit;
246 
247 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
248 	ipv6_addr_copy(&hdr->daddr, first_hop);
249 
250 	skb->priority = sk->sk_priority;
251 	skb->mark = sk->sk_mark;
252 
253 	mtu = dst_mtu(dst);
254 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
255 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
256 			      IPSTATS_MIB_OUT, skb->len);
257 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
258 			       dst->dev, dst_output);
259 	}
260 
261 	if (net_ratelimit())
262 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
263 	skb->dev = dst->dev;
264 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
265 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
266 	kfree_skb(skb);
267 	return -EMSGSIZE;
268 }
269 
270 EXPORT_SYMBOL(ip6_xmit);
271 
272 /*
273  *	To avoid extra problems ND packets are send through this
274  *	routine. It's code duplication but I really want to avoid
275  *	extra checks since ipv6_build_header is used by TCP (which
276  *	is for us performance critical)
277  */
278 
279 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
280 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
281 	       int proto, int len)
282 {
283 	struct ipv6_pinfo *np = inet6_sk(sk);
284 	struct ipv6hdr *hdr;
285 	int totlen;
286 
287 	skb->protocol = htons(ETH_P_IPV6);
288 	skb->dev = dev;
289 
290 	totlen = len + sizeof(struct ipv6hdr);
291 
292 	skb_reset_network_header(skb);
293 	skb_put(skb, sizeof(struct ipv6hdr));
294 	hdr = ipv6_hdr(skb);
295 
296 	*(__be32*)hdr = htonl(0x60000000);
297 
298 	hdr->payload_len = htons(len);
299 	hdr->nexthdr = proto;
300 	hdr->hop_limit = np->hop_limit;
301 
302 	ipv6_addr_copy(&hdr->saddr, saddr);
303 	ipv6_addr_copy(&hdr->daddr, daddr);
304 
305 	return 0;
306 }
307 
308 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
309 {
310 	struct ip6_ra_chain *ra;
311 	struct sock *last = NULL;
312 
313 	read_lock(&ip6_ra_lock);
314 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
315 		struct sock *sk = ra->sk;
316 		if (sk && ra->sel == sel &&
317 		    (!sk->sk_bound_dev_if ||
318 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
319 			if (last) {
320 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
321 				if (skb2)
322 					rawv6_rcv(last, skb2);
323 			}
324 			last = sk;
325 		}
326 	}
327 
328 	if (last) {
329 		rawv6_rcv(last, skb);
330 		read_unlock(&ip6_ra_lock);
331 		return 1;
332 	}
333 	read_unlock(&ip6_ra_lock);
334 	return 0;
335 }
336 
337 static int ip6_forward_proxy_check(struct sk_buff *skb)
338 {
339 	struct ipv6hdr *hdr = ipv6_hdr(skb);
340 	u8 nexthdr = hdr->nexthdr;
341 	int offset;
342 
343 	if (ipv6_ext_hdr(nexthdr)) {
344 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
345 		if (offset < 0)
346 			return 0;
347 	} else
348 		offset = sizeof(struct ipv6hdr);
349 
350 	if (nexthdr == IPPROTO_ICMPV6) {
351 		struct icmp6hdr *icmp6;
352 
353 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
354 					 offset + 1 - skb->data)))
355 			return 0;
356 
357 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
358 
359 		switch (icmp6->icmp6_type) {
360 		case NDISC_ROUTER_SOLICITATION:
361 		case NDISC_ROUTER_ADVERTISEMENT:
362 		case NDISC_NEIGHBOUR_SOLICITATION:
363 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
364 		case NDISC_REDIRECT:
365 			/* For reaction involving unicast neighbor discovery
366 			 * message destined to the proxied address, pass it to
367 			 * input function.
368 			 */
369 			return 1;
370 		default:
371 			break;
372 		}
373 	}
374 
375 	/*
376 	 * The proxying router can't forward traffic sent to a link-local
377 	 * address, so signal the sender and discard the packet. This
378 	 * behavior is clarified by the MIPv6 specification.
379 	 */
380 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
381 		dst_link_failure(skb);
382 		return -1;
383 	}
384 
385 	return 0;
386 }
387 
388 static inline int ip6_forward_finish(struct sk_buff *skb)
389 {
390 	return dst_output(skb);
391 }
392 
393 int ip6_forward(struct sk_buff *skb)
394 {
395 	struct dst_entry *dst = skb_dst(skb);
396 	struct ipv6hdr *hdr = ipv6_hdr(skb);
397 	struct inet6_skb_parm *opt = IP6CB(skb);
398 	struct net *net = dev_net(dst->dev);
399 	u32 mtu;
400 
401 	if (net->ipv6.devconf_all->forwarding == 0)
402 		goto error;
403 
404 	if (skb_warn_if_lro(skb))
405 		goto drop;
406 
407 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
408 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
409 		goto drop;
410 	}
411 
412 	skb_forward_csum(skb);
413 
414 	/*
415 	 *	We DO NOT make any processing on
416 	 *	RA packets, pushing them to user level AS IS
417 	 *	without ane WARRANTY that application will be able
418 	 *	to interpret them. The reason is that we
419 	 *	cannot make anything clever here.
420 	 *
421 	 *	We are not end-node, so that if packet contains
422 	 *	AH/ESP, we cannot make anything.
423 	 *	Defragmentation also would be mistake, RA packets
424 	 *	cannot be fragmented, because there is no warranty
425 	 *	that different fragments will go along one path. --ANK
426 	 */
427 	if (opt->ra) {
428 		u8 *ptr = skb_network_header(skb) + opt->ra;
429 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
430 			return 0;
431 	}
432 
433 	/*
434 	 *	check and decrement ttl
435 	 */
436 	if (hdr->hop_limit <= 1) {
437 		/* Force OUTPUT device used as source address */
438 		skb->dev = dst->dev;
439 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
440 		IP6_INC_STATS_BH(net,
441 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
442 
443 		kfree_skb(skb);
444 		return -ETIMEDOUT;
445 	}
446 
447 	/* XXX: idev->cnf.proxy_ndp? */
448 	if (net->ipv6.devconf_all->proxy_ndp &&
449 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
450 		int proxied = ip6_forward_proxy_check(skb);
451 		if (proxied > 0)
452 			return ip6_input(skb);
453 		else if (proxied < 0) {
454 			IP6_INC_STATS(net, ip6_dst_idev(dst),
455 				      IPSTATS_MIB_INDISCARDS);
456 			goto drop;
457 		}
458 	}
459 
460 	if (!xfrm6_route_forward(skb)) {
461 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
462 		goto drop;
463 	}
464 	dst = skb_dst(skb);
465 
466 	/* IPv6 specs say nothing about it, but it is clear that we cannot
467 	   send redirects to source routed frames.
468 	   We don't send redirects to frames decapsulated from IPsec.
469 	 */
470 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
471 	    !skb_sec_path(skb)) {
472 		struct in6_addr *target = NULL;
473 		struct rt6_info *rt;
474 		struct neighbour *n = dst->neighbour;
475 
476 		/*
477 		 *	incoming and outgoing devices are the same
478 		 *	send a redirect.
479 		 */
480 
481 		rt = (struct rt6_info *) dst;
482 		if ((rt->rt6i_flags & RTF_GATEWAY))
483 			target = (struct in6_addr*)&n->primary_key;
484 		else
485 			target = &hdr->daddr;
486 
487 		/* Limit redirects both by destination (here)
488 		   and by source (inside ndisc_send_redirect)
489 		 */
490 		if (xrlim_allow(dst, 1*HZ))
491 			ndisc_send_redirect(skb, n, target);
492 	} else {
493 		int addrtype = ipv6_addr_type(&hdr->saddr);
494 
495 		/* This check is security critical. */
496 		if (addrtype == IPV6_ADDR_ANY ||
497 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
498 			goto error;
499 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
500 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
501 				    ICMPV6_NOT_NEIGHBOUR, 0);
502 			goto error;
503 		}
504 	}
505 
506 	mtu = dst_mtu(dst);
507 	if (mtu < IPV6_MIN_MTU)
508 		mtu = IPV6_MIN_MTU;
509 
510 	if (skb->len > mtu && !skb_is_gso(skb)) {
511 		/* Again, force OUTPUT device used as source address */
512 		skb->dev = dst->dev;
513 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
514 		IP6_INC_STATS_BH(net,
515 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
516 		IP6_INC_STATS_BH(net,
517 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
518 		kfree_skb(skb);
519 		return -EMSGSIZE;
520 	}
521 
522 	if (skb_cow(skb, dst->dev->hard_header_len)) {
523 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524 		goto drop;
525 	}
526 
527 	hdr = ipv6_hdr(skb);
528 
529 	/* Mangling hops number delayed to point after skb COW */
530 
531 	hdr->hop_limit--;
532 
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
534 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535 		       ip6_forward_finish);
536 
537 error:
538 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 drop:
540 	kfree_skb(skb);
541 	return -EINVAL;
542 }
543 
544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
545 {
546 	to->pkt_type = from->pkt_type;
547 	to->priority = from->priority;
548 	to->protocol = from->protocol;
549 	skb_dst_drop(to);
550 	skb_dst_set(to, dst_clone(skb_dst(from)));
551 	to->dev = from->dev;
552 	to->mark = from->mark;
553 
554 #ifdef CONFIG_NET_SCHED
555 	to->tc_index = from->tc_index;
556 #endif
557 	nf_copy(to, from);
558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
560 	to->nf_trace = from->nf_trace;
561 #endif
562 	skb_copy_secmark(to, from);
563 }
564 
565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
566 {
567 	u16 offset = sizeof(struct ipv6hdr);
568 	struct ipv6_opt_hdr *exthdr =
569 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
570 	unsigned int packet_len = skb->tail - skb->network_header;
571 	int found_rhdr = 0;
572 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
573 
574 	while (offset + 1 <= packet_len) {
575 
576 		switch (**nexthdr) {
577 
578 		case NEXTHDR_HOP:
579 			break;
580 		case NEXTHDR_ROUTING:
581 			found_rhdr = 1;
582 			break;
583 		case NEXTHDR_DEST:
584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
585 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
586 				break;
587 #endif
588 			if (found_rhdr)
589 				return offset;
590 			break;
591 		default :
592 			return offset;
593 		}
594 
595 		offset += ipv6_optlen(exthdr);
596 		*nexthdr = &exthdr->nexthdr;
597 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
598 						 offset);
599 	}
600 
601 	return offset;
602 }
603 
604 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
605 {
606 	struct sk_buff *frag;
607 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
608 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
609 	struct ipv6hdr *tmp_hdr;
610 	struct frag_hdr *fh;
611 	unsigned int mtu, hlen, left, len;
612 	__be32 frag_id = 0;
613 	int ptr, offset = 0, err=0;
614 	u8 *prevhdr, nexthdr = 0;
615 	struct net *net = dev_net(skb_dst(skb)->dev);
616 
617 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
618 	nexthdr = *prevhdr;
619 
620 	mtu = ip6_skb_dst_mtu(skb);
621 
622 	/* We must not fragment if the socket is set to force MTU discovery
623 	 * or if the skb it not generated by a local socket.
624 	 */
625 	if (!skb->local_df && skb->len > mtu) {
626 		skb->dev = skb_dst(skb)->dev;
627 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
628 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
629 			      IPSTATS_MIB_FRAGFAILS);
630 		kfree_skb(skb);
631 		return -EMSGSIZE;
632 	}
633 
634 	if (np && np->frag_size < mtu) {
635 		if (np->frag_size)
636 			mtu = np->frag_size;
637 	}
638 	mtu -= hlen + sizeof(struct frag_hdr);
639 
640 	if (skb_has_frags(skb)) {
641 		int first_len = skb_pagelen(skb);
642 		struct sk_buff *frag2;
643 
644 		if (first_len - hlen > mtu ||
645 		    ((first_len - hlen) & 7) ||
646 		    skb_cloned(skb))
647 			goto slow_path;
648 
649 		skb_walk_frags(skb, frag) {
650 			/* Correct geometry. */
651 			if (frag->len > mtu ||
652 			    ((frag->len & 7) && frag->next) ||
653 			    skb_headroom(frag) < hlen)
654 				goto slow_path_clean;
655 
656 			/* Partially cloned skb? */
657 			if (skb_shared(frag))
658 				goto slow_path_clean;
659 
660 			BUG_ON(frag->sk);
661 			if (skb->sk) {
662 				frag->sk = skb->sk;
663 				frag->destructor = sock_wfree;
664 			}
665 			skb->truesize -= frag->truesize;
666 		}
667 
668 		err = 0;
669 		offset = 0;
670 		frag = skb_shinfo(skb)->frag_list;
671 		skb_frag_list_init(skb);
672 		/* BUILD HEADER */
673 
674 		*prevhdr = NEXTHDR_FRAGMENT;
675 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
676 		if (!tmp_hdr) {
677 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
678 				      IPSTATS_MIB_FRAGFAILS);
679 			return -ENOMEM;
680 		}
681 
682 		__skb_pull(skb, hlen);
683 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
684 		__skb_push(skb, hlen);
685 		skb_reset_network_header(skb);
686 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
687 
688 		ipv6_select_ident(fh);
689 		fh->nexthdr = nexthdr;
690 		fh->reserved = 0;
691 		fh->frag_off = htons(IP6_MF);
692 		frag_id = fh->identification;
693 
694 		first_len = skb_pagelen(skb);
695 		skb->data_len = first_len - skb_headlen(skb);
696 		skb->len = first_len;
697 		ipv6_hdr(skb)->payload_len = htons(first_len -
698 						   sizeof(struct ipv6hdr));
699 
700 		dst_hold(&rt->dst);
701 
702 		for (;;) {
703 			/* Prepare header of the next frame,
704 			 * before previous one went down. */
705 			if (frag) {
706 				frag->ip_summed = CHECKSUM_NONE;
707 				skb_reset_transport_header(frag);
708 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
709 				__skb_push(frag, hlen);
710 				skb_reset_network_header(frag);
711 				memcpy(skb_network_header(frag), tmp_hdr,
712 				       hlen);
713 				offset += skb->len - hlen - sizeof(struct frag_hdr);
714 				fh->nexthdr = nexthdr;
715 				fh->reserved = 0;
716 				fh->frag_off = htons(offset);
717 				if (frag->next != NULL)
718 					fh->frag_off |= htons(IP6_MF);
719 				fh->identification = frag_id;
720 				ipv6_hdr(frag)->payload_len =
721 						htons(frag->len -
722 						      sizeof(struct ipv6hdr));
723 				ip6_copy_metadata(frag, skb);
724 			}
725 
726 			err = output(skb);
727 			if(!err)
728 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
729 					      IPSTATS_MIB_FRAGCREATES);
730 
731 			if (err || !frag)
732 				break;
733 
734 			skb = frag;
735 			frag = skb->next;
736 			skb->next = NULL;
737 		}
738 
739 		kfree(tmp_hdr);
740 
741 		if (err == 0) {
742 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743 				      IPSTATS_MIB_FRAGOKS);
744 			dst_release(&rt->dst);
745 			return 0;
746 		}
747 
748 		while (frag) {
749 			skb = frag->next;
750 			kfree_skb(frag);
751 			frag = skb;
752 		}
753 
754 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
755 			      IPSTATS_MIB_FRAGFAILS);
756 		dst_release(&rt->dst);
757 		return err;
758 
759 slow_path_clean:
760 		skb_walk_frags(skb, frag2) {
761 			if (frag2 == frag)
762 				break;
763 			frag2->sk = NULL;
764 			frag2->destructor = NULL;
765 			skb->truesize += frag2->truesize;
766 		}
767 	}
768 
769 slow_path:
770 	left = skb->len - hlen;		/* Space per frame */
771 	ptr = hlen;			/* Where to start from */
772 
773 	/*
774 	 *	Fragment the datagram.
775 	 */
776 
777 	*prevhdr = NEXTHDR_FRAGMENT;
778 
779 	/*
780 	 *	Keep copying data until we run out.
781 	 */
782 	while(left > 0)	{
783 		len = left;
784 		/* IF: it doesn't fit, use 'mtu' - the data space left */
785 		if (len > mtu)
786 			len = mtu;
787 		/* IF: we are not sending upto and including the packet end
788 		   then align the next start on an eight byte boundary */
789 		if (len < left)	{
790 			len &= ~7;
791 		}
792 		/*
793 		 *	Allocate buffer.
794 		 */
795 
796 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
797 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
798 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
799 				      IPSTATS_MIB_FRAGFAILS);
800 			err = -ENOMEM;
801 			goto fail;
802 		}
803 
804 		/*
805 		 *	Set up data on packet
806 		 */
807 
808 		ip6_copy_metadata(frag, skb);
809 		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
810 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
811 		skb_reset_network_header(frag);
812 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
813 		frag->transport_header = (frag->network_header + hlen +
814 					  sizeof(struct frag_hdr));
815 
816 		/*
817 		 *	Charge the memory for the fragment to any owner
818 		 *	it might possess
819 		 */
820 		if (skb->sk)
821 			skb_set_owner_w(frag, skb->sk);
822 
823 		/*
824 		 *	Copy the packet header into the new buffer.
825 		 */
826 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
827 
828 		/*
829 		 *	Build fragment header.
830 		 */
831 		fh->nexthdr = nexthdr;
832 		fh->reserved = 0;
833 		if (!frag_id) {
834 			ipv6_select_ident(fh);
835 			frag_id = fh->identification;
836 		} else
837 			fh->identification = frag_id;
838 
839 		/*
840 		 *	Copy a block of the IP datagram.
841 		 */
842 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
843 			BUG();
844 		left -= len;
845 
846 		fh->frag_off = htons(offset);
847 		if (left > 0)
848 			fh->frag_off |= htons(IP6_MF);
849 		ipv6_hdr(frag)->payload_len = htons(frag->len -
850 						    sizeof(struct ipv6hdr));
851 
852 		ptr += len;
853 		offset += len;
854 
855 		/*
856 		 *	Put this fragment into the sending queue.
857 		 */
858 		err = output(frag);
859 		if (err)
860 			goto fail;
861 
862 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
863 			      IPSTATS_MIB_FRAGCREATES);
864 	}
865 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
866 		      IPSTATS_MIB_FRAGOKS);
867 	kfree_skb(skb);
868 	return err;
869 
870 fail:
871 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
872 		      IPSTATS_MIB_FRAGFAILS);
873 	kfree_skb(skb);
874 	return err;
875 }
876 
877 static inline int ip6_rt_check(struct rt6key *rt_key,
878 			       struct in6_addr *fl_addr,
879 			       struct in6_addr *addr_cache)
880 {
881 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
882 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
883 }
884 
885 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
886 					  struct dst_entry *dst,
887 					  struct flowi *fl)
888 {
889 	struct ipv6_pinfo *np = inet6_sk(sk);
890 	struct rt6_info *rt = (struct rt6_info *)dst;
891 
892 	if (!dst)
893 		goto out;
894 
895 	/* Yes, checking route validity in not connected
896 	 * case is not very simple. Take into account,
897 	 * that we do not support routing by source, TOS,
898 	 * and MSG_DONTROUTE 		--ANK (980726)
899 	 *
900 	 * 1. ip6_rt_check(): If route was host route,
901 	 *    check that cached destination is current.
902 	 *    If it is network route, we still may
903 	 *    check its validity using saved pointer
904 	 *    to the last used address: daddr_cache.
905 	 *    We do not want to save whole address now,
906 	 *    (because main consumer of this service
907 	 *    is tcp, which has not this problem),
908 	 *    so that the last trick works only on connected
909 	 *    sockets.
910 	 * 2. oif also should be the same.
911 	 */
912 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
913 #ifdef CONFIG_IPV6_SUBTREES
914 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
915 #endif
916 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
917 		dst_release(dst);
918 		dst = NULL;
919 	}
920 
921 out:
922 	return dst;
923 }
924 
925 static int ip6_dst_lookup_tail(struct sock *sk,
926 			       struct dst_entry **dst, struct flowi *fl)
927 {
928 	int err;
929 	struct net *net = sock_net(sk);
930 
931 	if (*dst == NULL)
932 		*dst = ip6_route_output(net, sk, fl);
933 
934 	if ((err = (*dst)->error))
935 		goto out_err_release;
936 
937 	if (ipv6_addr_any(&fl->fl6_src)) {
938 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
939 					 &fl->fl6_dst,
940 					 sk ? inet6_sk(sk)->srcprefs : 0,
941 					 &fl->fl6_src);
942 		if (err)
943 			goto out_err_release;
944 	}
945 
946 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
947 	/*
948 	 * Here if the dst entry we've looked up
949 	 * has a neighbour entry that is in the INCOMPLETE
950 	 * state and the src address from the flow is
951 	 * marked as OPTIMISTIC, we release the found
952 	 * dst entry and replace it instead with the
953 	 * dst entry of the nexthop router
954 	 */
955 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
956 		struct inet6_ifaddr *ifp;
957 		struct flowi fl_gw;
958 		int redirect;
959 
960 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
961 				      (*dst)->dev, 1);
962 
963 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
964 		if (ifp)
965 			in6_ifa_put(ifp);
966 
967 		if (redirect) {
968 			/*
969 			 * We need to get the dst entry for the
970 			 * default router instead
971 			 */
972 			dst_release(*dst);
973 			memcpy(&fl_gw, fl, sizeof(struct flowi));
974 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
975 			*dst = ip6_route_output(net, sk, &fl_gw);
976 			if ((err = (*dst)->error))
977 				goto out_err_release;
978 		}
979 	}
980 #endif
981 
982 	return 0;
983 
984 out_err_release:
985 	if (err == -ENETUNREACH)
986 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
987 	dst_release(*dst);
988 	*dst = NULL;
989 	return err;
990 }
991 
992 /**
993  *	ip6_dst_lookup - perform route lookup on flow
994  *	@sk: socket which provides route info
995  *	@dst: pointer to dst_entry * for result
996  *	@fl: flow to lookup
997  *
998  *	This function performs a route lookup on the given flow.
999  *
1000  *	It returns zero on success, or a standard errno code on error.
1001  */
1002 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1003 {
1004 	*dst = NULL;
1005 	return ip6_dst_lookup_tail(sk, dst, fl);
1006 }
1007 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1008 
1009 /**
1010  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1011  *	@sk: socket which provides the dst cache and route info
1012  *	@dst: pointer to dst_entry * for result
1013  *	@fl: flow to lookup
1014  *
1015  *	This function performs a route lookup on the given flow with the
1016  *	possibility of using the cached route in the socket if it is valid.
1017  *	It will take the socket dst lock when operating on the dst cache.
1018  *	As a result, this function can only be used in process context.
1019  *
1020  *	It returns zero on success, or a standard errno code on error.
1021  */
1022 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1023 {
1024 	*dst = NULL;
1025 	if (sk) {
1026 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1027 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1028 	}
1029 
1030 	return ip6_dst_lookup_tail(sk, dst, fl);
1031 }
1032 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1033 
1034 static inline int ip6_ufo_append_data(struct sock *sk,
1035 			int getfrag(void *from, char *to, int offset, int len,
1036 			int odd, struct sk_buff *skb),
1037 			void *from, int length, int hh_len, int fragheaderlen,
1038 			int transhdrlen, int mtu,unsigned int flags)
1039 
1040 {
1041 	struct sk_buff *skb;
1042 	int err;
1043 
1044 	/* There is support for UDP large send offload by network
1045 	 * device, so create one single skb packet containing complete
1046 	 * udp datagram
1047 	 */
1048 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1049 		skb = sock_alloc_send_skb(sk,
1050 			hh_len + fragheaderlen + transhdrlen + 20,
1051 			(flags & MSG_DONTWAIT), &err);
1052 		if (skb == NULL)
1053 			return -ENOMEM;
1054 
1055 		/* reserve space for Hardware header */
1056 		skb_reserve(skb, hh_len);
1057 
1058 		/* create space for UDP/IP header */
1059 		skb_put(skb,fragheaderlen + transhdrlen);
1060 
1061 		/* initialize network header pointer */
1062 		skb_reset_network_header(skb);
1063 
1064 		/* initialize protocol header pointer */
1065 		skb->transport_header = skb->network_header + fragheaderlen;
1066 
1067 		skb->ip_summed = CHECKSUM_PARTIAL;
1068 		skb->csum = 0;
1069 		sk->sk_sndmsg_off = 0;
1070 	}
1071 
1072 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1073 				      (length - transhdrlen));
1074 	if (!err) {
1075 		struct frag_hdr fhdr;
1076 
1077 		/* Specify the length of each IPv6 datagram fragment.
1078 		 * It has to be a multiple of 8.
1079 		 */
1080 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1081 					     sizeof(struct frag_hdr)) & ~7;
1082 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1083 		ipv6_select_ident(&fhdr);
1084 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1085 		__skb_queue_tail(&sk->sk_write_queue, skb);
1086 
1087 		return 0;
1088 	}
1089 	/* There is not enough support do UPD LSO,
1090 	 * so follow normal path
1091 	 */
1092 	kfree_skb(skb);
1093 
1094 	return err;
1095 }
1096 
1097 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1098 					       gfp_t gfp)
1099 {
1100 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1101 }
1102 
1103 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1104 						gfp_t gfp)
1105 {
1106 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1107 }
1108 
1109 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1110 	int offset, int len, int odd, struct sk_buff *skb),
1111 	void *from, int length, int transhdrlen,
1112 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1113 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1114 {
1115 	struct inet_sock *inet = inet_sk(sk);
1116 	struct ipv6_pinfo *np = inet6_sk(sk);
1117 	struct sk_buff *skb;
1118 	unsigned int maxfraglen, fragheaderlen;
1119 	int exthdrlen;
1120 	int hh_len;
1121 	int mtu;
1122 	int copy;
1123 	int err;
1124 	int offset = 0;
1125 	int csummode = CHECKSUM_NONE;
1126 
1127 	if (flags&MSG_PROBE)
1128 		return 0;
1129 	if (skb_queue_empty(&sk->sk_write_queue)) {
1130 		/*
1131 		 * setup for corking
1132 		 */
1133 		if (opt) {
1134 			if (WARN_ON(np->cork.opt))
1135 				return -EINVAL;
1136 
1137 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1138 			if (unlikely(np->cork.opt == NULL))
1139 				return -ENOBUFS;
1140 
1141 			np->cork.opt->tot_len = opt->tot_len;
1142 			np->cork.opt->opt_flen = opt->opt_flen;
1143 			np->cork.opt->opt_nflen = opt->opt_nflen;
1144 
1145 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1146 							    sk->sk_allocation);
1147 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1148 				return -ENOBUFS;
1149 
1150 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1151 							    sk->sk_allocation);
1152 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1153 				return -ENOBUFS;
1154 
1155 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1156 							   sk->sk_allocation);
1157 			if (opt->hopopt && !np->cork.opt->hopopt)
1158 				return -ENOBUFS;
1159 
1160 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1161 							    sk->sk_allocation);
1162 			if (opt->srcrt && !np->cork.opt->srcrt)
1163 				return -ENOBUFS;
1164 
1165 			/* need source address above miyazawa*/
1166 		}
1167 		dst_hold(&rt->dst);
1168 		inet->cork.dst = &rt->dst;
1169 		inet->cork.fl = *fl;
1170 		np->cork.hop_limit = hlimit;
1171 		np->cork.tclass = tclass;
1172 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1173 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1174 		if (np->frag_size < mtu) {
1175 			if (np->frag_size)
1176 				mtu = np->frag_size;
1177 		}
1178 		inet->cork.fragsize = mtu;
1179 		if (dst_allfrag(rt->dst.path))
1180 			inet->cork.flags |= IPCORK_ALLFRAG;
1181 		inet->cork.length = 0;
1182 		sk->sk_sndmsg_page = NULL;
1183 		sk->sk_sndmsg_off = 0;
1184 		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1185 			    rt->rt6i_nfheader_len;
1186 		length += exthdrlen;
1187 		transhdrlen += exthdrlen;
1188 	} else {
1189 		rt = (struct rt6_info *)inet->cork.dst;
1190 		fl = &inet->cork.fl;
1191 		opt = np->cork.opt;
1192 		transhdrlen = 0;
1193 		exthdrlen = 0;
1194 		mtu = inet->cork.fragsize;
1195 	}
1196 
1197 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1198 
1199 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1200 			(opt ? opt->opt_nflen : 0);
1201 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1202 
1203 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1204 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1205 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1206 			return -EMSGSIZE;
1207 		}
1208 	}
1209 
1210 	/*
1211 	 * Let's try using as much space as possible.
1212 	 * Use MTU if total length of the message fits into the MTU.
1213 	 * Otherwise, we need to reserve fragment header and
1214 	 * fragment alignment (= 8-15 octects, in total).
1215 	 *
1216 	 * Note that we may need to "move" the data from the tail of
1217 	 * of the buffer to the new fragment when we split
1218 	 * the message.
1219 	 *
1220 	 * FIXME: It may be fragmented into multiple chunks
1221 	 *        at once if non-fragmentable extension headers
1222 	 *        are too large.
1223 	 * --yoshfuji
1224 	 */
1225 
1226 	inet->cork.length += length;
1227 	if (length > mtu) {
1228 		int proto = sk->sk_protocol;
1229 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1230 			ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1231 			return -EMSGSIZE;
1232 		}
1233 
1234 		if (proto == IPPROTO_UDP &&
1235 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1236 
1237 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1238 						  hh_len, fragheaderlen,
1239 						  transhdrlen, mtu, flags);
1240 			if (err)
1241 				goto error;
1242 			return 0;
1243 		}
1244 	}
1245 
1246 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1247 		goto alloc_new_skb;
1248 
1249 	while (length > 0) {
1250 		/* Check if the remaining data fits into current packet. */
1251 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1252 		if (copy < length)
1253 			copy = maxfraglen - skb->len;
1254 
1255 		if (copy <= 0) {
1256 			char *data;
1257 			unsigned int datalen;
1258 			unsigned int fraglen;
1259 			unsigned int fraggap;
1260 			unsigned int alloclen;
1261 			struct sk_buff *skb_prev;
1262 alloc_new_skb:
1263 			skb_prev = skb;
1264 
1265 			/* There's no room in the current skb */
1266 			if (skb_prev)
1267 				fraggap = skb_prev->len - maxfraglen;
1268 			else
1269 				fraggap = 0;
1270 
1271 			/*
1272 			 * If remaining data exceeds the mtu,
1273 			 * we know we need more fragment(s).
1274 			 */
1275 			datalen = length + fraggap;
1276 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1277 				datalen = maxfraglen - fragheaderlen;
1278 
1279 			fraglen = datalen + fragheaderlen;
1280 			if ((flags & MSG_MORE) &&
1281 			    !(rt->dst.dev->features&NETIF_F_SG))
1282 				alloclen = mtu;
1283 			else
1284 				alloclen = datalen + fragheaderlen;
1285 
1286 			/*
1287 			 * The last fragment gets additional space at tail.
1288 			 * Note: we overallocate on fragments with MSG_MODE
1289 			 * because we have no idea if we're the last one.
1290 			 */
1291 			if (datalen == length + fraggap)
1292 				alloclen += rt->dst.trailer_len;
1293 
1294 			/*
1295 			 * We just reserve space for fragment header.
1296 			 * Note: this may be overallocation if the message
1297 			 * (without MSG_MORE) fits into the MTU.
1298 			 */
1299 			alloclen += sizeof(struct frag_hdr);
1300 
1301 			if (transhdrlen) {
1302 				skb = sock_alloc_send_skb(sk,
1303 						alloclen + hh_len,
1304 						(flags & MSG_DONTWAIT), &err);
1305 			} else {
1306 				skb = NULL;
1307 				if (atomic_read(&sk->sk_wmem_alloc) <=
1308 				    2 * sk->sk_sndbuf)
1309 					skb = sock_wmalloc(sk,
1310 							   alloclen + hh_len, 1,
1311 							   sk->sk_allocation);
1312 				if (unlikely(skb == NULL))
1313 					err = -ENOBUFS;
1314 			}
1315 			if (skb == NULL)
1316 				goto error;
1317 			/*
1318 			 *	Fill in the control structures
1319 			 */
1320 			skb->ip_summed = csummode;
1321 			skb->csum = 0;
1322 			/* reserve for fragmentation */
1323 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1324 
1325 			/*
1326 			 *	Find where to start putting bytes
1327 			 */
1328 			data = skb_put(skb, fraglen);
1329 			skb_set_network_header(skb, exthdrlen);
1330 			data += fragheaderlen;
1331 			skb->transport_header = (skb->network_header +
1332 						 fragheaderlen);
1333 			if (fraggap) {
1334 				skb->csum = skb_copy_and_csum_bits(
1335 					skb_prev, maxfraglen,
1336 					data + transhdrlen, fraggap, 0);
1337 				skb_prev->csum = csum_sub(skb_prev->csum,
1338 							  skb->csum);
1339 				data += fraggap;
1340 				pskb_trim_unique(skb_prev, maxfraglen);
1341 			}
1342 			copy = datalen - transhdrlen - fraggap;
1343 			if (copy < 0) {
1344 				err = -EINVAL;
1345 				kfree_skb(skb);
1346 				goto error;
1347 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1348 				err = -EFAULT;
1349 				kfree_skb(skb);
1350 				goto error;
1351 			}
1352 
1353 			offset += copy;
1354 			length -= datalen - fraggap;
1355 			transhdrlen = 0;
1356 			exthdrlen = 0;
1357 			csummode = CHECKSUM_NONE;
1358 
1359 			/*
1360 			 * Put the packet on the pending queue
1361 			 */
1362 			__skb_queue_tail(&sk->sk_write_queue, skb);
1363 			continue;
1364 		}
1365 
1366 		if (copy > length)
1367 			copy = length;
1368 
1369 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1370 			unsigned int off;
1371 
1372 			off = skb->len;
1373 			if (getfrag(from, skb_put(skb, copy),
1374 						offset, copy, off, skb) < 0) {
1375 				__skb_trim(skb, off);
1376 				err = -EFAULT;
1377 				goto error;
1378 			}
1379 		} else {
1380 			int i = skb_shinfo(skb)->nr_frags;
1381 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1382 			struct page *page = sk->sk_sndmsg_page;
1383 			int off = sk->sk_sndmsg_off;
1384 			unsigned int left;
1385 
1386 			if (page && (left = PAGE_SIZE - off) > 0) {
1387 				if (copy >= left)
1388 					copy = left;
1389 				if (page != frag->page) {
1390 					if (i == MAX_SKB_FRAGS) {
1391 						err = -EMSGSIZE;
1392 						goto error;
1393 					}
1394 					get_page(page);
1395 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1396 					frag = &skb_shinfo(skb)->frags[i];
1397 				}
1398 			} else if(i < MAX_SKB_FRAGS) {
1399 				if (copy > PAGE_SIZE)
1400 					copy = PAGE_SIZE;
1401 				page = alloc_pages(sk->sk_allocation, 0);
1402 				if (page == NULL) {
1403 					err = -ENOMEM;
1404 					goto error;
1405 				}
1406 				sk->sk_sndmsg_page = page;
1407 				sk->sk_sndmsg_off = 0;
1408 
1409 				skb_fill_page_desc(skb, i, page, 0, 0);
1410 				frag = &skb_shinfo(skb)->frags[i];
1411 			} else {
1412 				err = -EMSGSIZE;
1413 				goto error;
1414 			}
1415 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1416 				err = -EFAULT;
1417 				goto error;
1418 			}
1419 			sk->sk_sndmsg_off += copy;
1420 			frag->size += copy;
1421 			skb->len += copy;
1422 			skb->data_len += copy;
1423 			skb->truesize += copy;
1424 			atomic_add(copy, &sk->sk_wmem_alloc);
1425 		}
1426 		offset += copy;
1427 		length -= copy;
1428 	}
1429 	return 0;
1430 error:
1431 	inet->cork.length -= length;
1432 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1433 	return err;
1434 }
1435 
1436 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1437 {
1438 	if (np->cork.opt) {
1439 		kfree(np->cork.opt->dst0opt);
1440 		kfree(np->cork.opt->dst1opt);
1441 		kfree(np->cork.opt->hopopt);
1442 		kfree(np->cork.opt->srcrt);
1443 		kfree(np->cork.opt);
1444 		np->cork.opt = NULL;
1445 	}
1446 
1447 	if (inet->cork.dst) {
1448 		dst_release(inet->cork.dst);
1449 		inet->cork.dst = NULL;
1450 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1451 	}
1452 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1453 }
1454 
1455 int ip6_push_pending_frames(struct sock *sk)
1456 {
1457 	struct sk_buff *skb, *tmp_skb;
1458 	struct sk_buff **tail_skb;
1459 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1460 	struct inet_sock *inet = inet_sk(sk);
1461 	struct ipv6_pinfo *np = inet6_sk(sk);
1462 	struct net *net = sock_net(sk);
1463 	struct ipv6hdr *hdr;
1464 	struct ipv6_txoptions *opt = np->cork.opt;
1465 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1466 	struct flowi *fl = &inet->cork.fl;
1467 	unsigned char proto = fl->proto;
1468 	int err = 0;
1469 
1470 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1471 		goto out;
1472 	tail_skb = &(skb_shinfo(skb)->frag_list);
1473 
1474 	/* move skb->data to ip header from ext header */
1475 	if (skb->data < skb_network_header(skb))
1476 		__skb_pull(skb, skb_network_offset(skb));
1477 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1478 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1479 		*tail_skb = tmp_skb;
1480 		tail_skb = &(tmp_skb->next);
1481 		skb->len += tmp_skb->len;
1482 		skb->data_len += tmp_skb->len;
1483 		skb->truesize += tmp_skb->truesize;
1484 		tmp_skb->destructor = NULL;
1485 		tmp_skb->sk = NULL;
1486 	}
1487 
1488 	/* Allow local fragmentation. */
1489 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1490 		skb->local_df = 1;
1491 
1492 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1493 	__skb_pull(skb, skb_network_header_len(skb));
1494 	if (opt && opt->opt_flen)
1495 		ipv6_push_frag_opts(skb, opt, &proto);
1496 	if (opt && opt->opt_nflen)
1497 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1498 
1499 	skb_push(skb, sizeof(struct ipv6hdr));
1500 	skb_reset_network_header(skb);
1501 	hdr = ipv6_hdr(skb);
1502 
1503 	*(__be32*)hdr = fl->fl6_flowlabel |
1504 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1505 
1506 	hdr->hop_limit = np->cork.hop_limit;
1507 	hdr->nexthdr = proto;
1508 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1509 	ipv6_addr_copy(&hdr->daddr, final_dst);
1510 
1511 	skb->priority = sk->sk_priority;
1512 	skb->mark = sk->sk_mark;
1513 
1514 	skb_dst_set(skb, dst_clone(&rt->dst));
1515 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1516 	if (proto == IPPROTO_ICMPV6) {
1517 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1518 
1519 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1520 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1521 	}
1522 
1523 	err = ip6_local_out(skb);
1524 	if (err) {
1525 		if (err > 0)
1526 			err = net_xmit_errno(err);
1527 		if (err)
1528 			goto error;
1529 	}
1530 
1531 out:
1532 	ip6_cork_release(inet, np);
1533 	return err;
1534 error:
1535 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1536 	goto out;
1537 }
1538 
1539 void ip6_flush_pending_frames(struct sock *sk)
1540 {
1541 	struct sk_buff *skb;
1542 
1543 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1544 		if (skb_dst(skb))
1545 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1546 				      IPSTATS_MIB_OUTDISCARDS);
1547 		kfree_skb(skb);
1548 	}
1549 
1550 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1551 }
1552