xref: /linux/net/ipv6/ip6_output.c (revision a115bc070b1fc57ab23f3972401425927b5b465c)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43 
44 #include <net/sock.h>
45 #include <net/snmp.h>
46 
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57 
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59 
60 int __ip6_local_out(struct sk_buff *skb)
61 {
62 	int len;
63 
64 	len = skb->len - sizeof(struct ipv6hdr);
65 	if (len > IPV6_MAXPLEN)
66 		len = 0;
67 	ipv6_hdr(skb)->payload_len = htons(len);
68 
69 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
70 		       dst_output);
71 }
72 
73 int ip6_local_out(struct sk_buff *skb)
74 {
75 	int err;
76 
77 	err = __ip6_local_out(skb);
78 	if (likely(err == 1))
79 		err = dst_output(skb);
80 
81 	return err;
82 }
83 EXPORT_SYMBOL_GPL(ip6_local_out);
84 
85 static int ip6_output_finish(struct sk_buff *skb)
86 {
87 	struct dst_entry *dst = skb_dst(skb);
88 
89 	if (dst->hh)
90 		return neigh_hh_output(dst->hh, skb);
91 	else if (dst->neighbour)
92 		return dst->neighbour->output(skb);
93 
94 	IP6_INC_STATS_BH(dev_net(dst->dev),
95 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
96 	kfree_skb(skb);
97 	return -EINVAL;
98 
99 }
100 
101 /* dev_loopback_xmit for use with netfilter. */
102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
103 {
104 	skb_reset_mac_header(newskb);
105 	__skb_pull(newskb, skb_network_offset(newskb));
106 	newskb->pkt_type = PACKET_LOOPBACK;
107 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
108 	WARN_ON(!skb_dst(newskb));
109 
110 	netif_rx(newskb);
111 	return 0;
112 }
113 
114 
115 static int ip6_output2(struct sk_buff *skb)
116 {
117 	struct dst_entry *dst = skb_dst(skb);
118 	struct net_device *dev = dst->dev;
119 
120 	skb->protocol = htons(ETH_P_IPV6);
121 	skb->dev = dev;
122 
123 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
124 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
125 
126 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
127 		    ((mroute6_socket(dev_net(dev)) &&
128 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
129 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
130 					 &ipv6_hdr(skb)->saddr))) {
131 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
132 
133 			/* Do not check for IFF_ALLMULTI; multicast routing
134 			   is not supported in any case.
135 			 */
136 			if (newskb)
137 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
138 					NULL, newskb->dev,
139 					ip6_dev_loopback_xmit);
140 
141 			if (ipv6_hdr(skb)->hop_limit == 0) {
142 				IP6_INC_STATS(dev_net(dev), idev,
143 					      IPSTATS_MIB_OUTDISCARDS);
144 				kfree_skb(skb);
145 				return 0;
146 			}
147 		}
148 
149 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
150 				skb->len);
151 	}
152 
153 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
154 		       ip6_output_finish);
155 }
156 
157 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
158 {
159 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
160 
161 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
162 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
163 }
164 
165 int ip6_output(struct sk_buff *skb)
166 {
167 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
168 	if (unlikely(idev->cnf.disable_ipv6)) {
169 		IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
170 			      IPSTATS_MIB_OUTDISCARDS);
171 		kfree_skb(skb);
172 		return 0;
173 	}
174 
175 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
176 				dst_allfrag(skb_dst(skb)))
177 		return ip6_fragment(skb, ip6_output2);
178 	else
179 		return ip6_output2(skb);
180 }
181 
182 /*
183  *	xmit an sk_buff (used by TCP)
184  */
185 
186 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
187 	     struct ipv6_txoptions *opt, int ipfragok)
188 {
189 	struct net *net = sock_net(sk);
190 	struct ipv6_pinfo *np = inet6_sk(sk);
191 	struct in6_addr *first_hop = &fl->fl6_dst;
192 	struct dst_entry *dst = skb_dst(skb);
193 	struct ipv6hdr *hdr;
194 	u8  proto = fl->proto;
195 	int seg_len = skb->len;
196 	int hlimit = -1;
197 	int tclass = 0;
198 	u32 mtu;
199 
200 	if (opt) {
201 		unsigned int head_room;
202 
203 		/* First: exthdrs may take lots of space (~8K for now)
204 		   MAX_HEADER is not enough.
205 		 */
206 		head_room = opt->opt_nflen + opt->opt_flen;
207 		seg_len += head_room;
208 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
209 
210 		if (skb_headroom(skb) < head_room) {
211 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
212 			if (skb2 == NULL) {
213 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
214 					      IPSTATS_MIB_OUTDISCARDS);
215 				kfree_skb(skb);
216 				return -ENOBUFS;
217 			}
218 			kfree_skb(skb);
219 			skb = skb2;
220 			if (sk)
221 				skb_set_owner_w(skb, sk);
222 		}
223 		if (opt->opt_flen)
224 			ipv6_push_frag_opts(skb, opt, &proto);
225 		if (opt->opt_nflen)
226 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
227 	}
228 
229 	skb_push(skb, sizeof(struct ipv6hdr));
230 	skb_reset_network_header(skb);
231 	hdr = ipv6_hdr(skb);
232 
233 	/* Allow local fragmentation. */
234 	if (ipfragok)
235 		skb->local_df = 1;
236 
237 	/*
238 	 *	Fill in the IPv6 header
239 	 */
240 	if (np) {
241 		tclass = np->tclass;
242 		hlimit = np->hop_limit;
243 	}
244 	if (hlimit < 0)
245 		hlimit = ip6_dst_hoplimit(dst);
246 
247 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
248 
249 	hdr->payload_len = htons(seg_len);
250 	hdr->nexthdr = proto;
251 	hdr->hop_limit = hlimit;
252 
253 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
254 	ipv6_addr_copy(&hdr->daddr, first_hop);
255 
256 	skb->priority = sk->sk_priority;
257 	skb->mark = sk->sk_mark;
258 
259 	mtu = dst_mtu(dst);
260 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
261 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
262 			      IPSTATS_MIB_OUT, skb->len);
263 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
264 				dst_output);
265 	}
266 
267 	if (net_ratelimit())
268 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
269 	skb->dev = dst->dev;
270 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
271 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
272 	kfree_skb(skb);
273 	return -EMSGSIZE;
274 }
275 
276 EXPORT_SYMBOL(ip6_xmit);
277 
278 /*
279  *	To avoid extra problems ND packets are send through this
280  *	routine. It's code duplication but I really want to avoid
281  *	extra checks since ipv6_build_header is used by TCP (which
282  *	is for us performance critical)
283  */
284 
285 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
286 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
287 	       int proto, int len)
288 {
289 	struct ipv6_pinfo *np = inet6_sk(sk);
290 	struct ipv6hdr *hdr;
291 	int totlen;
292 
293 	skb->protocol = htons(ETH_P_IPV6);
294 	skb->dev = dev;
295 
296 	totlen = len + sizeof(struct ipv6hdr);
297 
298 	skb_reset_network_header(skb);
299 	skb_put(skb, sizeof(struct ipv6hdr));
300 	hdr = ipv6_hdr(skb);
301 
302 	*(__be32*)hdr = htonl(0x60000000);
303 
304 	hdr->payload_len = htons(len);
305 	hdr->nexthdr = proto;
306 	hdr->hop_limit = np->hop_limit;
307 
308 	ipv6_addr_copy(&hdr->saddr, saddr);
309 	ipv6_addr_copy(&hdr->daddr, daddr);
310 
311 	return 0;
312 }
313 
314 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
315 {
316 	struct ip6_ra_chain *ra;
317 	struct sock *last = NULL;
318 
319 	read_lock(&ip6_ra_lock);
320 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
321 		struct sock *sk = ra->sk;
322 		if (sk && ra->sel == sel &&
323 		    (!sk->sk_bound_dev_if ||
324 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
325 			if (last) {
326 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327 				if (skb2)
328 					rawv6_rcv(last, skb2);
329 			}
330 			last = sk;
331 		}
332 	}
333 
334 	if (last) {
335 		rawv6_rcv(last, skb);
336 		read_unlock(&ip6_ra_lock);
337 		return 1;
338 	}
339 	read_unlock(&ip6_ra_lock);
340 	return 0;
341 }
342 
343 static int ip6_forward_proxy_check(struct sk_buff *skb)
344 {
345 	struct ipv6hdr *hdr = ipv6_hdr(skb);
346 	u8 nexthdr = hdr->nexthdr;
347 	int offset;
348 
349 	if (ipv6_ext_hdr(nexthdr)) {
350 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
351 		if (offset < 0)
352 			return 0;
353 	} else
354 		offset = sizeof(struct ipv6hdr);
355 
356 	if (nexthdr == IPPROTO_ICMPV6) {
357 		struct icmp6hdr *icmp6;
358 
359 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
360 					 offset + 1 - skb->data)))
361 			return 0;
362 
363 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
364 
365 		switch (icmp6->icmp6_type) {
366 		case NDISC_ROUTER_SOLICITATION:
367 		case NDISC_ROUTER_ADVERTISEMENT:
368 		case NDISC_NEIGHBOUR_SOLICITATION:
369 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
370 		case NDISC_REDIRECT:
371 			/* For reaction involving unicast neighbor discovery
372 			 * message destined to the proxied address, pass it to
373 			 * input function.
374 			 */
375 			return 1;
376 		default:
377 			break;
378 		}
379 	}
380 
381 	/*
382 	 * The proxying router can't forward traffic sent to a link-local
383 	 * address, so signal the sender and discard the packet. This
384 	 * behavior is clarified by the MIPv6 specification.
385 	 */
386 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
387 		dst_link_failure(skb);
388 		return -1;
389 	}
390 
391 	return 0;
392 }
393 
394 static inline int ip6_forward_finish(struct sk_buff *skb)
395 {
396 	return dst_output(skb);
397 }
398 
399 int ip6_forward(struct sk_buff *skb)
400 {
401 	struct dst_entry *dst = skb_dst(skb);
402 	struct ipv6hdr *hdr = ipv6_hdr(skb);
403 	struct inet6_skb_parm *opt = IP6CB(skb);
404 	struct net *net = dev_net(dst->dev);
405 	u32 mtu;
406 
407 	if (net->ipv6.devconf_all->forwarding == 0)
408 		goto error;
409 
410 	if (skb_warn_if_lro(skb))
411 		goto drop;
412 
413 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
414 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
415 		goto drop;
416 	}
417 
418 	skb_forward_csum(skb);
419 
420 	/*
421 	 *	We DO NOT make any processing on
422 	 *	RA packets, pushing them to user level AS IS
423 	 *	without ane WARRANTY that application will be able
424 	 *	to interpret them. The reason is that we
425 	 *	cannot make anything clever here.
426 	 *
427 	 *	We are not end-node, so that if packet contains
428 	 *	AH/ESP, we cannot make anything.
429 	 *	Defragmentation also would be mistake, RA packets
430 	 *	cannot be fragmented, because there is no warranty
431 	 *	that different fragments will go along one path. --ANK
432 	 */
433 	if (opt->ra) {
434 		u8 *ptr = skb_network_header(skb) + opt->ra;
435 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
436 			return 0;
437 	}
438 
439 	/*
440 	 *	check and decrement ttl
441 	 */
442 	if (hdr->hop_limit <= 1) {
443 		/* Force OUTPUT device used as source address */
444 		skb->dev = dst->dev;
445 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
446 		IP6_INC_STATS_BH(net,
447 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
448 
449 		kfree_skb(skb);
450 		return -ETIMEDOUT;
451 	}
452 
453 	/* XXX: idev->cnf.proxy_ndp? */
454 	if (net->ipv6.devconf_all->proxy_ndp &&
455 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
456 		int proxied = ip6_forward_proxy_check(skb);
457 		if (proxied > 0)
458 			return ip6_input(skb);
459 		else if (proxied < 0) {
460 			IP6_INC_STATS(net, ip6_dst_idev(dst),
461 				      IPSTATS_MIB_INDISCARDS);
462 			goto drop;
463 		}
464 	}
465 
466 	if (!xfrm6_route_forward(skb)) {
467 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
468 		goto drop;
469 	}
470 	dst = skb_dst(skb);
471 
472 	/* IPv6 specs say nothing about it, but it is clear that we cannot
473 	   send redirects to source routed frames.
474 	   We don't send redirects to frames decapsulated from IPsec.
475 	 */
476 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
477 	    !skb_sec_path(skb)) {
478 		struct in6_addr *target = NULL;
479 		struct rt6_info *rt;
480 		struct neighbour *n = dst->neighbour;
481 
482 		/*
483 		 *	incoming and outgoing devices are the same
484 		 *	send a redirect.
485 		 */
486 
487 		rt = (struct rt6_info *) dst;
488 		if ((rt->rt6i_flags & RTF_GATEWAY))
489 			target = (struct in6_addr*)&n->primary_key;
490 		else
491 			target = &hdr->daddr;
492 
493 		/* Limit redirects both by destination (here)
494 		   and by source (inside ndisc_send_redirect)
495 		 */
496 		if (xrlim_allow(dst, 1*HZ))
497 			ndisc_send_redirect(skb, n, target);
498 	} else {
499 		int addrtype = ipv6_addr_type(&hdr->saddr);
500 
501 		/* This check is security critical. */
502 		if (addrtype == IPV6_ADDR_ANY ||
503 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
504 			goto error;
505 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
506 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
507 				    ICMPV6_NOT_NEIGHBOUR, 0);
508 			goto error;
509 		}
510 	}
511 
512 	mtu = dst_mtu(dst);
513 	if (mtu < IPV6_MIN_MTU)
514 		mtu = IPV6_MIN_MTU;
515 
516 	if (skb->len > mtu) {
517 		/* Again, force OUTPUT device used as source address */
518 		skb->dev = dst->dev;
519 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
520 		IP6_INC_STATS_BH(net,
521 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
522 		IP6_INC_STATS_BH(net,
523 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
524 		kfree_skb(skb);
525 		return -EMSGSIZE;
526 	}
527 
528 	if (skb_cow(skb, dst->dev->hard_header_len)) {
529 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
530 		goto drop;
531 	}
532 
533 	hdr = ipv6_hdr(skb);
534 
535 	/* Mangling hops number delayed to point after skb COW */
536 
537 	hdr->hop_limit--;
538 
539 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
540 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
541 		       ip6_forward_finish);
542 
543 error:
544 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
545 drop:
546 	kfree_skb(skb);
547 	return -EINVAL;
548 }
549 
550 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
551 {
552 	to->pkt_type = from->pkt_type;
553 	to->priority = from->priority;
554 	to->protocol = from->protocol;
555 	skb_dst_drop(to);
556 	skb_dst_set(to, dst_clone(skb_dst(from)));
557 	to->dev = from->dev;
558 	to->mark = from->mark;
559 
560 #ifdef CONFIG_NET_SCHED
561 	to->tc_index = from->tc_index;
562 #endif
563 	nf_copy(to, from);
564 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
565     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
566 	to->nf_trace = from->nf_trace;
567 #endif
568 	skb_copy_secmark(to, from);
569 }
570 
571 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
572 {
573 	u16 offset = sizeof(struct ipv6hdr);
574 	struct ipv6_opt_hdr *exthdr =
575 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
576 	unsigned int packet_len = skb->tail - skb->network_header;
577 	int found_rhdr = 0;
578 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
579 
580 	while (offset + 1 <= packet_len) {
581 
582 		switch (**nexthdr) {
583 
584 		case NEXTHDR_HOP:
585 			break;
586 		case NEXTHDR_ROUTING:
587 			found_rhdr = 1;
588 			break;
589 		case NEXTHDR_DEST:
590 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
591 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
592 				break;
593 #endif
594 			if (found_rhdr)
595 				return offset;
596 			break;
597 		default :
598 			return offset;
599 		}
600 
601 		offset += ipv6_optlen(exthdr);
602 		*nexthdr = &exthdr->nexthdr;
603 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
604 						 offset);
605 	}
606 
607 	return offset;
608 }
609 
610 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
611 {
612 	struct sk_buff *frag;
613 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
614 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
615 	struct ipv6hdr *tmp_hdr;
616 	struct frag_hdr *fh;
617 	unsigned int mtu, hlen, left, len;
618 	__be32 frag_id = 0;
619 	int ptr, offset = 0, err=0;
620 	u8 *prevhdr, nexthdr = 0;
621 	struct net *net = dev_net(skb_dst(skb)->dev);
622 
623 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
624 	nexthdr = *prevhdr;
625 
626 	mtu = ip6_skb_dst_mtu(skb);
627 
628 	/* We must not fragment if the socket is set to force MTU discovery
629 	 * or if the skb it not generated by a local socket.
630 	 */
631 	if (!skb->local_df) {
632 		skb->dev = skb_dst(skb)->dev;
633 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
634 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
635 			      IPSTATS_MIB_FRAGFAILS);
636 		kfree_skb(skb);
637 		return -EMSGSIZE;
638 	}
639 
640 	if (np && np->frag_size < mtu) {
641 		if (np->frag_size)
642 			mtu = np->frag_size;
643 	}
644 	mtu -= hlen + sizeof(struct frag_hdr);
645 
646 	if (skb_has_frags(skb)) {
647 		int first_len = skb_pagelen(skb);
648 		int truesizes = 0;
649 
650 		if (first_len - hlen > mtu ||
651 		    ((first_len - hlen) & 7) ||
652 		    skb_cloned(skb))
653 			goto slow_path;
654 
655 		skb_walk_frags(skb, frag) {
656 			/* Correct geometry. */
657 			if (frag->len > mtu ||
658 			    ((frag->len & 7) && frag->next) ||
659 			    skb_headroom(frag) < hlen)
660 			    goto slow_path;
661 
662 			/* Partially cloned skb? */
663 			if (skb_shared(frag))
664 				goto slow_path;
665 
666 			BUG_ON(frag->sk);
667 			if (skb->sk) {
668 				frag->sk = skb->sk;
669 				frag->destructor = sock_wfree;
670 				truesizes += frag->truesize;
671 			}
672 		}
673 
674 		err = 0;
675 		offset = 0;
676 		frag = skb_shinfo(skb)->frag_list;
677 		skb_frag_list_init(skb);
678 		/* BUILD HEADER */
679 
680 		*prevhdr = NEXTHDR_FRAGMENT;
681 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
682 		if (!tmp_hdr) {
683 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
684 				      IPSTATS_MIB_FRAGFAILS);
685 			return -ENOMEM;
686 		}
687 
688 		__skb_pull(skb, hlen);
689 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
690 		__skb_push(skb, hlen);
691 		skb_reset_network_header(skb);
692 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
693 
694 		ipv6_select_ident(fh);
695 		fh->nexthdr = nexthdr;
696 		fh->reserved = 0;
697 		fh->frag_off = htons(IP6_MF);
698 		frag_id = fh->identification;
699 
700 		first_len = skb_pagelen(skb);
701 		skb->data_len = first_len - skb_headlen(skb);
702 		skb->truesize -= truesizes;
703 		skb->len = first_len;
704 		ipv6_hdr(skb)->payload_len = htons(first_len -
705 						   sizeof(struct ipv6hdr));
706 
707 		dst_hold(&rt->u.dst);
708 
709 		for (;;) {
710 			/* Prepare header of the next frame,
711 			 * before previous one went down. */
712 			if (frag) {
713 				frag->ip_summed = CHECKSUM_NONE;
714 				skb_reset_transport_header(frag);
715 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
716 				__skb_push(frag, hlen);
717 				skb_reset_network_header(frag);
718 				memcpy(skb_network_header(frag), tmp_hdr,
719 				       hlen);
720 				offset += skb->len - hlen - sizeof(struct frag_hdr);
721 				fh->nexthdr = nexthdr;
722 				fh->reserved = 0;
723 				fh->frag_off = htons(offset);
724 				if (frag->next != NULL)
725 					fh->frag_off |= htons(IP6_MF);
726 				fh->identification = frag_id;
727 				ipv6_hdr(frag)->payload_len =
728 						htons(frag->len -
729 						      sizeof(struct ipv6hdr));
730 				ip6_copy_metadata(frag, skb);
731 			}
732 
733 			err = output(skb);
734 			if(!err)
735 				IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
736 					      IPSTATS_MIB_FRAGCREATES);
737 
738 			if (err || !frag)
739 				break;
740 
741 			skb = frag;
742 			frag = skb->next;
743 			skb->next = NULL;
744 		}
745 
746 		kfree(tmp_hdr);
747 
748 		if (err == 0) {
749 			IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
750 				      IPSTATS_MIB_FRAGOKS);
751 			dst_release(&rt->u.dst);
752 			return 0;
753 		}
754 
755 		while (frag) {
756 			skb = frag->next;
757 			kfree_skb(frag);
758 			frag = skb;
759 		}
760 
761 		IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
762 			      IPSTATS_MIB_FRAGFAILS);
763 		dst_release(&rt->u.dst);
764 		return err;
765 	}
766 
767 slow_path:
768 	left = skb->len - hlen;		/* Space per frame */
769 	ptr = hlen;			/* Where to start from */
770 
771 	/*
772 	 *	Fragment the datagram.
773 	 */
774 
775 	*prevhdr = NEXTHDR_FRAGMENT;
776 
777 	/*
778 	 *	Keep copying data until we run out.
779 	 */
780 	while(left > 0)	{
781 		len = left;
782 		/* IF: it doesn't fit, use 'mtu' - the data space left */
783 		if (len > mtu)
784 			len = mtu;
785 		/* IF: we are not sending upto and including the packet end
786 		   then align the next start on an eight byte boundary */
787 		if (len < left)	{
788 			len &= ~7;
789 		}
790 		/*
791 		 *	Allocate buffer.
792 		 */
793 
794 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
795 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
796 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
797 				      IPSTATS_MIB_FRAGFAILS);
798 			err = -ENOMEM;
799 			goto fail;
800 		}
801 
802 		/*
803 		 *	Set up data on packet
804 		 */
805 
806 		ip6_copy_metadata(frag, skb);
807 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
808 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
809 		skb_reset_network_header(frag);
810 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
811 		frag->transport_header = (frag->network_header + hlen +
812 					  sizeof(struct frag_hdr));
813 
814 		/*
815 		 *	Charge the memory for the fragment to any owner
816 		 *	it might possess
817 		 */
818 		if (skb->sk)
819 			skb_set_owner_w(frag, skb->sk);
820 
821 		/*
822 		 *	Copy the packet header into the new buffer.
823 		 */
824 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
825 
826 		/*
827 		 *	Build fragment header.
828 		 */
829 		fh->nexthdr = nexthdr;
830 		fh->reserved = 0;
831 		if (!frag_id) {
832 			ipv6_select_ident(fh);
833 			frag_id = fh->identification;
834 		} else
835 			fh->identification = frag_id;
836 
837 		/*
838 		 *	Copy a block of the IP datagram.
839 		 */
840 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
841 			BUG();
842 		left -= len;
843 
844 		fh->frag_off = htons(offset);
845 		if (left > 0)
846 			fh->frag_off |= htons(IP6_MF);
847 		ipv6_hdr(frag)->payload_len = htons(frag->len -
848 						    sizeof(struct ipv6hdr));
849 
850 		ptr += len;
851 		offset += len;
852 
853 		/*
854 		 *	Put this fragment into the sending queue.
855 		 */
856 		err = output(frag);
857 		if (err)
858 			goto fail;
859 
860 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861 			      IPSTATS_MIB_FRAGCREATES);
862 	}
863 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
864 		      IPSTATS_MIB_FRAGOKS);
865 	kfree_skb(skb);
866 	return err;
867 
868 fail:
869 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
870 		      IPSTATS_MIB_FRAGFAILS);
871 	kfree_skb(skb);
872 	return err;
873 }
874 
875 static inline int ip6_rt_check(struct rt6key *rt_key,
876 			       struct in6_addr *fl_addr,
877 			       struct in6_addr *addr_cache)
878 {
879 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
880 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
881 }
882 
883 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
884 					  struct dst_entry *dst,
885 					  struct flowi *fl)
886 {
887 	struct ipv6_pinfo *np = inet6_sk(sk);
888 	struct rt6_info *rt = (struct rt6_info *)dst;
889 
890 	if (!dst)
891 		goto out;
892 
893 	/* Yes, checking route validity in not connected
894 	 * case is not very simple. Take into account,
895 	 * that we do not support routing by source, TOS,
896 	 * and MSG_DONTROUTE 		--ANK (980726)
897 	 *
898 	 * 1. ip6_rt_check(): If route was host route,
899 	 *    check that cached destination is current.
900 	 *    If it is network route, we still may
901 	 *    check its validity using saved pointer
902 	 *    to the last used address: daddr_cache.
903 	 *    We do not want to save whole address now,
904 	 *    (because main consumer of this service
905 	 *    is tcp, which has not this problem),
906 	 *    so that the last trick works only on connected
907 	 *    sockets.
908 	 * 2. oif also should be the same.
909 	 */
910 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
911 #ifdef CONFIG_IPV6_SUBTREES
912 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
913 #endif
914 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
915 		dst_release(dst);
916 		dst = NULL;
917 	}
918 
919 out:
920 	return dst;
921 }
922 
923 static int ip6_dst_lookup_tail(struct sock *sk,
924 			       struct dst_entry **dst, struct flowi *fl)
925 {
926 	int err;
927 	struct net *net = sock_net(sk);
928 
929 	if (*dst == NULL)
930 		*dst = ip6_route_output(net, sk, fl);
931 
932 	if ((err = (*dst)->error))
933 		goto out_err_release;
934 
935 	if (ipv6_addr_any(&fl->fl6_src)) {
936 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
937 					 &fl->fl6_dst,
938 					 sk ? inet6_sk(sk)->srcprefs : 0,
939 					 &fl->fl6_src);
940 		if (err)
941 			goto out_err_release;
942 	}
943 
944 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
945 	/*
946 	 * Here if the dst entry we've looked up
947 	 * has a neighbour entry that is in the INCOMPLETE
948 	 * state and the src address from the flow is
949 	 * marked as OPTIMISTIC, we release the found
950 	 * dst entry and replace it instead with the
951 	 * dst entry of the nexthop router
952 	 */
953 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
954 		struct inet6_ifaddr *ifp;
955 		struct flowi fl_gw;
956 		int redirect;
957 
958 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
959 				      (*dst)->dev, 1);
960 
961 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
962 		if (ifp)
963 			in6_ifa_put(ifp);
964 
965 		if (redirect) {
966 			/*
967 			 * We need to get the dst entry for the
968 			 * default router instead
969 			 */
970 			dst_release(*dst);
971 			memcpy(&fl_gw, fl, sizeof(struct flowi));
972 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
973 			*dst = ip6_route_output(net, sk, &fl_gw);
974 			if ((err = (*dst)->error))
975 				goto out_err_release;
976 		}
977 	}
978 #endif
979 
980 	return 0;
981 
982 out_err_release:
983 	if (err == -ENETUNREACH)
984 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
985 	dst_release(*dst);
986 	*dst = NULL;
987 	return err;
988 }
989 
990 /**
991  *	ip6_dst_lookup - perform route lookup on flow
992  *	@sk: socket which provides route info
993  *	@dst: pointer to dst_entry * for result
994  *	@fl: flow to lookup
995  *
996  *	This function performs a route lookup on the given flow.
997  *
998  *	It returns zero on success, or a standard errno code on error.
999  */
1000 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1001 {
1002 	*dst = NULL;
1003 	return ip6_dst_lookup_tail(sk, dst, fl);
1004 }
1005 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1006 
1007 /**
1008  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1009  *	@sk: socket which provides the dst cache and route info
1010  *	@dst: pointer to dst_entry * for result
1011  *	@fl: flow to lookup
1012  *
1013  *	This function performs a route lookup on the given flow with the
1014  *	possibility of using the cached route in the socket if it is valid.
1015  *	It will take the socket dst lock when operating on the dst cache.
1016  *	As a result, this function can only be used in process context.
1017  *
1018  *	It returns zero on success, or a standard errno code on error.
1019  */
1020 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1021 {
1022 	*dst = NULL;
1023 	if (sk) {
1024 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1025 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1026 	}
1027 
1028 	return ip6_dst_lookup_tail(sk, dst, fl);
1029 }
1030 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1031 
1032 static inline int ip6_ufo_append_data(struct sock *sk,
1033 			int getfrag(void *from, char *to, int offset, int len,
1034 			int odd, struct sk_buff *skb),
1035 			void *from, int length, int hh_len, int fragheaderlen,
1036 			int transhdrlen, int mtu,unsigned int flags)
1037 
1038 {
1039 	struct sk_buff *skb;
1040 	int err;
1041 
1042 	/* There is support for UDP large send offload by network
1043 	 * device, so create one single skb packet containing complete
1044 	 * udp datagram
1045 	 */
1046 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1047 		skb = sock_alloc_send_skb(sk,
1048 			hh_len + fragheaderlen + transhdrlen + 20,
1049 			(flags & MSG_DONTWAIT), &err);
1050 		if (skb == NULL)
1051 			return -ENOMEM;
1052 
1053 		/* reserve space for Hardware header */
1054 		skb_reserve(skb, hh_len);
1055 
1056 		/* create space for UDP/IP header */
1057 		skb_put(skb,fragheaderlen + transhdrlen);
1058 
1059 		/* initialize network header pointer */
1060 		skb_reset_network_header(skb);
1061 
1062 		/* initialize protocol header pointer */
1063 		skb->transport_header = skb->network_header + fragheaderlen;
1064 
1065 		skb->ip_summed = CHECKSUM_PARTIAL;
1066 		skb->csum = 0;
1067 		sk->sk_sndmsg_off = 0;
1068 	}
1069 
1070 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1071 				      (length - transhdrlen));
1072 	if (!err) {
1073 		struct frag_hdr fhdr;
1074 
1075 		/* Specify the length of each IPv6 datagram fragment.
1076 		 * It has to be a multiple of 8.
1077 		 */
1078 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1079 					     sizeof(struct frag_hdr)) & ~7;
1080 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1081 		ipv6_select_ident(&fhdr);
1082 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1083 		__skb_queue_tail(&sk->sk_write_queue, skb);
1084 
1085 		return 0;
1086 	}
1087 	/* There is not enough support do UPD LSO,
1088 	 * so follow normal path
1089 	 */
1090 	kfree_skb(skb);
1091 
1092 	return err;
1093 }
1094 
1095 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1096 					       gfp_t gfp)
1097 {
1098 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1099 }
1100 
1101 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1102 						gfp_t gfp)
1103 {
1104 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1105 }
1106 
1107 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1108 	int offset, int len, int odd, struct sk_buff *skb),
1109 	void *from, int length, int transhdrlen,
1110 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1111 	struct rt6_info *rt, unsigned int flags)
1112 {
1113 	struct inet_sock *inet = inet_sk(sk);
1114 	struct ipv6_pinfo *np = inet6_sk(sk);
1115 	struct sk_buff *skb;
1116 	unsigned int maxfraglen, fragheaderlen;
1117 	int exthdrlen;
1118 	int hh_len;
1119 	int mtu;
1120 	int copy;
1121 	int err;
1122 	int offset = 0;
1123 	int csummode = CHECKSUM_NONE;
1124 
1125 	if (flags&MSG_PROBE)
1126 		return 0;
1127 	if (skb_queue_empty(&sk->sk_write_queue)) {
1128 		/*
1129 		 * setup for corking
1130 		 */
1131 		if (opt) {
1132 			if (WARN_ON(np->cork.opt))
1133 				return -EINVAL;
1134 
1135 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1136 			if (unlikely(np->cork.opt == NULL))
1137 				return -ENOBUFS;
1138 
1139 			np->cork.opt->tot_len = opt->tot_len;
1140 			np->cork.opt->opt_flen = opt->opt_flen;
1141 			np->cork.opt->opt_nflen = opt->opt_nflen;
1142 
1143 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1144 							    sk->sk_allocation);
1145 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1146 				return -ENOBUFS;
1147 
1148 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1149 							    sk->sk_allocation);
1150 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1151 				return -ENOBUFS;
1152 
1153 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1154 							   sk->sk_allocation);
1155 			if (opt->hopopt && !np->cork.opt->hopopt)
1156 				return -ENOBUFS;
1157 
1158 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1159 							    sk->sk_allocation);
1160 			if (opt->srcrt && !np->cork.opt->srcrt)
1161 				return -ENOBUFS;
1162 
1163 			/* need source address above miyazawa*/
1164 		}
1165 		dst_hold(&rt->u.dst);
1166 		inet->cork.dst = &rt->u.dst;
1167 		inet->cork.fl = *fl;
1168 		np->cork.hop_limit = hlimit;
1169 		np->cork.tclass = tclass;
1170 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1171 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1172 		if (np->frag_size < mtu) {
1173 			if (np->frag_size)
1174 				mtu = np->frag_size;
1175 		}
1176 		inet->cork.fragsize = mtu;
1177 		if (dst_allfrag(rt->u.dst.path))
1178 			inet->cork.flags |= IPCORK_ALLFRAG;
1179 		inet->cork.length = 0;
1180 		sk->sk_sndmsg_page = NULL;
1181 		sk->sk_sndmsg_off = 0;
1182 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1183 			    rt->rt6i_nfheader_len;
1184 		length += exthdrlen;
1185 		transhdrlen += exthdrlen;
1186 	} else {
1187 		rt = (struct rt6_info *)inet->cork.dst;
1188 		fl = &inet->cork.fl;
1189 		opt = np->cork.opt;
1190 		transhdrlen = 0;
1191 		exthdrlen = 0;
1192 		mtu = inet->cork.fragsize;
1193 	}
1194 
1195 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1196 
1197 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1198 			(opt ? opt->opt_nflen : 0);
1199 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1200 
1201 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1202 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1203 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1204 			return -EMSGSIZE;
1205 		}
1206 	}
1207 
1208 	/*
1209 	 * Let's try using as much space as possible.
1210 	 * Use MTU if total length of the message fits into the MTU.
1211 	 * Otherwise, we need to reserve fragment header and
1212 	 * fragment alignment (= 8-15 octects, in total).
1213 	 *
1214 	 * Note that we may need to "move" the data from the tail of
1215 	 * of the buffer to the new fragment when we split
1216 	 * the message.
1217 	 *
1218 	 * FIXME: It may be fragmented into multiple chunks
1219 	 *        at once if non-fragmentable extension headers
1220 	 *        are too large.
1221 	 * --yoshfuji
1222 	 */
1223 
1224 	inet->cork.length += length;
1225 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1226 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1227 
1228 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1229 					  fragheaderlen, transhdrlen, mtu,
1230 					  flags);
1231 		if (err)
1232 			goto error;
1233 		return 0;
1234 	}
1235 
1236 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1237 		goto alloc_new_skb;
1238 
1239 	while (length > 0) {
1240 		/* Check if the remaining data fits into current packet. */
1241 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1242 		if (copy < length)
1243 			copy = maxfraglen - skb->len;
1244 
1245 		if (copy <= 0) {
1246 			char *data;
1247 			unsigned int datalen;
1248 			unsigned int fraglen;
1249 			unsigned int fraggap;
1250 			unsigned int alloclen;
1251 			struct sk_buff *skb_prev;
1252 alloc_new_skb:
1253 			skb_prev = skb;
1254 
1255 			/* There's no room in the current skb */
1256 			if (skb_prev)
1257 				fraggap = skb_prev->len - maxfraglen;
1258 			else
1259 				fraggap = 0;
1260 
1261 			/*
1262 			 * If remaining data exceeds the mtu,
1263 			 * we know we need more fragment(s).
1264 			 */
1265 			datalen = length + fraggap;
1266 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1267 				datalen = maxfraglen - fragheaderlen;
1268 
1269 			fraglen = datalen + fragheaderlen;
1270 			if ((flags & MSG_MORE) &&
1271 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1272 				alloclen = mtu;
1273 			else
1274 				alloclen = datalen + fragheaderlen;
1275 
1276 			/*
1277 			 * The last fragment gets additional space at tail.
1278 			 * Note: we overallocate on fragments with MSG_MODE
1279 			 * because we have no idea if we're the last one.
1280 			 */
1281 			if (datalen == length + fraggap)
1282 				alloclen += rt->u.dst.trailer_len;
1283 
1284 			/*
1285 			 * We just reserve space for fragment header.
1286 			 * Note: this may be overallocation if the message
1287 			 * (without MSG_MORE) fits into the MTU.
1288 			 */
1289 			alloclen += sizeof(struct frag_hdr);
1290 
1291 			if (transhdrlen) {
1292 				skb = sock_alloc_send_skb(sk,
1293 						alloclen + hh_len,
1294 						(flags & MSG_DONTWAIT), &err);
1295 			} else {
1296 				skb = NULL;
1297 				if (atomic_read(&sk->sk_wmem_alloc) <=
1298 				    2 * sk->sk_sndbuf)
1299 					skb = sock_wmalloc(sk,
1300 							   alloclen + hh_len, 1,
1301 							   sk->sk_allocation);
1302 				if (unlikely(skb == NULL))
1303 					err = -ENOBUFS;
1304 			}
1305 			if (skb == NULL)
1306 				goto error;
1307 			/*
1308 			 *	Fill in the control structures
1309 			 */
1310 			skb->ip_summed = csummode;
1311 			skb->csum = 0;
1312 			/* reserve for fragmentation */
1313 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1314 
1315 			/*
1316 			 *	Find where to start putting bytes
1317 			 */
1318 			data = skb_put(skb, fraglen);
1319 			skb_set_network_header(skb, exthdrlen);
1320 			data += fragheaderlen;
1321 			skb->transport_header = (skb->network_header +
1322 						 fragheaderlen);
1323 			if (fraggap) {
1324 				skb->csum = skb_copy_and_csum_bits(
1325 					skb_prev, maxfraglen,
1326 					data + transhdrlen, fraggap, 0);
1327 				skb_prev->csum = csum_sub(skb_prev->csum,
1328 							  skb->csum);
1329 				data += fraggap;
1330 				pskb_trim_unique(skb_prev, maxfraglen);
1331 			}
1332 			copy = datalen - transhdrlen - fraggap;
1333 			if (copy < 0) {
1334 				err = -EINVAL;
1335 				kfree_skb(skb);
1336 				goto error;
1337 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1338 				err = -EFAULT;
1339 				kfree_skb(skb);
1340 				goto error;
1341 			}
1342 
1343 			offset += copy;
1344 			length -= datalen - fraggap;
1345 			transhdrlen = 0;
1346 			exthdrlen = 0;
1347 			csummode = CHECKSUM_NONE;
1348 
1349 			/*
1350 			 * Put the packet on the pending queue
1351 			 */
1352 			__skb_queue_tail(&sk->sk_write_queue, skb);
1353 			continue;
1354 		}
1355 
1356 		if (copy > length)
1357 			copy = length;
1358 
1359 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1360 			unsigned int off;
1361 
1362 			off = skb->len;
1363 			if (getfrag(from, skb_put(skb, copy),
1364 						offset, copy, off, skb) < 0) {
1365 				__skb_trim(skb, off);
1366 				err = -EFAULT;
1367 				goto error;
1368 			}
1369 		} else {
1370 			int i = skb_shinfo(skb)->nr_frags;
1371 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1372 			struct page *page = sk->sk_sndmsg_page;
1373 			int off = sk->sk_sndmsg_off;
1374 			unsigned int left;
1375 
1376 			if (page && (left = PAGE_SIZE - off) > 0) {
1377 				if (copy >= left)
1378 					copy = left;
1379 				if (page != frag->page) {
1380 					if (i == MAX_SKB_FRAGS) {
1381 						err = -EMSGSIZE;
1382 						goto error;
1383 					}
1384 					get_page(page);
1385 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1386 					frag = &skb_shinfo(skb)->frags[i];
1387 				}
1388 			} else if(i < MAX_SKB_FRAGS) {
1389 				if (copy > PAGE_SIZE)
1390 					copy = PAGE_SIZE;
1391 				page = alloc_pages(sk->sk_allocation, 0);
1392 				if (page == NULL) {
1393 					err = -ENOMEM;
1394 					goto error;
1395 				}
1396 				sk->sk_sndmsg_page = page;
1397 				sk->sk_sndmsg_off = 0;
1398 
1399 				skb_fill_page_desc(skb, i, page, 0, 0);
1400 				frag = &skb_shinfo(skb)->frags[i];
1401 			} else {
1402 				err = -EMSGSIZE;
1403 				goto error;
1404 			}
1405 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1406 				err = -EFAULT;
1407 				goto error;
1408 			}
1409 			sk->sk_sndmsg_off += copy;
1410 			frag->size += copy;
1411 			skb->len += copy;
1412 			skb->data_len += copy;
1413 			skb->truesize += copy;
1414 			atomic_add(copy, &sk->sk_wmem_alloc);
1415 		}
1416 		offset += copy;
1417 		length -= copy;
1418 	}
1419 	return 0;
1420 error:
1421 	inet->cork.length -= length;
1422 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1423 	return err;
1424 }
1425 
1426 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1427 {
1428 	if (np->cork.opt) {
1429 		kfree(np->cork.opt->dst0opt);
1430 		kfree(np->cork.opt->dst1opt);
1431 		kfree(np->cork.opt->hopopt);
1432 		kfree(np->cork.opt->srcrt);
1433 		kfree(np->cork.opt);
1434 		np->cork.opt = NULL;
1435 	}
1436 
1437 	if (inet->cork.dst) {
1438 		dst_release(inet->cork.dst);
1439 		inet->cork.dst = NULL;
1440 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1441 	}
1442 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1443 }
1444 
1445 int ip6_push_pending_frames(struct sock *sk)
1446 {
1447 	struct sk_buff *skb, *tmp_skb;
1448 	struct sk_buff **tail_skb;
1449 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1450 	struct inet_sock *inet = inet_sk(sk);
1451 	struct ipv6_pinfo *np = inet6_sk(sk);
1452 	struct net *net = sock_net(sk);
1453 	struct ipv6hdr *hdr;
1454 	struct ipv6_txoptions *opt = np->cork.opt;
1455 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1456 	struct flowi *fl = &inet->cork.fl;
1457 	unsigned char proto = fl->proto;
1458 	int err = 0;
1459 
1460 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1461 		goto out;
1462 	tail_skb = &(skb_shinfo(skb)->frag_list);
1463 
1464 	/* move skb->data to ip header from ext header */
1465 	if (skb->data < skb_network_header(skb))
1466 		__skb_pull(skb, skb_network_offset(skb));
1467 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1468 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1469 		*tail_skb = tmp_skb;
1470 		tail_skb = &(tmp_skb->next);
1471 		skb->len += tmp_skb->len;
1472 		skb->data_len += tmp_skb->len;
1473 		skb->truesize += tmp_skb->truesize;
1474 		tmp_skb->destructor = NULL;
1475 		tmp_skb->sk = NULL;
1476 	}
1477 
1478 	/* Allow local fragmentation. */
1479 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1480 		skb->local_df = 1;
1481 
1482 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1483 	__skb_pull(skb, skb_network_header_len(skb));
1484 	if (opt && opt->opt_flen)
1485 		ipv6_push_frag_opts(skb, opt, &proto);
1486 	if (opt && opt->opt_nflen)
1487 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1488 
1489 	skb_push(skb, sizeof(struct ipv6hdr));
1490 	skb_reset_network_header(skb);
1491 	hdr = ipv6_hdr(skb);
1492 
1493 	*(__be32*)hdr = fl->fl6_flowlabel |
1494 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1495 
1496 	hdr->hop_limit = np->cork.hop_limit;
1497 	hdr->nexthdr = proto;
1498 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1499 	ipv6_addr_copy(&hdr->daddr, final_dst);
1500 
1501 	skb->priority = sk->sk_priority;
1502 	skb->mark = sk->sk_mark;
1503 
1504 	skb_dst_set(skb, dst_clone(&rt->u.dst));
1505 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1506 	if (proto == IPPROTO_ICMPV6) {
1507 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1508 
1509 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1510 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1511 	}
1512 
1513 	err = ip6_local_out(skb);
1514 	if (err) {
1515 		if (err > 0)
1516 			err = net_xmit_errno(err);
1517 		if (err)
1518 			goto error;
1519 	}
1520 
1521 out:
1522 	ip6_cork_release(inet, np);
1523 	return err;
1524 error:
1525 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1526 	goto out;
1527 }
1528 
1529 void ip6_flush_pending_frames(struct sock *sk)
1530 {
1531 	struct sk_buff *skb;
1532 
1533 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1534 		if (skb_dst(skb))
1535 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1536 				      IPSTATS_MIB_OUTDISCARDS);
1537 		kfree_skb(skb);
1538 	}
1539 
1540 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1541 }
1542