xref: /linux/net/ipv6/ip6_output.c (revision dfff0fa65ab15db45acd64b3189787d37ab163cd)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43 
44 #include <net/sock.h>
45 #include <net/snmp.h>
46 
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57 
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59 
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61 {
62 	static u32 ipv6_fragmentation_id = 1;
63 	static DEFINE_SPINLOCK(ip6_id_lock);
64 
65 	spin_lock_bh(&ip6_id_lock);
66 	fhdr->identification = htonl(ipv6_fragmentation_id);
67 	if (++ipv6_fragmentation_id == 0)
68 		ipv6_fragmentation_id = 1;
69 	spin_unlock_bh(&ip6_id_lock);
70 }
71 
72 int __ip6_local_out(struct sk_buff *skb)
73 {
74 	int len;
75 
76 	len = skb->len - sizeof(struct ipv6hdr);
77 	if (len > IPV6_MAXPLEN)
78 		len = 0;
79 	ipv6_hdr(skb)->payload_len = htons(len);
80 
81 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
82 		       dst_output);
83 }
84 
85 int ip6_local_out(struct sk_buff *skb)
86 {
87 	int err;
88 
89 	err = __ip6_local_out(skb);
90 	if (likely(err == 1))
91 		err = dst_output(skb);
92 
93 	return err;
94 }
95 EXPORT_SYMBOL_GPL(ip6_local_out);
96 
97 static int ip6_output_finish(struct sk_buff *skb)
98 {
99 	struct dst_entry *dst = skb_dst(skb);
100 
101 	if (dst->hh)
102 		return neigh_hh_output(dst->hh, skb);
103 	else if (dst->neighbour)
104 		return dst->neighbour->output(skb);
105 
106 	IP6_INC_STATS_BH(dev_net(dst->dev),
107 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
108 	kfree_skb(skb);
109 	return -EINVAL;
110 
111 }
112 
113 /* dev_loopback_xmit for use with netfilter. */
114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
115 {
116 	skb_reset_mac_header(newskb);
117 	__skb_pull(newskb, skb_network_offset(newskb));
118 	newskb->pkt_type = PACKET_LOOPBACK;
119 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
120 	WARN_ON(!skb_dst(newskb));
121 
122 	netif_rx(newskb);
123 	return 0;
124 }
125 
126 
127 static int ip6_output2(struct sk_buff *skb)
128 {
129 	struct dst_entry *dst = skb_dst(skb);
130 	struct net_device *dev = dst->dev;
131 
132 	skb->protocol = htons(ETH_P_IPV6);
133 	skb->dev = dev;
134 
135 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
136 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
137 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
138 
139 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
140 		    ((mroute6_socket(dev_net(dev)) &&
141 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
142 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
143 					 &ipv6_hdr(skb)->saddr))) {
144 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
145 
146 			/* Do not check for IFF_ALLMULTI; multicast routing
147 			   is not supported in any case.
148 			 */
149 			if (newskb)
150 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
151 					NULL, newskb->dev,
152 					ip6_dev_loopback_xmit);
153 
154 			if (ipv6_hdr(skb)->hop_limit == 0) {
155 				IP6_INC_STATS(dev_net(dev), idev,
156 					      IPSTATS_MIB_OUTDISCARDS);
157 				kfree_skb(skb);
158 				return 0;
159 			}
160 		}
161 
162 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
163 				skb->len);
164 	}
165 
166 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
167 		       ip6_output_finish);
168 }
169 
170 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
171 {
172 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
173 
174 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
175 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
176 }
177 
178 int ip6_output(struct sk_buff *skb)
179 {
180 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
181 	if (unlikely(idev->cnf.disable_ipv6)) {
182 		IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
183 			      IPSTATS_MIB_OUTDISCARDS);
184 		kfree_skb(skb);
185 		return 0;
186 	}
187 
188 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
189 				dst_allfrag(skb_dst(skb)))
190 		return ip6_fragment(skb, ip6_output2);
191 	else
192 		return ip6_output2(skb);
193 }
194 
195 /*
196  *	xmit an sk_buff (used by TCP)
197  */
198 
199 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
200 	     struct ipv6_txoptions *opt, int ipfragok)
201 {
202 	struct net *net = sock_net(sk);
203 	struct ipv6_pinfo *np = inet6_sk(sk);
204 	struct in6_addr *first_hop = &fl->fl6_dst;
205 	struct dst_entry *dst = skb_dst(skb);
206 	struct ipv6hdr *hdr;
207 	u8  proto = fl->proto;
208 	int seg_len = skb->len;
209 	int hlimit, tclass;
210 	u32 mtu;
211 
212 	if (opt) {
213 		unsigned int head_room;
214 
215 		/* First: exthdrs may take lots of space (~8K for now)
216 		   MAX_HEADER is not enough.
217 		 */
218 		head_room = opt->opt_nflen + opt->opt_flen;
219 		seg_len += head_room;
220 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
221 
222 		if (skb_headroom(skb) < head_room) {
223 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
224 			if (skb2 == NULL) {
225 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
226 					      IPSTATS_MIB_OUTDISCARDS);
227 				kfree_skb(skb);
228 				return -ENOBUFS;
229 			}
230 			kfree_skb(skb);
231 			skb = skb2;
232 			if (sk)
233 				skb_set_owner_w(skb, sk);
234 		}
235 		if (opt->opt_flen)
236 			ipv6_push_frag_opts(skb, opt, &proto);
237 		if (opt->opt_nflen)
238 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
239 	}
240 
241 	skb_push(skb, sizeof(struct ipv6hdr));
242 	skb_reset_network_header(skb);
243 	hdr = ipv6_hdr(skb);
244 
245 	/* Allow local fragmentation. */
246 	if (ipfragok)
247 		skb->local_df = 1;
248 
249 	/*
250 	 *	Fill in the IPv6 header
251 	 */
252 
253 	hlimit = -1;
254 	if (np)
255 		hlimit = np->hop_limit;
256 	if (hlimit < 0)
257 		hlimit = ip6_dst_hoplimit(dst);
258 
259 	tclass = -1;
260 	if (np)
261 		tclass = np->tclass;
262 	if (tclass < 0)
263 		tclass = 0;
264 
265 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
266 
267 	hdr->payload_len = htons(seg_len);
268 	hdr->nexthdr = proto;
269 	hdr->hop_limit = hlimit;
270 
271 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
272 	ipv6_addr_copy(&hdr->daddr, first_hop);
273 
274 	skb->priority = sk->sk_priority;
275 	skb->mark = sk->sk_mark;
276 
277 	mtu = dst_mtu(dst);
278 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
279 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
280 			      IPSTATS_MIB_OUT, skb->len);
281 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
282 				dst_output);
283 	}
284 
285 	if (net_ratelimit())
286 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
287 	skb->dev = dst->dev;
288 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
289 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
290 	kfree_skb(skb);
291 	return -EMSGSIZE;
292 }
293 
294 EXPORT_SYMBOL(ip6_xmit);
295 
296 /*
297  *	To avoid extra problems ND packets are send through this
298  *	routine. It's code duplication but I really want to avoid
299  *	extra checks since ipv6_build_header is used by TCP (which
300  *	is for us performance critical)
301  */
302 
303 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
304 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
305 	       int proto, int len)
306 {
307 	struct ipv6_pinfo *np = inet6_sk(sk);
308 	struct ipv6hdr *hdr;
309 	int totlen;
310 
311 	skb->protocol = htons(ETH_P_IPV6);
312 	skb->dev = dev;
313 
314 	totlen = len + sizeof(struct ipv6hdr);
315 
316 	skb_reset_network_header(skb);
317 	skb_put(skb, sizeof(struct ipv6hdr));
318 	hdr = ipv6_hdr(skb);
319 
320 	*(__be32*)hdr = htonl(0x60000000);
321 
322 	hdr->payload_len = htons(len);
323 	hdr->nexthdr = proto;
324 	hdr->hop_limit = np->hop_limit;
325 
326 	ipv6_addr_copy(&hdr->saddr, saddr);
327 	ipv6_addr_copy(&hdr->daddr, daddr);
328 
329 	return 0;
330 }
331 
332 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
333 {
334 	struct ip6_ra_chain *ra;
335 	struct sock *last = NULL;
336 
337 	read_lock(&ip6_ra_lock);
338 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
339 		struct sock *sk = ra->sk;
340 		if (sk && ra->sel == sel &&
341 		    (!sk->sk_bound_dev_if ||
342 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
343 			if (last) {
344 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
345 				if (skb2)
346 					rawv6_rcv(last, skb2);
347 			}
348 			last = sk;
349 		}
350 	}
351 
352 	if (last) {
353 		rawv6_rcv(last, skb);
354 		read_unlock(&ip6_ra_lock);
355 		return 1;
356 	}
357 	read_unlock(&ip6_ra_lock);
358 	return 0;
359 }
360 
361 static int ip6_forward_proxy_check(struct sk_buff *skb)
362 {
363 	struct ipv6hdr *hdr = ipv6_hdr(skb);
364 	u8 nexthdr = hdr->nexthdr;
365 	int offset;
366 
367 	if (ipv6_ext_hdr(nexthdr)) {
368 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
369 		if (offset < 0)
370 			return 0;
371 	} else
372 		offset = sizeof(struct ipv6hdr);
373 
374 	if (nexthdr == IPPROTO_ICMPV6) {
375 		struct icmp6hdr *icmp6;
376 
377 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
378 					 offset + 1 - skb->data)))
379 			return 0;
380 
381 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
382 
383 		switch (icmp6->icmp6_type) {
384 		case NDISC_ROUTER_SOLICITATION:
385 		case NDISC_ROUTER_ADVERTISEMENT:
386 		case NDISC_NEIGHBOUR_SOLICITATION:
387 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
388 		case NDISC_REDIRECT:
389 			/* For reaction involving unicast neighbor discovery
390 			 * message destined to the proxied address, pass it to
391 			 * input function.
392 			 */
393 			return 1;
394 		default:
395 			break;
396 		}
397 	}
398 
399 	/*
400 	 * The proxying router can't forward traffic sent to a link-local
401 	 * address, so signal the sender and discard the packet. This
402 	 * behavior is clarified by the MIPv6 specification.
403 	 */
404 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
405 		dst_link_failure(skb);
406 		return -1;
407 	}
408 
409 	return 0;
410 }
411 
412 static inline int ip6_forward_finish(struct sk_buff *skb)
413 {
414 	return dst_output(skb);
415 }
416 
417 int ip6_forward(struct sk_buff *skb)
418 {
419 	struct dst_entry *dst = skb_dst(skb);
420 	struct ipv6hdr *hdr = ipv6_hdr(skb);
421 	struct inet6_skb_parm *opt = IP6CB(skb);
422 	struct net *net = dev_net(dst->dev);
423 
424 	if (net->ipv6.devconf_all->forwarding == 0)
425 		goto error;
426 
427 	if (skb_warn_if_lro(skb))
428 		goto drop;
429 
430 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
431 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
432 		goto drop;
433 	}
434 
435 	skb_forward_csum(skb);
436 
437 	/*
438 	 *	We DO NOT make any processing on
439 	 *	RA packets, pushing them to user level AS IS
440 	 *	without ane WARRANTY that application will be able
441 	 *	to interpret them. The reason is that we
442 	 *	cannot make anything clever here.
443 	 *
444 	 *	We are not end-node, so that if packet contains
445 	 *	AH/ESP, we cannot make anything.
446 	 *	Defragmentation also would be mistake, RA packets
447 	 *	cannot be fragmented, because there is no warranty
448 	 *	that different fragments will go along one path. --ANK
449 	 */
450 	if (opt->ra) {
451 		u8 *ptr = skb_network_header(skb) + opt->ra;
452 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
453 			return 0;
454 	}
455 
456 	/*
457 	 *	check and decrement ttl
458 	 */
459 	if (hdr->hop_limit <= 1) {
460 		/* Force OUTPUT device used as source address */
461 		skb->dev = dst->dev;
462 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
463 			    0, skb->dev);
464 		IP6_INC_STATS_BH(net,
465 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
466 
467 		kfree_skb(skb);
468 		return -ETIMEDOUT;
469 	}
470 
471 	/* XXX: idev->cnf.proxy_ndp? */
472 	if (net->ipv6.devconf_all->proxy_ndp &&
473 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
474 		int proxied = ip6_forward_proxy_check(skb);
475 		if (proxied > 0)
476 			return ip6_input(skb);
477 		else if (proxied < 0) {
478 			IP6_INC_STATS(net, ip6_dst_idev(dst),
479 				      IPSTATS_MIB_INDISCARDS);
480 			goto drop;
481 		}
482 	}
483 
484 	if (!xfrm6_route_forward(skb)) {
485 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
486 		goto drop;
487 	}
488 	dst = skb_dst(skb);
489 
490 	/* IPv6 specs say nothing about it, but it is clear that we cannot
491 	   send redirects to source routed frames.
492 	   We don't send redirects to frames decapsulated from IPsec.
493 	 */
494 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
495 	    !skb_sec_path(skb)) {
496 		struct in6_addr *target = NULL;
497 		struct rt6_info *rt;
498 		struct neighbour *n = dst->neighbour;
499 
500 		/*
501 		 *	incoming and outgoing devices are the same
502 		 *	send a redirect.
503 		 */
504 
505 		rt = (struct rt6_info *) dst;
506 		if ((rt->rt6i_flags & RTF_GATEWAY))
507 			target = (struct in6_addr*)&n->primary_key;
508 		else
509 			target = &hdr->daddr;
510 
511 		/* Limit redirects both by destination (here)
512 		   and by source (inside ndisc_send_redirect)
513 		 */
514 		if (xrlim_allow(dst, 1*HZ))
515 			ndisc_send_redirect(skb, n, target);
516 	} else {
517 		int addrtype = ipv6_addr_type(&hdr->saddr);
518 
519 		/* This check is security critical. */
520 		if (addrtype == IPV6_ADDR_ANY ||
521 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
522 			goto error;
523 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
524 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
525 				ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
526 			goto error;
527 		}
528 	}
529 
530 	if (skb->len > dst_mtu(dst)) {
531 		/* Again, force OUTPUT device used as source address */
532 		skb->dev = dst->dev;
533 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
534 		IP6_INC_STATS_BH(net,
535 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
536 		IP6_INC_STATS_BH(net,
537 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
538 		kfree_skb(skb);
539 		return -EMSGSIZE;
540 	}
541 
542 	if (skb_cow(skb, dst->dev->hard_header_len)) {
543 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
544 		goto drop;
545 	}
546 
547 	hdr = ipv6_hdr(skb);
548 
549 	/* Mangling hops number delayed to point after skb COW */
550 
551 	hdr->hop_limit--;
552 
553 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
554 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
555 		       ip6_forward_finish);
556 
557 error:
558 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
559 drop:
560 	kfree_skb(skb);
561 	return -EINVAL;
562 }
563 
564 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
565 {
566 	to->pkt_type = from->pkt_type;
567 	to->priority = from->priority;
568 	to->protocol = from->protocol;
569 	skb_dst_drop(to);
570 	skb_dst_set(to, dst_clone(skb_dst(from)));
571 	to->dev = from->dev;
572 	to->mark = from->mark;
573 
574 #ifdef CONFIG_NET_SCHED
575 	to->tc_index = from->tc_index;
576 #endif
577 	nf_copy(to, from);
578 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
579     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
580 	to->nf_trace = from->nf_trace;
581 #endif
582 	skb_copy_secmark(to, from);
583 }
584 
585 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
586 {
587 	u16 offset = sizeof(struct ipv6hdr);
588 	struct ipv6_opt_hdr *exthdr =
589 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
590 	unsigned int packet_len = skb->tail - skb->network_header;
591 	int found_rhdr = 0;
592 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
593 
594 	while (offset + 1 <= packet_len) {
595 
596 		switch (**nexthdr) {
597 
598 		case NEXTHDR_HOP:
599 			break;
600 		case NEXTHDR_ROUTING:
601 			found_rhdr = 1;
602 			break;
603 		case NEXTHDR_DEST:
604 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
605 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
606 				break;
607 #endif
608 			if (found_rhdr)
609 				return offset;
610 			break;
611 		default :
612 			return offset;
613 		}
614 
615 		offset += ipv6_optlen(exthdr);
616 		*nexthdr = &exthdr->nexthdr;
617 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
618 						 offset);
619 	}
620 
621 	return offset;
622 }
623 
624 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
625 {
626 	struct sk_buff *frag;
627 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629 	struct ipv6hdr *tmp_hdr;
630 	struct frag_hdr *fh;
631 	unsigned int mtu, hlen, left, len;
632 	__be32 frag_id = 0;
633 	int ptr, offset = 0, err=0;
634 	u8 *prevhdr, nexthdr = 0;
635 	struct net *net = dev_net(skb_dst(skb)->dev);
636 
637 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
638 	nexthdr = *prevhdr;
639 
640 	mtu = ip6_skb_dst_mtu(skb);
641 
642 	/* We must not fragment if the socket is set to force MTU discovery
643 	 * or if the skb it not generated by a local socket.  (This last
644 	 * check should be redundant, but it's free.)
645 	 */
646 	if (!skb->local_df) {
647 		skb->dev = skb_dst(skb)->dev;
648 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
649 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
650 			      IPSTATS_MIB_FRAGFAILS);
651 		kfree_skb(skb);
652 		return -EMSGSIZE;
653 	}
654 
655 	if (np && np->frag_size < mtu) {
656 		if (np->frag_size)
657 			mtu = np->frag_size;
658 	}
659 	mtu -= hlen + sizeof(struct frag_hdr);
660 
661 	if (skb_has_frags(skb)) {
662 		int first_len = skb_pagelen(skb);
663 		int truesizes = 0;
664 
665 		if (first_len - hlen > mtu ||
666 		    ((first_len - hlen) & 7) ||
667 		    skb_cloned(skb))
668 			goto slow_path;
669 
670 		skb_walk_frags(skb, frag) {
671 			/* Correct geometry. */
672 			if (frag->len > mtu ||
673 			    ((frag->len & 7) && frag->next) ||
674 			    skb_headroom(frag) < hlen)
675 			    goto slow_path;
676 
677 			/* Partially cloned skb? */
678 			if (skb_shared(frag))
679 				goto slow_path;
680 
681 			BUG_ON(frag->sk);
682 			if (skb->sk) {
683 				frag->sk = skb->sk;
684 				frag->destructor = sock_wfree;
685 				truesizes += frag->truesize;
686 			}
687 		}
688 
689 		err = 0;
690 		offset = 0;
691 		frag = skb_shinfo(skb)->frag_list;
692 		skb_frag_list_init(skb);
693 		/* BUILD HEADER */
694 
695 		*prevhdr = NEXTHDR_FRAGMENT;
696 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
697 		if (!tmp_hdr) {
698 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
699 				      IPSTATS_MIB_FRAGFAILS);
700 			return -ENOMEM;
701 		}
702 
703 		__skb_pull(skb, hlen);
704 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
705 		__skb_push(skb, hlen);
706 		skb_reset_network_header(skb);
707 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
708 
709 		ipv6_select_ident(skb, fh);
710 		fh->nexthdr = nexthdr;
711 		fh->reserved = 0;
712 		fh->frag_off = htons(IP6_MF);
713 		frag_id = fh->identification;
714 
715 		first_len = skb_pagelen(skb);
716 		skb->data_len = first_len - skb_headlen(skb);
717 		skb->truesize -= truesizes;
718 		skb->len = first_len;
719 		ipv6_hdr(skb)->payload_len = htons(first_len -
720 						   sizeof(struct ipv6hdr));
721 
722 		dst_hold(&rt->u.dst);
723 
724 		for (;;) {
725 			/* Prepare header of the next frame,
726 			 * before previous one went down. */
727 			if (frag) {
728 				frag->ip_summed = CHECKSUM_NONE;
729 				skb_reset_transport_header(frag);
730 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
731 				__skb_push(frag, hlen);
732 				skb_reset_network_header(frag);
733 				memcpy(skb_network_header(frag), tmp_hdr,
734 				       hlen);
735 				offset += skb->len - hlen - sizeof(struct frag_hdr);
736 				fh->nexthdr = nexthdr;
737 				fh->reserved = 0;
738 				fh->frag_off = htons(offset);
739 				if (frag->next != NULL)
740 					fh->frag_off |= htons(IP6_MF);
741 				fh->identification = frag_id;
742 				ipv6_hdr(frag)->payload_len =
743 						htons(frag->len -
744 						      sizeof(struct ipv6hdr));
745 				ip6_copy_metadata(frag, skb);
746 			}
747 
748 			err = output(skb);
749 			if(!err)
750 				IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
751 					      IPSTATS_MIB_FRAGCREATES);
752 
753 			if (err || !frag)
754 				break;
755 
756 			skb = frag;
757 			frag = skb->next;
758 			skb->next = NULL;
759 		}
760 
761 		kfree(tmp_hdr);
762 
763 		if (err == 0) {
764 			IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
765 				      IPSTATS_MIB_FRAGOKS);
766 			dst_release(&rt->u.dst);
767 			return 0;
768 		}
769 
770 		while (frag) {
771 			skb = frag->next;
772 			kfree_skb(frag);
773 			frag = skb;
774 		}
775 
776 		IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
777 			      IPSTATS_MIB_FRAGFAILS);
778 		dst_release(&rt->u.dst);
779 		return err;
780 	}
781 
782 slow_path:
783 	left = skb->len - hlen;		/* Space per frame */
784 	ptr = hlen;			/* Where to start from */
785 
786 	/*
787 	 *	Fragment the datagram.
788 	 */
789 
790 	*prevhdr = NEXTHDR_FRAGMENT;
791 
792 	/*
793 	 *	Keep copying data until we run out.
794 	 */
795 	while(left > 0)	{
796 		len = left;
797 		/* IF: it doesn't fit, use 'mtu' - the data space left */
798 		if (len > mtu)
799 			len = mtu;
800 		/* IF: we are not sending upto and including the packet end
801 		   then align the next start on an eight byte boundary */
802 		if (len < left)	{
803 			len &= ~7;
804 		}
805 		/*
806 		 *	Allocate buffer.
807 		 */
808 
809 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
810 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
811 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
812 				      IPSTATS_MIB_FRAGFAILS);
813 			err = -ENOMEM;
814 			goto fail;
815 		}
816 
817 		/*
818 		 *	Set up data on packet
819 		 */
820 
821 		ip6_copy_metadata(frag, skb);
822 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
823 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
824 		skb_reset_network_header(frag);
825 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
826 		frag->transport_header = (frag->network_header + hlen +
827 					  sizeof(struct frag_hdr));
828 
829 		/*
830 		 *	Charge the memory for the fragment to any owner
831 		 *	it might possess
832 		 */
833 		if (skb->sk)
834 			skb_set_owner_w(frag, skb->sk);
835 
836 		/*
837 		 *	Copy the packet header into the new buffer.
838 		 */
839 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
840 
841 		/*
842 		 *	Build fragment header.
843 		 */
844 		fh->nexthdr = nexthdr;
845 		fh->reserved = 0;
846 		if (!frag_id) {
847 			ipv6_select_ident(skb, fh);
848 			frag_id = fh->identification;
849 		} else
850 			fh->identification = frag_id;
851 
852 		/*
853 		 *	Copy a block of the IP datagram.
854 		 */
855 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
856 			BUG();
857 		left -= len;
858 
859 		fh->frag_off = htons(offset);
860 		if (left > 0)
861 			fh->frag_off |= htons(IP6_MF);
862 		ipv6_hdr(frag)->payload_len = htons(frag->len -
863 						    sizeof(struct ipv6hdr));
864 
865 		ptr += len;
866 		offset += len;
867 
868 		/*
869 		 *	Put this fragment into the sending queue.
870 		 */
871 		err = output(frag);
872 		if (err)
873 			goto fail;
874 
875 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
876 			      IPSTATS_MIB_FRAGCREATES);
877 	}
878 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
879 		      IPSTATS_MIB_FRAGOKS);
880 	kfree_skb(skb);
881 	return err;
882 
883 fail:
884 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885 		      IPSTATS_MIB_FRAGFAILS);
886 	kfree_skb(skb);
887 	return err;
888 }
889 
890 static inline int ip6_rt_check(struct rt6key *rt_key,
891 			       struct in6_addr *fl_addr,
892 			       struct in6_addr *addr_cache)
893 {
894 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
895 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
896 }
897 
898 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
899 					  struct dst_entry *dst,
900 					  struct flowi *fl)
901 {
902 	struct ipv6_pinfo *np = inet6_sk(sk);
903 	struct rt6_info *rt = (struct rt6_info *)dst;
904 
905 	if (!dst)
906 		goto out;
907 
908 	/* Yes, checking route validity in not connected
909 	 * case is not very simple. Take into account,
910 	 * that we do not support routing by source, TOS,
911 	 * and MSG_DONTROUTE 		--ANK (980726)
912 	 *
913 	 * 1. ip6_rt_check(): If route was host route,
914 	 *    check that cached destination is current.
915 	 *    If it is network route, we still may
916 	 *    check its validity using saved pointer
917 	 *    to the last used address: daddr_cache.
918 	 *    We do not want to save whole address now,
919 	 *    (because main consumer of this service
920 	 *    is tcp, which has not this problem),
921 	 *    so that the last trick works only on connected
922 	 *    sockets.
923 	 * 2. oif also should be the same.
924 	 */
925 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
926 #ifdef CONFIG_IPV6_SUBTREES
927 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
928 #endif
929 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
930 		dst_release(dst);
931 		dst = NULL;
932 	}
933 
934 out:
935 	return dst;
936 }
937 
938 static int ip6_dst_lookup_tail(struct sock *sk,
939 			       struct dst_entry **dst, struct flowi *fl)
940 {
941 	int err;
942 	struct net *net = sock_net(sk);
943 
944 	if (*dst == NULL)
945 		*dst = ip6_route_output(net, sk, fl);
946 
947 	if ((err = (*dst)->error))
948 		goto out_err_release;
949 
950 	if (ipv6_addr_any(&fl->fl6_src)) {
951 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
952 					 &fl->fl6_dst,
953 					 sk ? inet6_sk(sk)->srcprefs : 0,
954 					 &fl->fl6_src);
955 		if (err)
956 			goto out_err_release;
957 	}
958 
959 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
960 	/*
961 	 * Here if the dst entry we've looked up
962 	 * has a neighbour entry that is in the INCOMPLETE
963 	 * state and the src address from the flow is
964 	 * marked as OPTIMISTIC, we release the found
965 	 * dst entry and replace it instead with the
966 	 * dst entry of the nexthop router
967 	 */
968 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
969 		struct inet6_ifaddr *ifp;
970 		struct flowi fl_gw;
971 		int redirect;
972 
973 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
974 				      (*dst)->dev, 1);
975 
976 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
977 		if (ifp)
978 			in6_ifa_put(ifp);
979 
980 		if (redirect) {
981 			/*
982 			 * We need to get the dst entry for the
983 			 * default router instead
984 			 */
985 			dst_release(*dst);
986 			memcpy(&fl_gw, fl, sizeof(struct flowi));
987 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
988 			*dst = ip6_route_output(net, sk, &fl_gw);
989 			if ((err = (*dst)->error))
990 				goto out_err_release;
991 		}
992 	}
993 #endif
994 
995 	return 0;
996 
997 out_err_release:
998 	if (err == -ENETUNREACH)
999 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1000 	dst_release(*dst);
1001 	*dst = NULL;
1002 	return err;
1003 }
1004 
1005 /**
1006  *	ip6_dst_lookup - perform route lookup on flow
1007  *	@sk: socket which provides route info
1008  *	@dst: pointer to dst_entry * for result
1009  *	@fl: flow to lookup
1010  *
1011  *	This function performs a route lookup on the given flow.
1012  *
1013  *	It returns zero on success, or a standard errno code on error.
1014  */
1015 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1016 {
1017 	*dst = NULL;
1018 	return ip6_dst_lookup_tail(sk, dst, fl);
1019 }
1020 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1021 
1022 /**
1023  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1024  *	@sk: socket which provides the dst cache and route info
1025  *	@dst: pointer to dst_entry * for result
1026  *	@fl: flow to lookup
1027  *
1028  *	This function performs a route lookup on the given flow with the
1029  *	possibility of using the cached route in the socket if it is valid.
1030  *	It will take the socket dst lock when operating on the dst cache.
1031  *	As a result, this function can only be used in process context.
1032  *
1033  *	It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1036 {
1037 	*dst = NULL;
1038 	if (sk) {
1039 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1040 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1041 	}
1042 
1043 	return ip6_dst_lookup_tail(sk, dst, fl);
1044 }
1045 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1046 
1047 static inline int ip6_ufo_append_data(struct sock *sk,
1048 			int getfrag(void *from, char *to, int offset, int len,
1049 			int odd, struct sk_buff *skb),
1050 			void *from, int length, int hh_len, int fragheaderlen,
1051 			int transhdrlen, int mtu,unsigned int flags)
1052 
1053 {
1054 	struct sk_buff *skb;
1055 	int err;
1056 
1057 	/* There is support for UDP large send offload by network
1058 	 * device, so create one single skb packet containing complete
1059 	 * udp datagram
1060 	 */
1061 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1062 		skb = sock_alloc_send_skb(sk,
1063 			hh_len + fragheaderlen + transhdrlen + 20,
1064 			(flags & MSG_DONTWAIT), &err);
1065 		if (skb == NULL)
1066 			return -ENOMEM;
1067 
1068 		/* reserve space for Hardware header */
1069 		skb_reserve(skb, hh_len);
1070 
1071 		/* create space for UDP/IP header */
1072 		skb_put(skb,fragheaderlen + transhdrlen);
1073 
1074 		/* initialize network header pointer */
1075 		skb_reset_network_header(skb);
1076 
1077 		/* initialize protocol header pointer */
1078 		skb->transport_header = skb->network_header + fragheaderlen;
1079 
1080 		skb->ip_summed = CHECKSUM_PARTIAL;
1081 		skb->csum = 0;
1082 		sk->sk_sndmsg_off = 0;
1083 	}
1084 
1085 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1086 				      (length - transhdrlen));
1087 	if (!err) {
1088 		struct frag_hdr fhdr;
1089 
1090 		/* specify the length of each IP datagram fragment*/
1091 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1092 					    sizeof(struct frag_hdr);
1093 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1094 		ipv6_select_ident(skb, &fhdr);
1095 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1096 		__skb_queue_tail(&sk->sk_write_queue, skb);
1097 
1098 		return 0;
1099 	}
1100 	/* There is not enough support do UPD LSO,
1101 	 * so follow normal path
1102 	 */
1103 	kfree_skb(skb);
1104 
1105 	return err;
1106 }
1107 
1108 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1109 					       gfp_t gfp)
1110 {
1111 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1112 }
1113 
1114 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1115 						gfp_t gfp)
1116 {
1117 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1118 }
1119 
1120 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1121 	int offset, int len, int odd, struct sk_buff *skb),
1122 	void *from, int length, int transhdrlen,
1123 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1124 	struct rt6_info *rt, unsigned int flags)
1125 {
1126 	struct inet_sock *inet = inet_sk(sk);
1127 	struct ipv6_pinfo *np = inet6_sk(sk);
1128 	struct sk_buff *skb;
1129 	unsigned int maxfraglen, fragheaderlen;
1130 	int exthdrlen;
1131 	int hh_len;
1132 	int mtu;
1133 	int copy;
1134 	int err;
1135 	int offset = 0;
1136 	int csummode = CHECKSUM_NONE;
1137 
1138 	if (flags&MSG_PROBE)
1139 		return 0;
1140 	if (skb_queue_empty(&sk->sk_write_queue)) {
1141 		/*
1142 		 * setup for corking
1143 		 */
1144 		if (opt) {
1145 			if (WARN_ON(np->cork.opt))
1146 				return -EINVAL;
1147 
1148 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1149 			if (unlikely(np->cork.opt == NULL))
1150 				return -ENOBUFS;
1151 
1152 			np->cork.opt->tot_len = opt->tot_len;
1153 			np->cork.opt->opt_flen = opt->opt_flen;
1154 			np->cork.opt->opt_nflen = opt->opt_nflen;
1155 
1156 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1157 							    sk->sk_allocation);
1158 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1159 				return -ENOBUFS;
1160 
1161 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1162 							    sk->sk_allocation);
1163 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1164 				return -ENOBUFS;
1165 
1166 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1167 							   sk->sk_allocation);
1168 			if (opt->hopopt && !np->cork.opt->hopopt)
1169 				return -ENOBUFS;
1170 
1171 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1172 							    sk->sk_allocation);
1173 			if (opt->srcrt && !np->cork.opt->srcrt)
1174 				return -ENOBUFS;
1175 
1176 			/* need source address above miyazawa*/
1177 		}
1178 		dst_hold(&rt->u.dst);
1179 		inet->cork.dst = &rt->u.dst;
1180 		inet->cork.fl = *fl;
1181 		np->cork.hop_limit = hlimit;
1182 		np->cork.tclass = tclass;
1183 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1184 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1185 		if (np->frag_size < mtu) {
1186 			if (np->frag_size)
1187 				mtu = np->frag_size;
1188 		}
1189 		inet->cork.fragsize = mtu;
1190 		if (dst_allfrag(rt->u.dst.path))
1191 			inet->cork.flags |= IPCORK_ALLFRAG;
1192 		inet->cork.length = 0;
1193 		sk->sk_sndmsg_page = NULL;
1194 		sk->sk_sndmsg_off = 0;
1195 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1196 			    rt->rt6i_nfheader_len;
1197 		length += exthdrlen;
1198 		transhdrlen += exthdrlen;
1199 	} else {
1200 		rt = (struct rt6_info *)inet->cork.dst;
1201 		fl = &inet->cork.fl;
1202 		opt = np->cork.opt;
1203 		transhdrlen = 0;
1204 		exthdrlen = 0;
1205 		mtu = inet->cork.fragsize;
1206 	}
1207 
1208 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1209 
1210 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1211 			(opt ? opt->opt_nflen : 0);
1212 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1213 
1214 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1215 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1216 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1217 			return -EMSGSIZE;
1218 		}
1219 	}
1220 
1221 	/*
1222 	 * Let's try using as much space as possible.
1223 	 * Use MTU if total length of the message fits into the MTU.
1224 	 * Otherwise, we need to reserve fragment header and
1225 	 * fragment alignment (= 8-15 octects, in total).
1226 	 *
1227 	 * Note that we may need to "move" the data from the tail of
1228 	 * of the buffer to the new fragment when we split
1229 	 * the message.
1230 	 *
1231 	 * FIXME: It may be fragmented into multiple chunks
1232 	 *        at once if non-fragmentable extension headers
1233 	 *        are too large.
1234 	 * --yoshfuji
1235 	 */
1236 
1237 	inet->cork.length += length;
1238 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1239 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1240 
1241 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1242 					  fragheaderlen, transhdrlen, mtu,
1243 					  flags);
1244 		if (err)
1245 			goto error;
1246 		return 0;
1247 	}
1248 
1249 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1250 		goto alloc_new_skb;
1251 
1252 	while (length > 0) {
1253 		/* Check if the remaining data fits into current packet. */
1254 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1255 		if (copy < length)
1256 			copy = maxfraglen - skb->len;
1257 
1258 		if (copy <= 0) {
1259 			char *data;
1260 			unsigned int datalen;
1261 			unsigned int fraglen;
1262 			unsigned int fraggap;
1263 			unsigned int alloclen;
1264 			struct sk_buff *skb_prev;
1265 alloc_new_skb:
1266 			skb_prev = skb;
1267 
1268 			/* There's no room in the current skb */
1269 			if (skb_prev)
1270 				fraggap = skb_prev->len - maxfraglen;
1271 			else
1272 				fraggap = 0;
1273 
1274 			/*
1275 			 * If remaining data exceeds the mtu,
1276 			 * we know we need more fragment(s).
1277 			 */
1278 			datalen = length + fraggap;
1279 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1280 				datalen = maxfraglen - fragheaderlen;
1281 
1282 			fraglen = datalen + fragheaderlen;
1283 			if ((flags & MSG_MORE) &&
1284 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1285 				alloclen = mtu;
1286 			else
1287 				alloclen = datalen + fragheaderlen;
1288 
1289 			/*
1290 			 * The last fragment gets additional space at tail.
1291 			 * Note: we overallocate on fragments with MSG_MODE
1292 			 * because we have no idea if we're the last one.
1293 			 */
1294 			if (datalen == length + fraggap)
1295 				alloclen += rt->u.dst.trailer_len;
1296 
1297 			/*
1298 			 * We just reserve space for fragment header.
1299 			 * Note: this may be overallocation if the message
1300 			 * (without MSG_MORE) fits into the MTU.
1301 			 */
1302 			alloclen += sizeof(struct frag_hdr);
1303 
1304 			if (transhdrlen) {
1305 				skb = sock_alloc_send_skb(sk,
1306 						alloclen + hh_len,
1307 						(flags & MSG_DONTWAIT), &err);
1308 			} else {
1309 				skb = NULL;
1310 				if (atomic_read(&sk->sk_wmem_alloc) <=
1311 				    2 * sk->sk_sndbuf)
1312 					skb = sock_wmalloc(sk,
1313 							   alloclen + hh_len, 1,
1314 							   sk->sk_allocation);
1315 				if (unlikely(skb == NULL))
1316 					err = -ENOBUFS;
1317 			}
1318 			if (skb == NULL)
1319 				goto error;
1320 			/*
1321 			 *	Fill in the control structures
1322 			 */
1323 			skb->ip_summed = csummode;
1324 			skb->csum = 0;
1325 			/* reserve for fragmentation */
1326 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1327 
1328 			/*
1329 			 *	Find where to start putting bytes
1330 			 */
1331 			data = skb_put(skb, fraglen);
1332 			skb_set_network_header(skb, exthdrlen);
1333 			data += fragheaderlen;
1334 			skb->transport_header = (skb->network_header +
1335 						 fragheaderlen);
1336 			if (fraggap) {
1337 				skb->csum = skb_copy_and_csum_bits(
1338 					skb_prev, maxfraglen,
1339 					data + transhdrlen, fraggap, 0);
1340 				skb_prev->csum = csum_sub(skb_prev->csum,
1341 							  skb->csum);
1342 				data += fraggap;
1343 				pskb_trim_unique(skb_prev, maxfraglen);
1344 			}
1345 			copy = datalen - transhdrlen - fraggap;
1346 			if (copy < 0) {
1347 				err = -EINVAL;
1348 				kfree_skb(skb);
1349 				goto error;
1350 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1351 				err = -EFAULT;
1352 				kfree_skb(skb);
1353 				goto error;
1354 			}
1355 
1356 			offset += copy;
1357 			length -= datalen - fraggap;
1358 			transhdrlen = 0;
1359 			exthdrlen = 0;
1360 			csummode = CHECKSUM_NONE;
1361 
1362 			/*
1363 			 * Put the packet on the pending queue
1364 			 */
1365 			__skb_queue_tail(&sk->sk_write_queue, skb);
1366 			continue;
1367 		}
1368 
1369 		if (copy > length)
1370 			copy = length;
1371 
1372 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1373 			unsigned int off;
1374 
1375 			off = skb->len;
1376 			if (getfrag(from, skb_put(skb, copy),
1377 						offset, copy, off, skb) < 0) {
1378 				__skb_trim(skb, off);
1379 				err = -EFAULT;
1380 				goto error;
1381 			}
1382 		} else {
1383 			int i = skb_shinfo(skb)->nr_frags;
1384 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1385 			struct page *page = sk->sk_sndmsg_page;
1386 			int off = sk->sk_sndmsg_off;
1387 			unsigned int left;
1388 
1389 			if (page && (left = PAGE_SIZE - off) > 0) {
1390 				if (copy >= left)
1391 					copy = left;
1392 				if (page != frag->page) {
1393 					if (i == MAX_SKB_FRAGS) {
1394 						err = -EMSGSIZE;
1395 						goto error;
1396 					}
1397 					get_page(page);
1398 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1399 					frag = &skb_shinfo(skb)->frags[i];
1400 				}
1401 			} else if(i < MAX_SKB_FRAGS) {
1402 				if (copy > PAGE_SIZE)
1403 					copy = PAGE_SIZE;
1404 				page = alloc_pages(sk->sk_allocation, 0);
1405 				if (page == NULL) {
1406 					err = -ENOMEM;
1407 					goto error;
1408 				}
1409 				sk->sk_sndmsg_page = page;
1410 				sk->sk_sndmsg_off = 0;
1411 
1412 				skb_fill_page_desc(skb, i, page, 0, 0);
1413 				frag = &skb_shinfo(skb)->frags[i];
1414 			} else {
1415 				err = -EMSGSIZE;
1416 				goto error;
1417 			}
1418 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1419 				err = -EFAULT;
1420 				goto error;
1421 			}
1422 			sk->sk_sndmsg_off += copy;
1423 			frag->size += copy;
1424 			skb->len += copy;
1425 			skb->data_len += copy;
1426 			skb->truesize += copy;
1427 			atomic_add(copy, &sk->sk_wmem_alloc);
1428 		}
1429 		offset += copy;
1430 		length -= copy;
1431 	}
1432 	return 0;
1433 error:
1434 	inet->cork.length -= length;
1435 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1436 	return err;
1437 }
1438 
1439 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1440 {
1441 	if (np->cork.opt) {
1442 		kfree(np->cork.opt->dst0opt);
1443 		kfree(np->cork.opt->dst1opt);
1444 		kfree(np->cork.opt->hopopt);
1445 		kfree(np->cork.opt->srcrt);
1446 		kfree(np->cork.opt);
1447 		np->cork.opt = NULL;
1448 	}
1449 
1450 	if (inet->cork.dst) {
1451 		dst_release(inet->cork.dst);
1452 		inet->cork.dst = NULL;
1453 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1454 	}
1455 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1456 }
1457 
1458 int ip6_push_pending_frames(struct sock *sk)
1459 {
1460 	struct sk_buff *skb, *tmp_skb;
1461 	struct sk_buff **tail_skb;
1462 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1463 	struct inet_sock *inet = inet_sk(sk);
1464 	struct ipv6_pinfo *np = inet6_sk(sk);
1465 	struct net *net = sock_net(sk);
1466 	struct ipv6hdr *hdr;
1467 	struct ipv6_txoptions *opt = np->cork.opt;
1468 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1469 	struct flowi *fl = &inet->cork.fl;
1470 	unsigned char proto = fl->proto;
1471 	int err = 0;
1472 
1473 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1474 		goto out;
1475 	tail_skb = &(skb_shinfo(skb)->frag_list);
1476 
1477 	/* move skb->data to ip header from ext header */
1478 	if (skb->data < skb_network_header(skb))
1479 		__skb_pull(skb, skb_network_offset(skb));
1480 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1481 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1482 		*tail_skb = tmp_skb;
1483 		tail_skb = &(tmp_skb->next);
1484 		skb->len += tmp_skb->len;
1485 		skb->data_len += tmp_skb->len;
1486 		skb->truesize += tmp_skb->truesize;
1487 		tmp_skb->destructor = NULL;
1488 		tmp_skb->sk = NULL;
1489 	}
1490 
1491 	/* Allow local fragmentation. */
1492 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1493 		skb->local_df = 1;
1494 
1495 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1496 	__skb_pull(skb, skb_network_header_len(skb));
1497 	if (opt && opt->opt_flen)
1498 		ipv6_push_frag_opts(skb, opt, &proto);
1499 	if (opt && opt->opt_nflen)
1500 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1501 
1502 	skb_push(skb, sizeof(struct ipv6hdr));
1503 	skb_reset_network_header(skb);
1504 	hdr = ipv6_hdr(skb);
1505 
1506 	*(__be32*)hdr = fl->fl6_flowlabel |
1507 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1508 
1509 	hdr->hop_limit = np->cork.hop_limit;
1510 	hdr->nexthdr = proto;
1511 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1512 	ipv6_addr_copy(&hdr->daddr, final_dst);
1513 
1514 	skb->priority = sk->sk_priority;
1515 	skb->mark = sk->sk_mark;
1516 
1517 	skb_dst_set(skb, dst_clone(&rt->u.dst));
1518 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1519 	if (proto == IPPROTO_ICMPV6) {
1520 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1521 
1522 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1523 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1524 	}
1525 
1526 	err = ip6_local_out(skb);
1527 	if (err) {
1528 		if (err > 0)
1529 			err = np->recverr ? net_xmit_errno(err) : 0;
1530 		if (err)
1531 			goto error;
1532 	}
1533 
1534 out:
1535 	ip6_cork_release(inet, np);
1536 	return err;
1537 error:
1538 	goto out;
1539 }
1540 
1541 void ip6_flush_pending_frames(struct sock *sk)
1542 {
1543 	struct sk_buff *skb;
1544 
1545 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1546 		if (skb_dst(skb))
1547 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1548 				      IPSTATS_MIB_OUTDISCARDS);
1549 		kfree_skb(skb);
1550 	}
1551 
1552 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1553 }
1554