xref: /linux/net/ipv6/ip6_output.c (revision a67ff6a54095e27093ea501fb143fefe51a536c2)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	rcu_read_lock();
139 	neigh = dst_get_neighbour(dst);
140 	if (neigh) {
141 		int res = neigh_output(neigh, skb);
142 
143 		rcu_read_unlock();
144 		return res;
145 	}
146 	rcu_read_unlock();
147 	IP6_INC_STATS_BH(dev_net(dst->dev),
148 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149 	kfree_skb(skb);
150 	return -EINVAL;
151 }
152 
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 	    dst_allfrag(skb_dst(skb)))
157 		return ip6_fragment(skb, ip6_finish_output2);
158 	else
159 		return ip6_finish_output2(skb);
160 }
161 
162 int ip6_output(struct sk_buff *skb)
163 {
164 	struct net_device *dev = skb_dst(skb)->dev;
165 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 	if (unlikely(idev->cnf.disable_ipv6)) {
167 		IP6_INC_STATS(dev_net(dev), idev,
168 			      IPSTATS_MIB_OUTDISCARDS);
169 		kfree_skb(skb);
170 		return 0;
171 	}
172 
173 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174 			    ip6_finish_output,
175 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177 
178 /*
179  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181 
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 	     struct ipv6_txoptions *opt, int tclass)
184 {
185 	struct net *net = sock_net(sk);
186 	struct ipv6_pinfo *np = inet6_sk(sk);
187 	struct in6_addr *first_hop = &fl6->daddr;
188 	struct dst_entry *dst = skb_dst(skb);
189 	struct ipv6hdr *hdr;
190 	u8  proto = fl6->flowi6_proto;
191 	int seg_len = skb->len;
192 	int hlimit = -1;
193 	u32 mtu;
194 
195 	if (opt) {
196 		unsigned int head_room;
197 
198 		/* First: exthdrs may take lots of space (~8K for now)
199 		   MAX_HEADER is not enough.
200 		 */
201 		head_room = opt->opt_nflen + opt->opt_flen;
202 		seg_len += head_room;
203 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204 
205 		if (skb_headroom(skb) < head_room) {
206 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 			if (skb2 == NULL) {
208 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 					      IPSTATS_MIB_OUTDISCARDS);
210 				kfree_skb(skb);
211 				return -ENOBUFS;
212 			}
213 			kfree_skb(skb);
214 			skb = skb2;
215 			skb_set_owner_w(skb, sk);
216 		}
217 		if (opt->opt_flen)
218 			ipv6_push_frag_opts(skb, opt, &proto);
219 		if (opt->opt_nflen)
220 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221 	}
222 
223 	skb_push(skb, sizeof(struct ipv6hdr));
224 	skb_reset_network_header(skb);
225 	hdr = ipv6_hdr(skb);
226 
227 	/*
228 	 *	Fill in the IPv6 header
229 	 */
230 	if (np)
231 		hlimit = np->hop_limit;
232 	if (hlimit < 0)
233 		hlimit = ip6_dst_hoplimit(dst);
234 
235 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236 
237 	hdr->payload_len = htons(seg_len);
238 	hdr->nexthdr = proto;
239 	hdr->hop_limit = hlimit;
240 
241 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242 	ipv6_addr_copy(&hdr->daddr, first_hop);
243 
244 	skb->priority = sk->sk_priority;
245 	skb->mark = sk->sk_mark;
246 
247 	mtu = dst_mtu(dst);
248 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 			      IPSTATS_MIB_OUT, skb->len);
251 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 			       dst->dev, dst_output);
253 	}
254 
255 	if (net_ratelimit())
256 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257 	skb->dev = dst->dev;
258 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260 	kfree_skb(skb);
261 	return -EMSGSIZE;
262 }
263 
264 EXPORT_SYMBOL(ip6_xmit);
265 
266 /*
267  *	To avoid extra problems ND packets are send through this
268  *	routine. It's code duplication but I really want to avoid
269  *	extra checks since ipv6_build_header is used by TCP (which
270  *	is for us performance critical)
271  */
272 
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
275 	       int proto, int len)
276 {
277 	struct ipv6_pinfo *np = inet6_sk(sk);
278 	struct ipv6hdr *hdr;
279 
280 	skb->protocol = htons(ETH_P_IPV6);
281 	skb->dev = dev;
282 
283 	skb_reset_network_header(skb);
284 	skb_put(skb, sizeof(struct ipv6hdr));
285 	hdr = ipv6_hdr(skb);
286 
287 	*(__be32*)hdr = htonl(0x60000000);
288 
289 	hdr->payload_len = htons(len);
290 	hdr->nexthdr = proto;
291 	hdr->hop_limit = np->hop_limit;
292 
293 	ipv6_addr_copy(&hdr->saddr, saddr);
294 	ipv6_addr_copy(&hdr->daddr, daddr);
295 
296 	return 0;
297 }
298 
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301 	struct ip6_ra_chain *ra;
302 	struct sock *last = NULL;
303 
304 	read_lock(&ip6_ra_lock);
305 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
306 		struct sock *sk = ra->sk;
307 		if (sk && ra->sel == sel &&
308 		    (!sk->sk_bound_dev_if ||
309 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
310 			if (last) {
311 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312 				if (skb2)
313 					rawv6_rcv(last, skb2);
314 			}
315 			last = sk;
316 		}
317 	}
318 
319 	if (last) {
320 		rawv6_rcv(last, skb);
321 		read_unlock(&ip6_ra_lock);
322 		return 1;
323 	}
324 	read_unlock(&ip6_ra_lock);
325 	return 0;
326 }
327 
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330 	struct ipv6hdr *hdr = ipv6_hdr(skb);
331 	u8 nexthdr = hdr->nexthdr;
332 	int offset;
333 
334 	if (ipv6_ext_hdr(nexthdr)) {
335 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336 		if (offset < 0)
337 			return 0;
338 	} else
339 		offset = sizeof(struct ipv6hdr);
340 
341 	if (nexthdr == IPPROTO_ICMPV6) {
342 		struct icmp6hdr *icmp6;
343 
344 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 					 offset + 1 - skb->data)))
346 			return 0;
347 
348 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349 
350 		switch (icmp6->icmp6_type) {
351 		case NDISC_ROUTER_SOLICITATION:
352 		case NDISC_ROUTER_ADVERTISEMENT:
353 		case NDISC_NEIGHBOUR_SOLICITATION:
354 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
355 		case NDISC_REDIRECT:
356 			/* For reaction involving unicast neighbor discovery
357 			 * message destined to the proxied address, pass it to
358 			 * input function.
359 			 */
360 			return 1;
361 		default:
362 			break;
363 		}
364 	}
365 
366 	/*
367 	 * The proxying router can't forward traffic sent to a link-local
368 	 * address, so signal the sender and discard the packet. This
369 	 * behavior is clarified by the MIPv6 specification.
370 	 */
371 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 		dst_link_failure(skb);
373 		return -1;
374 	}
375 
376 	return 0;
377 }
378 
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381 	return dst_output(skb);
382 }
383 
384 int ip6_forward(struct sk_buff *skb)
385 {
386 	struct dst_entry *dst = skb_dst(skb);
387 	struct ipv6hdr *hdr = ipv6_hdr(skb);
388 	struct inet6_skb_parm *opt = IP6CB(skb);
389 	struct net *net = dev_net(dst->dev);
390 	struct neighbour *n;
391 	u32 mtu;
392 
393 	if (net->ipv6.devconf_all->forwarding == 0)
394 		goto error;
395 
396 	if (skb_warn_if_lro(skb))
397 		goto drop;
398 
399 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401 		goto drop;
402 	}
403 
404 	if (skb->pkt_type != PACKET_HOST)
405 		goto drop;
406 
407 	skb_forward_csum(skb);
408 
409 	/*
410 	 *	We DO NOT make any processing on
411 	 *	RA packets, pushing them to user level AS IS
412 	 *	without ane WARRANTY that application will be able
413 	 *	to interpret them. The reason is that we
414 	 *	cannot make anything clever here.
415 	 *
416 	 *	We are not end-node, so that if packet contains
417 	 *	AH/ESP, we cannot make anything.
418 	 *	Defragmentation also would be mistake, RA packets
419 	 *	cannot be fragmented, because there is no warranty
420 	 *	that different fragments will go along one path. --ANK
421 	 */
422 	if (opt->ra) {
423 		u8 *ptr = skb_network_header(skb) + opt->ra;
424 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425 			return 0;
426 	}
427 
428 	/*
429 	 *	check and decrement ttl
430 	 */
431 	if (hdr->hop_limit <= 1) {
432 		/* Force OUTPUT device used as source address */
433 		skb->dev = dst->dev;
434 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435 		IP6_INC_STATS_BH(net,
436 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437 
438 		kfree_skb(skb);
439 		return -ETIMEDOUT;
440 	}
441 
442 	/* XXX: idev->cnf.proxy_ndp? */
443 	if (net->ipv6.devconf_all->proxy_ndp &&
444 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445 		int proxied = ip6_forward_proxy_check(skb);
446 		if (proxied > 0)
447 			return ip6_input(skb);
448 		else if (proxied < 0) {
449 			IP6_INC_STATS(net, ip6_dst_idev(dst),
450 				      IPSTATS_MIB_INDISCARDS);
451 			goto drop;
452 		}
453 	}
454 
455 	if (!xfrm6_route_forward(skb)) {
456 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457 		goto drop;
458 	}
459 	dst = skb_dst(skb);
460 
461 	/* IPv6 specs say nothing about it, but it is clear that we cannot
462 	   send redirects to source routed frames.
463 	   We don't send redirects to frames decapsulated from IPsec.
464 	 */
465 	n = dst_get_neighbour(dst);
466 	if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467 		struct in6_addr *target = NULL;
468 		struct rt6_info *rt;
469 
470 		/*
471 		 *	incoming and outgoing devices are the same
472 		 *	send a redirect.
473 		 */
474 
475 		rt = (struct rt6_info *) dst;
476 		if ((rt->rt6i_flags & RTF_GATEWAY))
477 			target = (struct in6_addr*)&n->primary_key;
478 		else
479 			target = &hdr->daddr;
480 
481 		if (!rt->rt6i_peer)
482 			rt6_bind_peer(rt, 1);
483 
484 		/* Limit redirects both by destination (here)
485 		   and by source (inside ndisc_send_redirect)
486 		 */
487 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488 			ndisc_send_redirect(skb, n, target);
489 	} else {
490 		int addrtype = ipv6_addr_type(&hdr->saddr);
491 
492 		/* This check is security critical. */
493 		if (addrtype == IPV6_ADDR_ANY ||
494 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
495 			goto error;
496 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
497 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498 				    ICMPV6_NOT_NEIGHBOUR, 0);
499 			goto error;
500 		}
501 	}
502 
503 	mtu = dst_mtu(dst);
504 	if (mtu < IPV6_MIN_MTU)
505 		mtu = IPV6_MIN_MTU;
506 
507 	if (skb->len > mtu && !skb_is_gso(skb)) {
508 		/* Again, force OUTPUT device used as source address */
509 		skb->dev = dst->dev;
510 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511 		IP6_INC_STATS_BH(net,
512 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513 		IP6_INC_STATS_BH(net,
514 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515 		kfree_skb(skb);
516 		return -EMSGSIZE;
517 	}
518 
519 	if (skb_cow(skb, dst->dev->hard_header_len)) {
520 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521 		goto drop;
522 	}
523 
524 	hdr = ipv6_hdr(skb);
525 
526 	/* Mangling hops number delayed to point after skb COW */
527 
528 	hdr->hop_limit--;
529 
530 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532 		       ip6_forward_finish);
533 
534 error:
535 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537 	kfree_skb(skb);
538 	return -EINVAL;
539 }
540 
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543 	to->pkt_type = from->pkt_type;
544 	to->priority = from->priority;
545 	to->protocol = from->protocol;
546 	skb_dst_drop(to);
547 	skb_dst_set(to, dst_clone(skb_dst(from)));
548 	to->dev = from->dev;
549 	to->mark = from->mark;
550 
551 #ifdef CONFIG_NET_SCHED
552 	to->tc_index = from->tc_index;
553 #endif
554 	nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557 	to->nf_trace = from->nf_trace;
558 #endif
559 	skb_copy_secmark(to, from);
560 }
561 
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564 	u16 offset = sizeof(struct ipv6hdr);
565 	struct ipv6_opt_hdr *exthdr =
566 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567 	unsigned int packet_len = skb->tail - skb->network_header;
568 	int found_rhdr = 0;
569 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
570 
571 	while (offset + 1 <= packet_len) {
572 
573 		switch (**nexthdr) {
574 
575 		case NEXTHDR_HOP:
576 			break;
577 		case NEXTHDR_ROUTING:
578 			found_rhdr = 1;
579 			break;
580 		case NEXTHDR_DEST:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
583 				break;
584 #endif
585 			if (found_rhdr)
586 				return offset;
587 			break;
588 		default :
589 			return offset;
590 		}
591 
592 		offset += ipv6_optlen(exthdr);
593 		*nexthdr = &exthdr->nexthdr;
594 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
595 						 offset);
596 	}
597 
598 	return offset;
599 }
600 
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
602 {
603 	static atomic_t ipv6_fragmentation_id;
604 	int old, new;
605 
606 	if (rt) {
607 		struct inet_peer *peer;
608 
609 		if (!rt->rt6i_peer)
610 			rt6_bind_peer(rt, 1);
611 		peer = rt->rt6i_peer;
612 		if (peer) {
613 			fhdr->identification = htonl(inet_getid(peer, 0));
614 			return;
615 		}
616 	}
617 	do {
618 		old = atomic_read(&ipv6_fragmentation_id);
619 		new = old + 1;
620 		if (!new)
621 			new = 1;
622 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
623 	fhdr->identification = htonl(new);
624 }
625 
626 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
627 {
628 	struct sk_buff *frag;
629 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
630 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
631 	struct ipv6hdr *tmp_hdr;
632 	struct frag_hdr *fh;
633 	unsigned int mtu, hlen, left, len;
634 	__be32 frag_id = 0;
635 	int ptr, offset = 0, err=0;
636 	u8 *prevhdr, nexthdr = 0;
637 	struct net *net = dev_net(skb_dst(skb)->dev);
638 
639 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
640 	nexthdr = *prevhdr;
641 
642 	mtu = ip6_skb_dst_mtu(skb);
643 
644 	/* We must not fragment if the socket is set to force MTU discovery
645 	 * or if the skb it not generated by a local socket.
646 	 */
647 	if (!skb->local_df && skb->len > mtu) {
648 		skb->dev = skb_dst(skb)->dev;
649 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
650 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
651 			      IPSTATS_MIB_FRAGFAILS);
652 		kfree_skb(skb);
653 		return -EMSGSIZE;
654 	}
655 
656 	if (np && np->frag_size < mtu) {
657 		if (np->frag_size)
658 			mtu = np->frag_size;
659 	}
660 	mtu -= hlen + sizeof(struct frag_hdr);
661 
662 	if (skb_has_frag_list(skb)) {
663 		int first_len = skb_pagelen(skb);
664 		struct sk_buff *frag2;
665 
666 		if (first_len - hlen > mtu ||
667 		    ((first_len - hlen) & 7) ||
668 		    skb_cloned(skb))
669 			goto slow_path;
670 
671 		skb_walk_frags(skb, frag) {
672 			/* Correct geometry. */
673 			if (frag->len > mtu ||
674 			    ((frag->len & 7) && frag->next) ||
675 			    skb_headroom(frag) < hlen)
676 				goto slow_path_clean;
677 
678 			/* Partially cloned skb? */
679 			if (skb_shared(frag))
680 				goto slow_path_clean;
681 
682 			BUG_ON(frag->sk);
683 			if (skb->sk) {
684 				frag->sk = skb->sk;
685 				frag->destructor = sock_wfree;
686 			}
687 			skb->truesize -= frag->truesize;
688 		}
689 
690 		err = 0;
691 		offset = 0;
692 		frag = skb_shinfo(skb)->frag_list;
693 		skb_frag_list_init(skb);
694 		/* BUILD HEADER */
695 
696 		*prevhdr = NEXTHDR_FRAGMENT;
697 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
698 		if (!tmp_hdr) {
699 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
700 				      IPSTATS_MIB_FRAGFAILS);
701 			return -ENOMEM;
702 		}
703 
704 		__skb_pull(skb, hlen);
705 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
706 		__skb_push(skb, hlen);
707 		skb_reset_network_header(skb);
708 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
709 
710 		ipv6_select_ident(fh, rt);
711 		fh->nexthdr = nexthdr;
712 		fh->reserved = 0;
713 		fh->frag_off = htons(IP6_MF);
714 		frag_id = fh->identification;
715 
716 		first_len = skb_pagelen(skb);
717 		skb->data_len = first_len - skb_headlen(skb);
718 		skb->len = first_len;
719 		ipv6_hdr(skb)->payload_len = htons(first_len -
720 						   sizeof(struct ipv6hdr));
721 
722 		dst_hold(&rt->dst);
723 
724 		for (;;) {
725 			/* Prepare header of the next frame,
726 			 * before previous one went down. */
727 			if (frag) {
728 				frag->ip_summed = CHECKSUM_NONE;
729 				skb_reset_transport_header(frag);
730 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
731 				__skb_push(frag, hlen);
732 				skb_reset_network_header(frag);
733 				memcpy(skb_network_header(frag), tmp_hdr,
734 				       hlen);
735 				offset += skb->len - hlen - sizeof(struct frag_hdr);
736 				fh->nexthdr = nexthdr;
737 				fh->reserved = 0;
738 				fh->frag_off = htons(offset);
739 				if (frag->next != NULL)
740 					fh->frag_off |= htons(IP6_MF);
741 				fh->identification = frag_id;
742 				ipv6_hdr(frag)->payload_len =
743 						htons(frag->len -
744 						      sizeof(struct ipv6hdr));
745 				ip6_copy_metadata(frag, skb);
746 			}
747 
748 			err = output(skb);
749 			if(!err)
750 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
751 					      IPSTATS_MIB_FRAGCREATES);
752 
753 			if (err || !frag)
754 				break;
755 
756 			skb = frag;
757 			frag = skb->next;
758 			skb->next = NULL;
759 		}
760 
761 		kfree(tmp_hdr);
762 
763 		if (err == 0) {
764 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
765 				      IPSTATS_MIB_FRAGOKS);
766 			dst_release(&rt->dst);
767 			return 0;
768 		}
769 
770 		while (frag) {
771 			skb = frag->next;
772 			kfree_skb(frag);
773 			frag = skb;
774 		}
775 
776 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
777 			      IPSTATS_MIB_FRAGFAILS);
778 		dst_release(&rt->dst);
779 		return err;
780 
781 slow_path_clean:
782 		skb_walk_frags(skb, frag2) {
783 			if (frag2 == frag)
784 				break;
785 			frag2->sk = NULL;
786 			frag2->destructor = NULL;
787 			skb->truesize += frag2->truesize;
788 		}
789 	}
790 
791 slow_path:
792 	left = skb->len - hlen;		/* Space per frame */
793 	ptr = hlen;			/* Where to start from */
794 
795 	/*
796 	 *	Fragment the datagram.
797 	 */
798 
799 	*prevhdr = NEXTHDR_FRAGMENT;
800 
801 	/*
802 	 *	Keep copying data until we run out.
803 	 */
804 	while(left > 0)	{
805 		len = left;
806 		/* IF: it doesn't fit, use 'mtu' - the data space left */
807 		if (len > mtu)
808 			len = mtu;
809 		/* IF: we are not sending up to and including the packet end
810 		   then align the next start on an eight byte boundary */
811 		if (len < left)	{
812 			len &= ~7;
813 		}
814 		/*
815 		 *	Allocate buffer.
816 		 */
817 
818 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
819 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
820 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821 				      IPSTATS_MIB_FRAGFAILS);
822 			err = -ENOMEM;
823 			goto fail;
824 		}
825 
826 		/*
827 		 *	Set up data on packet
828 		 */
829 
830 		ip6_copy_metadata(frag, skb);
831 		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
832 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
833 		skb_reset_network_header(frag);
834 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
835 		frag->transport_header = (frag->network_header + hlen +
836 					  sizeof(struct frag_hdr));
837 
838 		/*
839 		 *	Charge the memory for the fragment to any owner
840 		 *	it might possess
841 		 */
842 		if (skb->sk)
843 			skb_set_owner_w(frag, skb->sk);
844 
845 		/*
846 		 *	Copy the packet header into the new buffer.
847 		 */
848 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
849 
850 		/*
851 		 *	Build fragment header.
852 		 */
853 		fh->nexthdr = nexthdr;
854 		fh->reserved = 0;
855 		if (!frag_id) {
856 			ipv6_select_ident(fh, rt);
857 			frag_id = fh->identification;
858 		} else
859 			fh->identification = frag_id;
860 
861 		/*
862 		 *	Copy a block of the IP datagram.
863 		 */
864 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
865 			BUG();
866 		left -= len;
867 
868 		fh->frag_off = htons(offset);
869 		if (left > 0)
870 			fh->frag_off |= htons(IP6_MF);
871 		ipv6_hdr(frag)->payload_len = htons(frag->len -
872 						    sizeof(struct ipv6hdr));
873 
874 		ptr += len;
875 		offset += len;
876 
877 		/*
878 		 *	Put this fragment into the sending queue.
879 		 */
880 		err = output(frag);
881 		if (err)
882 			goto fail;
883 
884 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885 			      IPSTATS_MIB_FRAGCREATES);
886 	}
887 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888 		      IPSTATS_MIB_FRAGOKS);
889 	kfree_skb(skb);
890 	return err;
891 
892 fail:
893 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894 		      IPSTATS_MIB_FRAGFAILS);
895 	kfree_skb(skb);
896 	return err;
897 }
898 
899 static inline int ip6_rt_check(const struct rt6key *rt_key,
900 			       const struct in6_addr *fl_addr,
901 			       const struct in6_addr *addr_cache)
902 {
903 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
904 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
905 }
906 
907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
908 					  struct dst_entry *dst,
909 					  const struct flowi6 *fl6)
910 {
911 	struct ipv6_pinfo *np = inet6_sk(sk);
912 	struct rt6_info *rt = (struct rt6_info *)dst;
913 
914 	if (!dst)
915 		goto out;
916 
917 	/* Yes, checking route validity in not connected
918 	 * case is not very simple. Take into account,
919 	 * that we do not support routing by source, TOS,
920 	 * and MSG_DONTROUTE 		--ANK (980726)
921 	 *
922 	 * 1. ip6_rt_check(): If route was host route,
923 	 *    check that cached destination is current.
924 	 *    If it is network route, we still may
925 	 *    check its validity using saved pointer
926 	 *    to the last used address: daddr_cache.
927 	 *    We do not want to save whole address now,
928 	 *    (because main consumer of this service
929 	 *    is tcp, which has not this problem),
930 	 *    so that the last trick works only on connected
931 	 *    sockets.
932 	 * 2. oif also should be the same.
933 	 */
934 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
935 #ifdef CONFIG_IPV6_SUBTREES
936 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
937 #endif
938 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
939 		dst_release(dst);
940 		dst = NULL;
941 	}
942 
943 out:
944 	return dst;
945 }
946 
947 static int ip6_dst_lookup_tail(struct sock *sk,
948 			       struct dst_entry **dst, struct flowi6 *fl6)
949 {
950 	struct net *net = sock_net(sk);
951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
952 	struct neighbour *n;
953 #endif
954 	int err;
955 
956 	if (*dst == NULL)
957 		*dst = ip6_route_output(net, sk, fl6);
958 
959 	if ((err = (*dst)->error))
960 		goto out_err_release;
961 
962 	if (ipv6_addr_any(&fl6->saddr)) {
963 		struct rt6_info *rt = (struct rt6_info *) *dst;
964 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
965 					  sk ? inet6_sk(sk)->srcprefs : 0,
966 					  &fl6->saddr);
967 		if (err)
968 			goto out_err_release;
969 	}
970 
971 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
972 	/*
973 	 * Here if the dst entry we've looked up
974 	 * has a neighbour entry that is in the INCOMPLETE
975 	 * state and the src address from the flow is
976 	 * marked as OPTIMISTIC, we release the found
977 	 * dst entry and replace it instead with the
978 	 * dst entry of the nexthop router
979 	 */
980 	rcu_read_lock();
981 	n = dst_get_neighbour(*dst);
982 	if (n && !(n->nud_state & NUD_VALID)) {
983 		struct inet6_ifaddr *ifp;
984 		struct flowi6 fl_gw6;
985 		int redirect;
986 
987 		rcu_read_unlock();
988 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
989 				      (*dst)->dev, 1);
990 
991 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
992 		if (ifp)
993 			in6_ifa_put(ifp);
994 
995 		if (redirect) {
996 			/*
997 			 * We need to get the dst entry for the
998 			 * default router instead
999 			 */
1000 			dst_release(*dst);
1001 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1002 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1003 			*dst = ip6_route_output(net, sk, &fl_gw6);
1004 			if ((err = (*dst)->error))
1005 				goto out_err_release;
1006 		}
1007 	} else {
1008 		rcu_read_unlock();
1009 	}
1010 #endif
1011 
1012 	return 0;
1013 
1014 out_err_release:
1015 	if (err == -ENETUNREACH)
1016 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1017 	dst_release(*dst);
1018 	*dst = NULL;
1019 	return err;
1020 }
1021 
1022 /**
1023  *	ip6_dst_lookup - perform route lookup on flow
1024  *	@sk: socket which provides route info
1025  *	@dst: pointer to dst_entry * for result
1026  *	@fl6: flow to lookup
1027  *
1028  *	This function performs a route lookup on the given flow.
1029  *
1030  *	It returns zero on success, or a standard errno code on error.
1031  */
1032 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1033 {
1034 	*dst = NULL;
1035 	return ip6_dst_lookup_tail(sk, dst, fl6);
1036 }
1037 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1038 
1039 /**
1040  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1041  *	@sk: socket which provides route info
1042  *	@fl6: flow to lookup
1043  *	@final_dst: final destination address for ipsec lookup
1044  *	@can_sleep: we are in a sleepable context
1045  *
1046  *	This function performs a route lookup on the given flow.
1047  *
1048  *	It returns a valid dst pointer on success, or a pointer encoded
1049  *	error code.
1050  */
1051 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1052 				      const struct in6_addr *final_dst,
1053 				      bool can_sleep)
1054 {
1055 	struct dst_entry *dst = NULL;
1056 	int err;
1057 
1058 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1059 	if (err)
1060 		return ERR_PTR(err);
1061 	if (final_dst)
1062 		ipv6_addr_copy(&fl6->daddr, final_dst);
1063 	if (can_sleep)
1064 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1065 
1066 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1067 }
1068 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1069 
1070 /**
1071  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1072  *	@sk: socket which provides the dst cache and route info
1073  *	@fl6: flow to lookup
1074  *	@final_dst: final destination address for ipsec lookup
1075  *	@can_sleep: we are in a sleepable context
1076  *
1077  *	This function performs a route lookup on the given flow with the
1078  *	possibility of using the cached route in the socket if it is valid.
1079  *	It will take the socket dst lock when operating on the dst cache.
1080  *	As a result, this function can only be used in process context.
1081  *
1082  *	It returns a valid dst pointer on success, or a pointer encoded
1083  *	error code.
1084  */
1085 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1086 					 const struct in6_addr *final_dst,
1087 					 bool can_sleep)
1088 {
1089 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1090 	int err;
1091 
1092 	dst = ip6_sk_dst_check(sk, dst, fl6);
1093 
1094 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1095 	if (err)
1096 		return ERR_PTR(err);
1097 	if (final_dst)
1098 		ipv6_addr_copy(&fl6->daddr, final_dst);
1099 	if (can_sleep)
1100 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1101 
1102 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1103 }
1104 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1105 
1106 static inline int ip6_ufo_append_data(struct sock *sk,
1107 			int getfrag(void *from, char *to, int offset, int len,
1108 			int odd, struct sk_buff *skb),
1109 			void *from, int length, int hh_len, int fragheaderlen,
1110 			int transhdrlen, int mtu,unsigned int flags,
1111 			struct rt6_info *rt)
1112 
1113 {
1114 	struct sk_buff *skb;
1115 	int err;
1116 
1117 	/* There is support for UDP large send offload by network
1118 	 * device, so create one single skb packet containing complete
1119 	 * udp datagram
1120 	 */
1121 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1122 		skb = sock_alloc_send_skb(sk,
1123 			hh_len + fragheaderlen + transhdrlen + 20,
1124 			(flags & MSG_DONTWAIT), &err);
1125 		if (skb == NULL)
1126 			return err;
1127 
1128 		/* reserve space for Hardware header */
1129 		skb_reserve(skb, hh_len);
1130 
1131 		/* create space for UDP/IP header */
1132 		skb_put(skb,fragheaderlen + transhdrlen);
1133 
1134 		/* initialize network header pointer */
1135 		skb_reset_network_header(skb);
1136 
1137 		/* initialize protocol header pointer */
1138 		skb->transport_header = skb->network_header + fragheaderlen;
1139 
1140 		skb->ip_summed = CHECKSUM_PARTIAL;
1141 		skb->csum = 0;
1142 	}
1143 
1144 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1145 				      (length - transhdrlen));
1146 	if (!err) {
1147 		struct frag_hdr fhdr;
1148 
1149 		/* Specify the length of each IPv6 datagram fragment.
1150 		 * It has to be a multiple of 8.
1151 		 */
1152 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1153 					     sizeof(struct frag_hdr)) & ~7;
1154 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1155 		ipv6_select_ident(&fhdr, rt);
1156 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1157 		__skb_queue_tail(&sk->sk_write_queue, skb);
1158 
1159 		return 0;
1160 	}
1161 	/* There is not enough support do UPD LSO,
1162 	 * so follow normal path
1163 	 */
1164 	kfree_skb(skb);
1165 
1166 	return err;
1167 }
1168 
1169 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1170 					       gfp_t gfp)
1171 {
1172 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 }
1174 
1175 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1176 						gfp_t gfp)
1177 {
1178 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1179 }
1180 
1181 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1182 	int offset, int len, int odd, struct sk_buff *skb),
1183 	void *from, int length, int transhdrlen,
1184 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1185 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1186 {
1187 	struct inet_sock *inet = inet_sk(sk);
1188 	struct ipv6_pinfo *np = inet6_sk(sk);
1189 	struct inet_cork *cork;
1190 	struct sk_buff *skb;
1191 	unsigned int maxfraglen, fragheaderlen;
1192 	int exthdrlen;
1193 	int dst_exthdrlen;
1194 	int hh_len;
1195 	int mtu;
1196 	int copy;
1197 	int err;
1198 	int offset = 0;
1199 	int csummode = CHECKSUM_NONE;
1200 	__u8 tx_flags = 0;
1201 
1202 	if (flags&MSG_PROBE)
1203 		return 0;
1204 	cork = &inet->cork.base;
1205 	if (skb_queue_empty(&sk->sk_write_queue)) {
1206 		/*
1207 		 * setup for corking
1208 		 */
1209 		if (opt) {
1210 			if (WARN_ON(np->cork.opt))
1211 				return -EINVAL;
1212 
1213 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1214 			if (unlikely(np->cork.opt == NULL))
1215 				return -ENOBUFS;
1216 
1217 			np->cork.opt->tot_len = opt->tot_len;
1218 			np->cork.opt->opt_flen = opt->opt_flen;
1219 			np->cork.opt->opt_nflen = opt->opt_nflen;
1220 
1221 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1222 							    sk->sk_allocation);
1223 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1224 				return -ENOBUFS;
1225 
1226 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1227 							    sk->sk_allocation);
1228 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1229 				return -ENOBUFS;
1230 
1231 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1232 							   sk->sk_allocation);
1233 			if (opt->hopopt && !np->cork.opt->hopopt)
1234 				return -ENOBUFS;
1235 
1236 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1237 							    sk->sk_allocation);
1238 			if (opt->srcrt && !np->cork.opt->srcrt)
1239 				return -ENOBUFS;
1240 
1241 			/* need source address above miyazawa*/
1242 		}
1243 		dst_hold(&rt->dst);
1244 		cork->dst = &rt->dst;
1245 		inet->cork.fl.u.ip6 = *fl6;
1246 		np->cork.hop_limit = hlimit;
1247 		np->cork.tclass = tclass;
1248 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1249 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1250 		if (np->frag_size < mtu) {
1251 			if (np->frag_size)
1252 				mtu = np->frag_size;
1253 		}
1254 		cork->fragsize = mtu;
1255 		if (dst_allfrag(rt->dst.path))
1256 			cork->flags |= IPCORK_ALLFRAG;
1257 		cork->length = 0;
1258 		sk->sk_sndmsg_page = NULL;
1259 		sk->sk_sndmsg_off = 0;
1260 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1261 		length += exthdrlen;
1262 		transhdrlen += exthdrlen;
1263 		dst_exthdrlen = rt->dst.header_len;
1264 	} else {
1265 		rt = (struct rt6_info *)cork->dst;
1266 		fl6 = &inet->cork.fl.u.ip6;
1267 		opt = np->cork.opt;
1268 		transhdrlen = 0;
1269 		exthdrlen = 0;
1270 		dst_exthdrlen = 0;
1271 		mtu = cork->fragsize;
1272 	}
1273 
1274 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1275 
1276 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1277 			(opt ? opt->opt_nflen : 0);
1278 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1279 
1280 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1281 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1282 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1283 			return -EMSGSIZE;
1284 		}
1285 	}
1286 
1287 	/* For UDP, check if TX timestamp is enabled */
1288 	if (sk->sk_type == SOCK_DGRAM) {
1289 		err = sock_tx_timestamp(sk, &tx_flags);
1290 		if (err)
1291 			goto error;
1292 	}
1293 
1294 	/*
1295 	 * Let's try using as much space as possible.
1296 	 * Use MTU if total length of the message fits into the MTU.
1297 	 * Otherwise, we need to reserve fragment header and
1298 	 * fragment alignment (= 8-15 octects, in total).
1299 	 *
1300 	 * Note that we may need to "move" the data from the tail of
1301 	 * of the buffer to the new fragment when we split
1302 	 * the message.
1303 	 *
1304 	 * FIXME: It may be fragmented into multiple chunks
1305 	 *        at once if non-fragmentable extension headers
1306 	 *        are too large.
1307 	 * --yoshfuji
1308 	 */
1309 
1310 	cork->length += length;
1311 	if (length > mtu) {
1312 		int proto = sk->sk_protocol;
1313 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1314 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1315 			return -EMSGSIZE;
1316 		}
1317 
1318 		if (proto == IPPROTO_UDP &&
1319 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1320 
1321 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1322 						  hh_len, fragheaderlen,
1323 						  transhdrlen, mtu, flags, rt);
1324 			if (err)
1325 				goto error;
1326 			return 0;
1327 		}
1328 	}
1329 
1330 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1331 		goto alloc_new_skb;
1332 
1333 	while (length > 0) {
1334 		/* Check if the remaining data fits into current packet. */
1335 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1336 		if (copy < length)
1337 			copy = maxfraglen - skb->len;
1338 
1339 		if (copy <= 0) {
1340 			char *data;
1341 			unsigned int datalen;
1342 			unsigned int fraglen;
1343 			unsigned int fraggap;
1344 			unsigned int alloclen;
1345 			struct sk_buff *skb_prev;
1346 alloc_new_skb:
1347 			skb_prev = skb;
1348 
1349 			/* There's no room in the current skb */
1350 			if (skb_prev)
1351 				fraggap = skb_prev->len - maxfraglen;
1352 			else
1353 				fraggap = 0;
1354 
1355 			/*
1356 			 * If remaining data exceeds the mtu,
1357 			 * we know we need more fragment(s).
1358 			 */
1359 			datalen = length + fraggap;
1360 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1361 				datalen = maxfraglen - fragheaderlen;
1362 
1363 			fraglen = datalen + fragheaderlen;
1364 			if ((flags & MSG_MORE) &&
1365 			    !(rt->dst.dev->features&NETIF_F_SG))
1366 				alloclen = mtu;
1367 			else
1368 				alloclen = datalen + fragheaderlen;
1369 
1370 			alloclen += dst_exthdrlen;
1371 
1372 			/*
1373 			 * The last fragment gets additional space at tail.
1374 			 * Note: we overallocate on fragments with MSG_MODE
1375 			 * because we have no idea if we're the last one.
1376 			 */
1377 			if (datalen == length + fraggap)
1378 				alloclen += rt->dst.trailer_len;
1379 
1380 			/*
1381 			 * We just reserve space for fragment header.
1382 			 * Note: this may be overallocation if the message
1383 			 * (without MSG_MORE) fits into the MTU.
1384 			 */
1385 			alloclen += sizeof(struct frag_hdr);
1386 
1387 			if (transhdrlen) {
1388 				skb = sock_alloc_send_skb(sk,
1389 						alloclen + hh_len,
1390 						(flags & MSG_DONTWAIT), &err);
1391 			} else {
1392 				skb = NULL;
1393 				if (atomic_read(&sk->sk_wmem_alloc) <=
1394 				    2 * sk->sk_sndbuf)
1395 					skb = sock_wmalloc(sk,
1396 							   alloclen + hh_len, 1,
1397 							   sk->sk_allocation);
1398 				if (unlikely(skb == NULL))
1399 					err = -ENOBUFS;
1400 				else {
1401 					/* Only the initial fragment
1402 					 * is time stamped.
1403 					 */
1404 					tx_flags = 0;
1405 				}
1406 			}
1407 			if (skb == NULL)
1408 				goto error;
1409 			/*
1410 			 *	Fill in the control structures
1411 			 */
1412 			skb->ip_summed = csummode;
1413 			skb->csum = 0;
1414 			/* reserve for fragmentation */
1415 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1416 
1417 			if (sk->sk_type == SOCK_DGRAM)
1418 				skb_shinfo(skb)->tx_flags = tx_flags;
1419 
1420 			/*
1421 			 *	Find where to start putting bytes
1422 			 */
1423 			data = skb_put(skb, fraglen + dst_exthdrlen);
1424 			skb_set_network_header(skb, exthdrlen + dst_exthdrlen);
1425 			data += fragheaderlen + dst_exthdrlen;
1426 			skb->transport_header = (skb->network_header +
1427 						 fragheaderlen);
1428 			if (fraggap) {
1429 				skb->csum = skb_copy_and_csum_bits(
1430 					skb_prev, maxfraglen,
1431 					data + transhdrlen, fraggap, 0);
1432 				skb_prev->csum = csum_sub(skb_prev->csum,
1433 							  skb->csum);
1434 				data += fraggap;
1435 				pskb_trim_unique(skb_prev, maxfraglen);
1436 			}
1437 			copy = datalen - transhdrlen - fraggap;
1438 
1439 			if (copy < 0) {
1440 				err = -EINVAL;
1441 				kfree_skb(skb);
1442 				goto error;
1443 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1444 				err = -EFAULT;
1445 				kfree_skb(skb);
1446 				goto error;
1447 			}
1448 
1449 			offset += copy;
1450 			length -= datalen - fraggap;
1451 			transhdrlen = 0;
1452 			exthdrlen = 0;
1453 			dst_exthdrlen = 0;
1454 			csummode = CHECKSUM_NONE;
1455 
1456 			/*
1457 			 * Put the packet on the pending queue
1458 			 */
1459 			__skb_queue_tail(&sk->sk_write_queue, skb);
1460 			continue;
1461 		}
1462 
1463 		if (copy > length)
1464 			copy = length;
1465 
1466 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1467 			unsigned int off;
1468 
1469 			off = skb->len;
1470 			if (getfrag(from, skb_put(skb, copy),
1471 						offset, copy, off, skb) < 0) {
1472 				__skb_trim(skb, off);
1473 				err = -EFAULT;
1474 				goto error;
1475 			}
1476 		} else {
1477 			int i = skb_shinfo(skb)->nr_frags;
1478 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1479 			struct page *page = sk->sk_sndmsg_page;
1480 			int off = sk->sk_sndmsg_off;
1481 			unsigned int left;
1482 
1483 			if (page && (left = PAGE_SIZE - off) > 0) {
1484 				if (copy >= left)
1485 					copy = left;
1486 				if (page != skb_frag_page(frag)) {
1487 					if (i == MAX_SKB_FRAGS) {
1488 						err = -EMSGSIZE;
1489 						goto error;
1490 					}
1491 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1492 					skb_frag_ref(skb, i);
1493 					frag = &skb_shinfo(skb)->frags[i];
1494 				}
1495 			} else if(i < MAX_SKB_FRAGS) {
1496 				if (copy > PAGE_SIZE)
1497 					copy = PAGE_SIZE;
1498 				page = alloc_pages(sk->sk_allocation, 0);
1499 				if (page == NULL) {
1500 					err = -ENOMEM;
1501 					goto error;
1502 				}
1503 				sk->sk_sndmsg_page = page;
1504 				sk->sk_sndmsg_off = 0;
1505 
1506 				skb_fill_page_desc(skb, i, page, 0, 0);
1507 				frag = &skb_shinfo(skb)->frags[i];
1508 			} else {
1509 				err = -EMSGSIZE;
1510 				goto error;
1511 			}
1512 			if (getfrag(from,
1513 				    skb_frag_address(frag) + skb_frag_size(frag),
1514 				    offset, copy, skb->len, skb) < 0) {
1515 				err = -EFAULT;
1516 				goto error;
1517 			}
1518 			sk->sk_sndmsg_off += copy;
1519 			skb_frag_size_add(frag, copy);
1520 			skb->len += copy;
1521 			skb->data_len += copy;
1522 			skb->truesize += copy;
1523 			atomic_add(copy, &sk->sk_wmem_alloc);
1524 		}
1525 		offset += copy;
1526 		length -= copy;
1527 	}
1528 	return 0;
1529 error:
1530 	cork->length -= length;
1531 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1532 	return err;
1533 }
1534 
1535 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1536 {
1537 	if (np->cork.opt) {
1538 		kfree(np->cork.opt->dst0opt);
1539 		kfree(np->cork.opt->dst1opt);
1540 		kfree(np->cork.opt->hopopt);
1541 		kfree(np->cork.opt->srcrt);
1542 		kfree(np->cork.opt);
1543 		np->cork.opt = NULL;
1544 	}
1545 
1546 	if (inet->cork.base.dst) {
1547 		dst_release(inet->cork.base.dst);
1548 		inet->cork.base.dst = NULL;
1549 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1550 	}
1551 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1552 }
1553 
1554 int ip6_push_pending_frames(struct sock *sk)
1555 {
1556 	struct sk_buff *skb, *tmp_skb;
1557 	struct sk_buff **tail_skb;
1558 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1559 	struct inet_sock *inet = inet_sk(sk);
1560 	struct ipv6_pinfo *np = inet6_sk(sk);
1561 	struct net *net = sock_net(sk);
1562 	struct ipv6hdr *hdr;
1563 	struct ipv6_txoptions *opt = np->cork.opt;
1564 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1565 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1566 	unsigned char proto = fl6->flowi6_proto;
1567 	int err = 0;
1568 
1569 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1570 		goto out;
1571 	tail_skb = &(skb_shinfo(skb)->frag_list);
1572 
1573 	/* move skb->data to ip header from ext header */
1574 	if (skb->data < skb_network_header(skb))
1575 		__skb_pull(skb, skb_network_offset(skb));
1576 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1577 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1578 		*tail_skb = tmp_skb;
1579 		tail_skb = &(tmp_skb->next);
1580 		skb->len += tmp_skb->len;
1581 		skb->data_len += tmp_skb->len;
1582 		skb->truesize += tmp_skb->truesize;
1583 		tmp_skb->destructor = NULL;
1584 		tmp_skb->sk = NULL;
1585 	}
1586 
1587 	/* Allow local fragmentation. */
1588 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1589 		skb->local_df = 1;
1590 
1591 	ipv6_addr_copy(final_dst, &fl6->daddr);
1592 	__skb_pull(skb, skb_network_header_len(skb));
1593 	if (opt && opt->opt_flen)
1594 		ipv6_push_frag_opts(skb, opt, &proto);
1595 	if (opt && opt->opt_nflen)
1596 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1597 
1598 	skb_push(skb, sizeof(struct ipv6hdr));
1599 	skb_reset_network_header(skb);
1600 	hdr = ipv6_hdr(skb);
1601 
1602 	*(__be32*)hdr = fl6->flowlabel |
1603 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1604 
1605 	hdr->hop_limit = np->cork.hop_limit;
1606 	hdr->nexthdr = proto;
1607 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1608 	ipv6_addr_copy(&hdr->daddr, final_dst);
1609 
1610 	skb->priority = sk->sk_priority;
1611 	skb->mark = sk->sk_mark;
1612 
1613 	skb_dst_set(skb, dst_clone(&rt->dst));
1614 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1615 	if (proto == IPPROTO_ICMPV6) {
1616 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1617 
1618 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1619 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1620 	}
1621 
1622 	err = ip6_local_out(skb);
1623 	if (err) {
1624 		if (err > 0)
1625 			err = net_xmit_errno(err);
1626 		if (err)
1627 			goto error;
1628 	}
1629 
1630 out:
1631 	ip6_cork_release(inet, np);
1632 	return err;
1633 error:
1634 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1635 	goto out;
1636 }
1637 
1638 void ip6_flush_pending_frames(struct sock *sk)
1639 {
1640 	struct sk_buff *skb;
1641 
1642 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1643 		if (skb_dst(skb))
1644 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1645 				      IPSTATS_MIB_OUTDISCARDS);
1646 		kfree_skb(skb);
1647 	}
1648 
1649 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1650 }
1651