xref: /linux/net/ipv6/ip6_output.c (revision 26b0d14106954ae46d2f4f7eec3481828a210f7d)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	rcu_read_lock();
139 	neigh = dst_get_neighbour_noref(dst);
140 	if (neigh) {
141 		int res = neigh_output(neigh, skb);
142 
143 		rcu_read_unlock();
144 		return res;
145 	}
146 	rcu_read_unlock();
147 	IP6_INC_STATS_BH(dev_net(dst->dev),
148 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149 	kfree_skb(skb);
150 	return -EINVAL;
151 }
152 
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 	    dst_allfrag(skb_dst(skb)))
157 		return ip6_fragment(skb, ip6_finish_output2);
158 	else
159 		return ip6_finish_output2(skb);
160 }
161 
162 int ip6_output(struct sk_buff *skb)
163 {
164 	struct net_device *dev = skb_dst(skb)->dev;
165 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 	if (unlikely(idev->cnf.disable_ipv6)) {
167 		IP6_INC_STATS(dev_net(dev), idev,
168 			      IPSTATS_MIB_OUTDISCARDS);
169 		kfree_skb(skb);
170 		return 0;
171 	}
172 
173 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174 			    ip6_finish_output,
175 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177 
178 /*
179  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181 
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 	     struct ipv6_txoptions *opt, int tclass)
184 {
185 	struct net *net = sock_net(sk);
186 	struct ipv6_pinfo *np = inet6_sk(sk);
187 	struct in6_addr *first_hop = &fl6->daddr;
188 	struct dst_entry *dst = skb_dst(skb);
189 	struct ipv6hdr *hdr;
190 	u8  proto = fl6->flowi6_proto;
191 	int seg_len = skb->len;
192 	int hlimit = -1;
193 	u32 mtu;
194 
195 	if (opt) {
196 		unsigned int head_room;
197 
198 		/* First: exthdrs may take lots of space (~8K for now)
199 		   MAX_HEADER is not enough.
200 		 */
201 		head_room = opt->opt_nflen + opt->opt_flen;
202 		seg_len += head_room;
203 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204 
205 		if (skb_headroom(skb) < head_room) {
206 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 			if (skb2 == NULL) {
208 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 					      IPSTATS_MIB_OUTDISCARDS);
210 				kfree_skb(skb);
211 				return -ENOBUFS;
212 			}
213 			consume_skb(skb);
214 			skb = skb2;
215 			skb_set_owner_w(skb, sk);
216 		}
217 		if (opt->opt_flen)
218 			ipv6_push_frag_opts(skb, opt, &proto);
219 		if (opt->opt_nflen)
220 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221 	}
222 
223 	skb_push(skb, sizeof(struct ipv6hdr));
224 	skb_reset_network_header(skb);
225 	hdr = ipv6_hdr(skb);
226 
227 	/*
228 	 *	Fill in the IPv6 header
229 	 */
230 	if (np)
231 		hlimit = np->hop_limit;
232 	if (hlimit < 0)
233 		hlimit = ip6_dst_hoplimit(dst);
234 
235 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236 
237 	hdr->payload_len = htons(seg_len);
238 	hdr->nexthdr = proto;
239 	hdr->hop_limit = hlimit;
240 
241 	hdr->saddr = fl6->saddr;
242 	hdr->daddr = *first_hop;
243 
244 	skb->priority = sk->sk_priority;
245 	skb->mark = sk->sk_mark;
246 
247 	mtu = dst_mtu(dst);
248 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 			      IPSTATS_MIB_OUT, skb->len);
251 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 			       dst->dev, dst_output);
253 	}
254 
255 	net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
256 	skb->dev = dst->dev;
257 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
258 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
259 	kfree_skb(skb);
260 	return -EMSGSIZE;
261 }
262 
263 EXPORT_SYMBOL(ip6_xmit);
264 
265 /*
266  *	To avoid extra problems ND packets are send through this
267  *	routine. It's code duplication but I really want to avoid
268  *	extra checks since ipv6_build_header is used by TCP (which
269  *	is for us performance critical)
270  */
271 
272 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
273 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
274 	       int proto, int len)
275 {
276 	struct ipv6_pinfo *np = inet6_sk(sk);
277 	struct ipv6hdr *hdr;
278 
279 	skb->protocol = htons(ETH_P_IPV6);
280 	skb->dev = dev;
281 
282 	skb_reset_network_header(skb);
283 	skb_put(skb, sizeof(struct ipv6hdr));
284 	hdr = ipv6_hdr(skb);
285 
286 	*(__be32*)hdr = htonl(0x60000000);
287 
288 	hdr->payload_len = htons(len);
289 	hdr->nexthdr = proto;
290 	hdr->hop_limit = np->hop_limit;
291 
292 	hdr->saddr = *saddr;
293 	hdr->daddr = *daddr;
294 
295 	return 0;
296 }
297 
298 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
299 {
300 	struct ip6_ra_chain *ra;
301 	struct sock *last = NULL;
302 
303 	read_lock(&ip6_ra_lock);
304 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
305 		struct sock *sk = ra->sk;
306 		if (sk && ra->sel == sel &&
307 		    (!sk->sk_bound_dev_if ||
308 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
309 			if (last) {
310 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311 				if (skb2)
312 					rawv6_rcv(last, skb2);
313 			}
314 			last = sk;
315 		}
316 	}
317 
318 	if (last) {
319 		rawv6_rcv(last, skb);
320 		read_unlock(&ip6_ra_lock);
321 		return 1;
322 	}
323 	read_unlock(&ip6_ra_lock);
324 	return 0;
325 }
326 
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 {
329 	struct ipv6hdr *hdr = ipv6_hdr(skb);
330 	u8 nexthdr = hdr->nexthdr;
331 	__be16 frag_off;
332 	int offset;
333 
334 	if (ipv6_ext_hdr(nexthdr)) {
335 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336 		if (offset < 0)
337 			return 0;
338 	} else
339 		offset = sizeof(struct ipv6hdr);
340 
341 	if (nexthdr == IPPROTO_ICMPV6) {
342 		struct icmp6hdr *icmp6;
343 
344 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 					 offset + 1 - skb->data)))
346 			return 0;
347 
348 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349 
350 		switch (icmp6->icmp6_type) {
351 		case NDISC_ROUTER_SOLICITATION:
352 		case NDISC_ROUTER_ADVERTISEMENT:
353 		case NDISC_NEIGHBOUR_SOLICITATION:
354 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
355 		case NDISC_REDIRECT:
356 			/* For reaction involving unicast neighbor discovery
357 			 * message destined to the proxied address, pass it to
358 			 * input function.
359 			 */
360 			return 1;
361 		default:
362 			break;
363 		}
364 	}
365 
366 	/*
367 	 * The proxying router can't forward traffic sent to a link-local
368 	 * address, so signal the sender and discard the packet. This
369 	 * behavior is clarified by the MIPv6 specification.
370 	 */
371 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 		dst_link_failure(skb);
373 		return -1;
374 	}
375 
376 	return 0;
377 }
378 
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381 	return dst_output(skb);
382 }
383 
384 int ip6_forward(struct sk_buff *skb)
385 {
386 	struct dst_entry *dst = skb_dst(skb);
387 	struct ipv6hdr *hdr = ipv6_hdr(skb);
388 	struct inet6_skb_parm *opt = IP6CB(skb);
389 	struct net *net = dev_net(dst->dev);
390 	u32 mtu;
391 
392 	if (net->ipv6.devconf_all->forwarding == 0)
393 		goto error;
394 
395 	if (skb_warn_if_lro(skb))
396 		goto drop;
397 
398 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
400 		goto drop;
401 	}
402 
403 	if (skb->pkt_type != PACKET_HOST)
404 		goto drop;
405 
406 	skb_forward_csum(skb);
407 
408 	/*
409 	 *	We DO NOT make any processing on
410 	 *	RA packets, pushing them to user level AS IS
411 	 *	without ane WARRANTY that application will be able
412 	 *	to interpret them. The reason is that we
413 	 *	cannot make anything clever here.
414 	 *
415 	 *	We are not end-node, so that if packet contains
416 	 *	AH/ESP, we cannot make anything.
417 	 *	Defragmentation also would be mistake, RA packets
418 	 *	cannot be fragmented, because there is no warranty
419 	 *	that different fragments will go along one path. --ANK
420 	 */
421 	if (opt->ra) {
422 		u8 *ptr = skb_network_header(skb) + opt->ra;
423 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
424 			return 0;
425 	}
426 
427 	/*
428 	 *	check and decrement ttl
429 	 */
430 	if (hdr->hop_limit <= 1) {
431 		/* Force OUTPUT device used as source address */
432 		skb->dev = dst->dev;
433 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
434 		IP6_INC_STATS_BH(net,
435 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
436 
437 		kfree_skb(skb);
438 		return -ETIMEDOUT;
439 	}
440 
441 	/* XXX: idev->cnf.proxy_ndp? */
442 	if (net->ipv6.devconf_all->proxy_ndp &&
443 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
444 		int proxied = ip6_forward_proxy_check(skb);
445 		if (proxied > 0)
446 			return ip6_input(skb);
447 		else if (proxied < 0) {
448 			IP6_INC_STATS(net, ip6_dst_idev(dst),
449 				      IPSTATS_MIB_INDISCARDS);
450 			goto drop;
451 		}
452 	}
453 
454 	if (!xfrm6_route_forward(skb)) {
455 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
456 		goto drop;
457 	}
458 	dst = skb_dst(skb);
459 
460 	/* IPv6 specs say nothing about it, but it is clear that we cannot
461 	   send redirects to source routed frames.
462 	   We don't send redirects to frames decapsulated from IPsec.
463 	 */
464 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
465 		struct in6_addr *target = NULL;
466 		struct rt6_info *rt;
467 
468 		/*
469 		 *	incoming and outgoing devices are the same
470 		 *	send a redirect.
471 		 */
472 
473 		rt = (struct rt6_info *) dst;
474 		if (rt->rt6i_flags & RTF_GATEWAY)
475 			target = &rt->rt6i_gateway;
476 		else
477 			target = &hdr->daddr;
478 
479 		if (!rt->rt6i_peer)
480 			rt6_bind_peer(rt, 1);
481 
482 		/* Limit redirects both by destination (here)
483 		   and by source (inside ndisc_send_redirect)
484 		 */
485 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486 			ndisc_send_redirect(skb, target);
487 	} else {
488 		int addrtype = ipv6_addr_type(&hdr->saddr);
489 
490 		/* This check is security critical. */
491 		if (addrtype == IPV6_ADDR_ANY ||
492 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 			goto error;
494 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 				    ICMPV6_NOT_NEIGHBOUR, 0);
497 			goto error;
498 		}
499 	}
500 
501 	mtu = dst_mtu(dst);
502 	if (mtu < IPV6_MIN_MTU)
503 		mtu = IPV6_MIN_MTU;
504 
505 	if (skb->len > mtu && !skb_is_gso(skb)) {
506 		/* Again, force OUTPUT device used as source address */
507 		skb->dev = dst->dev;
508 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 		IP6_INC_STATS_BH(net,
510 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 		IP6_INC_STATS_BH(net,
512 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513 		kfree_skb(skb);
514 		return -EMSGSIZE;
515 	}
516 
517 	if (skb_cow(skb, dst->dev->hard_header_len)) {
518 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519 		goto drop;
520 	}
521 
522 	hdr = ipv6_hdr(skb);
523 
524 	/* Mangling hops number delayed to point after skb COW */
525 
526 	hdr->hop_limit--;
527 
528 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
530 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
531 		       ip6_forward_finish);
532 
533 error:
534 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
535 drop:
536 	kfree_skb(skb);
537 	return -EINVAL;
538 }
539 
540 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
541 {
542 	to->pkt_type = from->pkt_type;
543 	to->priority = from->priority;
544 	to->protocol = from->protocol;
545 	skb_dst_drop(to);
546 	skb_dst_set(to, dst_clone(skb_dst(from)));
547 	to->dev = from->dev;
548 	to->mark = from->mark;
549 
550 #ifdef CONFIG_NET_SCHED
551 	to->tc_index = from->tc_index;
552 #endif
553 	nf_copy(to, from);
554 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
555     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
556 	to->nf_trace = from->nf_trace;
557 #endif
558 	skb_copy_secmark(to, from);
559 }
560 
561 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
562 {
563 	u16 offset = sizeof(struct ipv6hdr);
564 	struct ipv6_opt_hdr *exthdr =
565 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
566 	unsigned int packet_len = skb->tail - skb->network_header;
567 	int found_rhdr = 0;
568 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
569 
570 	while (offset + 1 <= packet_len) {
571 
572 		switch (**nexthdr) {
573 
574 		case NEXTHDR_HOP:
575 			break;
576 		case NEXTHDR_ROUTING:
577 			found_rhdr = 1;
578 			break;
579 		case NEXTHDR_DEST:
580 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
581 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
582 				break;
583 #endif
584 			if (found_rhdr)
585 				return offset;
586 			break;
587 		default :
588 			return offset;
589 		}
590 
591 		offset += ipv6_optlen(exthdr);
592 		*nexthdr = &exthdr->nexthdr;
593 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
594 						 offset);
595 	}
596 
597 	return offset;
598 }
599 
600 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
601 {
602 	static atomic_t ipv6_fragmentation_id;
603 	int old, new;
604 
605 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
606 		struct inet_peer *peer;
607 
608 		if (!rt->rt6i_peer)
609 			rt6_bind_peer(rt, 1);
610 		peer = rt->rt6i_peer;
611 		if (peer) {
612 			fhdr->identification = htonl(inet_getid(peer, 0));
613 			return;
614 		}
615 	}
616 	do {
617 		old = atomic_read(&ipv6_fragmentation_id);
618 		new = old + 1;
619 		if (!new)
620 			new = 1;
621 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
622 	fhdr->identification = htonl(new);
623 }
624 
625 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
626 {
627 	struct sk_buff *frag;
628 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
629 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
630 	struct ipv6hdr *tmp_hdr;
631 	struct frag_hdr *fh;
632 	unsigned int mtu, hlen, left, len;
633 	int hroom, troom;
634 	__be32 frag_id = 0;
635 	int ptr, offset = 0, err=0;
636 	u8 *prevhdr, nexthdr = 0;
637 	struct net *net = dev_net(skb_dst(skb)->dev);
638 
639 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
640 	nexthdr = *prevhdr;
641 
642 	mtu = ip6_skb_dst_mtu(skb);
643 
644 	/* We must not fragment if the socket is set to force MTU discovery
645 	 * or if the skb it not generated by a local socket.
646 	 */
647 	if (unlikely(!skb->local_df && skb->len > mtu)) {
648 		if (skb->sk && dst_allfrag(skb_dst(skb)))
649 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
650 
651 		skb->dev = skb_dst(skb)->dev;
652 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
653 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
654 			      IPSTATS_MIB_FRAGFAILS);
655 		kfree_skb(skb);
656 		return -EMSGSIZE;
657 	}
658 
659 	if (np && np->frag_size < mtu) {
660 		if (np->frag_size)
661 			mtu = np->frag_size;
662 	}
663 	mtu -= hlen + sizeof(struct frag_hdr);
664 
665 	if (skb_has_frag_list(skb)) {
666 		int first_len = skb_pagelen(skb);
667 		struct sk_buff *frag2;
668 
669 		if (first_len - hlen > mtu ||
670 		    ((first_len - hlen) & 7) ||
671 		    skb_cloned(skb))
672 			goto slow_path;
673 
674 		skb_walk_frags(skb, frag) {
675 			/* Correct geometry. */
676 			if (frag->len > mtu ||
677 			    ((frag->len & 7) && frag->next) ||
678 			    skb_headroom(frag) < hlen)
679 				goto slow_path_clean;
680 
681 			/* Partially cloned skb? */
682 			if (skb_shared(frag))
683 				goto slow_path_clean;
684 
685 			BUG_ON(frag->sk);
686 			if (skb->sk) {
687 				frag->sk = skb->sk;
688 				frag->destructor = sock_wfree;
689 			}
690 			skb->truesize -= frag->truesize;
691 		}
692 
693 		err = 0;
694 		offset = 0;
695 		frag = skb_shinfo(skb)->frag_list;
696 		skb_frag_list_init(skb);
697 		/* BUILD HEADER */
698 
699 		*prevhdr = NEXTHDR_FRAGMENT;
700 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
701 		if (!tmp_hdr) {
702 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
703 				      IPSTATS_MIB_FRAGFAILS);
704 			return -ENOMEM;
705 		}
706 
707 		__skb_pull(skb, hlen);
708 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
709 		__skb_push(skb, hlen);
710 		skb_reset_network_header(skb);
711 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
712 
713 		ipv6_select_ident(fh, rt);
714 		fh->nexthdr = nexthdr;
715 		fh->reserved = 0;
716 		fh->frag_off = htons(IP6_MF);
717 		frag_id = fh->identification;
718 
719 		first_len = skb_pagelen(skb);
720 		skb->data_len = first_len - skb_headlen(skb);
721 		skb->len = first_len;
722 		ipv6_hdr(skb)->payload_len = htons(first_len -
723 						   sizeof(struct ipv6hdr));
724 
725 		dst_hold(&rt->dst);
726 
727 		for (;;) {
728 			/* Prepare header of the next frame,
729 			 * before previous one went down. */
730 			if (frag) {
731 				frag->ip_summed = CHECKSUM_NONE;
732 				skb_reset_transport_header(frag);
733 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
734 				__skb_push(frag, hlen);
735 				skb_reset_network_header(frag);
736 				memcpy(skb_network_header(frag), tmp_hdr,
737 				       hlen);
738 				offset += skb->len - hlen - sizeof(struct frag_hdr);
739 				fh->nexthdr = nexthdr;
740 				fh->reserved = 0;
741 				fh->frag_off = htons(offset);
742 				if (frag->next != NULL)
743 					fh->frag_off |= htons(IP6_MF);
744 				fh->identification = frag_id;
745 				ipv6_hdr(frag)->payload_len =
746 						htons(frag->len -
747 						      sizeof(struct ipv6hdr));
748 				ip6_copy_metadata(frag, skb);
749 			}
750 
751 			err = output(skb);
752 			if(!err)
753 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
754 					      IPSTATS_MIB_FRAGCREATES);
755 
756 			if (err || !frag)
757 				break;
758 
759 			skb = frag;
760 			frag = skb->next;
761 			skb->next = NULL;
762 		}
763 
764 		kfree(tmp_hdr);
765 
766 		if (err == 0) {
767 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
768 				      IPSTATS_MIB_FRAGOKS);
769 			dst_release(&rt->dst);
770 			return 0;
771 		}
772 
773 		while (frag) {
774 			skb = frag->next;
775 			kfree_skb(frag);
776 			frag = skb;
777 		}
778 
779 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
780 			      IPSTATS_MIB_FRAGFAILS);
781 		dst_release(&rt->dst);
782 		return err;
783 
784 slow_path_clean:
785 		skb_walk_frags(skb, frag2) {
786 			if (frag2 == frag)
787 				break;
788 			frag2->sk = NULL;
789 			frag2->destructor = NULL;
790 			skb->truesize += frag2->truesize;
791 		}
792 	}
793 
794 slow_path:
795 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
796 	    skb_checksum_help(skb))
797 		goto fail;
798 
799 	left = skb->len - hlen;		/* Space per frame */
800 	ptr = hlen;			/* Where to start from */
801 
802 	/*
803 	 *	Fragment the datagram.
804 	 */
805 
806 	*prevhdr = NEXTHDR_FRAGMENT;
807 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
808 	troom = rt->dst.dev->needed_tailroom;
809 
810 	/*
811 	 *	Keep copying data until we run out.
812 	 */
813 	while(left > 0)	{
814 		len = left;
815 		/* IF: it doesn't fit, use 'mtu' - the data space left */
816 		if (len > mtu)
817 			len = mtu;
818 		/* IF: we are not sending up to and including the packet end
819 		   then align the next start on an eight byte boundary */
820 		if (len < left)	{
821 			len &= ~7;
822 		}
823 		/*
824 		 *	Allocate buffer.
825 		 */
826 
827 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
828 				      hroom + troom, GFP_ATOMIC)) == NULL) {
829 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
830 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
831 				      IPSTATS_MIB_FRAGFAILS);
832 			err = -ENOMEM;
833 			goto fail;
834 		}
835 
836 		/*
837 		 *	Set up data on packet
838 		 */
839 
840 		ip6_copy_metadata(frag, skb);
841 		skb_reserve(frag, hroom);
842 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
843 		skb_reset_network_header(frag);
844 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
845 		frag->transport_header = (frag->network_header + hlen +
846 					  sizeof(struct frag_hdr));
847 
848 		/*
849 		 *	Charge the memory for the fragment to any owner
850 		 *	it might possess
851 		 */
852 		if (skb->sk)
853 			skb_set_owner_w(frag, skb->sk);
854 
855 		/*
856 		 *	Copy the packet header into the new buffer.
857 		 */
858 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
859 
860 		/*
861 		 *	Build fragment header.
862 		 */
863 		fh->nexthdr = nexthdr;
864 		fh->reserved = 0;
865 		if (!frag_id) {
866 			ipv6_select_ident(fh, rt);
867 			frag_id = fh->identification;
868 		} else
869 			fh->identification = frag_id;
870 
871 		/*
872 		 *	Copy a block of the IP datagram.
873 		 */
874 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
875 			BUG();
876 		left -= len;
877 
878 		fh->frag_off = htons(offset);
879 		if (left > 0)
880 			fh->frag_off |= htons(IP6_MF);
881 		ipv6_hdr(frag)->payload_len = htons(frag->len -
882 						    sizeof(struct ipv6hdr));
883 
884 		ptr += len;
885 		offset += len;
886 
887 		/*
888 		 *	Put this fragment into the sending queue.
889 		 */
890 		err = output(frag);
891 		if (err)
892 			goto fail;
893 
894 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
895 			      IPSTATS_MIB_FRAGCREATES);
896 	}
897 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
898 		      IPSTATS_MIB_FRAGOKS);
899 	consume_skb(skb);
900 	return err;
901 
902 fail:
903 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
904 		      IPSTATS_MIB_FRAGFAILS);
905 	kfree_skb(skb);
906 	return err;
907 }
908 
909 static inline int ip6_rt_check(const struct rt6key *rt_key,
910 			       const struct in6_addr *fl_addr,
911 			       const struct in6_addr *addr_cache)
912 {
913 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
914 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
915 }
916 
917 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
918 					  struct dst_entry *dst,
919 					  const struct flowi6 *fl6)
920 {
921 	struct ipv6_pinfo *np = inet6_sk(sk);
922 	struct rt6_info *rt = (struct rt6_info *)dst;
923 
924 	if (!dst)
925 		goto out;
926 
927 	/* Yes, checking route validity in not connected
928 	 * case is not very simple. Take into account,
929 	 * that we do not support routing by source, TOS,
930 	 * and MSG_DONTROUTE 		--ANK (980726)
931 	 *
932 	 * 1. ip6_rt_check(): If route was host route,
933 	 *    check that cached destination is current.
934 	 *    If it is network route, we still may
935 	 *    check its validity using saved pointer
936 	 *    to the last used address: daddr_cache.
937 	 *    We do not want to save whole address now,
938 	 *    (because main consumer of this service
939 	 *    is tcp, which has not this problem),
940 	 *    so that the last trick works only on connected
941 	 *    sockets.
942 	 * 2. oif also should be the same.
943 	 */
944 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
945 #ifdef CONFIG_IPV6_SUBTREES
946 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
947 #endif
948 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
949 		dst_release(dst);
950 		dst = NULL;
951 	}
952 
953 out:
954 	return dst;
955 }
956 
957 static int ip6_dst_lookup_tail(struct sock *sk,
958 			       struct dst_entry **dst, struct flowi6 *fl6)
959 {
960 	struct net *net = sock_net(sk);
961 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
962 	struct neighbour *n;
963 #endif
964 	int err;
965 
966 	if (*dst == NULL)
967 		*dst = ip6_route_output(net, sk, fl6);
968 
969 	if ((err = (*dst)->error))
970 		goto out_err_release;
971 
972 	if (ipv6_addr_any(&fl6->saddr)) {
973 		struct rt6_info *rt = (struct rt6_info *) *dst;
974 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
975 					  sk ? inet6_sk(sk)->srcprefs : 0,
976 					  &fl6->saddr);
977 		if (err)
978 			goto out_err_release;
979 	}
980 
981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
982 	/*
983 	 * Here if the dst entry we've looked up
984 	 * has a neighbour entry that is in the INCOMPLETE
985 	 * state and the src address from the flow is
986 	 * marked as OPTIMISTIC, we release the found
987 	 * dst entry and replace it instead with the
988 	 * dst entry of the nexthop router
989 	 */
990 	rcu_read_lock();
991 	n = dst_get_neighbour_noref(*dst);
992 	if (n && !(n->nud_state & NUD_VALID)) {
993 		struct inet6_ifaddr *ifp;
994 		struct flowi6 fl_gw6;
995 		int redirect;
996 
997 		rcu_read_unlock();
998 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
999 				      (*dst)->dev, 1);
1000 
1001 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1002 		if (ifp)
1003 			in6_ifa_put(ifp);
1004 
1005 		if (redirect) {
1006 			/*
1007 			 * We need to get the dst entry for the
1008 			 * default router instead
1009 			 */
1010 			dst_release(*dst);
1011 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1012 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1013 			*dst = ip6_route_output(net, sk, &fl_gw6);
1014 			if ((err = (*dst)->error))
1015 				goto out_err_release;
1016 		}
1017 	} else {
1018 		rcu_read_unlock();
1019 	}
1020 #endif
1021 
1022 	return 0;
1023 
1024 out_err_release:
1025 	if (err == -ENETUNREACH)
1026 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1027 	dst_release(*dst);
1028 	*dst = NULL;
1029 	return err;
1030 }
1031 
1032 /**
1033  *	ip6_dst_lookup - perform route lookup on flow
1034  *	@sk: socket which provides route info
1035  *	@dst: pointer to dst_entry * for result
1036  *	@fl6: flow to lookup
1037  *
1038  *	This function performs a route lookup on the given flow.
1039  *
1040  *	It returns zero on success, or a standard errno code on error.
1041  */
1042 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1043 {
1044 	*dst = NULL;
1045 	return ip6_dst_lookup_tail(sk, dst, fl6);
1046 }
1047 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1048 
1049 /**
1050  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1051  *	@sk: socket which provides route info
1052  *	@fl6: flow to lookup
1053  *	@final_dst: final destination address for ipsec lookup
1054  *	@can_sleep: we are in a sleepable context
1055  *
1056  *	This function performs a route lookup on the given flow.
1057  *
1058  *	It returns a valid dst pointer on success, or a pointer encoded
1059  *	error code.
1060  */
1061 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1062 				      const struct in6_addr *final_dst,
1063 				      bool can_sleep)
1064 {
1065 	struct dst_entry *dst = NULL;
1066 	int err;
1067 
1068 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1069 	if (err)
1070 		return ERR_PTR(err);
1071 	if (final_dst)
1072 		fl6->daddr = *final_dst;
1073 	if (can_sleep)
1074 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1075 
1076 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1077 }
1078 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1079 
1080 /**
1081  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1082  *	@sk: socket which provides the dst cache and route info
1083  *	@fl6: flow to lookup
1084  *	@final_dst: final destination address for ipsec lookup
1085  *	@can_sleep: we are in a sleepable context
1086  *
1087  *	This function performs a route lookup on the given flow with the
1088  *	possibility of using the cached route in the socket if it is valid.
1089  *	It will take the socket dst lock when operating on the dst cache.
1090  *	As a result, this function can only be used in process context.
1091  *
1092  *	It returns a valid dst pointer on success, or a pointer encoded
1093  *	error code.
1094  */
1095 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1096 					 const struct in6_addr *final_dst,
1097 					 bool can_sleep)
1098 {
1099 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1100 	int err;
1101 
1102 	dst = ip6_sk_dst_check(sk, dst, fl6);
1103 
1104 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1105 	if (err)
1106 		return ERR_PTR(err);
1107 	if (final_dst)
1108 		fl6->daddr = *final_dst;
1109 	if (can_sleep)
1110 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1111 
1112 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1113 }
1114 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1115 
1116 static inline int ip6_ufo_append_data(struct sock *sk,
1117 			int getfrag(void *from, char *to, int offset, int len,
1118 			int odd, struct sk_buff *skb),
1119 			void *from, int length, int hh_len, int fragheaderlen,
1120 			int transhdrlen, int mtu,unsigned int flags,
1121 			struct rt6_info *rt)
1122 
1123 {
1124 	struct sk_buff *skb;
1125 	int err;
1126 
1127 	/* There is support for UDP large send offload by network
1128 	 * device, so create one single skb packet containing complete
1129 	 * udp datagram
1130 	 */
1131 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1132 		skb = sock_alloc_send_skb(sk,
1133 			hh_len + fragheaderlen + transhdrlen + 20,
1134 			(flags & MSG_DONTWAIT), &err);
1135 		if (skb == NULL)
1136 			return err;
1137 
1138 		/* reserve space for Hardware header */
1139 		skb_reserve(skb, hh_len);
1140 
1141 		/* create space for UDP/IP header */
1142 		skb_put(skb,fragheaderlen + transhdrlen);
1143 
1144 		/* initialize network header pointer */
1145 		skb_reset_network_header(skb);
1146 
1147 		/* initialize protocol header pointer */
1148 		skb->transport_header = skb->network_header + fragheaderlen;
1149 
1150 		skb->ip_summed = CHECKSUM_PARTIAL;
1151 		skb->csum = 0;
1152 	}
1153 
1154 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1155 				      (length - transhdrlen));
1156 	if (!err) {
1157 		struct frag_hdr fhdr;
1158 
1159 		/* Specify the length of each IPv6 datagram fragment.
1160 		 * It has to be a multiple of 8.
1161 		 */
1162 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1163 					     sizeof(struct frag_hdr)) & ~7;
1164 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1165 		ipv6_select_ident(&fhdr, rt);
1166 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1167 		__skb_queue_tail(&sk->sk_write_queue, skb);
1168 
1169 		return 0;
1170 	}
1171 	/* There is not enough support do UPD LSO,
1172 	 * so follow normal path
1173 	 */
1174 	kfree_skb(skb);
1175 
1176 	return err;
1177 }
1178 
1179 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1180 					       gfp_t gfp)
1181 {
1182 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1183 }
1184 
1185 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1186 						gfp_t gfp)
1187 {
1188 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1189 }
1190 
1191 static void ip6_append_data_mtu(int *mtu,
1192 				int *maxfraglen,
1193 				unsigned int fragheaderlen,
1194 				struct sk_buff *skb,
1195 				struct rt6_info *rt)
1196 {
1197 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1198 		if (skb == NULL) {
1199 			/* first fragment, reserve header_len */
1200 			*mtu = *mtu - rt->dst.header_len;
1201 
1202 		} else {
1203 			/*
1204 			 * this fragment is not first, the headers
1205 			 * space is regarded as data space.
1206 			 */
1207 			*mtu = dst_mtu(rt->dst.path);
1208 		}
1209 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1210 			      + fragheaderlen - sizeof(struct frag_hdr);
1211 	}
1212 }
1213 
1214 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1215 	int offset, int len, int odd, struct sk_buff *skb),
1216 	void *from, int length, int transhdrlen,
1217 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1218 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1219 {
1220 	struct inet_sock *inet = inet_sk(sk);
1221 	struct ipv6_pinfo *np = inet6_sk(sk);
1222 	struct inet_cork *cork;
1223 	struct sk_buff *skb, *skb_prev = NULL;
1224 	unsigned int maxfraglen, fragheaderlen;
1225 	int exthdrlen;
1226 	int dst_exthdrlen;
1227 	int hh_len;
1228 	int mtu;
1229 	int copy;
1230 	int err;
1231 	int offset = 0;
1232 	__u8 tx_flags = 0;
1233 
1234 	if (flags&MSG_PROBE)
1235 		return 0;
1236 	cork = &inet->cork.base;
1237 	if (skb_queue_empty(&sk->sk_write_queue)) {
1238 		/*
1239 		 * setup for corking
1240 		 */
1241 		if (opt) {
1242 			if (WARN_ON(np->cork.opt))
1243 				return -EINVAL;
1244 
1245 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1246 			if (unlikely(np->cork.opt == NULL))
1247 				return -ENOBUFS;
1248 
1249 			np->cork.opt->tot_len = opt->tot_len;
1250 			np->cork.opt->opt_flen = opt->opt_flen;
1251 			np->cork.opt->opt_nflen = opt->opt_nflen;
1252 
1253 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1254 							    sk->sk_allocation);
1255 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1256 				return -ENOBUFS;
1257 
1258 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1259 							    sk->sk_allocation);
1260 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1261 				return -ENOBUFS;
1262 
1263 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1264 							   sk->sk_allocation);
1265 			if (opt->hopopt && !np->cork.opt->hopopt)
1266 				return -ENOBUFS;
1267 
1268 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1269 							    sk->sk_allocation);
1270 			if (opt->srcrt && !np->cork.opt->srcrt)
1271 				return -ENOBUFS;
1272 
1273 			/* need source address above miyazawa*/
1274 		}
1275 		dst_hold(&rt->dst);
1276 		cork->dst = &rt->dst;
1277 		inet->cork.fl.u.ip6 = *fl6;
1278 		np->cork.hop_limit = hlimit;
1279 		np->cork.tclass = tclass;
1280 		if (rt->dst.flags & DST_XFRM_TUNNEL)
1281 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1282 			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1283 		else
1284 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1285 			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1286 		if (np->frag_size < mtu) {
1287 			if (np->frag_size)
1288 				mtu = np->frag_size;
1289 		}
1290 		cork->fragsize = mtu;
1291 		if (dst_allfrag(rt->dst.path))
1292 			cork->flags |= IPCORK_ALLFRAG;
1293 		cork->length = 0;
1294 		sk->sk_sndmsg_page = NULL;
1295 		sk->sk_sndmsg_off = 0;
1296 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1297 		length += exthdrlen;
1298 		transhdrlen += exthdrlen;
1299 		dst_exthdrlen = rt->dst.header_len;
1300 	} else {
1301 		rt = (struct rt6_info *)cork->dst;
1302 		fl6 = &inet->cork.fl.u.ip6;
1303 		opt = np->cork.opt;
1304 		transhdrlen = 0;
1305 		exthdrlen = 0;
1306 		dst_exthdrlen = 0;
1307 		mtu = cork->fragsize;
1308 	}
1309 
1310 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1311 
1312 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1313 			(opt ? opt->opt_nflen : 0);
1314 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1315 
1316 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1317 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1318 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1319 			return -EMSGSIZE;
1320 		}
1321 	}
1322 
1323 	/* For UDP, check if TX timestamp is enabled */
1324 	if (sk->sk_type == SOCK_DGRAM) {
1325 		err = sock_tx_timestamp(sk, &tx_flags);
1326 		if (err)
1327 			goto error;
1328 	}
1329 
1330 	/*
1331 	 * Let's try using as much space as possible.
1332 	 * Use MTU if total length of the message fits into the MTU.
1333 	 * Otherwise, we need to reserve fragment header and
1334 	 * fragment alignment (= 8-15 octects, in total).
1335 	 *
1336 	 * Note that we may need to "move" the data from the tail of
1337 	 * of the buffer to the new fragment when we split
1338 	 * the message.
1339 	 *
1340 	 * FIXME: It may be fragmented into multiple chunks
1341 	 *        at once if non-fragmentable extension headers
1342 	 *        are too large.
1343 	 * --yoshfuji
1344 	 */
1345 
1346 	cork->length += length;
1347 	if (length > mtu) {
1348 		int proto = sk->sk_protocol;
1349 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1350 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1351 			return -EMSGSIZE;
1352 		}
1353 
1354 		if (proto == IPPROTO_UDP &&
1355 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1356 
1357 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1358 						  hh_len, fragheaderlen,
1359 						  transhdrlen, mtu, flags, rt);
1360 			if (err)
1361 				goto error;
1362 			return 0;
1363 		}
1364 	}
1365 
1366 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1367 		goto alloc_new_skb;
1368 
1369 	while (length > 0) {
1370 		/* Check if the remaining data fits into current packet. */
1371 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1372 		if (copy < length)
1373 			copy = maxfraglen - skb->len;
1374 
1375 		if (copy <= 0) {
1376 			char *data;
1377 			unsigned int datalen;
1378 			unsigned int fraglen;
1379 			unsigned int fraggap;
1380 			unsigned int alloclen;
1381 alloc_new_skb:
1382 			/* There's no room in the current skb */
1383 			if (skb)
1384 				fraggap = skb->len - maxfraglen;
1385 			else
1386 				fraggap = 0;
1387 			/* update mtu and maxfraglen if necessary */
1388 			if (skb == NULL || skb_prev == NULL)
1389 				ip6_append_data_mtu(&mtu, &maxfraglen,
1390 						    fragheaderlen, skb, rt);
1391 
1392 			skb_prev = skb;
1393 
1394 			/*
1395 			 * If remaining data exceeds the mtu,
1396 			 * we know we need more fragment(s).
1397 			 */
1398 			datalen = length + fraggap;
1399 
1400 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1401 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1402 			if ((flags & MSG_MORE) &&
1403 			    !(rt->dst.dev->features&NETIF_F_SG))
1404 				alloclen = mtu;
1405 			else
1406 				alloclen = datalen + fragheaderlen;
1407 
1408 			alloclen += dst_exthdrlen;
1409 
1410 			if (datalen != length + fraggap) {
1411 				/*
1412 				 * this is not the last fragment, the trailer
1413 				 * space is regarded as data space.
1414 				 */
1415 				datalen += rt->dst.trailer_len;
1416 			}
1417 
1418 			alloclen += rt->dst.trailer_len;
1419 			fraglen = datalen + fragheaderlen;
1420 
1421 			/*
1422 			 * We just reserve space for fragment header.
1423 			 * Note: this may be overallocation if the message
1424 			 * (without MSG_MORE) fits into the MTU.
1425 			 */
1426 			alloclen += sizeof(struct frag_hdr);
1427 
1428 			if (transhdrlen) {
1429 				skb = sock_alloc_send_skb(sk,
1430 						alloclen + hh_len,
1431 						(flags & MSG_DONTWAIT), &err);
1432 			} else {
1433 				skb = NULL;
1434 				if (atomic_read(&sk->sk_wmem_alloc) <=
1435 				    2 * sk->sk_sndbuf)
1436 					skb = sock_wmalloc(sk,
1437 							   alloclen + hh_len, 1,
1438 							   sk->sk_allocation);
1439 				if (unlikely(skb == NULL))
1440 					err = -ENOBUFS;
1441 				else {
1442 					/* Only the initial fragment
1443 					 * is time stamped.
1444 					 */
1445 					tx_flags = 0;
1446 				}
1447 			}
1448 			if (skb == NULL)
1449 				goto error;
1450 			/*
1451 			 *	Fill in the control structures
1452 			 */
1453 			skb->ip_summed = CHECKSUM_NONE;
1454 			skb->csum = 0;
1455 			/* reserve for fragmentation and ipsec header */
1456 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1457 				    dst_exthdrlen);
1458 
1459 			if (sk->sk_type == SOCK_DGRAM)
1460 				skb_shinfo(skb)->tx_flags = tx_flags;
1461 
1462 			/*
1463 			 *	Find where to start putting bytes
1464 			 */
1465 			data = skb_put(skb, fraglen);
1466 			skb_set_network_header(skb, exthdrlen);
1467 			data += fragheaderlen;
1468 			skb->transport_header = (skb->network_header +
1469 						 fragheaderlen);
1470 			if (fraggap) {
1471 				skb->csum = skb_copy_and_csum_bits(
1472 					skb_prev, maxfraglen,
1473 					data + transhdrlen, fraggap, 0);
1474 				skb_prev->csum = csum_sub(skb_prev->csum,
1475 							  skb->csum);
1476 				data += fraggap;
1477 				pskb_trim_unique(skb_prev, maxfraglen);
1478 			}
1479 			copy = datalen - transhdrlen - fraggap;
1480 
1481 			if (copy < 0) {
1482 				err = -EINVAL;
1483 				kfree_skb(skb);
1484 				goto error;
1485 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1486 				err = -EFAULT;
1487 				kfree_skb(skb);
1488 				goto error;
1489 			}
1490 
1491 			offset += copy;
1492 			length -= datalen - fraggap;
1493 			transhdrlen = 0;
1494 			exthdrlen = 0;
1495 			dst_exthdrlen = 0;
1496 
1497 			/*
1498 			 * Put the packet on the pending queue
1499 			 */
1500 			__skb_queue_tail(&sk->sk_write_queue, skb);
1501 			continue;
1502 		}
1503 
1504 		if (copy > length)
1505 			copy = length;
1506 
1507 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1508 			unsigned int off;
1509 
1510 			off = skb->len;
1511 			if (getfrag(from, skb_put(skb, copy),
1512 						offset, copy, off, skb) < 0) {
1513 				__skb_trim(skb, off);
1514 				err = -EFAULT;
1515 				goto error;
1516 			}
1517 		} else {
1518 			int i = skb_shinfo(skb)->nr_frags;
1519 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1520 			struct page *page = sk->sk_sndmsg_page;
1521 			int off = sk->sk_sndmsg_off;
1522 			unsigned int left;
1523 
1524 			if (page && (left = PAGE_SIZE - off) > 0) {
1525 				if (copy >= left)
1526 					copy = left;
1527 				if (page != skb_frag_page(frag)) {
1528 					if (i == MAX_SKB_FRAGS) {
1529 						err = -EMSGSIZE;
1530 						goto error;
1531 					}
1532 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1533 					skb_frag_ref(skb, i);
1534 					frag = &skb_shinfo(skb)->frags[i];
1535 				}
1536 			} else if(i < MAX_SKB_FRAGS) {
1537 				if (copy > PAGE_SIZE)
1538 					copy = PAGE_SIZE;
1539 				page = alloc_pages(sk->sk_allocation, 0);
1540 				if (page == NULL) {
1541 					err = -ENOMEM;
1542 					goto error;
1543 				}
1544 				sk->sk_sndmsg_page = page;
1545 				sk->sk_sndmsg_off = 0;
1546 
1547 				skb_fill_page_desc(skb, i, page, 0, 0);
1548 				frag = &skb_shinfo(skb)->frags[i];
1549 			} else {
1550 				err = -EMSGSIZE;
1551 				goto error;
1552 			}
1553 			if (getfrag(from,
1554 				    skb_frag_address(frag) + skb_frag_size(frag),
1555 				    offset, copy, skb->len, skb) < 0) {
1556 				err = -EFAULT;
1557 				goto error;
1558 			}
1559 			sk->sk_sndmsg_off += copy;
1560 			skb_frag_size_add(frag, copy);
1561 			skb->len += copy;
1562 			skb->data_len += copy;
1563 			skb->truesize += copy;
1564 			atomic_add(copy, &sk->sk_wmem_alloc);
1565 		}
1566 		offset += copy;
1567 		length -= copy;
1568 	}
1569 	return 0;
1570 error:
1571 	cork->length -= length;
1572 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1573 	return err;
1574 }
1575 EXPORT_SYMBOL_GPL(ip6_append_data);
1576 
1577 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1578 {
1579 	if (np->cork.opt) {
1580 		kfree(np->cork.opt->dst0opt);
1581 		kfree(np->cork.opt->dst1opt);
1582 		kfree(np->cork.opt->hopopt);
1583 		kfree(np->cork.opt->srcrt);
1584 		kfree(np->cork.opt);
1585 		np->cork.opt = NULL;
1586 	}
1587 
1588 	if (inet->cork.base.dst) {
1589 		dst_release(inet->cork.base.dst);
1590 		inet->cork.base.dst = NULL;
1591 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1592 	}
1593 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1594 }
1595 
1596 int ip6_push_pending_frames(struct sock *sk)
1597 {
1598 	struct sk_buff *skb, *tmp_skb;
1599 	struct sk_buff **tail_skb;
1600 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1601 	struct inet_sock *inet = inet_sk(sk);
1602 	struct ipv6_pinfo *np = inet6_sk(sk);
1603 	struct net *net = sock_net(sk);
1604 	struct ipv6hdr *hdr;
1605 	struct ipv6_txoptions *opt = np->cork.opt;
1606 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1607 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1608 	unsigned char proto = fl6->flowi6_proto;
1609 	int err = 0;
1610 
1611 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1612 		goto out;
1613 	tail_skb = &(skb_shinfo(skb)->frag_list);
1614 
1615 	/* move skb->data to ip header from ext header */
1616 	if (skb->data < skb_network_header(skb))
1617 		__skb_pull(skb, skb_network_offset(skb));
1618 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1619 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1620 		*tail_skb = tmp_skb;
1621 		tail_skb = &(tmp_skb->next);
1622 		skb->len += tmp_skb->len;
1623 		skb->data_len += tmp_skb->len;
1624 		skb->truesize += tmp_skb->truesize;
1625 		tmp_skb->destructor = NULL;
1626 		tmp_skb->sk = NULL;
1627 	}
1628 
1629 	/* Allow local fragmentation. */
1630 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1631 		skb->local_df = 1;
1632 
1633 	*final_dst = fl6->daddr;
1634 	__skb_pull(skb, skb_network_header_len(skb));
1635 	if (opt && opt->opt_flen)
1636 		ipv6_push_frag_opts(skb, opt, &proto);
1637 	if (opt && opt->opt_nflen)
1638 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1639 
1640 	skb_push(skb, sizeof(struct ipv6hdr));
1641 	skb_reset_network_header(skb);
1642 	hdr = ipv6_hdr(skb);
1643 
1644 	*(__be32*)hdr = fl6->flowlabel |
1645 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1646 
1647 	hdr->hop_limit = np->cork.hop_limit;
1648 	hdr->nexthdr = proto;
1649 	hdr->saddr = fl6->saddr;
1650 	hdr->daddr = *final_dst;
1651 
1652 	skb->priority = sk->sk_priority;
1653 	skb->mark = sk->sk_mark;
1654 
1655 	skb_dst_set(skb, dst_clone(&rt->dst));
1656 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1657 	if (proto == IPPROTO_ICMPV6) {
1658 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1659 
1660 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1661 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1662 	}
1663 
1664 	err = ip6_local_out(skb);
1665 	if (err) {
1666 		if (err > 0)
1667 			err = net_xmit_errno(err);
1668 		if (err)
1669 			goto error;
1670 	}
1671 
1672 out:
1673 	ip6_cork_release(inet, np);
1674 	return err;
1675 error:
1676 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1677 	goto out;
1678 }
1679 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1680 
1681 void ip6_flush_pending_frames(struct sock *sk)
1682 {
1683 	struct sk_buff *skb;
1684 
1685 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1686 		if (skb_dst(skb))
1687 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1688 				      IPSTATS_MIB_OUTDISCARDS);
1689 		kfree_skb(skb);
1690 	}
1691 
1692 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1693 }
1694 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1695