xref: /linux/net/ipv6/ip6_output.c (revision a508da6cc0093171833efb8376b00473f24221b9)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	rcu_read_lock();
139 	neigh = dst_get_neighbour_noref(dst);
140 	if (neigh) {
141 		int res = neigh_output(neigh, skb);
142 
143 		rcu_read_unlock();
144 		return res;
145 	}
146 	rcu_read_unlock();
147 	IP6_INC_STATS_BH(dev_net(dst->dev),
148 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149 	kfree_skb(skb);
150 	return -EINVAL;
151 }
152 
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 	    dst_allfrag(skb_dst(skb)))
157 		return ip6_fragment(skb, ip6_finish_output2);
158 	else
159 		return ip6_finish_output2(skb);
160 }
161 
162 int ip6_output(struct sk_buff *skb)
163 {
164 	struct net_device *dev = skb_dst(skb)->dev;
165 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 	if (unlikely(idev->cnf.disable_ipv6)) {
167 		IP6_INC_STATS(dev_net(dev), idev,
168 			      IPSTATS_MIB_OUTDISCARDS);
169 		kfree_skb(skb);
170 		return 0;
171 	}
172 
173 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174 			    ip6_finish_output,
175 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177 
178 /*
179  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181 
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 	     struct ipv6_txoptions *opt, int tclass)
184 {
185 	struct net *net = sock_net(sk);
186 	struct ipv6_pinfo *np = inet6_sk(sk);
187 	struct in6_addr *first_hop = &fl6->daddr;
188 	struct dst_entry *dst = skb_dst(skb);
189 	struct ipv6hdr *hdr;
190 	u8  proto = fl6->flowi6_proto;
191 	int seg_len = skb->len;
192 	int hlimit = -1;
193 	u32 mtu;
194 
195 	if (opt) {
196 		unsigned int head_room;
197 
198 		/* First: exthdrs may take lots of space (~8K for now)
199 		   MAX_HEADER is not enough.
200 		 */
201 		head_room = opt->opt_nflen + opt->opt_flen;
202 		seg_len += head_room;
203 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204 
205 		if (skb_headroom(skb) < head_room) {
206 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 			if (skb2 == NULL) {
208 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 					      IPSTATS_MIB_OUTDISCARDS);
210 				kfree_skb(skb);
211 				return -ENOBUFS;
212 			}
213 			consume_skb(skb);
214 			skb = skb2;
215 			skb_set_owner_w(skb, sk);
216 		}
217 		if (opt->opt_flen)
218 			ipv6_push_frag_opts(skb, opt, &proto);
219 		if (opt->opt_nflen)
220 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221 	}
222 
223 	skb_push(skb, sizeof(struct ipv6hdr));
224 	skb_reset_network_header(skb);
225 	hdr = ipv6_hdr(skb);
226 
227 	/*
228 	 *	Fill in the IPv6 header
229 	 */
230 	if (np)
231 		hlimit = np->hop_limit;
232 	if (hlimit < 0)
233 		hlimit = ip6_dst_hoplimit(dst);
234 
235 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236 
237 	hdr->payload_len = htons(seg_len);
238 	hdr->nexthdr = proto;
239 	hdr->hop_limit = hlimit;
240 
241 	hdr->saddr = fl6->saddr;
242 	hdr->daddr = *first_hop;
243 
244 	skb->priority = sk->sk_priority;
245 	skb->mark = sk->sk_mark;
246 
247 	mtu = dst_mtu(dst);
248 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 			      IPSTATS_MIB_OUT, skb->len);
251 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 			       dst->dev, dst_output);
253 	}
254 
255 	net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
256 	skb->dev = dst->dev;
257 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
258 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
259 	kfree_skb(skb);
260 	return -EMSGSIZE;
261 }
262 
263 EXPORT_SYMBOL(ip6_xmit);
264 
265 /*
266  *	To avoid extra problems ND packets are send through this
267  *	routine. It's code duplication but I really want to avoid
268  *	extra checks since ipv6_build_header is used by TCP (which
269  *	is for us performance critical)
270  */
271 
272 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
273 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
274 	       int proto, int len)
275 {
276 	struct ipv6_pinfo *np = inet6_sk(sk);
277 	struct ipv6hdr *hdr;
278 
279 	skb->protocol = htons(ETH_P_IPV6);
280 	skb->dev = dev;
281 
282 	skb_reset_network_header(skb);
283 	skb_put(skb, sizeof(struct ipv6hdr));
284 	hdr = ipv6_hdr(skb);
285 
286 	*(__be32*)hdr = htonl(0x60000000);
287 
288 	hdr->payload_len = htons(len);
289 	hdr->nexthdr = proto;
290 	hdr->hop_limit = np->hop_limit;
291 
292 	hdr->saddr = *saddr;
293 	hdr->daddr = *daddr;
294 
295 	return 0;
296 }
297 
298 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
299 {
300 	struct ip6_ra_chain *ra;
301 	struct sock *last = NULL;
302 
303 	read_lock(&ip6_ra_lock);
304 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
305 		struct sock *sk = ra->sk;
306 		if (sk && ra->sel == sel &&
307 		    (!sk->sk_bound_dev_if ||
308 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
309 			if (last) {
310 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311 				if (skb2)
312 					rawv6_rcv(last, skb2);
313 			}
314 			last = sk;
315 		}
316 	}
317 
318 	if (last) {
319 		rawv6_rcv(last, skb);
320 		read_unlock(&ip6_ra_lock);
321 		return 1;
322 	}
323 	read_unlock(&ip6_ra_lock);
324 	return 0;
325 }
326 
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 {
329 	struct ipv6hdr *hdr = ipv6_hdr(skb);
330 	u8 nexthdr = hdr->nexthdr;
331 	__be16 frag_off;
332 	int offset;
333 
334 	if (ipv6_ext_hdr(nexthdr)) {
335 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336 		if (offset < 0)
337 			return 0;
338 	} else
339 		offset = sizeof(struct ipv6hdr);
340 
341 	if (nexthdr == IPPROTO_ICMPV6) {
342 		struct icmp6hdr *icmp6;
343 
344 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 					 offset + 1 - skb->data)))
346 			return 0;
347 
348 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349 
350 		switch (icmp6->icmp6_type) {
351 		case NDISC_ROUTER_SOLICITATION:
352 		case NDISC_ROUTER_ADVERTISEMENT:
353 		case NDISC_NEIGHBOUR_SOLICITATION:
354 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
355 		case NDISC_REDIRECT:
356 			/* For reaction involving unicast neighbor discovery
357 			 * message destined to the proxied address, pass it to
358 			 * input function.
359 			 */
360 			return 1;
361 		default:
362 			break;
363 		}
364 	}
365 
366 	/*
367 	 * The proxying router can't forward traffic sent to a link-local
368 	 * address, so signal the sender and discard the packet. This
369 	 * behavior is clarified by the MIPv6 specification.
370 	 */
371 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 		dst_link_failure(skb);
373 		return -1;
374 	}
375 
376 	return 0;
377 }
378 
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381 	return dst_output(skb);
382 }
383 
384 int ip6_forward(struct sk_buff *skb)
385 {
386 	struct dst_entry *dst = skb_dst(skb);
387 	struct ipv6hdr *hdr = ipv6_hdr(skb);
388 	struct inet6_skb_parm *opt = IP6CB(skb);
389 	struct net *net = dev_net(dst->dev);
390 	u32 mtu;
391 
392 	if (net->ipv6.devconf_all->forwarding == 0)
393 		goto error;
394 
395 	if (skb_warn_if_lro(skb))
396 		goto drop;
397 
398 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
400 		goto drop;
401 	}
402 
403 	if (skb->pkt_type != PACKET_HOST)
404 		goto drop;
405 
406 	skb_forward_csum(skb);
407 
408 	/*
409 	 *	We DO NOT make any processing on
410 	 *	RA packets, pushing them to user level AS IS
411 	 *	without ane WARRANTY that application will be able
412 	 *	to interpret them. The reason is that we
413 	 *	cannot make anything clever here.
414 	 *
415 	 *	We are not end-node, so that if packet contains
416 	 *	AH/ESP, we cannot make anything.
417 	 *	Defragmentation also would be mistake, RA packets
418 	 *	cannot be fragmented, because there is no warranty
419 	 *	that different fragments will go along one path. --ANK
420 	 */
421 	if (opt->ra) {
422 		u8 *ptr = skb_network_header(skb) + opt->ra;
423 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
424 			return 0;
425 	}
426 
427 	/*
428 	 *	check and decrement ttl
429 	 */
430 	if (hdr->hop_limit <= 1) {
431 		/* Force OUTPUT device used as source address */
432 		skb->dev = dst->dev;
433 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
434 		IP6_INC_STATS_BH(net,
435 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
436 
437 		kfree_skb(skb);
438 		return -ETIMEDOUT;
439 	}
440 
441 	/* XXX: idev->cnf.proxy_ndp? */
442 	if (net->ipv6.devconf_all->proxy_ndp &&
443 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
444 		int proxied = ip6_forward_proxy_check(skb);
445 		if (proxied > 0)
446 			return ip6_input(skb);
447 		else if (proxied < 0) {
448 			IP6_INC_STATS(net, ip6_dst_idev(dst),
449 				      IPSTATS_MIB_INDISCARDS);
450 			goto drop;
451 		}
452 	}
453 
454 	if (!xfrm6_route_forward(skb)) {
455 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
456 		goto drop;
457 	}
458 	dst = skb_dst(skb);
459 
460 	/* IPv6 specs say nothing about it, but it is clear that we cannot
461 	   send redirects to source routed frames.
462 	   We don't send redirects to frames decapsulated from IPsec.
463 	 */
464 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
465 		struct in6_addr *target = NULL;
466 		struct rt6_info *rt;
467 
468 		/*
469 		 *	incoming and outgoing devices are the same
470 		 *	send a redirect.
471 		 */
472 
473 		rt = (struct rt6_info *) dst;
474 		if (rt->rt6i_flags & RTF_GATEWAY)
475 			target = &rt->rt6i_gateway;
476 		else
477 			target = &hdr->daddr;
478 
479 		if (!rt->rt6i_peer)
480 			rt6_bind_peer(rt, 1);
481 
482 		/* Limit redirects both by destination (here)
483 		   and by source (inside ndisc_send_redirect)
484 		 */
485 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486 			ndisc_send_redirect(skb, target);
487 	} else {
488 		int addrtype = ipv6_addr_type(&hdr->saddr);
489 
490 		/* This check is security critical. */
491 		if (addrtype == IPV6_ADDR_ANY ||
492 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 			goto error;
494 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 				    ICMPV6_NOT_NEIGHBOUR, 0);
497 			goto error;
498 		}
499 	}
500 
501 	mtu = dst_mtu(dst);
502 	if (mtu < IPV6_MIN_MTU)
503 		mtu = IPV6_MIN_MTU;
504 
505 	if (skb->len > mtu && !skb_is_gso(skb)) {
506 		/* Again, force OUTPUT device used as source address */
507 		skb->dev = dst->dev;
508 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 		IP6_INC_STATS_BH(net,
510 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 		IP6_INC_STATS_BH(net,
512 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513 		kfree_skb(skb);
514 		return -EMSGSIZE;
515 	}
516 
517 	if (skb_cow(skb, dst->dev->hard_header_len)) {
518 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519 		goto drop;
520 	}
521 
522 	hdr = ipv6_hdr(skb);
523 
524 	/* Mangling hops number delayed to point after skb COW */
525 
526 	hdr->hop_limit--;
527 
528 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530 		       ip6_forward_finish);
531 
532 error:
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535 	kfree_skb(skb);
536 	return -EINVAL;
537 }
538 
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541 	to->pkt_type = from->pkt_type;
542 	to->priority = from->priority;
543 	to->protocol = from->protocol;
544 	skb_dst_drop(to);
545 	skb_dst_set(to, dst_clone(skb_dst(from)));
546 	to->dev = from->dev;
547 	to->mark = from->mark;
548 
549 #ifdef CONFIG_NET_SCHED
550 	to->tc_index = from->tc_index;
551 #endif
552 	nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 	to->nf_trace = from->nf_trace;
556 #endif
557 	skb_copy_secmark(to, from);
558 }
559 
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 {
562 	u16 offset = sizeof(struct ipv6hdr);
563 	struct ipv6_opt_hdr *exthdr =
564 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 	unsigned int packet_len = skb->tail - skb->network_header;
566 	int found_rhdr = 0;
567 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
568 
569 	while (offset + 1 <= packet_len) {
570 
571 		switch (**nexthdr) {
572 
573 		case NEXTHDR_HOP:
574 			break;
575 		case NEXTHDR_ROUTING:
576 			found_rhdr = 1;
577 			break;
578 		case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581 				break;
582 #endif
583 			if (found_rhdr)
584 				return offset;
585 			break;
586 		default :
587 			return offset;
588 		}
589 
590 		offset += ipv6_optlen(exthdr);
591 		*nexthdr = &exthdr->nexthdr;
592 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593 						 offset);
594 	}
595 
596 	return offset;
597 }
598 
599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
600 {
601 	static atomic_t ipv6_fragmentation_id;
602 	int old, new;
603 
604 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
605 		struct inet_peer *peer;
606 
607 		if (!rt->rt6i_peer)
608 			rt6_bind_peer(rt, 1);
609 		peer = rt->rt6i_peer;
610 		if (peer) {
611 			fhdr->identification = htonl(inet_getid(peer, 0));
612 			return;
613 		}
614 	}
615 	do {
616 		old = atomic_read(&ipv6_fragmentation_id);
617 		new = old + 1;
618 		if (!new)
619 			new = 1;
620 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
621 	fhdr->identification = htonl(new);
622 }
623 
624 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
625 {
626 	struct sk_buff *frag;
627 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629 	struct ipv6hdr *tmp_hdr;
630 	struct frag_hdr *fh;
631 	unsigned int mtu, hlen, left, len;
632 	int hroom, troom;
633 	__be32 frag_id = 0;
634 	int ptr, offset = 0, err=0;
635 	u8 *prevhdr, nexthdr = 0;
636 	struct net *net = dev_net(skb_dst(skb)->dev);
637 
638 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
639 	nexthdr = *prevhdr;
640 
641 	mtu = ip6_skb_dst_mtu(skb);
642 
643 	/* We must not fragment if the socket is set to force MTU discovery
644 	 * or if the skb it not generated by a local socket.
645 	 */
646 	if (!skb->local_df && skb->len > mtu) {
647 		skb->dev = skb_dst(skb)->dev;
648 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
649 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
650 			      IPSTATS_MIB_FRAGFAILS);
651 		kfree_skb(skb);
652 		return -EMSGSIZE;
653 	}
654 
655 	if (np && np->frag_size < mtu) {
656 		if (np->frag_size)
657 			mtu = np->frag_size;
658 	}
659 	mtu -= hlen + sizeof(struct frag_hdr);
660 
661 	if (skb_has_frag_list(skb)) {
662 		int first_len = skb_pagelen(skb);
663 		struct sk_buff *frag2;
664 
665 		if (first_len - hlen > mtu ||
666 		    ((first_len - hlen) & 7) ||
667 		    skb_cloned(skb))
668 			goto slow_path;
669 
670 		skb_walk_frags(skb, frag) {
671 			/* Correct geometry. */
672 			if (frag->len > mtu ||
673 			    ((frag->len & 7) && frag->next) ||
674 			    skb_headroom(frag) < hlen)
675 				goto slow_path_clean;
676 
677 			/* Partially cloned skb? */
678 			if (skb_shared(frag))
679 				goto slow_path_clean;
680 
681 			BUG_ON(frag->sk);
682 			if (skb->sk) {
683 				frag->sk = skb->sk;
684 				frag->destructor = sock_wfree;
685 			}
686 			skb->truesize -= frag->truesize;
687 		}
688 
689 		err = 0;
690 		offset = 0;
691 		frag = skb_shinfo(skb)->frag_list;
692 		skb_frag_list_init(skb);
693 		/* BUILD HEADER */
694 
695 		*prevhdr = NEXTHDR_FRAGMENT;
696 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
697 		if (!tmp_hdr) {
698 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
699 				      IPSTATS_MIB_FRAGFAILS);
700 			return -ENOMEM;
701 		}
702 
703 		__skb_pull(skb, hlen);
704 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
705 		__skb_push(skb, hlen);
706 		skb_reset_network_header(skb);
707 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
708 
709 		ipv6_select_ident(fh, rt);
710 		fh->nexthdr = nexthdr;
711 		fh->reserved = 0;
712 		fh->frag_off = htons(IP6_MF);
713 		frag_id = fh->identification;
714 
715 		first_len = skb_pagelen(skb);
716 		skb->data_len = first_len - skb_headlen(skb);
717 		skb->len = first_len;
718 		ipv6_hdr(skb)->payload_len = htons(first_len -
719 						   sizeof(struct ipv6hdr));
720 
721 		dst_hold(&rt->dst);
722 
723 		for (;;) {
724 			/* Prepare header of the next frame,
725 			 * before previous one went down. */
726 			if (frag) {
727 				frag->ip_summed = CHECKSUM_NONE;
728 				skb_reset_transport_header(frag);
729 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
730 				__skb_push(frag, hlen);
731 				skb_reset_network_header(frag);
732 				memcpy(skb_network_header(frag), tmp_hdr,
733 				       hlen);
734 				offset += skb->len - hlen - sizeof(struct frag_hdr);
735 				fh->nexthdr = nexthdr;
736 				fh->reserved = 0;
737 				fh->frag_off = htons(offset);
738 				if (frag->next != NULL)
739 					fh->frag_off |= htons(IP6_MF);
740 				fh->identification = frag_id;
741 				ipv6_hdr(frag)->payload_len =
742 						htons(frag->len -
743 						      sizeof(struct ipv6hdr));
744 				ip6_copy_metadata(frag, skb);
745 			}
746 
747 			err = output(skb);
748 			if(!err)
749 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
750 					      IPSTATS_MIB_FRAGCREATES);
751 
752 			if (err || !frag)
753 				break;
754 
755 			skb = frag;
756 			frag = skb->next;
757 			skb->next = NULL;
758 		}
759 
760 		kfree(tmp_hdr);
761 
762 		if (err == 0) {
763 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764 				      IPSTATS_MIB_FRAGOKS);
765 			dst_release(&rt->dst);
766 			return 0;
767 		}
768 
769 		while (frag) {
770 			skb = frag->next;
771 			kfree_skb(frag);
772 			frag = skb;
773 		}
774 
775 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
776 			      IPSTATS_MIB_FRAGFAILS);
777 		dst_release(&rt->dst);
778 		return err;
779 
780 slow_path_clean:
781 		skb_walk_frags(skb, frag2) {
782 			if (frag2 == frag)
783 				break;
784 			frag2->sk = NULL;
785 			frag2->destructor = NULL;
786 			skb->truesize += frag2->truesize;
787 		}
788 	}
789 
790 slow_path:
791 	left = skb->len - hlen;		/* Space per frame */
792 	ptr = hlen;			/* Where to start from */
793 
794 	/*
795 	 *	Fragment the datagram.
796 	 */
797 
798 	*prevhdr = NEXTHDR_FRAGMENT;
799 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
800 	troom = rt->dst.dev->needed_tailroom;
801 
802 	/*
803 	 *	Keep copying data until we run out.
804 	 */
805 	while(left > 0)	{
806 		len = left;
807 		/* IF: it doesn't fit, use 'mtu' - the data space left */
808 		if (len > mtu)
809 			len = mtu;
810 		/* IF: we are not sending up to and including the packet end
811 		   then align the next start on an eight byte boundary */
812 		if (len < left)	{
813 			len &= ~7;
814 		}
815 		/*
816 		 *	Allocate buffer.
817 		 */
818 
819 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
820 				      hroom + troom, GFP_ATOMIC)) == NULL) {
821 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
822 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
823 				      IPSTATS_MIB_FRAGFAILS);
824 			err = -ENOMEM;
825 			goto fail;
826 		}
827 
828 		/*
829 		 *	Set up data on packet
830 		 */
831 
832 		ip6_copy_metadata(frag, skb);
833 		skb_reserve(frag, hroom);
834 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
835 		skb_reset_network_header(frag);
836 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
837 		frag->transport_header = (frag->network_header + hlen +
838 					  sizeof(struct frag_hdr));
839 
840 		/*
841 		 *	Charge the memory for the fragment to any owner
842 		 *	it might possess
843 		 */
844 		if (skb->sk)
845 			skb_set_owner_w(frag, skb->sk);
846 
847 		/*
848 		 *	Copy the packet header into the new buffer.
849 		 */
850 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
851 
852 		/*
853 		 *	Build fragment header.
854 		 */
855 		fh->nexthdr = nexthdr;
856 		fh->reserved = 0;
857 		if (!frag_id) {
858 			ipv6_select_ident(fh, rt);
859 			frag_id = fh->identification;
860 		} else
861 			fh->identification = frag_id;
862 
863 		/*
864 		 *	Copy a block of the IP datagram.
865 		 */
866 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
867 			BUG();
868 		left -= len;
869 
870 		fh->frag_off = htons(offset);
871 		if (left > 0)
872 			fh->frag_off |= htons(IP6_MF);
873 		ipv6_hdr(frag)->payload_len = htons(frag->len -
874 						    sizeof(struct ipv6hdr));
875 
876 		ptr += len;
877 		offset += len;
878 
879 		/*
880 		 *	Put this fragment into the sending queue.
881 		 */
882 		err = output(frag);
883 		if (err)
884 			goto fail;
885 
886 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
887 			      IPSTATS_MIB_FRAGCREATES);
888 	}
889 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
890 		      IPSTATS_MIB_FRAGOKS);
891 	consume_skb(skb);
892 	return err;
893 
894 fail:
895 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
896 		      IPSTATS_MIB_FRAGFAILS);
897 	kfree_skb(skb);
898 	return err;
899 }
900 
901 static inline int ip6_rt_check(const struct rt6key *rt_key,
902 			       const struct in6_addr *fl_addr,
903 			       const struct in6_addr *addr_cache)
904 {
905 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
906 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
907 }
908 
909 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
910 					  struct dst_entry *dst,
911 					  const struct flowi6 *fl6)
912 {
913 	struct ipv6_pinfo *np = inet6_sk(sk);
914 	struct rt6_info *rt = (struct rt6_info *)dst;
915 
916 	if (!dst)
917 		goto out;
918 
919 	/* Yes, checking route validity in not connected
920 	 * case is not very simple. Take into account,
921 	 * that we do not support routing by source, TOS,
922 	 * and MSG_DONTROUTE 		--ANK (980726)
923 	 *
924 	 * 1. ip6_rt_check(): If route was host route,
925 	 *    check that cached destination is current.
926 	 *    If it is network route, we still may
927 	 *    check its validity using saved pointer
928 	 *    to the last used address: daddr_cache.
929 	 *    We do not want to save whole address now,
930 	 *    (because main consumer of this service
931 	 *    is tcp, which has not this problem),
932 	 *    so that the last trick works only on connected
933 	 *    sockets.
934 	 * 2. oif also should be the same.
935 	 */
936 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
937 #ifdef CONFIG_IPV6_SUBTREES
938 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
939 #endif
940 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
941 		dst_release(dst);
942 		dst = NULL;
943 	}
944 
945 out:
946 	return dst;
947 }
948 
949 static int ip6_dst_lookup_tail(struct sock *sk,
950 			       struct dst_entry **dst, struct flowi6 *fl6)
951 {
952 	struct net *net = sock_net(sk);
953 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
954 	struct neighbour *n;
955 #endif
956 	int err;
957 
958 	if (*dst == NULL)
959 		*dst = ip6_route_output(net, sk, fl6);
960 
961 	if ((err = (*dst)->error))
962 		goto out_err_release;
963 
964 	if (ipv6_addr_any(&fl6->saddr)) {
965 		struct rt6_info *rt = (struct rt6_info *) *dst;
966 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
967 					  sk ? inet6_sk(sk)->srcprefs : 0,
968 					  &fl6->saddr);
969 		if (err)
970 			goto out_err_release;
971 	}
972 
973 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
974 	/*
975 	 * Here if the dst entry we've looked up
976 	 * has a neighbour entry that is in the INCOMPLETE
977 	 * state and the src address from the flow is
978 	 * marked as OPTIMISTIC, we release the found
979 	 * dst entry and replace it instead with the
980 	 * dst entry of the nexthop router
981 	 */
982 	rcu_read_lock();
983 	n = dst_get_neighbour_noref(*dst);
984 	if (n && !(n->nud_state & NUD_VALID)) {
985 		struct inet6_ifaddr *ifp;
986 		struct flowi6 fl_gw6;
987 		int redirect;
988 
989 		rcu_read_unlock();
990 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
991 				      (*dst)->dev, 1);
992 
993 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
994 		if (ifp)
995 			in6_ifa_put(ifp);
996 
997 		if (redirect) {
998 			/*
999 			 * We need to get the dst entry for the
1000 			 * default router instead
1001 			 */
1002 			dst_release(*dst);
1003 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1004 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1005 			*dst = ip6_route_output(net, sk, &fl_gw6);
1006 			if ((err = (*dst)->error))
1007 				goto out_err_release;
1008 		}
1009 	} else {
1010 		rcu_read_unlock();
1011 	}
1012 #endif
1013 
1014 	return 0;
1015 
1016 out_err_release:
1017 	if (err == -ENETUNREACH)
1018 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1019 	dst_release(*dst);
1020 	*dst = NULL;
1021 	return err;
1022 }
1023 
1024 /**
1025  *	ip6_dst_lookup - perform route lookup on flow
1026  *	@sk: socket which provides route info
1027  *	@dst: pointer to dst_entry * for result
1028  *	@fl6: flow to lookup
1029  *
1030  *	This function performs a route lookup on the given flow.
1031  *
1032  *	It returns zero on success, or a standard errno code on error.
1033  */
1034 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1035 {
1036 	*dst = NULL;
1037 	return ip6_dst_lookup_tail(sk, dst, fl6);
1038 }
1039 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1040 
1041 /**
1042  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1043  *	@sk: socket which provides route info
1044  *	@fl6: flow to lookup
1045  *	@final_dst: final destination address for ipsec lookup
1046  *	@can_sleep: we are in a sleepable context
1047  *
1048  *	This function performs a route lookup on the given flow.
1049  *
1050  *	It returns a valid dst pointer on success, or a pointer encoded
1051  *	error code.
1052  */
1053 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1054 				      const struct in6_addr *final_dst,
1055 				      bool can_sleep)
1056 {
1057 	struct dst_entry *dst = NULL;
1058 	int err;
1059 
1060 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1061 	if (err)
1062 		return ERR_PTR(err);
1063 	if (final_dst)
1064 		fl6->daddr = *final_dst;
1065 	if (can_sleep)
1066 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1067 
1068 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1069 }
1070 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1071 
1072 /**
1073  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1074  *	@sk: socket which provides the dst cache and route info
1075  *	@fl6: flow to lookup
1076  *	@final_dst: final destination address for ipsec lookup
1077  *	@can_sleep: we are in a sleepable context
1078  *
1079  *	This function performs a route lookup on the given flow with the
1080  *	possibility of using the cached route in the socket if it is valid.
1081  *	It will take the socket dst lock when operating on the dst cache.
1082  *	As a result, this function can only be used in process context.
1083  *
1084  *	It returns a valid dst pointer on success, or a pointer encoded
1085  *	error code.
1086  */
1087 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1088 					 const struct in6_addr *final_dst,
1089 					 bool can_sleep)
1090 {
1091 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1092 	int err;
1093 
1094 	dst = ip6_sk_dst_check(sk, dst, fl6);
1095 
1096 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1097 	if (err)
1098 		return ERR_PTR(err);
1099 	if (final_dst)
1100 		fl6->daddr = *final_dst;
1101 	if (can_sleep)
1102 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1103 
1104 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1105 }
1106 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1107 
1108 static inline int ip6_ufo_append_data(struct sock *sk,
1109 			int getfrag(void *from, char *to, int offset, int len,
1110 			int odd, struct sk_buff *skb),
1111 			void *from, int length, int hh_len, int fragheaderlen,
1112 			int transhdrlen, int mtu,unsigned int flags,
1113 			struct rt6_info *rt)
1114 
1115 {
1116 	struct sk_buff *skb;
1117 	int err;
1118 
1119 	/* There is support for UDP large send offload by network
1120 	 * device, so create one single skb packet containing complete
1121 	 * udp datagram
1122 	 */
1123 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1124 		skb = sock_alloc_send_skb(sk,
1125 			hh_len + fragheaderlen + transhdrlen + 20,
1126 			(flags & MSG_DONTWAIT), &err);
1127 		if (skb == NULL)
1128 			return err;
1129 
1130 		/* reserve space for Hardware header */
1131 		skb_reserve(skb, hh_len);
1132 
1133 		/* create space for UDP/IP header */
1134 		skb_put(skb,fragheaderlen + transhdrlen);
1135 
1136 		/* initialize network header pointer */
1137 		skb_reset_network_header(skb);
1138 
1139 		/* initialize protocol header pointer */
1140 		skb->transport_header = skb->network_header + fragheaderlen;
1141 
1142 		skb->ip_summed = CHECKSUM_PARTIAL;
1143 		skb->csum = 0;
1144 	}
1145 
1146 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1147 				      (length - transhdrlen));
1148 	if (!err) {
1149 		struct frag_hdr fhdr;
1150 
1151 		/* Specify the length of each IPv6 datagram fragment.
1152 		 * It has to be a multiple of 8.
1153 		 */
1154 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1155 					     sizeof(struct frag_hdr)) & ~7;
1156 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1157 		ipv6_select_ident(&fhdr, rt);
1158 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1159 		__skb_queue_tail(&sk->sk_write_queue, skb);
1160 
1161 		return 0;
1162 	}
1163 	/* There is not enough support do UPD LSO,
1164 	 * so follow normal path
1165 	 */
1166 	kfree_skb(skb);
1167 
1168 	return err;
1169 }
1170 
1171 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1172 					       gfp_t gfp)
1173 {
1174 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1175 }
1176 
1177 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1178 						gfp_t gfp)
1179 {
1180 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1181 }
1182 
1183 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1184 	int offset, int len, int odd, struct sk_buff *skb),
1185 	void *from, int length, int transhdrlen,
1186 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1187 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1188 {
1189 	struct inet_sock *inet = inet_sk(sk);
1190 	struct ipv6_pinfo *np = inet6_sk(sk);
1191 	struct inet_cork *cork;
1192 	struct sk_buff *skb;
1193 	unsigned int maxfraglen, fragheaderlen;
1194 	int exthdrlen;
1195 	int dst_exthdrlen;
1196 	int hh_len;
1197 	int mtu;
1198 	int copy;
1199 	int err;
1200 	int offset = 0;
1201 	int csummode = CHECKSUM_NONE;
1202 	__u8 tx_flags = 0;
1203 
1204 	if (flags&MSG_PROBE)
1205 		return 0;
1206 	cork = &inet->cork.base;
1207 	if (skb_queue_empty(&sk->sk_write_queue)) {
1208 		/*
1209 		 * setup for corking
1210 		 */
1211 		if (opt) {
1212 			if (WARN_ON(np->cork.opt))
1213 				return -EINVAL;
1214 
1215 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1216 			if (unlikely(np->cork.opt == NULL))
1217 				return -ENOBUFS;
1218 
1219 			np->cork.opt->tot_len = opt->tot_len;
1220 			np->cork.opt->opt_flen = opt->opt_flen;
1221 			np->cork.opt->opt_nflen = opt->opt_nflen;
1222 
1223 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1224 							    sk->sk_allocation);
1225 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1226 				return -ENOBUFS;
1227 
1228 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1229 							    sk->sk_allocation);
1230 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1231 				return -ENOBUFS;
1232 
1233 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1234 							   sk->sk_allocation);
1235 			if (opt->hopopt && !np->cork.opt->hopopt)
1236 				return -ENOBUFS;
1237 
1238 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1239 							    sk->sk_allocation);
1240 			if (opt->srcrt && !np->cork.opt->srcrt)
1241 				return -ENOBUFS;
1242 
1243 			/* need source address above miyazawa*/
1244 		}
1245 		dst_hold(&rt->dst);
1246 		cork->dst = &rt->dst;
1247 		inet->cork.fl.u.ip6 = *fl6;
1248 		np->cork.hop_limit = hlimit;
1249 		np->cork.tclass = tclass;
1250 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1251 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1252 		if (np->frag_size < mtu) {
1253 			if (np->frag_size)
1254 				mtu = np->frag_size;
1255 		}
1256 		cork->fragsize = mtu;
1257 		if (dst_allfrag(rt->dst.path))
1258 			cork->flags |= IPCORK_ALLFRAG;
1259 		cork->length = 0;
1260 		sk->sk_sndmsg_page = NULL;
1261 		sk->sk_sndmsg_off = 0;
1262 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1263 		length += exthdrlen;
1264 		transhdrlen += exthdrlen;
1265 		dst_exthdrlen = rt->dst.header_len;
1266 	} else {
1267 		rt = (struct rt6_info *)cork->dst;
1268 		fl6 = &inet->cork.fl.u.ip6;
1269 		opt = np->cork.opt;
1270 		transhdrlen = 0;
1271 		exthdrlen = 0;
1272 		dst_exthdrlen = 0;
1273 		mtu = cork->fragsize;
1274 	}
1275 
1276 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1277 
1278 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1279 			(opt ? opt->opt_nflen : 0);
1280 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1281 
1282 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1283 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1284 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1285 			return -EMSGSIZE;
1286 		}
1287 	}
1288 
1289 	/* For UDP, check if TX timestamp is enabled */
1290 	if (sk->sk_type == SOCK_DGRAM) {
1291 		err = sock_tx_timestamp(sk, &tx_flags);
1292 		if (err)
1293 			goto error;
1294 	}
1295 
1296 	/*
1297 	 * Let's try using as much space as possible.
1298 	 * Use MTU if total length of the message fits into the MTU.
1299 	 * Otherwise, we need to reserve fragment header and
1300 	 * fragment alignment (= 8-15 octects, in total).
1301 	 *
1302 	 * Note that we may need to "move" the data from the tail of
1303 	 * of the buffer to the new fragment when we split
1304 	 * the message.
1305 	 *
1306 	 * FIXME: It may be fragmented into multiple chunks
1307 	 *        at once if non-fragmentable extension headers
1308 	 *        are too large.
1309 	 * --yoshfuji
1310 	 */
1311 
1312 	cork->length += length;
1313 	if (length > mtu) {
1314 		int proto = sk->sk_protocol;
1315 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1316 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1317 			return -EMSGSIZE;
1318 		}
1319 
1320 		if (proto == IPPROTO_UDP &&
1321 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1322 
1323 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1324 						  hh_len, fragheaderlen,
1325 						  transhdrlen, mtu, flags, rt);
1326 			if (err)
1327 				goto error;
1328 			return 0;
1329 		}
1330 	}
1331 
1332 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1333 		goto alloc_new_skb;
1334 
1335 	while (length > 0) {
1336 		/* Check if the remaining data fits into current packet. */
1337 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1338 		if (copy < length)
1339 			copy = maxfraglen - skb->len;
1340 
1341 		if (copy <= 0) {
1342 			char *data;
1343 			unsigned int datalen;
1344 			unsigned int fraglen;
1345 			unsigned int fraggap;
1346 			unsigned int alloclen;
1347 			struct sk_buff *skb_prev;
1348 alloc_new_skb:
1349 			skb_prev = skb;
1350 
1351 			/* There's no room in the current skb */
1352 			if (skb_prev)
1353 				fraggap = skb_prev->len - maxfraglen;
1354 			else
1355 				fraggap = 0;
1356 
1357 			/*
1358 			 * If remaining data exceeds the mtu,
1359 			 * we know we need more fragment(s).
1360 			 */
1361 			datalen = length + fraggap;
1362 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1363 				datalen = maxfraglen - fragheaderlen;
1364 
1365 			fraglen = datalen + fragheaderlen;
1366 			if ((flags & MSG_MORE) &&
1367 			    !(rt->dst.dev->features&NETIF_F_SG))
1368 				alloclen = mtu;
1369 			else
1370 				alloclen = datalen + fragheaderlen;
1371 
1372 			alloclen += dst_exthdrlen;
1373 
1374 			/*
1375 			 * The last fragment gets additional space at tail.
1376 			 * Note: we overallocate on fragments with MSG_MODE
1377 			 * because we have no idea if we're the last one.
1378 			 */
1379 			if (datalen == length + fraggap)
1380 				alloclen += rt->dst.trailer_len;
1381 
1382 			/*
1383 			 * We just reserve space for fragment header.
1384 			 * Note: this may be overallocation if the message
1385 			 * (without MSG_MORE) fits into the MTU.
1386 			 */
1387 			alloclen += sizeof(struct frag_hdr);
1388 
1389 			if (transhdrlen) {
1390 				skb = sock_alloc_send_skb(sk,
1391 						alloclen + hh_len,
1392 						(flags & MSG_DONTWAIT), &err);
1393 			} else {
1394 				skb = NULL;
1395 				if (atomic_read(&sk->sk_wmem_alloc) <=
1396 				    2 * sk->sk_sndbuf)
1397 					skb = sock_wmalloc(sk,
1398 							   alloclen + hh_len, 1,
1399 							   sk->sk_allocation);
1400 				if (unlikely(skb == NULL))
1401 					err = -ENOBUFS;
1402 				else {
1403 					/* Only the initial fragment
1404 					 * is time stamped.
1405 					 */
1406 					tx_flags = 0;
1407 				}
1408 			}
1409 			if (skb == NULL)
1410 				goto error;
1411 			/*
1412 			 *	Fill in the control structures
1413 			 */
1414 			skb->ip_summed = csummode;
1415 			skb->csum = 0;
1416 			/* reserve for fragmentation and ipsec header */
1417 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1418 				    dst_exthdrlen);
1419 
1420 			if (sk->sk_type == SOCK_DGRAM)
1421 				skb_shinfo(skb)->tx_flags = tx_flags;
1422 
1423 			/*
1424 			 *	Find where to start putting bytes
1425 			 */
1426 			data = skb_put(skb, fraglen);
1427 			skb_set_network_header(skb, exthdrlen);
1428 			data += fragheaderlen;
1429 			skb->transport_header = (skb->network_header +
1430 						 fragheaderlen);
1431 			if (fraggap) {
1432 				skb->csum = skb_copy_and_csum_bits(
1433 					skb_prev, maxfraglen,
1434 					data + transhdrlen, fraggap, 0);
1435 				skb_prev->csum = csum_sub(skb_prev->csum,
1436 							  skb->csum);
1437 				data += fraggap;
1438 				pskb_trim_unique(skb_prev, maxfraglen);
1439 			}
1440 			copy = datalen - transhdrlen - fraggap;
1441 
1442 			if (copy < 0) {
1443 				err = -EINVAL;
1444 				kfree_skb(skb);
1445 				goto error;
1446 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1447 				err = -EFAULT;
1448 				kfree_skb(skb);
1449 				goto error;
1450 			}
1451 
1452 			offset += copy;
1453 			length -= datalen - fraggap;
1454 			transhdrlen = 0;
1455 			exthdrlen = 0;
1456 			dst_exthdrlen = 0;
1457 			csummode = CHECKSUM_NONE;
1458 
1459 			/*
1460 			 * Put the packet on the pending queue
1461 			 */
1462 			__skb_queue_tail(&sk->sk_write_queue, skb);
1463 			continue;
1464 		}
1465 
1466 		if (copy > length)
1467 			copy = length;
1468 
1469 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1470 			unsigned int off;
1471 
1472 			off = skb->len;
1473 			if (getfrag(from, skb_put(skb, copy),
1474 						offset, copy, off, skb) < 0) {
1475 				__skb_trim(skb, off);
1476 				err = -EFAULT;
1477 				goto error;
1478 			}
1479 		} else {
1480 			int i = skb_shinfo(skb)->nr_frags;
1481 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1482 			struct page *page = sk->sk_sndmsg_page;
1483 			int off = sk->sk_sndmsg_off;
1484 			unsigned int left;
1485 
1486 			if (page && (left = PAGE_SIZE - off) > 0) {
1487 				if (copy >= left)
1488 					copy = left;
1489 				if (page != skb_frag_page(frag)) {
1490 					if (i == MAX_SKB_FRAGS) {
1491 						err = -EMSGSIZE;
1492 						goto error;
1493 					}
1494 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1495 					skb_frag_ref(skb, i);
1496 					frag = &skb_shinfo(skb)->frags[i];
1497 				}
1498 			} else if(i < MAX_SKB_FRAGS) {
1499 				if (copy > PAGE_SIZE)
1500 					copy = PAGE_SIZE;
1501 				page = alloc_pages(sk->sk_allocation, 0);
1502 				if (page == NULL) {
1503 					err = -ENOMEM;
1504 					goto error;
1505 				}
1506 				sk->sk_sndmsg_page = page;
1507 				sk->sk_sndmsg_off = 0;
1508 
1509 				skb_fill_page_desc(skb, i, page, 0, 0);
1510 				frag = &skb_shinfo(skb)->frags[i];
1511 			} else {
1512 				err = -EMSGSIZE;
1513 				goto error;
1514 			}
1515 			if (getfrag(from,
1516 				    skb_frag_address(frag) + skb_frag_size(frag),
1517 				    offset, copy, skb->len, skb) < 0) {
1518 				err = -EFAULT;
1519 				goto error;
1520 			}
1521 			sk->sk_sndmsg_off += copy;
1522 			skb_frag_size_add(frag, copy);
1523 			skb->len += copy;
1524 			skb->data_len += copy;
1525 			skb->truesize += copy;
1526 			atomic_add(copy, &sk->sk_wmem_alloc);
1527 		}
1528 		offset += copy;
1529 		length -= copy;
1530 	}
1531 	return 0;
1532 error:
1533 	cork->length -= length;
1534 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1535 	return err;
1536 }
1537 EXPORT_SYMBOL_GPL(ip6_append_data);
1538 
1539 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1540 {
1541 	if (np->cork.opt) {
1542 		kfree(np->cork.opt->dst0opt);
1543 		kfree(np->cork.opt->dst1opt);
1544 		kfree(np->cork.opt->hopopt);
1545 		kfree(np->cork.opt->srcrt);
1546 		kfree(np->cork.opt);
1547 		np->cork.opt = NULL;
1548 	}
1549 
1550 	if (inet->cork.base.dst) {
1551 		dst_release(inet->cork.base.dst);
1552 		inet->cork.base.dst = NULL;
1553 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1554 	}
1555 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1556 }
1557 
1558 int ip6_push_pending_frames(struct sock *sk)
1559 {
1560 	struct sk_buff *skb, *tmp_skb;
1561 	struct sk_buff **tail_skb;
1562 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1563 	struct inet_sock *inet = inet_sk(sk);
1564 	struct ipv6_pinfo *np = inet6_sk(sk);
1565 	struct net *net = sock_net(sk);
1566 	struct ipv6hdr *hdr;
1567 	struct ipv6_txoptions *opt = np->cork.opt;
1568 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1569 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1570 	unsigned char proto = fl6->flowi6_proto;
1571 	int err = 0;
1572 
1573 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1574 		goto out;
1575 	tail_skb = &(skb_shinfo(skb)->frag_list);
1576 
1577 	/* move skb->data to ip header from ext header */
1578 	if (skb->data < skb_network_header(skb))
1579 		__skb_pull(skb, skb_network_offset(skb));
1580 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1581 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1582 		*tail_skb = tmp_skb;
1583 		tail_skb = &(tmp_skb->next);
1584 		skb->len += tmp_skb->len;
1585 		skb->data_len += tmp_skb->len;
1586 		skb->truesize += tmp_skb->truesize;
1587 		tmp_skb->destructor = NULL;
1588 		tmp_skb->sk = NULL;
1589 	}
1590 
1591 	/* Allow local fragmentation. */
1592 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1593 		skb->local_df = 1;
1594 
1595 	*final_dst = fl6->daddr;
1596 	__skb_pull(skb, skb_network_header_len(skb));
1597 	if (opt && opt->opt_flen)
1598 		ipv6_push_frag_opts(skb, opt, &proto);
1599 	if (opt && opt->opt_nflen)
1600 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1601 
1602 	skb_push(skb, sizeof(struct ipv6hdr));
1603 	skb_reset_network_header(skb);
1604 	hdr = ipv6_hdr(skb);
1605 
1606 	*(__be32*)hdr = fl6->flowlabel |
1607 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1608 
1609 	hdr->hop_limit = np->cork.hop_limit;
1610 	hdr->nexthdr = proto;
1611 	hdr->saddr = fl6->saddr;
1612 	hdr->daddr = *final_dst;
1613 
1614 	skb->priority = sk->sk_priority;
1615 	skb->mark = sk->sk_mark;
1616 
1617 	skb_dst_set(skb, dst_clone(&rt->dst));
1618 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1619 	if (proto == IPPROTO_ICMPV6) {
1620 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1621 
1622 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1623 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1624 	}
1625 
1626 	err = ip6_local_out(skb);
1627 	if (err) {
1628 		if (err > 0)
1629 			err = net_xmit_errno(err);
1630 		if (err)
1631 			goto error;
1632 	}
1633 
1634 out:
1635 	ip6_cork_release(inet, np);
1636 	return err;
1637 error:
1638 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1639 	goto out;
1640 }
1641 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1642 
1643 void ip6_flush_pending_frames(struct sock *sk)
1644 {
1645 	struct sk_buff *skb;
1646 
1647 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1648 		if (skb_dst(skb))
1649 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1650 				      IPSTATS_MIB_OUTDISCARDS);
1651 		kfree_skb(skb);
1652 	}
1653 
1654 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1655 }
1656 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1657