xref: /linux/net/ipv6/ip6_output.c (revision f49f4ab95c301dbccad0efe85296d908b8ae7ad4)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 static int ip6_finish_output2(struct sk_buff *skb)
87 {
88 	struct dst_entry *dst = skb_dst(skb);
89 	struct net_device *dev = dst->dev;
90 	struct neighbour *neigh;
91 	struct rt6_info *rt;
92 
93 	skb->protocol = htons(ETH_P_IPV6);
94 	skb->dev = dev;
95 
96 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
98 
99 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100 		    ((mroute6_socket(dev_net(dev), skb) &&
101 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103 					 &ipv6_hdr(skb)->saddr))) {
104 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
105 
106 			/* Do not check for IFF_ALLMULTI; multicast routing
107 			   is not supported in any case.
108 			 */
109 			if (newskb)
110 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111 					newskb, NULL, newskb->dev,
112 					dev_loopback_xmit);
113 
114 			if (ipv6_hdr(skb)->hop_limit == 0) {
115 				IP6_INC_STATS(dev_net(dev), idev,
116 					      IPSTATS_MIB_OUTDISCARDS);
117 				kfree_skb(skb);
118 				return 0;
119 			}
120 		}
121 
122 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
123 				skb->len);
124 	}
125 
126 	rt = (struct rt6_info *) dst;
127 	neigh = rt->n;
128 	if (neigh)
129 		return dst_neigh_output(dst, neigh, skb);
130 
131 	IP6_INC_STATS_BH(dev_net(dst->dev),
132 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
133 	kfree_skb(skb);
134 	return -EINVAL;
135 }
136 
137 static int ip6_finish_output(struct sk_buff *skb)
138 {
139 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
140 	    dst_allfrag(skb_dst(skb)))
141 		return ip6_fragment(skb, ip6_finish_output2);
142 	else
143 		return ip6_finish_output2(skb);
144 }
145 
146 int ip6_output(struct sk_buff *skb)
147 {
148 	struct net_device *dev = skb_dst(skb)->dev;
149 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
150 	if (unlikely(idev->cnf.disable_ipv6)) {
151 		IP6_INC_STATS(dev_net(dev), idev,
152 			      IPSTATS_MIB_OUTDISCARDS);
153 		kfree_skb(skb);
154 		return 0;
155 	}
156 
157 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
158 			    ip6_finish_output,
159 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
160 }
161 
162 /*
163  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
164  */
165 
166 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
167 	     struct ipv6_txoptions *opt, int tclass)
168 {
169 	struct net *net = sock_net(sk);
170 	struct ipv6_pinfo *np = inet6_sk(sk);
171 	struct in6_addr *first_hop = &fl6->daddr;
172 	struct dst_entry *dst = skb_dst(skb);
173 	struct ipv6hdr *hdr;
174 	u8  proto = fl6->flowi6_proto;
175 	int seg_len = skb->len;
176 	int hlimit = -1;
177 	u32 mtu;
178 
179 	if (opt) {
180 		unsigned int head_room;
181 
182 		/* First: exthdrs may take lots of space (~8K for now)
183 		   MAX_HEADER is not enough.
184 		 */
185 		head_room = opt->opt_nflen + opt->opt_flen;
186 		seg_len += head_room;
187 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
188 
189 		if (skb_headroom(skb) < head_room) {
190 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
191 			if (skb2 == NULL) {
192 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
193 					      IPSTATS_MIB_OUTDISCARDS);
194 				kfree_skb(skb);
195 				return -ENOBUFS;
196 			}
197 			consume_skb(skb);
198 			skb = skb2;
199 			skb_set_owner_w(skb, sk);
200 		}
201 		if (opt->opt_flen)
202 			ipv6_push_frag_opts(skb, opt, &proto);
203 		if (opt->opt_nflen)
204 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
205 	}
206 
207 	skb_push(skb, sizeof(struct ipv6hdr));
208 	skb_reset_network_header(skb);
209 	hdr = ipv6_hdr(skb);
210 
211 	/*
212 	 *	Fill in the IPv6 header
213 	 */
214 	if (np)
215 		hlimit = np->hop_limit;
216 	if (hlimit < 0)
217 		hlimit = ip6_dst_hoplimit(dst);
218 
219 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
220 
221 	hdr->payload_len = htons(seg_len);
222 	hdr->nexthdr = proto;
223 	hdr->hop_limit = hlimit;
224 
225 	hdr->saddr = fl6->saddr;
226 	hdr->daddr = *first_hop;
227 
228 	skb->priority = sk->sk_priority;
229 	skb->mark = sk->sk_mark;
230 
231 	mtu = dst_mtu(dst);
232 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
233 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
234 			      IPSTATS_MIB_OUT, skb->len);
235 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
236 			       dst->dev, dst_output);
237 	}
238 
239 	net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
240 	skb->dev = dst->dev;
241 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
242 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
243 	kfree_skb(skb);
244 	return -EMSGSIZE;
245 }
246 
247 EXPORT_SYMBOL(ip6_xmit);
248 
249 /*
250  *	To avoid extra problems ND packets are send through this
251  *	routine. It's code duplication but I really want to avoid
252  *	extra checks since ipv6_build_header is used by TCP (which
253  *	is for us performance critical)
254  */
255 
256 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
257 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
258 	       int proto, int len)
259 {
260 	struct ipv6_pinfo *np = inet6_sk(sk);
261 	struct ipv6hdr *hdr;
262 
263 	skb->protocol = htons(ETH_P_IPV6);
264 	skb->dev = dev;
265 
266 	skb_reset_network_header(skb);
267 	skb_put(skb, sizeof(struct ipv6hdr));
268 	hdr = ipv6_hdr(skb);
269 
270 	*(__be32*)hdr = htonl(0x60000000);
271 
272 	hdr->payload_len = htons(len);
273 	hdr->nexthdr = proto;
274 	hdr->hop_limit = np->hop_limit;
275 
276 	hdr->saddr = *saddr;
277 	hdr->daddr = *daddr;
278 
279 	return 0;
280 }
281 
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
283 {
284 	struct ip6_ra_chain *ra;
285 	struct sock *last = NULL;
286 
287 	read_lock(&ip6_ra_lock);
288 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
289 		struct sock *sk = ra->sk;
290 		if (sk && ra->sel == sel &&
291 		    (!sk->sk_bound_dev_if ||
292 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
293 			if (last) {
294 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295 				if (skb2)
296 					rawv6_rcv(last, skb2);
297 			}
298 			last = sk;
299 		}
300 	}
301 
302 	if (last) {
303 		rawv6_rcv(last, skb);
304 		read_unlock(&ip6_ra_lock);
305 		return 1;
306 	}
307 	read_unlock(&ip6_ra_lock);
308 	return 0;
309 }
310 
311 static int ip6_forward_proxy_check(struct sk_buff *skb)
312 {
313 	struct ipv6hdr *hdr = ipv6_hdr(skb);
314 	u8 nexthdr = hdr->nexthdr;
315 	__be16 frag_off;
316 	int offset;
317 
318 	if (ipv6_ext_hdr(nexthdr)) {
319 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
320 		if (offset < 0)
321 			return 0;
322 	} else
323 		offset = sizeof(struct ipv6hdr);
324 
325 	if (nexthdr == IPPROTO_ICMPV6) {
326 		struct icmp6hdr *icmp6;
327 
328 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
329 					 offset + 1 - skb->data)))
330 			return 0;
331 
332 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
333 
334 		switch (icmp6->icmp6_type) {
335 		case NDISC_ROUTER_SOLICITATION:
336 		case NDISC_ROUTER_ADVERTISEMENT:
337 		case NDISC_NEIGHBOUR_SOLICITATION:
338 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
339 		case NDISC_REDIRECT:
340 			/* For reaction involving unicast neighbor discovery
341 			 * message destined to the proxied address, pass it to
342 			 * input function.
343 			 */
344 			return 1;
345 		default:
346 			break;
347 		}
348 	}
349 
350 	/*
351 	 * The proxying router can't forward traffic sent to a link-local
352 	 * address, so signal the sender and discard the packet. This
353 	 * behavior is clarified by the MIPv6 specification.
354 	 */
355 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
356 		dst_link_failure(skb);
357 		return -1;
358 	}
359 
360 	return 0;
361 }
362 
363 static inline int ip6_forward_finish(struct sk_buff *skb)
364 {
365 	return dst_output(skb);
366 }
367 
368 int ip6_forward(struct sk_buff *skb)
369 {
370 	struct dst_entry *dst = skb_dst(skb);
371 	struct ipv6hdr *hdr = ipv6_hdr(skb);
372 	struct inet6_skb_parm *opt = IP6CB(skb);
373 	struct net *net = dev_net(dst->dev);
374 	u32 mtu;
375 
376 	if (net->ipv6.devconf_all->forwarding == 0)
377 		goto error;
378 
379 	if (skb_warn_if_lro(skb))
380 		goto drop;
381 
382 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
384 		goto drop;
385 	}
386 
387 	if (skb->pkt_type != PACKET_HOST)
388 		goto drop;
389 
390 	skb_forward_csum(skb);
391 
392 	/*
393 	 *	We DO NOT make any processing on
394 	 *	RA packets, pushing them to user level AS IS
395 	 *	without ane WARRANTY that application will be able
396 	 *	to interpret them. The reason is that we
397 	 *	cannot make anything clever here.
398 	 *
399 	 *	We are not end-node, so that if packet contains
400 	 *	AH/ESP, we cannot make anything.
401 	 *	Defragmentation also would be mistake, RA packets
402 	 *	cannot be fragmented, because there is no warranty
403 	 *	that different fragments will go along one path. --ANK
404 	 */
405 	if (opt->ra) {
406 		u8 *ptr = skb_network_header(skb) + opt->ra;
407 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
408 			return 0;
409 	}
410 
411 	/*
412 	 *	check and decrement ttl
413 	 */
414 	if (hdr->hop_limit <= 1) {
415 		/* Force OUTPUT device used as source address */
416 		skb->dev = dst->dev;
417 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418 		IP6_INC_STATS_BH(net,
419 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
420 
421 		kfree_skb(skb);
422 		return -ETIMEDOUT;
423 	}
424 
425 	/* XXX: idev->cnf.proxy_ndp? */
426 	if (net->ipv6.devconf_all->proxy_ndp &&
427 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428 		int proxied = ip6_forward_proxy_check(skb);
429 		if (proxied > 0)
430 			return ip6_input(skb);
431 		else if (proxied < 0) {
432 			IP6_INC_STATS(net, ip6_dst_idev(dst),
433 				      IPSTATS_MIB_INDISCARDS);
434 			goto drop;
435 		}
436 	}
437 
438 	if (!xfrm6_route_forward(skb)) {
439 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
440 		goto drop;
441 	}
442 	dst = skb_dst(skb);
443 
444 	/* IPv6 specs say nothing about it, but it is clear that we cannot
445 	   send redirects to source routed frames.
446 	   We don't send redirects to frames decapsulated from IPsec.
447 	 */
448 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
449 		struct in6_addr *target = NULL;
450 		struct inet_peer *peer;
451 		struct rt6_info *rt;
452 
453 		/*
454 		 *	incoming and outgoing devices are the same
455 		 *	send a redirect.
456 		 */
457 
458 		rt = (struct rt6_info *) dst;
459 		if (rt->rt6i_flags & RTF_GATEWAY)
460 			target = &rt->rt6i_gateway;
461 		else
462 			target = &hdr->daddr;
463 
464 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
465 
466 		/* Limit redirects both by destination (here)
467 		   and by source (inside ndisc_send_redirect)
468 		 */
469 		if (inet_peer_xrlim_allow(peer, 1*HZ))
470 			ndisc_send_redirect(skb, target);
471 		if (peer)
472 			inet_putpeer(peer);
473 	} else {
474 		int addrtype = ipv6_addr_type(&hdr->saddr);
475 
476 		/* This check is security critical. */
477 		if (addrtype == IPV6_ADDR_ANY ||
478 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
479 			goto error;
480 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
481 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
482 				    ICMPV6_NOT_NEIGHBOUR, 0);
483 			goto error;
484 		}
485 	}
486 
487 	mtu = dst_mtu(dst);
488 	if (mtu < IPV6_MIN_MTU)
489 		mtu = IPV6_MIN_MTU;
490 
491 	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
492 	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
493 		/* Again, force OUTPUT device used as source address */
494 		skb->dev = dst->dev;
495 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496 		IP6_INC_STATS_BH(net,
497 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
498 		IP6_INC_STATS_BH(net,
499 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
500 		kfree_skb(skb);
501 		return -EMSGSIZE;
502 	}
503 
504 	if (skb_cow(skb, dst->dev->hard_header_len)) {
505 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
506 		goto drop;
507 	}
508 
509 	hdr = ipv6_hdr(skb);
510 
511 	/* Mangling hops number delayed to point after skb COW */
512 
513 	hdr->hop_limit--;
514 
515 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
516 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
517 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
518 		       ip6_forward_finish);
519 
520 error:
521 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
522 drop:
523 	kfree_skb(skb);
524 	return -EINVAL;
525 }
526 
527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
528 {
529 	to->pkt_type = from->pkt_type;
530 	to->priority = from->priority;
531 	to->protocol = from->protocol;
532 	skb_dst_drop(to);
533 	skb_dst_set(to, dst_clone(skb_dst(from)));
534 	to->dev = from->dev;
535 	to->mark = from->mark;
536 
537 #ifdef CONFIG_NET_SCHED
538 	to->tc_index = from->tc_index;
539 #endif
540 	nf_copy(to, from);
541 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
542     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
543 	to->nf_trace = from->nf_trace;
544 #endif
545 	skb_copy_secmark(to, from);
546 }
547 
548 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
549 {
550 	u16 offset = sizeof(struct ipv6hdr);
551 	struct ipv6_opt_hdr *exthdr =
552 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
553 	unsigned int packet_len = skb->tail - skb->network_header;
554 	int found_rhdr = 0;
555 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
556 
557 	while (offset + 1 <= packet_len) {
558 
559 		switch (**nexthdr) {
560 
561 		case NEXTHDR_HOP:
562 			break;
563 		case NEXTHDR_ROUTING:
564 			found_rhdr = 1;
565 			break;
566 		case NEXTHDR_DEST:
567 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
568 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
569 				break;
570 #endif
571 			if (found_rhdr)
572 				return offset;
573 			break;
574 		default :
575 			return offset;
576 		}
577 
578 		offset += ipv6_optlen(exthdr);
579 		*nexthdr = &exthdr->nexthdr;
580 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
581 						 offset);
582 	}
583 
584 	return offset;
585 }
586 
587 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
588 {
589 	static atomic_t ipv6_fragmentation_id;
590 	int old, new;
591 
592 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
593 		struct inet_peer *peer;
594 		struct net *net;
595 
596 		net = dev_net(rt->dst.dev);
597 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
598 		if (peer) {
599 			fhdr->identification = htonl(inet_getid(peer, 0));
600 			inet_putpeer(peer);
601 			return;
602 		}
603 	}
604 	do {
605 		old = atomic_read(&ipv6_fragmentation_id);
606 		new = old + 1;
607 		if (!new)
608 			new = 1;
609 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
610 	fhdr->identification = htonl(new);
611 }
612 
613 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
614 {
615 	struct sk_buff *frag;
616 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
617 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
618 	struct ipv6hdr *tmp_hdr;
619 	struct frag_hdr *fh;
620 	unsigned int mtu, hlen, left, len;
621 	int hroom, troom;
622 	__be32 frag_id = 0;
623 	int ptr, offset = 0, err=0;
624 	u8 *prevhdr, nexthdr = 0;
625 	struct net *net = dev_net(skb_dst(skb)->dev);
626 
627 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
628 	nexthdr = *prevhdr;
629 
630 	mtu = ip6_skb_dst_mtu(skb);
631 
632 	/* We must not fragment if the socket is set to force MTU discovery
633 	 * or if the skb it not generated by a local socket.
634 	 */
635 	if (unlikely(!skb->local_df && skb->len > mtu) ||
636 		     (IP6CB(skb)->frag_max_size &&
637 		      IP6CB(skb)->frag_max_size > mtu)) {
638 		if (skb->sk && dst_allfrag(skb_dst(skb)))
639 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
640 
641 		skb->dev = skb_dst(skb)->dev;
642 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
643 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
644 			      IPSTATS_MIB_FRAGFAILS);
645 		kfree_skb(skb);
646 		return -EMSGSIZE;
647 	}
648 
649 	if (np && np->frag_size < mtu) {
650 		if (np->frag_size)
651 			mtu = np->frag_size;
652 	}
653 	mtu -= hlen + sizeof(struct frag_hdr);
654 
655 	if (skb_has_frag_list(skb)) {
656 		int first_len = skb_pagelen(skb);
657 		struct sk_buff *frag2;
658 
659 		if (first_len - hlen > mtu ||
660 		    ((first_len - hlen) & 7) ||
661 		    skb_cloned(skb))
662 			goto slow_path;
663 
664 		skb_walk_frags(skb, frag) {
665 			/* Correct geometry. */
666 			if (frag->len > mtu ||
667 			    ((frag->len & 7) && frag->next) ||
668 			    skb_headroom(frag) < hlen)
669 				goto slow_path_clean;
670 
671 			/* Partially cloned skb? */
672 			if (skb_shared(frag))
673 				goto slow_path_clean;
674 
675 			BUG_ON(frag->sk);
676 			if (skb->sk) {
677 				frag->sk = skb->sk;
678 				frag->destructor = sock_wfree;
679 			}
680 			skb->truesize -= frag->truesize;
681 		}
682 
683 		err = 0;
684 		offset = 0;
685 		frag = skb_shinfo(skb)->frag_list;
686 		skb_frag_list_init(skb);
687 		/* BUILD HEADER */
688 
689 		*prevhdr = NEXTHDR_FRAGMENT;
690 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691 		if (!tmp_hdr) {
692 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
693 				      IPSTATS_MIB_FRAGFAILS);
694 			return -ENOMEM;
695 		}
696 
697 		__skb_pull(skb, hlen);
698 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
699 		__skb_push(skb, hlen);
700 		skb_reset_network_header(skb);
701 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
702 
703 		ipv6_select_ident(fh, rt);
704 		fh->nexthdr = nexthdr;
705 		fh->reserved = 0;
706 		fh->frag_off = htons(IP6_MF);
707 		frag_id = fh->identification;
708 
709 		first_len = skb_pagelen(skb);
710 		skb->data_len = first_len - skb_headlen(skb);
711 		skb->len = first_len;
712 		ipv6_hdr(skb)->payload_len = htons(first_len -
713 						   sizeof(struct ipv6hdr));
714 
715 		dst_hold(&rt->dst);
716 
717 		for (;;) {
718 			/* Prepare header of the next frame,
719 			 * before previous one went down. */
720 			if (frag) {
721 				frag->ip_summed = CHECKSUM_NONE;
722 				skb_reset_transport_header(frag);
723 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
724 				__skb_push(frag, hlen);
725 				skb_reset_network_header(frag);
726 				memcpy(skb_network_header(frag), tmp_hdr,
727 				       hlen);
728 				offset += skb->len - hlen - sizeof(struct frag_hdr);
729 				fh->nexthdr = nexthdr;
730 				fh->reserved = 0;
731 				fh->frag_off = htons(offset);
732 				if (frag->next != NULL)
733 					fh->frag_off |= htons(IP6_MF);
734 				fh->identification = frag_id;
735 				ipv6_hdr(frag)->payload_len =
736 						htons(frag->len -
737 						      sizeof(struct ipv6hdr));
738 				ip6_copy_metadata(frag, skb);
739 			}
740 
741 			err = output(skb);
742 			if(!err)
743 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
744 					      IPSTATS_MIB_FRAGCREATES);
745 
746 			if (err || !frag)
747 				break;
748 
749 			skb = frag;
750 			frag = skb->next;
751 			skb->next = NULL;
752 		}
753 
754 		kfree(tmp_hdr);
755 
756 		if (err == 0) {
757 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
758 				      IPSTATS_MIB_FRAGOKS);
759 			dst_release(&rt->dst);
760 			return 0;
761 		}
762 
763 		while (frag) {
764 			skb = frag->next;
765 			kfree_skb(frag);
766 			frag = skb;
767 		}
768 
769 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
770 			      IPSTATS_MIB_FRAGFAILS);
771 		dst_release(&rt->dst);
772 		return err;
773 
774 slow_path_clean:
775 		skb_walk_frags(skb, frag2) {
776 			if (frag2 == frag)
777 				break;
778 			frag2->sk = NULL;
779 			frag2->destructor = NULL;
780 			skb->truesize += frag2->truesize;
781 		}
782 	}
783 
784 slow_path:
785 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
786 	    skb_checksum_help(skb))
787 		goto fail;
788 
789 	left = skb->len - hlen;		/* Space per frame */
790 	ptr = hlen;			/* Where to start from */
791 
792 	/*
793 	 *	Fragment the datagram.
794 	 */
795 
796 	*prevhdr = NEXTHDR_FRAGMENT;
797 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
798 	troom = rt->dst.dev->needed_tailroom;
799 
800 	/*
801 	 *	Keep copying data until we run out.
802 	 */
803 	while(left > 0)	{
804 		len = left;
805 		/* IF: it doesn't fit, use 'mtu' - the data space left */
806 		if (len > mtu)
807 			len = mtu;
808 		/* IF: we are not sending up to and including the packet end
809 		   then align the next start on an eight byte boundary */
810 		if (len < left)	{
811 			len &= ~7;
812 		}
813 		/*
814 		 *	Allocate buffer.
815 		 */
816 
817 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
818 				      hroom + troom, GFP_ATOMIC)) == NULL) {
819 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
820 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821 				      IPSTATS_MIB_FRAGFAILS);
822 			err = -ENOMEM;
823 			goto fail;
824 		}
825 
826 		/*
827 		 *	Set up data on packet
828 		 */
829 
830 		ip6_copy_metadata(frag, skb);
831 		skb_reserve(frag, hroom);
832 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
833 		skb_reset_network_header(frag);
834 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
835 		frag->transport_header = (frag->network_header + hlen +
836 					  sizeof(struct frag_hdr));
837 
838 		/*
839 		 *	Charge the memory for the fragment to any owner
840 		 *	it might possess
841 		 */
842 		if (skb->sk)
843 			skb_set_owner_w(frag, skb->sk);
844 
845 		/*
846 		 *	Copy the packet header into the new buffer.
847 		 */
848 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
849 
850 		/*
851 		 *	Build fragment header.
852 		 */
853 		fh->nexthdr = nexthdr;
854 		fh->reserved = 0;
855 		if (!frag_id) {
856 			ipv6_select_ident(fh, rt);
857 			frag_id = fh->identification;
858 		} else
859 			fh->identification = frag_id;
860 
861 		/*
862 		 *	Copy a block of the IP datagram.
863 		 */
864 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
865 			BUG();
866 		left -= len;
867 
868 		fh->frag_off = htons(offset);
869 		if (left > 0)
870 			fh->frag_off |= htons(IP6_MF);
871 		ipv6_hdr(frag)->payload_len = htons(frag->len -
872 						    sizeof(struct ipv6hdr));
873 
874 		ptr += len;
875 		offset += len;
876 
877 		/*
878 		 *	Put this fragment into the sending queue.
879 		 */
880 		err = output(frag);
881 		if (err)
882 			goto fail;
883 
884 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885 			      IPSTATS_MIB_FRAGCREATES);
886 	}
887 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888 		      IPSTATS_MIB_FRAGOKS);
889 	consume_skb(skb);
890 	return err;
891 
892 fail:
893 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894 		      IPSTATS_MIB_FRAGFAILS);
895 	kfree_skb(skb);
896 	return err;
897 }
898 
899 static inline int ip6_rt_check(const struct rt6key *rt_key,
900 			       const struct in6_addr *fl_addr,
901 			       const struct in6_addr *addr_cache)
902 {
903 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
904 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
905 }
906 
907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
908 					  struct dst_entry *dst,
909 					  const struct flowi6 *fl6)
910 {
911 	struct ipv6_pinfo *np = inet6_sk(sk);
912 	struct rt6_info *rt = (struct rt6_info *)dst;
913 
914 	if (!dst)
915 		goto out;
916 
917 	/* Yes, checking route validity in not connected
918 	 * case is not very simple. Take into account,
919 	 * that we do not support routing by source, TOS,
920 	 * and MSG_DONTROUTE 		--ANK (980726)
921 	 *
922 	 * 1. ip6_rt_check(): If route was host route,
923 	 *    check that cached destination is current.
924 	 *    If it is network route, we still may
925 	 *    check its validity using saved pointer
926 	 *    to the last used address: daddr_cache.
927 	 *    We do not want to save whole address now,
928 	 *    (because main consumer of this service
929 	 *    is tcp, which has not this problem),
930 	 *    so that the last trick works only on connected
931 	 *    sockets.
932 	 * 2. oif also should be the same.
933 	 */
934 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
935 #ifdef CONFIG_IPV6_SUBTREES
936 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
937 #endif
938 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
939 		dst_release(dst);
940 		dst = NULL;
941 	}
942 
943 out:
944 	return dst;
945 }
946 
947 static int ip6_dst_lookup_tail(struct sock *sk,
948 			       struct dst_entry **dst, struct flowi6 *fl6)
949 {
950 	struct net *net = sock_net(sk);
951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
952 	struct neighbour *n;
953 	struct rt6_info *rt;
954 #endif
955 	int err;
956 
957 	if (*dst == NULL)
958 		*dst = ip6_route_output(net, sk, fl6);
959 
960 	if ((err = (*dst)->error))
961 		goto out_err_release;
962 
963 	if (ipv6_addr_any(&fl6->saddr)) {
964 		struct rt6_info *rt = (struct rt6_info *) *dst;
965 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
966 					  sk ? inet6_sk(sk)->srcprefs : 0,
967 					  &fl6->saddr);
968 		if (err)
969 			goto out_err_release;
970 	}
971 
972 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
973 	/*
974 	 * Here if the dst entry we've looked up
975 	 * has a neighbour entry that is in the INCOMPLETE
976 	 * state and the src address from the flow is
977 	 * marked as OPTIMISTIC, we release the found
978 	 * dst entry and replace it instead with the
979 	 * dst entry of the nexthop router
980 	 */
981 	rt = (struct rt6_info *) *dst;
982 	n = rt->n;
983 	if (n && !(n->nud_state & NUD_VALID)) {
984 		struct inet6_ifaddr *ifp;
985 		struct flowi6 fl_gw6;
986 		int redirect;
987 
988 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
989 				      (*dst)->dev, 1);
990 
991 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
992 		if (ifp)
993 			in6_ifa_put(ifp);
994 
995 		if (redirect) {
996 			/*
997 			 * We need to get the dst entry for the
998 			 * default router instead
999 			 */
1000 			dst_release(*dst);
1001 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1002 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1003 			*dst = ip6_route_output(net, sk, &fl_gw6);
1004 			if ((err = (*dst)->error))
1005 				goto out_err_release;
1006 		}
1007 	}
1008 #endif
1009 
1010 	return 0;
1011 
1012 out_err_release:
1013 	if (err == -ENETUNREACH)
1014 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1015 	dst_release(*dst);
1016 	*dst = NULL;
1017 	return err;
1018 }
1019 
1020 /**
1021  *	ip6_dst_lookup - perform route lookup on flow
1022  *	@sk: socket which provides route info
1023  *	@dst: pointer to dst_entry * for result
1024  *	@fl6: flow to lookup
1025  *
1026  *	This function performs a route lookup on the given flow.
1027  *
1028  *	It returns zero on success, or a standard errno code on error.
1029  */
1030 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1031 {
1032 	*dst = NULL;
1033 	return ip6_dst_lookup_tail(sk, dst, fl6);
1034 }
1035 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1036 
1037 /**
1038  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1039  *	@sk: socket which provides route info
1040  *	@fl6: flow to lookup
1041  *	@final_dst: final destination address for ipsec lookup
1042  *	@can_sleep: we are in a sleepable context
1043  *
1044  *	This function performs a route lookup on the given flow.
1045  *
1046  *	It returns a valid dst pointer on success, or a pointer encoded
1047  *	error code.
1048  */
1049 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1050 				      const struct in6_addr *final_dst,
1051 				      bool can_sleep)
1052 {
1053 	struct dst_entry *dst = NULL;
1054 	int err;
1055 
1056 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1057 	if (err)
1058 		return ERR_PTR(err);
1059 	if (final_dst)
1060 		fl6->daddr = *final_dst;
1061 	if (can_sleep)
1062 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1063 
1064 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1065 }
1066 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1067 
1068 /**
1069  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1070  *	@sk: socket which provides the dst cache and route info
1071  *	@fl6: flow to lookup
1072  *	@final_dst: final destination address for ipsec lookup
1073  *	@can_sleep: we are in a sleepable context
1074  *
1075  *	This function performs a route lookup on the given flow with the
1076  *	possibility of using the cached route in the socket if it is valid.
1077  *	It will take the socket dst lock when operating on the dst cache.
1078  *	As a result, this function can only be used in process context.
1079  *
1080  *	It returns a valid dst pointer on success, or a pointer encoded
1081  *	error code.
1082  */
1083 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1084 					 const struct in6_addr *final_dst,
1085 					 bool can_sleep)
1086 {
1087 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1088 	int err;
1089 
1090 	dst = ip6_sk_dst_check(sk, dst, fl6);
1091 
1092 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1093 	if (err)
1094 		return ERR_PTR(err);
1095 	if (final_dst)
1096 		fl6->daddr = *final_dst;
1097 	if (can_sleep)
1098 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1099 
1100 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1101 }
1102 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1103 
1104 static inline int ip6_ufo_append_data(struct sock *sk,
1105 			int getfrag(void *from, char *to, int offset, int len,
1106 			int odd, struct sk_buff *skb),
1107 			void *from, int length, int hh_len, int fragheaderlen,
1108 			int transhdrlen, int mtu,unsigned int flags,
1109 			struct rt6_info *rt)
1110 
1111 {
1112 	struct sk_buff *skb;
1113 	int err;
1114 
1115 	/* There is support for UDP large send offload by network
1116 	 * device, so create one single skb packet containing complete
1117 	 * udp datagram
1118 	 */
1119 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1120 		skb = sock_alloc_send_skb(sk,
1121 			hh_len + fragheaderlen + transhdrlen + 20,
1122 			(flags & MSG_DONTWAIT), &err);
1123 		if (skb == NULL)
1124 			return err;
1125 
1126 		/* reserve space for Hardware header */
1127 		skb_reserve(skb, hh_len);
1128 
1129 		/* create space for UDP/IP header */
1130 		skb_put(skb,fragheaderlen + transhdrlen);
1131 
1132 		/* initialize network header pointer */
1133 		skb_reset_network_header(skb);
1134 
1135 		/* initialize protocol header pointer */
1136 		skb->transport_header = skb->network_header + fragheaderlen;
1137 
1138 		skb->ip_summed = CHECKSUM_PARTIAL;
1139 		skb->csum = 0;
1140 	}
1141 
1142 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1143 				      (length - transhdrlen));
1144 	if (!err) {
1145 		struct frag_hdr fhdr;
1146 
1147 		/* Specify the length of each IPv6 datagram fragment.
1148 		 * It has to be a multiple of 8.
1149 		 */
1150 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1151 					     sizeof(struct frag_hdr)) & ~7;
1152 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1153 		ipv6_select_ident(&fhdr, rt);
1154 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1155 		__skb_queue_tail(&sk->sk_write_queue, skb);
1156 
1157 		return 0;
1158 	}
1159 	/* There is not enough support do UPD LSO,
1160 	 * so follow normal path
1161 	 */
1162 	kfree_skb(skb);
1163 
1164 	return err;
1165 }
1166 
1167 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1168 					       gfp_t gfp)
1169 {
1170 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1171 }
1172 
1173 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1174 						gfp_t gfp)
1175 {
1176 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178 
1179 static void ip6_append_data_mtu(int *mtu,
1180 				int *maxfraglen,
1181 				unsigned int fragheaderlen,
1182 				struct sk_buff *skb,
1183 				struct rt6_info *rt)
1184 {
1185 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1186 		if (skb == NULL) {
1187 			/* first fragment, reserve header_len */
1188 			*mtu = *mtu - rt->dst.header_len;
1189 
1190 		} else {
1191 			/*
1192 			 * this fragment is not first, the headers
1193 			 * space is regarded as data space.
1194 			 */
1195 			*mtu = dst_mtu(rt->dst.path);
1196 		}
1197 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1198 			      + fragheaderlen - sizeof(struct frag_hdr);
1199 	}
1200 }
1201 
1202 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1203 	int offset, int len, int odd, struct sk_buff *skb),
1204 	void *from, int length, int transhdrlen,
1205 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1206 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1207 {
1208 	struct inet_sock *inet = inet_sk(sk);
1209 	struct ipv6_pinfo *np = inet6_sk(sk);
1210 	struct inet_cork *cork;
1211 	struct sk_buff *skb, *skb_prev = NULL;
1212 	unsigned int maxfraglen, fragheaderlen;
1213 	int exthdrlen;
1214 	int dst_exthdrlen;
1215 	int hh_len;
1216 	int mtu;
1217 	int copy;
1218 	int err;
1219 	int offset = 0;
1220 	__u8 tx_flags = 0;
1221 
1222 	if (flags&MSG_PROBE)
1223 		return 0;
1224 	cork = &inet->cork.base;
1225 	if (skb_queue_empty(&sk->sk_write_queue)) {
1226 		/*
1227 		 * setup for corking
1228 		 */
1229 		if (opt) {
1230 			if (WARN_ON(np->cork.opt))
1231 				return -EINVAL;
1232 
1233 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1234 			if (unlikely(np->cork.opt == NULL))
1235 				return -ENOBUFS;
1236 
1237 			np->cork.opt->tot_len = opt->tot_len;
1238 			np->cork.opt->opt_flen = opt->opt_flen;
1239 			np->cork.opt->opt_nflen = opt->opt_nflen;
1240 
1241 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1242 							    sk->sk_allocation);
1243 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1244 				return -ENOBUFS;
1245 
1246 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1247 							    sk->sk_allocation);
1248 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1249 				return -ENOBUFS;
1250 
1251 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1252 							   sk->sk_allocation);
1253 			if (opt->hopopt && !np->cork.opt->hopopt)
1254 				return -ENOBUFS;
1255 
1256 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1257 							    sk->sk_allocation);
1258 			if (opt->srcrt && !np->cork.opt->srcrt)
1259 				return -ENOBUFS;
1260 
1261 			/* need source address above miyazawa*/
1262 		}
1263 		dst_hold(&rt->dst);
1264 		cork->dst = &rt->dst;
1265 		inet->cork.fl.u.ip6 = *fl6;
1266 		np->cork.hop_limit = hlimit;
1267 		np->cork.tclass = tclass;
1268 		if (rt->dst.flags & DST_XFRM_TUNNEL)
1269 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1270 			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1271 		else
1272 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1273 			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1274 		if (np->frag_size < mtu) {
1275 			if (np->frag_size)
1276 				mtu = np->frag_size;
1277 		}
1278 		cork->fragsize = mtu;
1279 		if (dst_allfrag(rt->dst.path))
1280 			cork->flags |= IPCORK_ALLFRAG;
1281 		cork->length = 0;
1282 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1283 		length += exthdrlen;
1284 		transhdrlen += exthdrlen;
1285 		dst_exthdrlen = rt->dst.header_len;
1286 	} else {
1287 		rt = (struct rt6_info *)cork->dst;
1288 		fl6 = &inet->cork.fl.u.ip6;
1289 		opt = np->cork.opt;
1290 		transhdrlen = 0;
1291 		exthdrlen = 0;
1292 		dst_exthdrlen = 0;
1293 		mtu = cork->fragsize;
1294 	}
1295 
1296 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1297 
1298 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1299 			(opt ? opt->opt_nflen : 0);
1300 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1301 
1302 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1303 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1304 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1305 			return -EMSGSIZE;
1306 		}
1307 	}
1308 
1309 	/* For UDP, check if TX timestamp is enabled */
1310 	if (sk->sk_type == SOCK_DGRAM) {
1311 		err = sock_tx_timestamp(sk, &tx_flags);
1312 		if (err)
1313 			goto error;
1314 	}
1315 
1316 	/*
1317 	 * Let's try using as much space as possible.
1318 	 * Use MTU if total length of the message fits into the MTU.
1319 	 * Otherwise, we need to reserve fragment header and
1320 	 * fragment alignment (= 8-15 octects, in total).
1321 	 *
1322 	 * Note that we may need to "move" the data from the tail of
1323 	 * of the buffer to the new fragment when we split
1324 	 * the message.
1325 	 *
1326 	 * FIXME: It may be fragmented into multiple chunks
1327 	 *        at once if non-fragmentable extension headers
1328 	 *        are too large.
1329 	 * --yoshfuji
1330 	 */
1331 
1332 	cork->length += length;
1333 	if (length > mtu) {
1334 		int proto = sk->sk_protocol;
1335 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1336 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1337 			return -EMSGSIZE;
1338 		}
1339 
1340 		if (proto == IPPROTO_UDP &&
1341 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1342 
1343 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1344 						  hh_len, fragheaderlen,
1345 						  transhdrlen, mtu, flags, rt);
1346 			if (err)
1347 				goto error;
1348 			return 0;
1349 		}
1350 	}
1351 
1352 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1353 		goto alloc_new_skb;
1354 
1355 	while (length > 0) {
1356 		/* Check if the remaining data fits into current packet. */
1357 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1358 		if (copy < length)
1359 			copy = maxfraglen - skb->len;
1360 
1361 		if (copy <= 0) {
1362 			char *data;
1363 			unsigned int datalen;
1364 			unsigned int fraglen;
1365 			unsigned int fraggap;
1366 			unsigned int alloclen;
1367 alloc_new_skb:
1368 			/* There's no room in the current skb */
1369 			if (skb)
1370 				fraggap = skb->len - maxfraglen;
1371 			else
1372 				fraggap = 0;
1373 			/* update mtu and maxfraglen if necessary */
1374 			if (skb == NULL || skb_prev == NULL)
1375 				ip6_append_data_mtu(&mtu, &maxfraglen,
1376 						    fragheaderlen, skb, rt);
1377 
1378 			skb_prev = skb;
1379 
1380 			/*
1381 			 * If remaining data exceeds the mtu,
1382 			 * we know we need more fragment(s).
1383 			 */
1384 			datalen = length + fraggap;
1385 
1386 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1387 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1388 			if ((flags & MSG_MORE) &&
1389 			    !(rt->dst.dev->features&NETIF_F_SG))
1390 				alloclen = mtu;
1391 			else
1392 				alloclen = datalen + fragheaderlen;
1393 
1394 			alloclen += dst_exthdrlen;
1395 
1396 			if (datalen != length + fraggap) {
1397 				/*
1398 				 * this is not the last fragment, the trailer
1399 				 * space is regarded as data space.
1400 				 */
1401 				datalen += rt->dst.trailer_len;
1402 			}
1403 
1404 			alloclen += rt->dst.trailer_len;
1405 			fraglen = datalen + fragheaderlen;
1406 
1407 			/*
1408 			 * We just reserve space for fragment header.
1409 			 * Note: this may be overallocation if the message
1410 			 * (without MSG_MORE) fits into the MTU.
1411 			 */
1412 			alloclen += sizeof(struct frag_hdr);
1413 
1414 			if (transhdrlen) {
1415 				skb = sock_alloc_send_skb(sk,
1416 						alloclen + hh_len,
1417 						(flags & MSG_DONTWAIT), &err);
1418 			} else {
1419 				skb = NULL;
1420 				if (atomic_read(&sk->sk_wmem_alloc) <=
1421 				    2 * sk->sk_sndbuf)
1422 					skb = sock_wmalloc(sk,
1423 							   alloclen + hh_len, 1,
1424 							   sk->sk_allocation);
1425 				if (unlikely(skb == NULL))
1426 					err = -ENOBUFS;
1427 				else {
1428 					/* Only the initial fragment
1429 					 * is time stamped.
1430 					 */
1431 					tx_flags = 0;
1432 				}
1433 			}
1434 			if (skb == NULL)
1435 				goto error;
1436 			/*
1437 			 *	Fill in the control structures
1438 			 */
1439 			skb->ip_summed = CHECKSUM_NONE;
1440 			skb->csum = 0;
1441 			/* reserve for fragmentation and ipsec header */
1442 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1443 				    dst_exthdrlen);
1444 
1445 			if (sk->sk_type == SOCK_DGRAM)
1446 				skb_shinfo(skb)->tx_flags = tx_flags;
1447 
1448 			/*
1449 			 *	Find where to start putting bytes
1450 			 */
1451 			data = skb_put(skb, fraglen);
1452 			skb_set_network_header(skb, exthdrlen);
1453 			data += fragheaderlen;
1454 			skb->transport_header = (skb->network_header +
1455 						 fragheaderlen);
1456 			if (fraggap) {
1457 				skb->csum = skb_copy_and_csum_bits(
1458 					skb_prev, maxfraglen,
1459 					data + transhdrlen, fraggap, 0);
1460 				skb_prev->csum = csum_sub(skb_prev->csum,
1461 							  skb->csum);
1462 				data += fraggap;
1463 				pskb_trim_unique(skb_prev, maxfraglen);
1464 			}
1465 			copy = datalen - transhdrlen - fraggap;
1466 
1467 			if (copy < 0) {
1468 				err = -EINVAL;
1469 				kfree_skb(skb);
1470 				goto error;
1471 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1472 				err = -EFAULT;
1473 				kfree_skb(skb);
1474 				goto error;
1475 			}
1476 
1477 			offset += copy;
1478 			length -= datalen - fraggap;
1479 			transhdrlen = 0;
1480 			exthdrlen = 0;
1481 			dst_exthdrlen = 0;
1482 
1483 			/*
1484 			 * Put the packet on the pending queue
1485 			 */
1486 			__skb_queue_tail(&sk->sk_write_queue, skb);
1487 			continue;
1488 		}
1489 
1490 		if (copy > length)
1491 			copy = length;
1492 
1493 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1494 			unsigned int off;
1495 
1496 			off = skb->len;
1497 			if (getfrag(from, skb_put(skb, copy),
1498 						offset, copy, off, skb) < 0) {
1499 				__skb_trim(skb, off);
1500 				err = -EFAULT;
1501 				goto error;
1502 			}
1503 		} else {
1504 			int i = skb_shinfo(skb)->nr_frags;
1505 			struct page_frag *pfrag = sk_page_frag(sk);
1506 
1507 			err = -ENOMEM;
1508 			if (!sk_page_frag_refill(sk, pfrag))
1509 				goto error;
1510 
1511 			if (!skb_can_coalesce(skb, i, pfrag->page,
1512 					      pfrag->offset)) {
1513 				err = -EMSGSIZE;
1514 				if (i == MAX_SKB_FRAGS)
1515 					goto error;
1516 
1517 				__skb_fill_page_desc(skb, i, pfrag->page,
1518 						     pfrag->offset, 0);
1519 				skb_shinfo(skb)->nr_frags = ++i;
1520 				get_page(pfrag->page);
1521 			}
1522 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1523 			if (getfrag(from,
1524 				    page_address(pfrag->page) + pfrag->offset,
1525 				    offset, copy, skb->len, skb) < 0)
1526 				goto error_efault;
1527 
1528 			pfrag->offset += copy;
1529 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1530 			skb->len += copy;
1531 			skb->data_len += copy;
1532 			skb->truesize += copy;
1533 			atomic_add(copy, &sk->sk_wmem_alloc);
1534 		}
1535 		offset += copy;
1536 		length -= copy;
1537 	}
1538 
1539 	return 0;
1540 
1541 error_efault:
1542 	err = -EFAULT;
1543 error:
1544 	cork->length -= length;
1545 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1546 	return err;
1547 }
1548 EXPORT_SYMBOL_GPL(ip6_append_data);
1549 
1550 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1551 {
1552 	if (np->cork.opt) {
1553 		kfree(np->cork.opt->dst0opt);
1554 		kfree(np->cork.opt->dst1opt);
1555 		kfree(np->cork.opt->hopopt);
1556 		kfree(np->cork.opt->srcrt);
1557 		kfree(np->cork.opt);
1558 		np->cork.opt = NULL;
1559 	}
1560 
1561 	if (inet->cork.base.dst) {
1562 		dst_release(inet->cork.base.dst);
1563 		inet->cork.base.dst = NULL;
1564 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1565 	}
1566 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1567 }
1568 
1569 int ip6_push_pending_frames(struct sock *sk)
1570 {
1571 	struct sk_buff *skb, *tmp_skb;
1572 	struct sk_buff **tail_skb;
1573 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1574 	struct inet_sock *inet = inet_sk(sk);
1575 	struct ipv6_pinfo *np = inet6_sk(sk);
1576 	struct net *net = sock_net(sk);
1577 	struct ipv6hdr *hdr;
1578 	struct ipv6_txoptions *opt = np->cork.opt;
1579 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1580 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1581 	unsigned char proto = fl6->flowi6_proto;
1582 	int err = 0;
1583 
1584 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1585 		goto out;
1586 	tail_skb = &(skb_shinfo(skb)->frag_list);
1587 
1588 	/* move skb->data to ip header from ext header */
1589 	if (skb->data < skb_network_header(skb))
1590 		__skb_pull(skb, skb_network_offset(skb));
1591 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1592 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1593 		*tail_skb = tmp_skb;
1594 		tail_skb = &(tmp_skb->next);
1595 		skb->len += tmp_skb->len;
1596 		skb->data_len += tmp_skb->len;
1597 		skb->truesize += tmp_skb->truesize;
1598 		tmp_skb->destructor = NULL;
1599 		tmp_skb->sk = NULL;
1600 	}
1601 
1602 	/* Allow local fragmentation. */
1603 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1604 		skb->local_df = 1;
1605 
1606 	*final_dst = fl6->daddr;
1607 	__skb_pull(skb, skb_network_header_len(skb));
1608 	if (opt && opt->opt_flen)
1609 		ipv6_push_frag_opts(skb, opt, &proto);
1610 	if (opt && opt->opt_nflen)
1611 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1612 
1613 	skb_push(skb, sizeof(struct ipv6hdr));
1614 	skb_reset_network_header(skb);
1615 	hdr = ipv6_hdr(skb);
1616 
1617 	*(__be32*)hdr = fl6->flowlabel |
1618 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1619 
1620 	hdr->hop_limit = np->cork.hop_limit;
1621 	hdr->nexthdr = proto;
1622 	hdr->saddr = fl6->saddr;
1623 	hdr->daddr = *final_dst;
1624 
1625 	skb->priority = sk->sk_priority;
1626 	skb->mark = sk->sk_mark;
1627 
1628 	skb_dst_set(skb, dst_clone(&rt->dst));
1629 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1630 	if (proto == IPPROTO_ICMPV6) {
1631 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1632 
1633 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1634 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1635 	}
1636 
1637 	err = ip6_local_out(skb);
1638 	if (err) {
1639 		if (err > 0)
1640 			err = net_xmit_errno(err);
1641 		if (err)
1642 			goto error;
1643 	}
1644 
1645 out:
1646 	ip6_cork_release(inet, np);
1647 	return err;
1648 error:
1649 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1650 	goto out;
1651 }
1652 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1653 
1654 void ip6_flush_pending_frames(struct sock *sk)
1655 {
1656 	struct sk_buff *skb;
1657 
1658 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1659 		if (skb_dst(skb))
1660 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1661 				      IPSTATS_MIB_OUTDISCARDS);
1662 		kfree_skb(skb);
1663 	}
1664 
1665 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1666 }
1667 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1668