xref: /linux/net/ipv6/ip6_output.c (revision d8ce7263e1bc3b6b2b906fec0c5037bc27d21d6a)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	rcu_read_lock();
139 	neigh = dst_get_neighbour_noref(dst);
140 	if (neigh) {
141 		int res = neigh_output(neigh, skb);
142 
143 		rcu_read_unlock();
144 		return res;
145 	}
146 	rcu_read_unlock();
147 	IP6_INC_STATS_BH(dev_net(dst->dev),
148 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149 	kfree_skb(skb);
150 	return -EINVAL;
151 }
152 
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 	    dst_allfrag(skb_dst(skb)))
157 		return ip6_fragment(skb, ip6_finish_output2);
158 	else
159 		return ip6_finish_output2(skb);
160 }
161 
162 int ip6_output(struct sk_buff *skb)
163 {
164 	struct net_device *dev = skb_dst(skb)->dev;
165 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 	if (unlikely(idev->cnf.disable_ipv6)) {
167 		IP6_INC_STATS(dev_net(dev), idev,
168 			      IPSTATS_MIB_OUTDISCARDS);
169 		kfree_skb(skb);
170 		return 0;
171 	}
172 
173 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174 			    ip6_finish_output,
175 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177 
178 /*
179  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181 
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 	     struct ipv6_txoptions *opt, int tclass)
184 {
185 	struct net *net = sock_net(sk);
186 	struct ipv6_pinfo *np = inet6_sk(sk);
187 	struct in6_addr *first_hop = &fl6->daddr;
188 	struct dst_entry *dst = skb_dst(skb);
189 	struct ipv6hdr *hdr;
190 	u8  proto = fl6->flowi6_proto;
191 	int seg_len = skb->len;
192 	int hlimit = -1;
193 	u32 mtu;
194 
195 	if (opt) {
196 		unsigned int head_room;
197 
198 		/* First: exthdrs may take lots of space (~8K for now)
199 		   MAX_HEADER is not enough.
200 		 */
201 		head_room = opt->opt_nflen + opt->opt_flen;
202 		seg_len += head_room;
203 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204 
205 		if (skb_headroom(skb) < head_room) {
206 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 			if (skb2 == NULL) {
208 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 					      IPSTATS_MIB_OUTDISCARDS);
210 				kfree_skb(skb);
211 				return -ENOBUFS;
212 			}
213 			consume_skb(skb);
214 			skb = skb2;
215 			skb_set_owner_w(skb, sk);
216 		}
217 		if (opt->opt_flen)
218 			ipv6_push_frag_opts(skb, opt, &proto);
219 		if (opt->opt_nflen)
220 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221 	}
222 
223 	skb_push(skb, sizeof(struct ipv6hdr));
224 	skb_reset_network_header(skb);
225 	hdr = ipv6_hdr(skb);
226 
227 	/*
228 	 *	Fill in the IPv6 header
229 	 */
230 	if (np)
231 		hlimit = np->hop_limit;
232 	if (hlimit < 0)
233 		hlimit = ip6_dst_hoplimit(dst);
234 
235 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236 
237 	hdr->payload_len = htons(seg_len);
238 	hdr->nexthdr = proto;
239 	hdr->hop_limit = hlimit;
240 
241 	hdr->saddr = fl6->saddr;
242 	hdr->daddr = *first_hop;
243 
244 	skb->priority = sk->sk_priority;
245 	skb->mark = sk->sk_mark;
246 
247 	mtu = dst_mtu(dst);
248 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 			      IPSTATS_MIB_OUT, skb->len);
251 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 			       dst->dev, dst_output);
253 	}
254 
255 	net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
256 	skb->dev = dst->dev;
257 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
258 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
259 	kfree_skb(skb);
260 	return -EMSGSIZE;
261 }
262 
263 EXPORT_SYMBOL(ip6_xmit);
264 
265 /*
266  *	To avoid extra problems ND packets are send through this
267  *	routine. It's code duplication but I really want to avoid
268  *	extra checks since ipv6_build_header is used by TCP (which
269  *	is for us performance critical)
270  */
271 
272 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
273 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
274 	       int proto, int len)
275 {
276 	struct ipv6_pinfo *np = inet6_sk(sk);
277 	struct ipv6hdr *hdr;
278 
279 	skb->protocol = htons(ETH_P_IPV6);
280 	skb->dev = dev;
281 
282 	skb_reset_network_header(skb);
283 	skb_put(skb, sizeof(struct ipv6hdr));
284 	hdr = ipv6_hdr(skb);
285 
286 	*(__be32*)hdr = htonl(0x60000000);
287 
288 	hdr->payload_len = htons(len);
289 	hdr->nexthdr = proto;
290 	hdr->hop_limit = np->hop_limit;
291 
292 	hdr->saddr = *saddr;
293 	hdr->daddr = *daddr;
294 
295 	return 0;
296 }
297 
298 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
299 {
300 	struct ip6_ra_chain *ra;
301 	struct sock *last = NULL;
302 
303 	read_lock(&ip6_ra_lock);
304 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
305 		struct sock *sk = ra->sk;
306 		if (sk && ra->sel == sel &&
307 		    (!sk->sk_bound_dev_if ||
308 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
309 			if (last) {
310 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311 				if (skb2)
312 					rawv6_rcv(last, skb2);
313 			}
314 			last = sk;
315 		}
316 	}
317 
318 	if (last) {
319 		rawv6_rcv(last, skb);
320 		read_unlock(&ip6_ra_lock);
321 		return 1;
322 	}
323 	read_unlock(&ip6_ra_lock);
324 	return 0;
325 }
326 
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 {
329 	struct ipv6hdr *hdr = ipv6_hdr(skb);
330 	u8 nexthdr = hdr->nexthdr;
331 	__be16 frag_off;
332 	int offset;
333 
334 	if (ipv6_ext_hdr(nexthdr)) {
335 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336 		if (offset < 0)
337 			return 0;
338 	} else
339 		offset = sizeof(struct ipv6hdr);
340 
341 	if (nexthdr == IPPROTO_ICMPV6) {
342 		struct icmp6hdr *icmp6;
343 
344 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 					 offset + 1 - skb->data)))
346 			return 0;
347 
348 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349 
350 		switch (icmp6->icmp6_type) {
351 		case NDISC_ROUTER_SOLICITATION:
352 		case NDISC_ROUTER_ADVERTISEMENT:
353 		case NDISC_NEIGHBOUR_SOLICITATION:
354 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
355 		case NDISC_REDIRECT:
356 			/* For reaction involving unicast neighbor discovery
357 			 * message destined to the proxied address, pass it to
358 			 * input function.
359 			 */
360 			return 1;
361 		default:
362 			break;
363 		}
364 	}
365 
366 	/*
367 	 * The proxying router can't forward traffic sent to a link-local
368 	 * address, so signal the sender and discard the packet. This
369 	 * behavior is clarified by the MIPv6 specification.
370 	 */
371 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 		dst_link_failure(skb);
373 		return -1;
374 	}
375 
376 	return 0;
377 }
378 
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381 	return dst_output(skb);
382 }
383 
384 int ip6_forward(struct sk_buff *skb)
385 {
386 	struct dst_entry *dst = skb_dst(skb);
387 	struct ipv6hdr *hdr = ipv6_hdr(skb);
388 	struct inet6_skb_parm *opt = IP6CB(skb);
389 	struct net *net = dev_net(dst->dev);
390 	u32 mtu;
391 
392 	if (net->ipv6.devconf_all->forwarding == 0)
393 		goto error;
394 
395 	if (skb_warn_if_lro(skb))
396 		goto drop;
397 
398 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
400 		goto drop;
401 	}
402 
403 	if (skb->pkt_type != PACKET_HOST)
404 		goto drop;
405 
406 	skb_forward_csum(skb);
407 
408 	/*
409 	 *	We DO NOT make any processing on
410 	 *	RA packets, pushing them to user level AS IS
411 	 *	without ane WARRANTY that application will be able
412 	 *	to interpret them. The reason is that we
413 	 *	cannot make anything clever here.
414 	 *
415 	 *	We are not end-node, so that if packet contains
416 	 *	AH/ESP, we cannot make anything.
417 	 *	Defragmentation also would be mistake, RA packets
418 	 *	cannot be fragmented, because there is no warranty
419 	 *	that different fragments will go along one path. --ANK
420 	 */
421 	if (opt->ra) {
422 		u8 *ptr = skb_network_header(skb) + opt->ra;
423 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
424 			return 0;
425 	}
426 
427 	/*
428 	 *	check and decrement ttl
429 	 */
430 	if (hdr->hop_limit <= 1) {
431 		/* Force OUTPUT device used as source address */
432 		skb->dev = dst->dev;
433 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
434 		IP6_INC_STATS_BH(net,
435 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
436 
437 		kfree_skb(skb);
438 		return -ETIMEDOUT;
439 	}
440 
441 	/* XXX: idev->cnf.proxy_ndp? */
442 	if (net->ipv6.devconf_all->proxy_ndp &&
443 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
444 		int proxied = ip6_forward_proxy_check(skb);
445 		if (proxied > 0)
446 			return ip6_input(skb);
447 		else if (proxied < 0) {
448 			IP6_INC_STATS(net, ip6_dst_idev(dst),
449 				      IPSTATS_MIB_INDISCARDS);
450 			goto drop;
451 		}
452 	}
453 
454 	if (!xfrm6_route_forward(skb)) {
455 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
456 		goto drop;
457 	}
458 	dst = skb_dst(skb);
459 
460 	/* IPv6 specs say nothing about it, but it is clear that we cannot
461 	   send redirects to source routed frames.
462 	   We don't send redirects to frames decapsulated from IPsec.
463 	 */
464 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
465 		struct in6_addr *target = NULL;
466 		struct rt6_info *rt;
467 
468 		/*
469 		 *	incoming and outgoing devices are the same
470 		 *	send a redirect.
471 		 */
472 
473 		rt = (struct rt6_info *) dst;
474 		if (rt->rt6i_flags & RTF_GATEWAY)
475 			target = &rt->rt6i_gateway;
476 		else
477 			target = &hdr->daddr;
478 
479 		if (!rt->rt6i_peer)
480 			rt6_bind_peer(rt, 1);
481 
482 		/* Limit redirects both by destination (here)
483 		   and by source (inside ndisc_send_redirect)
484 		 */
485 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486 			ndisc_send_redirect(skb, target);
487 	} else {
488 		int addrtype = ipv6_addr_type(&hdr->saddr);
489 
490 		/* This check is security critical. */
491 		if (addrtype == IPV6_ADDR_ANY ||
492 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 			goto error;
494 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 				    ICMPV6_NOT_NEIGHBOUR, 0);
497 			goto error;
498 		}
499 	}
500 
501 	mtu = dst_mtu(dst);
502 	if (mtu < IPV6_MIN_MTU)
503 		mtu = IPV6_MIN_MTU;
504 
505 	if (skb->len > mtu && !skb_is_gso(skb)) {
506 		/* Again, force OUTPUT device used as source address */
507 		skb->dev = dst->dev;
508 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 		IP6_INC_STATS_BH(net,
510 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 		IP6_INC_STATS_BH(net,
512 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513 		kfree_skb(skb);
514 		return -EMSGSIZE;
515 	}
516 
517 	if (skb_cow(skb, dst->dev->hard_header_len)) {
518 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519 		goto drop;
520 	}
521 
522 	hdr = ipv6_hdr(skb);
523 
524 	/* Mangling hops number delayed to point after skb COW */
525 
526 	hdr->hop_limit--;
527 
528 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530 		       ip6_forward_finish);
531 
532 error:
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535 	kfree_skb(skb);
536 	return -EINVAL;
537 }
538 
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541 	to->pkt_type = from->pkt_type;
542 	to->priority = from->priority;
543 	to->protocol = from->protocol;
544 	skb_dst_drop(to);
545 	skb_dst_set(to, dst_clone(skb_dst(from)));
546 	to->dev = from->dev;
547 	to->mark = from->mark;
548 
549 #ifdef CONFIG_NET_SCHED
550 	to->tc_index = from->tc_index;
551 #endif
552 	nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 	to->nf_trace = from->nf_trace;
556 #endif
557 	skb_copy_secmark(to, from);
558 }
559 
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 {
562 	u16 offset = sizeof(struct ipv6hdr);
563 	struct ipv6_opt_hdr *exthdr =
564 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 	unsigned int packet_len = skb->tail - skb->network_header;
566 	int found_rhdr = 0;
567 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
568 
569 	while (offset + 1 <= packet_len) {
570 
571 		switch (**nexthdr) {
572 
573 		case NEXTHDR_HOP:
574 			break;
575 		case NEXTHDR_ROUTING:
576 			found_rhdr = 1;
577 			break;
578 		case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581 				break;
582 #endif
583 			if (found_rhdr)
584 				return offset;
585 			break;
586 		default :
587 			return offset;
588 		}
589 
590 		offset += ipv6_optlen(exthdr);
591 		*nexthdr = &exthdr->nexthdr;
592 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593 						 offset);
594 	}
595 
596 	return offset;
597 }
598 
599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
600 {
601 	static atomic_t ipv6_fragmentation_id;
602 	int old, new;
603 
604 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
605 		struct inet_peer *peer;
606 
607 		if (!rt->rt6i_peer)
608 			rt6_bind_peer(rt, 1);
609 		peer = rt->rt6i_peer;
610 		if (peer) {
611 			fhdr->identification = htonl(inet_getid(peer, 0));
612 			return;
613 		}
614 	}
615 	do {
616 		old = atomic_read(&ipv6_fragmentation_id);
617 		new = old + 1;
618 		if (!new)
619 			new = 1;
620 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
621 	fhdr->identification = htonl(new);
622 }
623 
624 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
625 {
626 	struct sk_buff *frag;
627 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629 	struct ipv6hdr *tmp_hdr;
630 	struct frag_hdr *fh;
631 	unsigned int mtu, hlen, left, len;
632 	int hroom, troom;
633 	__be32 frag_id = 0;
634 	int ptr, offset = 0, err=0;
635 	u8 *prevhdr, nexthdr = 0;
636 	struct net *net = dev_net(skb_dst(skb)->dev);
637 
638 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
639 	nexthdr = *prevhdr;
640 
641 	mtu = ip6_skb_dst_mtu(skb);
642 
643 	/* We must not fragment if the socket is set to force MTU discovery
644 	 * or if the skb it not generated by a local socket.
645 	 */
646 	if (unlikely(!skb->local_df && skb->len > mtu)) {
647 		if (skb->sk && dst_allfrag(skb_dst(skb)))
648 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
649 
650 		skb->dev = skb_dst(skb)->dev;
651 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
652 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
653 			      IPSTATS_MIB_FRAGFAILS);
654 		kfree_skb(skb);
655 		return -EMSGSIZE;
656 	}
657 
658 	if (np && np->frag_size < mtu) {
659 		if (np->frag_size)
660 			mtu = np->frag_size;
661 	}
662 	mtu -= hlen + sizeof(struct frag_hdr);
663 
664 	if (skb_has_frag_list(skb)) {
665 		int first_len = skb_pagelen(skb);
666 		struct sk_buff *frag2;
667 
668 		if (first_len - hlen > mtu ||
669 		    ((first_len - hlen) & 7) ||
670 		    skb_cloned(skb))
671 			goto slow_path;
672 
673 		skb_walk_frags(skb, frag) {
674 			/* Correct geometry. */
675 			if (frag->len > mtu ||
676 			    ((frag->len & 7) && frag->next) ||
677 			    skb_headroom(frag) < hlen)
678 				goto slow_path_clean;
679 
680 			/* Partially cloned skb? */
681 			if (skb_shared(frag))
682 				goto slow_path_clean;
683 
684 			BUG_ON(frag->sk);
685 			if (skb->sk) {
686 				frag->sk = skb->sk;
687 				frag->destructor = sock_wfree;
688 			}
689 			skb->truesize -= frag->truesize;
690 		}
691 
692 		err = 0;
693 		offset = 0;
694 		frag = skb_shinfo(skb)->frag_list;
695 		skb_frag_list_init(skb);
696 		/* BUILD HEADER */
697 
698 		*prevhdr = NEXTHDR_FRAGMENT;
699 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
700 		if (!tmp_hdr) {
701 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
702 				      IPSTATS_MIB_FRAGFAILS);
703 			return -ENOMEM;
704 		}
705 
706 		__skb_pull(skb, hlen);
707 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
708 		__skb_push(skb, hlen);
709 		skb_reset_network_header(skb);
710 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
711 
712 		ipv6_select_ident(fh, rt);
713 		fh->nexthdr = nexthdr;
714 		fh->reserved = 0;
715 		fh->frag_off = htons(IP6_MF);
716 		frag_id = fh->identification;
717 
718 		first_len = skb_pagelen(skb);
719 		skb->data_len = first_len - skb_headlen(skb);
720 		skb->len = first_len;
721 		ipv6_hdr(skb)->payload_len = htons(first_len -
722 						   sizeof(struct ipv6hdr));
723 
724 		dst_hold(&rt->dst);
725 
726 		for (;;) {
727 			/* Prepare header of the next frame,
728 			 * before previous one went down. */
729 			if (frag) {
730 				frag->ip_summed = CHECKSUM_NONE;
731 				skb_reset_transport_header(frag);
732 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
733 				__skb_push(frag, hlen);
734 				skb_reset_network_header(frag);
735 				memcpy(skb_network_header(frag), tmp_hdr,
736 				       hlen);
737 				offset += skb->len - hlen - sizeof(struct frag_hdr);
738 				fh->nexthdr = nexthdr;
739 				fh->reserved = 0;
740 				fh->frag_off = htons(offset);
741 				if (frag->next != NULL)
742 					fh->frag_off |= htons(IP6_MF);
743 				fh->identification = frag_id;
744 				ipv6_hdr(frag)->payload_len =
745 						htons(frag->len -
746 						      sizeof(struct ipv6hdr));
747 				ip6_copy_metadata(frag, skb);
748 			}
749 
750 			err = output(skb);
751 			if(!err)
752 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
753 					      IPSTATS_MIB_FRAGCREATES);
754 
755 			if (err || !frag)
756 				break;
757 
758 			skb = frag;
759 			frag = skb->next;
760 			skb->next = NULL;
761 		}
762 
763 		kfree(tmp_hdr);
764 
765 		if (err == 0) {
766 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
767 				      IPSTATS_MIB_FRAGOKS);
768 			dst_release(&rt->dst);
769 			return 0;
770 		}
771 
772 		while (frag) {
773 			skb = frag->next;
774 			kfree_skb(frag);
775 			frag = skb;
776 		}
777 
778 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
779 			      IPSTATS_MIB_FRAGFAILS);
780 		dst_release(&rt->dst);
781 		return err;
782 
783 slow_path_clean:
784 		skb_walk_frags(skb, frag2) {
785 			if (frag2 == frag)
786 				break;
787 			frag2->sk = NULL;
788 			frag2->destructor = NULL;
789 			skb->truesize += frag2->truesize;
790 		}
791 	}
792 
793 slow_path:
794 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
795 	    skb_checksum_help(skb))
796 		goto fail;
797 
798 	left = skb->len - hlen;		/* Space per frame */
799 	ptr = hlen;			/* Where to start from */
800 
801 	/*
802 	 *	Fragment the datagram.
803 	 */
804 
805 	*prevhdr = NEXTHDR_FRAGMENT;
806 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
807 	troom = rt->dst.dev->needed_tailroom;
808 
809 	/*
810 	 *	Keep copying data until we run out.
811 	 */
812 	while(left > 0)	{
813 		len = left;
814 		/* IF: it doesn't fit, use 'mtu' - the data space left */
815 		if (len > mtu)
816 			len = mtu;
817 		/* IF: we are not sending up to and including the packet end
818 		   then align the next start on an eight byte boundary */
819 		if (len < left)	{
820 			len &= ~7;
821 		}
822 		/*
823 		 *	Allocate buffer.
824 		 */
825 
826 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
827 				      hroom + troom, GFP_ATOMIC)) == NULL) {
828 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
829 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
830 				      IPSTATS_MIB_FRAGFAILS);
831 			err = -ENOMEM;
832 			goto fail;
833 		}
834 
835 		/*
836 		 *	Set up data on packet
837 		 */
838 
839 		ip6_copy_metadata(frag, skb);
840 		skb_reserve(frag, hroom);
841 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
842 		skb_reset_network_header(frag);
843 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
844 		frag->transport_header = (frag->network_header + hlen +
845 					  sizeof(struct frag_hdr));
846 
847 		/*
848 		 *	Charge the memory for the fragment to any owner
849 		 *	it might possess
850 		 */
851 		if (skb->sk)
852 			skb_set_owner_w(frag, skb->sk);
853 
854 		/*
855 		 *	Copy the packet header into the new buffer.
856 		 */
857 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
858 
859 		/*
860 		 *	Build fragment header.
861 		 */
862 		fh->nexthdr = nexthdr;
863 		fh->reserved = 0;
864 		if (!frag_id) {
865 			ipv6_select_ident(fh, rt);
866 			frag_id = fh->identification;
867 		} else
868 			fh->identification = frag_id;
869 
870 		/*
871 		 *	Copy a block of the IP datagram.
872 		 */
873 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
874 			BUG();
875 		left -= len;
876 
877 		fh->frag_off = htons(offset);
878 		if (left > 0)
879 			fh->frag_off |= htons(IP6_MF);
880 		ipv6_hdr(frag)->payload_len = htons(frag->len -
881 						    sizeof(struct ipv6hdr));
882 
883 		ptr += len;
884 		offset += len;
885 
886 		/*
887 		 *	Put this fragment into the sending queue.
888 		 */
889 		err = output(frag);
890 		if (err)
891 			goto fail;
892 
893 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894 			      IPSTATS_MIB_FRAGCREATES);
895 	}
896 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
897 		      IPSTATS_MIB_FRAGOKS);
898 	consume_skb(skb);
899 	return err;
900 
901 fail:
902 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
903 		      IPSTATS_MIB_FRAGFAILS);
904 	kfree_skb(skb);
905 	return err;
906 }
907 
908 static inline int ip6_rt_check(const struct rt6key *rt_key,
909 			       const struct in6_addr *fl_addr,
910 			       const struct in6_addr *addr_cache)
911 {
912 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
913 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
914 }
915 
916 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
917 					  struct dst_entry *dst,
918 					  const struct flowi6 *fl6)
919 {
920 	struct ipv6_pinfo *np = inet6_sk(sk);
921 	struct rt6_info *rt = (struct rt6_info *)dst;
922 
923 	if (!dst)
924 		goto out;
925 
926 	/* Yes, checking route validity in not connected
927 	 * case is not very simple. Take into account,
928 	 * that we do not support routing by source, TOS,
929 	 * and MSG_DONTROUTE 		--ANK (980726)
930 	 *
931 	 * 1. ip6_rt_check(): If route was host route,
932 	 *    check that cached destination is current.
933 	 *    If it is network route, we still may
934 	 *    check its validity using saved pointer
935 	 *    to the last used address: daddr_cache.
936 	 *    We do not want to save whole address now,
937 	 *    (because main consumer of this service
938 	 *    is tcp, which has not this problem),
939 	 *    so that the last trick works only on connected
940 	 *    sockets.
941 	 * 2. oif also should be the same.
942 	 */
943 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
944 #ifdef CONFIG_IPV6_SUBTREES
945 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
946 #endif
947 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
948 		dst_release(dst);
949 		dst = NULL;
950 	}
951 
952 out:
953 	return dst;
954 }
955 
956 static int ip6_dst_lookup_tail(struct sock *sk,
957 			       struct dst_entry **dst, struct flowi6 *fl6)
958 {
959 	struct net *net = sock_net(sk);
960 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
961 	struct neighbour *n;
962 #endif
963 	int err;
964 
965 	if (*dst == NULL)
966 		*dst = ip6_route_output(net, sk, fl6);
967 
968 	if ((err = (*dst)->error))
969 		goto out_err_release;
970 
971 	if (ipv6_addr_any(&fl6->saddr)) {
972 		struct rt6_info *rt = (struct rt6_info *) *dst;
973 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
974 					  sk ? inet6_sk(sk)->srcprefs : 0,
975 					  &fl6->saddr);
976 		if (err)
977 			goto out_err_release;
978 	}
979 
980 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
981 	/*
982 	 * Here if the dst entry we've looked up
983 	 * has a neighbour entry that is in the INCOMPLETE
984 	 * state and the src address from the flow is
985 	 * marked as OPTIMISTIC, we release the found
986 	 * dst entry and replace it instead with the
987 	 * dst entry of the nexthop router
988 	 */
989 	rcu_read_lock();
990 	n = dst_get_neighbour_noref(*dst);
991 	if (n && !(n->nud_state & NUD_VALID)) {
992 		struct inet6_ifaddr *ifp;
993 		struct flowi6 fl_gw6;
994 		int redirect;
995 
996 		rcu_read_unlock();
997 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
998 				      (*dst)->dev, 1);
999 
1000 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1001 		if (ifp)
1002 			in6_ifa_put(ifp);
1003 
1004 		if (redirect) {
1005 			/*
1006 			 * We need to get the dst entry for the
1007 			 * default router instead
1008 			 */
1009 			dst_release(*dst);
1010 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1011 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1012 			*dst = ip6_route_output(net, sk, &fl_gw6);
1013 			if ((err = (*dst)->error))
1014 				goto out_err_release;
1015 		}
1016 	} else {
1017 		rcu_read_unlock();
1018 	}
1019 #endif
1020 
1021 	return 0;
1022 
1023 out_err_release:
1024 	if (err == -ENETUNREACH)
1025 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1026 	dst_release(*dst);
1027 	*dst = NULL;
1028 	return err;
1029 }
1030 
1031 /**
1032  *	ip6_dst_lookup - perform route lookup on flow
1033  *	@sk: socket which provides route info
1034  *	@dst: pointer to dst_entry * for result
1035  *	@fl6: flow to lookup
1036  *
1037  *	This function performs a route lookup on the given flow.
1038  *
1039  *	It returns zero on success, or a standard errno code on error.
1040  */
1041 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1042 {
1043 	*dst = NULL;
1044 	return ip6_dst_lookup_tail(sk, dst, fl6);
1045 }
1046 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1047 
1048 /**
1049  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1050  *	@sk: socket which provides route info
1051  *	@fl6: flow to lookup
1052  *	@final_dst: final destination address for ipsec lookup
1053  *	@can_sleep: we are in a sleepable context
1054  *
1055  *	This function performs a route lookup on the given flow.
1056  *
1057  *	It returns a valid dst pointer on success, or a pointer encoded
1058  *	error code.
1059  */
1060 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1061 				      const struct in6_addr *final_dst,
1062 				      bool can_sleep)
1063 {
1064 	struct dst_entry *dst = NULL;
1065 	int err;
1066 
1067 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1068 	if (err)
1069 		return ERR_PTR(err);
1070 	if (final_dst)
1071 		fl6->daddr = *final_dst;
1072 	if (can_sleep)
1073 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1074 
1075 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1076 }
1077 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1078 
1079 /**
1080  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1081  *	@sk: socket which provides the dst cache and route info
1082  *	@fl6: flow to lookup
1083  *	@final_dst: final destination address for ipsec lookup
1084  *	@can_sleep: we are in a sleepable context
1085  *
1086  *	This function performs a route lookup on the given flow with the
1087  *	possibility of using the cached route in the socket if it is valid.
1088  *	It will take the socket dst lock when operating on the dst cache.
1089  *	As a result, this function can only be used in process context.
1090  *
1091  *	It returns a valid dst pointer on success, or a pointer encoded
1092  *	error code.
1093  */
1094 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1095 					 const struct in6_addr *final_dst,
1096 					 bool can_sleep)
1097 {
1098 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1099 	int err;
1100 
1101 	dst = ip6_sk_dst_check(sk, dst, fl6);
1102 
1103 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1104 	if (err)
1105 		return ERR_PTR(err);
1106 	if (final_dst)
1107 		fl6->daddr = *final_dst;
1108 	if (can_sleep)
1109 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1110 
1111 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1112 }
1113 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1114 
1115 static inline int ip6_ufo_append_data(struct sock *sk,
1116 			int getfrag(void *from, char *to, int offset, int len,
1117 			int odd, struct sk_buff *skb),
1118 			void *from, int length, int hh_len, int fragheaderlen,
1119 			int transhdrlen, int mtu,unsigned int flags,
1120 			struct rt6_info *rt)
1121 
1122 {
1123 	struct sk_buff *skb;
1124 	int err;
1125 
1126 	/* There is support for UDP large send offload by network
1127 	 * device, so create one single skb packet containing complete
1128 	 * udp datagram
1129 	 */
1130 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1131 		skb = sock_alloc_send_skb(sk,
1132 			hh_len + fragheaderlen + transhdrlen + 20,
1133 			(flags & MSG_DONTWAIT), &err);
1134 		if (skb == NULL)
1135 			return err;
1136 
1137 		/* reserve space for Hardware header */
1138 		skb_reserve(skb, hh_len);
1139 
1140 		/* create space for UDP/IP header */
1141 		skb_put(skb,fragheaderlen + transhdrlen);
1142 
1143 		/* initialize network header pointer */
1144 		skb_reset_network_header(skb);
1145 
1146 		/* initialize protocol header pointer */
1147 		skb->transport_header = skb->network_header + fragheaderlen;
1148 
1149 		skb->ip_summed = CHECKSUM_PARTIAL;
1150 		skb->csum = 0;
1151 	}
1152 
1153 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1154 				      (length - transhdrlen));
1155 	if (!err) {
1156 		struct frag_hdr fhdr;
1157 
1158 		/* Specify the length of each IPv6 datagram fragment.
1159 		 * It has to be a multiple of 8.
1160 		 */
1161 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1162 					     sizeof(struct frag_hdr)) & ~7;
1163 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1164 		ipv6_select_ident(&fhdr, rt);
1165 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1166 		__skb_queue_tail(&sk->sk_write_queue, skb);
1167 
1168 		return 0;
1169 	}
1170 	/* There is not enough support do UPD LSO,
1171 	 * so follow normal path
1172 	 */
1173 	kfree_skb(skb);
1174 
1175 	return err;
1176 }
1177 
1178 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1179 					       gfp_t gfp)
1180 {
1181 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183 
1184 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1185 						gfp_t gfp)
1186 {
1187 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1188 }
1189 
1190 static void ip6_append_data_mtu(int *mtu,
1191 				int *maxfraglen,
1192 				unsigned int fragheaderlen,
1193 				struct sk_buff *skb,
1194 				struct rt6_info *rt)
1195 {
1196 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1197 		if (skb == NULL) {
1198 			/* first fragment, reserve header_len */
1199 			*mtu = *mtu - rt->dst.header_len;
1200 
1201 		} else {
1202 			/*
1203 			 * this fragment is not first, the headers
1204 			 * space is regarded as data space.
1205 			 */
1206 			*mtu = dst_mtu(rt->dst.path);
1207 		}
1208 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1209 			      + fragheaderlen - sizeof(struct frag_hdr);
1210 	}
1211 }
1212 
1213 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1214 	int offset, int len, int odd, struct sk_buff *skb),
1215 	void *from, int length, int transhdrlen,
1216 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1217 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1218 {
1219 	struct inet_sock *inet = inet_sk(sk);
1220 	struct ipv6_pinfo *np = inet6_sk(sk);
1221 	struct inet_cork *cork;
1222 	struct sk_buff *skb, *skb_prev = NULL;
1223 	unsigned int maxfraglen, fragheaderlen;
1224 	int exthdrlen;
1225 	int dst_exthdrlen;
1226 	int hh_len;
1227 	int mtu;
1228 	int copy;
1229 	int err;
1230 	int offset = 0;
1231 	__u8 tx_flags = 0;
1232 
1233 	if (flags&MSG_PROBE)
1234 		return 0;
1235 	cork = &inet->cork.base;
1236 	if (skb_queue_empty(&sk->sk_write_queue)) {
1237 		/*
1238 		 * setup for corking
1239 		 */
1240 		if (opt) {
1241 			if (WARN_ON(np->cork.opt))
1242 				return -EINVAL;
1243 
1244 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1245 			if (unlikely(np->cork.opt == NULL))
1246 				return -ENOBUFS;
1247 
1248 			np->cork.opt->tot_len = opt->tot_len;
1249 			np->cork.opt->opt_flen = opt->opt_flen;
1250 			np->cork.opt->opt_nflen = opt->opt_nflen;
1251 
1252 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1253 							    sk->sk_allocation);
1254 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1255 				return -ENOBUFS;
1256 
1257 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1258 							    sk->sk_allocation);
1259 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1260 				return -ENOBUFS;
1261 
1262 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1263 							   sk->sk_allocation);
1264 			if (opt->hopopt && !np->cork.opt->hopopt)
1265 				return -ENOBUFS;
1266 
1267 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1268 							    sk->sk_allocation);
1269 			if (opt->srcrt && !np->cork.opt->srcrt)
1270 				return -ENOBUFS;
1271 
1272 			/* need source address above miyazawa*/
1273 		}
1274 		dst_hold(&rt->dst);
1275 		cork->dst = &rt->dst;
1276 		inet->cork.fl.u.ip6 = *fl6;
1277 		np->cork.hop_limit = hlimit;
1278 		np->cork.tclass = tclass;
1279 		if (rt->dst.flags & DST_XFRM_TUNNEL)
1280 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1281 			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1282 		else
1283 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1284 			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1285 		if (np->frag_size < mtu) {
1286 			if (np->frag_size)
1287 				mtu = np->frag_size;
1288 		}
1289 		cork->fragsize = mtu;
1290 		if (dst_allfrag(rt->dst.path))
1291 			cork->flags |= IPCORK_ALLFRAG;
1292 		cork->length = 0;
1293 		sk->sk_sndmsg_page = NULL;
1294 		sk->sk_sndmsg_off = 0;
1295 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1296 		length += exthdrlen;
1297 		transhdrlen += exthdrlen;
1298 		dst_exthdrlen = rt->dst.header_len;
1299 	} else {
1300 		rt = (struct rt6_info *)cork->dst;
1301 		fl6 = &inet->cork.fl.u.ip6;
1302 		opt = np->cork.opt;
1303 		transhdrlen = 0;
1304 		exthdrlen = 0;
1305 		dst_exthdrlen = 0;
1306 		mtu = cork->fragsize;
1307 	}
1308 
1309 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1310 
1311 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1312 			(opt ? opt->opt_nflen : 0);
1313 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1314 
1315 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1316 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1317 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1318 			return -EMSGSIZE;
1319 		}
1320 	}
1321 
1322 	/* For UDP, check if TX timestamp is enabled */
1323 	if (sk->sk_type == SOCK_DGRAM) {
1324 		err = sock_tx_timestamp(sk, &tx_flags);
1325 		if (err)
1326 			goto error;
1327 	}
1328 
1329 	/*
1330 	 * Let's try using as much space as possible.
1331 	 * Use MTU if total length of the message fits into the MTU.
1332 	 * Otherwise, we need to reserve fragment header and
1333 	 * fragment alignment (= 8-15 octects, in total).
1334 	 *
1335 	 * Note that we may need to "move" the data from the tail of
1336 	 * of the buffer to the new fragment when we split
1337 	 * the message.
1338 	 *
1339 	 * FIXME: It may be fragmented into multiple chunks
1340 	 *        at once if non-fragmentable extension headers
1341 	 *        are too large.
1342 	 * --yoshfuji
1343 	 */
1344 
1345 	cork->length += length;
1346 	if (length > mtu) {
1347 		int proto = sk->sk_protocol;
1348 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1349 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1350 			return -EMSGSIZE;
1351 		}
1352 
1353 		if (proto == IPPROTO_UDP &&
1354 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1355 
1356 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1357 						  hh_len, fragheaderlen,
1358 						  transhdrlen, mtu, flags, rt);
1359 			if (err)
1360 				goto error;
1361 			return 0;
1362 		}
1363 	}
1364 
1365 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1366 		goto alloc_new_skb;
1367 
1368 	while (length > 0) {
1369 		/* Check if the remaining data fits into current packet. */
1370 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1371 		if (copy < length)
1372 			copy = maxfraglen - skb->len;
1373 
1374 		if (copy <= 0) {
1375 			char *data;
1376 			unsigned int datalen;
1377 			unsigned int fraglen;
1378 			unsigned int fraggap;
1379 			unsigned int alloclen;
1380 alloc_new_skb:
1381 			/* There's no room in the current skb */
1382 			if (skb)
1383 				fraggap = skb->len - maxfraglen;
1384 			else
1385 				fraggap = 0;
1386 			/* update mtu and maxfraglen if necessary */
1387 			if (skb == NULL || skb_prev == NULL)
1388 				ip6_append_data_mtu(&mtu, &maxfraglen,
1389 						    fragheaderlen, skb, rt);
1390 
1391 			skb_prev = skb;
1392 
1393 			/*
1394 			 * If remaining data exceeds the mtu,
1395 			 * we know we need more fragment(s).
1396 			 */
1397 			datalen = length + fraggap;
1398 
1399 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1400 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1401 			if ((flags & MSG_MORE) &&
1402 			    !(rt->dst.dev->features&NETIF_F_SG))
1403 				alloclen = mtu;
1404 			else
1405 				alloclen = datalen + fragheaderlen;
1406 
1407 			alloclen += dst_exthdrlen;
1408 
1409 			if (datalen != length + fraggap) {
1410 				/*
1411 				 * this is not the last fragment, the trailer
1412 				 * space is regarded as data space.
1413 				 */
1414 				datalen += rt->dst.trailer_len;
1415 			}
1416 
1417 			alloclen += rt->dst.trailer_len;
1418 			fraglen = datalen + fragheaderlen;
1419 
1420 			/*
1421 			 * We just reserve space for fragment header.
1422 			 * Note: this may be overallocation if the message
1423 			 * (without MSG_MORE) fits into the MTU.
1424 			 */
1425 			alloclen += sizeof(struct frag_hdr);
1426 
1427 			if (transhdrlen) {
1428 				skb = sock_alloc_send_skb(sk,
1429 						alloclen + hh_len,
1430 						(flags & MSG_DONTWAIT), &err);
1431 			} else {
1432 				skb = NULL;
1433 				if (atomic_read(&sk->sk_wmem_alloc) <=
1434 				    2 * sk->sk_sndbuf)
1435 					skb = sock_wmalloc(sk,
1436 							   alloclen + hh_len, 1,
1437 							   sk->sk_allocation);
1438 				if (unlikely(skb == NULL))
1439 					err = -ENOBUFS;
1440 				else {
1441 					/* Only the initial fragment
1442 					 * is time stamped.
1443 					 */
1444 					tx_flags = 0;
1445 				}
1446 			}
1447 			if (skb == NULL)
1448 				goto error;
1449 			/*
1450 			 *	Fill in the control structures
1451 			 */
1452 			skb->ip_summed = CHECKSUM_NONE;
1453 			skb->csum = 0;
1454 			/* reserve for fragmentation and ipsec header */
1455 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1456 				    dst_exthdrlen);
1457 
1458 			if (sk->sk_type == SOCK_DGRAM)
1459 				skb_shinfo(skb)->tx_flags = tx_flags;
1460 
1461 			/*
1462 			 *	Find where to start putting bytes
1463 			 */
1464 			data = skb_put(skb, fraglen);
1465 			skb_set_network_header(skb, exthdrlen);
1466 			data += fragheaderlen;
1467 			skb->transport_header = (skb->network_header +
1468 						 fragheaderlen);
1469 			if (fraggap) {
1470 				skb->csum = skb_copy_and_csum_bits(
1471 					skb_prev, maxfraglen,
1472 					data + transhdrlen, fraggap, 0);
1473 				skb_prev->csum = csum_sub(skb_prev->csum,
1474 							  skb->csum);
1475 				data += fraggap;
1476 				pskb_trim_unique(skb_prev, maxfraglen);
1477 			}
1478 			copy = datalen - transhdrlen - fraggap;
1479 
1480 			if (copy < 0) {
1481 				err = -EINVAL;
1482 				kfree_skb(skb);
1483 				goto error;
1484 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1485 				err = -EFAULT;
1486 				kfree_skb(skb);
1487 				goto error;
1488 			}
1489 
1490 			offset += copy;
1491 			length -= datalen - fraggap;
1492 			transhdrlen = 0;
1493 			exthdrlen = 0;
1494 			dst_exthdrlen = 0;
1495 
1496 			/*
1497 			 * Put the packet on the pending queue
1498 			 */
1499 			__skb_queue_tail(&sk->sk_write_queue, skb);
1500 			continue;
1501 		}
1502 
1503 		if (copy > length)
1504 			copy = length;
1505 
1506 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1507 			unsigned int off;
1508 
1509 			off = skb->len;
1510 			if (getfrag(from, skb_put(skb, copy),
1511 						offset, copy, off, skb) < 0) {
1512 				__skb_trim(skb, off);
1513 				err = -EFAULT;
1514 				goto error;
1515 			}
1516 		} else {
1517 			int i = skb_shinfo(skb)->nr_frags;
1518 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1519 			struct page *page = sk->sk_sndmsg_page;
1520 			int off = sk->sk_sndmsg_off;
1521 			unsigned int left;
1522 
1523 			if (page && (left = PAGE_SIZE - off) > 0) {
1524 				if (copy >= left)
1525 					copy = left;
1526 				if (page != skb_frag_page(frag)) {
1527 					if (i == MAX_SKB_FRAGS) {
1528 						err = -EMSGSIZE;
1529 						goto error;
1530 					}
1531 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1532 					skb_frag_ref(skb, i);
1533 					frag = &skb_shinfo(skb)->frags[i];
1534 				}
1535 			} else if(i < MAX_SKB_FRAGS) {
1536 				if (copy > PAGE_SIZE)
1537 					copy = PAGE_SIZE;
1538 				page = alloc_pages(sk->sk_allocation, 0);
1539 				if (page == NULL) {
1540 					err = -ENOMEM;
1541 					goto error;
1542 				}
1543 				sk->sk_sndmsg_page = page;
1544 				sk->sk_sndmsg_off = 0;
1545 
1546 				skb_fill_page_desc(skb, i, page, 0, 0);
1547 				frag = &skb_shinfo(skb)->frags[i];
1548 			} else {
1549 				err = -EMSGSIZE;
1550 				goto error;
1551 			}
1552 			if (getfrag(from,
1553 				    skb_frag_address(frag) + skb_frag_size(frag),
1554 				    offset, copy, skb->len, skb) < 0) {
1555 				err = -EFAULT;
1556 				goto error;
1557 			}
1558 			sk->sk_sndmsg_off += copy;
1559 			skb_frag_size_add(frag, copy);
1560 			skb->len += copy;
1561 			skb->data_len += copy;
1562 			skb->truesize += copy;
1563 			atomic_add(copy, &sk->sk_wmem_alloc);
1564 		}
1565 		offset += copy;
1566 		length -= copy;
1567 	}
1568 	return 0;
1569 error:
1570 	cork->length -= length;
1571 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1572 	return err;
1573 }
1574 EXPORT_SYMBOL_GPL(ip6_append_data);
1575 
1576 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1577 {
1578 	if (np->cork.opt) {
1579 		kfree(np->cork.opt->dst0opt);
1580 		kfree(np->cork.opt->dst1opt);
1581 		kfree(np->cork.opt->hopopt);
1582 		kfree(np->cork.opt->srcrt);
1583 		kfree(np->cork.opt);
1584 		np->cork.opt = NULL;
1585 	}
1586 
1587 	if (inet->cork.base.dst) {
1588 		dst_release(inet->cork.base.dst);
1589 		inet->cork.base.dst = NULL;
1590 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1591 	}
1592 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1593 }
1594 
1595 int ip6_push_pending_frames(struct sock *sk)
1596 {
1597 	struct sk_buff *skb, *tmp_skb;
1598 	struct sk_buff **tail_skb;
1599 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1600 	struct inet_sock *inet = inet_sk(sk);
1601 	struct ipv6_pinfo *np = inet6_sk(sk);
1602 	struct net *net = sock_net(sk);
1603 	struct ipv6hdr *hdr;
1604 	struct ipv6_txoptions *opt = np->cork.opt;
1605 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1606 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1607 	unsigned char proto = fl6->flowi6_proto;
1608 	int err = 0;
1609 
1610 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1611 		goto out;
1612 	tail_skb = &(skb_shinfo(skb)->frag_list);
1613 
1614 	/* move skb->data to ip header from ext header */
1615 	if (skb->data < skb_network_header(skb))
1616 		__skb_pull(skb, skb_network_offset(skb));
1617 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1618 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1619 		*tail_skb = tmp_skb;
1620 		tail_skb = &(tmp_skb->next);
1621 		skb->len += tmp_skb->len;
1622 		skb->data_len += tmp_skb->len;
1623 		skb->truesize += tmp_skb->truesize;
1624 		tmp_skb->destructor = NULL;
1625 		tmp_skb->sk = NULL;
1626 	}
1627 
1628 	/* Allow local fragmentation. */
1629 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1630 		skb->local_df = 1;
1631 
1632 	*final_dst = fl6->daddr;
1633 	__skb_pull(skb, skb_network_header_len(skb));
1634 	if (opt && opt->opt_flen)
1635 		ipv6_push_frag_opts(skb, opt, &proto);
1636 	if (opt && opt->opt_nflen)
1637 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1638 
1639 	skb_push(skb, sizeof(struct ipv6hdr));
1640 	skb_reset_network_header(skb);
1641 	hdr = ipv6_hdr(skb);
1642 
1643 	*(__be32*)hdr = fl6->flowlabel |
1644 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1645 
1646 	hdr->hop_limit = np->cork.hop_limit;
1647 	hdr->nexthdr = proto;
1648 	hdr->saddr = fl6->saddr;
1649 	hdr->daddr = *final_dst;
1650 
1651 	skb->priority = sk->sk_priority;
1652 	skb->mark = sk->sk_mark;
1653 
1654 	skb_dst_set(skb, dst_clone(&rt->dst));
1655 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1656 	if (proto == IPPROTO_ICMPV6) {
1657 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1658 
1659 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1660 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1661 	}
1662 
1663 	err = ip6_local_out(skb);
1664 	if (err) {
1665 		if (err > 0)
1666 			err = net_xmit_errno(err);
1667 		if (err)
1668 			goto error;
1669 	}
1670 
1671 out:
1672 	ip6_cork_release(inet, np);
1673 	return err;
1674 error:
1675 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1676 	goto out;
1677 }
1678 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1679 
1680 void ip6_flush_pending_frames(struct sock *sk)
1681 {
1682 	struct sk_buff *skb;
1683 
1684 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1685 		if (skb_dst(skb))
1686 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1687 				      IPSTATS_MIB_OUTDISCARDS);
1688 		kfree_skb(skb);
1689 	}
1690 
1691 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1692 }
1693 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1694