xref: /linux/net/ipv6/ip6_output.c (revision 2c1ba398ac9da3305815f6ae8e95ae2b9fd3b5ff)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	rcu_read_lock();
139 	neigh = dst_get_neighbour(dst);
140 	if (neigh) {
141 		int res = neigh_output(neigh, skb);
142 
143 		rcu_read_unlock();
144 		return res;
145 	}
146 	rcu_read_unlock();
147 	IP6_INC_STATS_BH(dev_net(dst->dev),
148 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149 	kfree_skb(skb);
150 	return -EINVAL;
151 }
152 
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 	    dst_allfrag(skb_dst(skb)))
157 		return ip6_fragment(skb, ip6_finish_output2);
158 	else
159 		return ip6_finish_output2(skb);
160 }
161 
162 int ip6_output(struct sk_buff *skb)
163 {
164 	struct net_device *dev = skb_dst(skb)->dev;
165 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 	if (unlikely(idev->cnf.disable_ipv6)) {
167 		IP6_INC_STATS(dev_net(dev), idev,
168 			      IPSTATS_MIB_OUTDISCARDS);
169 		kfree_skb(skb);
170 		return 0;
171 	}
172 
173 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174 			    ip6_finish_output,
175 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177 
178 /*
179  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181 
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 	     struct ipv6_txoptions *opt)
184 {
185 	struct net *net = sock_net(sk);
186 	struct ipv6_pinfo *np = inet6_sk(sk);
187 	struct in6_addr *first_hop = &fl6->daddr;
188 	struct dst_entry *dst = skb_dst(skb);
189 	struct ipv6hdr *hdr;
190 	u8  proto = fl6->flowi6_proto;
191 	int seg_len = skb->len;
192 	int hlimit = -1;
193 	int tclass = 0;
194 	u32 mtu;
195 
196 	if (opt) {
197 		unsigned int head_room;
198 
199 		/* First: exthdrs may take lots of space (~8K for now)
200 		   MAX_HEADER is not enough.
201 		 */
202 		head_room = opt->opt_nflen + opt->opt_flen;
203 		seg_len += head_room;
204 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
205 
206 		if (skb_headroom(skb) < head_room) {
207 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
208 			if (skb2 == NULL) {
209 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
210 					      IPSTATS_MIB_OUTDISCARDS);
211 				kfree_skb(skb);
212 				return -ENOBUFS;
213 			}
214 			kfree_skb(skb);
215 			skb = skb2;
216 			skb_set_owner_w(skb, sk);
217 		}
218 		if (opt->opt_flen)
219 			ipv6_push_frag_opts(skb, opt, &proto);
220 		if (opt->opt_nflen)
221 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
222 	}
223 
224 	skb_push(skb, sizeof(struct ipv6hdr));
225 	skb_reset_network_header(skb);
226 	hdr = ipv6_hdr(skb);
227 
228 	/*
229 	 *	Fill in the IPv6 header
230 	 */
231 	if (np) {
232 		tclass = np->tclass;
233 		hlimit = np->hop_limit;
234 	}
235 	if (hlimit < 0)
236 		hlimit = ip6_dst_hoplimit(dst);
237 
238 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
239 
240 	hdr->payload_len = htons(seg_len);
241 	hdr->nexthdr = proto;
242 	hdr->hop_limit = hlimit;
243 
244 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
245 	ipv6_addr_copy(&hdr->daddr, first_hop);
246 
247 	skb->priority = sk->sk_priority;
248 	skb->mark = sk->sk_mark;
249 
250 	mtu = dst_mtu(dst);
251 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
252 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
253 			      IPSTATS_MIB_OUT, skb->len);
254 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
255 			       dst->dev, dst_output);
256 	}
257 
258 	if (net_ratelimit())
259 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
260 	skb->dev = dst->dev;
261 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
262 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
263 	kfree_skb(skb);
264 	return -EMSGSIZE;
265 }
266 
267 EXPORT_SYMBOL(ip6_xmit);
268 
269 /*
270  *	To avoid extra problems ND packets are send through this
271  *	routine. It's code duplication but I really want to avoid
272  *	extra checks since ipv6_build_header is used by TCP (which
273  *	is for us performance critical)
274  */
275 
276 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
277 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
278 	       int proto, int len)
279 {
280 	struct ipv6_pinfo *np = inet6_sk(sk);
281 	struct ipv6hdr *hdr;
282 
283 	skb->protocol = htons(ETH_P_IPV6);
284 	skb->dev = dev;
285 
286 	skb_reset_network_header(skb);
287 	skb_put(skb, sizeof(struct ipv6hdr));
288 	hdr = ipv6_hdr(skb);
289 
290 	*(__be32*)hdr = htonl(0x60000000);
291 
292 	hdr->payload_len = htons(len);
293 	hdr->nexthdr = proto;
294 	hdr->hop_limit = np->hop_limit;
295 
296 	ipv6_addr_copy(&hdr->saddr, saddr);
297 	ipv6_addr_copy(&hdr->daddr, daddr);
298 
299 	return 0;
300 }
301 
302 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
303 {
304 	struct ip6_ra_chain *ra;
305 	struct sock *last = NULL;
306 
307 	read_lock(&ip6_ra_lock);
308 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
309 		struct sock *sk = ra->sk;
310 		if (sk && ra->sel == sel &&
311 		    (!sk->sk_bound_dev_if ||
312 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
313 			if (last) {
314 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
315 				if (skb2)
316 					rawv6_rcv(last, skb2);
317 			}
318 			last = sk;
319 		}
320 	}
321 
322 	if (last) {
323 		rawv6_rcv(last, skb);
324 		read_unlock(&ip6_ra_lock);
325 		return 1;
326 	}
327 	read_unlock(&ip6_ra_lock);
328 	return 0;
329 }
330 
331 static int ip6_forward_proxy_check(struct sk_buff *skb)
332 {
333 	struct ipv6hdr *hdr = ipv6_hdr(skb);
334 	u8 nexthdr = hdr->nexthdr;
335 	int offset;
336 
337 	if (ipv6_ext_hdr(nexthdr)) {
338 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
339 		if (offset < 0)
340 			return 0;
341 	} else
342 		offset = sizeof(struct ipv6hdr);
343 
344 	if (nexthdr == IPPROTO_ICMPV6) {
345 		struct icmp6hdr *icmp6;
346 
347 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
348 					 offset + 1 - skb->data)))
349 			return 0;
350 
351 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
352 
353 		switch (icmp6->icmp6_type) {
354 		case NDISC_ROUTER_SOLICITATION:
355 		case NDISC_ROUTER_ADVERTISEMENT:
356 		case NDISC_NEIGHBOUR_SOLICITATION:
357 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
358 		case NDISC_REDIRECT:
359 			/* For reaction involving unicast neighbor discovery
360 			 * message destined to the proxied address, pass it to
361 			 * input function.
362 			 */
363 			return 1;
364 		default:
365 			break;
366 		}
367 	}
368 
369 	/*
370 	 * The proxying router can't forward traffic sent to a link-local
371 	 * address, so signal the sender and discard the packet. This
372 	 * behavior is clarified by the MIPv6 specification.
373 	 */
374 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
375 		dst_link_failure(skb);
376 		return -1;
377 	}
378 
379 	return 0;
380 }
381 
382 static inline int ip6_forward_finish(struct sk_buff *skb)
383 {
384 	return dst_output(skb);
385 }
386 
387 int ip6_forward(struct sk_buff *skb)
388 {
389 	struct dst_entry *dst = skb_dst(skb);
390 	struct ipv6hdr *hdr = ipv6_hdr(skb);
391 	struct inet6_skb_parm *opt = IP6CB(skb);
392 	struct net *net = dev_net(dst->dev);
393 	struct neighbour *n;
394 	u32 mtu;
395 
396 	if (net->ipv6.devconf_all->forwarding == 0)
397 		goto error;
398 
399 	if (skb_warn_if_lro(skb))
400 		goto drop;
401 
402 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
403 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
404 		goto drop;
405 	}
406 
407 	if (skb->pkt_type != PACKET_HOST)
408 		goto drop;
409 
410 	skb_forward_csum(skb);
411 
412 	/*
413 	 *	We DO NOT make any processing on
414 	 *	RA packets, pushing them to user level AS IS
415 	 *	without ane WARRANTY that application will be able
416 	 *	to interpret them. The reason is that we
417 	 *	cannot make anything clever here.
418 	 *
419 	 *	We are not end-node, so that if packet contains
420 	 *	AH/ESP, we cannot make anything.
421 	 *	Defragmentation also would be mistake, RA packets
422 	 *	cannot be fragmented, because there is no warranty
423 	 *	that different fragments will go along one path. --ANK
424 	 */
425 	if (opt->ra) {
426 		u8 *ptr = skb_network_header(skb) + opt->ra;
427 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
428 			return 0;
429 	}
430 
431 	/*
432 	 *	check and decrement ttl
433 	 */
434 	if (hdr->hop_limit <= 1) {
435 		/* Force OUTPUT device used as source address */
436 		skb->dev = dst->dev;
437 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
438 		IP6_INC_STATS_BH(net,
439 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
440 
441 		kfree_skb(skb);
442 		return -ETIMEDOUT;
443 	}
444 
445 	/* XXX: idev->cnf.proxy_ndp? */
446 	if (net->ipv6.devconf_all->proxy_ndp &&
447 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
448 		int proxied = ip6_forward_proxy_check(skb);
449 		if (proxied > 0)
450 			return ip6_input(skb);
451 		else if (proxied < 0) {
452 			IP6_INC_STATS(net, ip6_dst_idev(dst),
453 				      IPSTATS_MIB_INDISCARDS);
454 			goto drop;
455 		}
456 	}
457 
458 	if (!xfrm6_route_forward(skb)) {
459 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
460 		goto drop;
461 	}
462 	dst = skb_dst(skb);
463 
464 	/* IPv6 specs say nothing about it, but it is clear that we cannot
465 	   send redirects to source routed frames.
466 	   We don't send redirects to frames decapsulated from IPsec.
467 	 */
468 	n = dst_get_neighbour(dst);
469 	if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
470 		struct in6_addr *target = NULL;
471 		struct rt6_info *rt;
472 
473 		/*
474 		 *	incoming and outgoing devices are the same
475 		 *	send a redirect.
476 		 */
477 
478 		rt = (struct rt6_info *) dst;
479 		if ((rt->rt6i_flags & RTF_GATEWAY))
480 			target = (struct in6_addr*)&n->primary_key;
481 		else
482 			target = &hdr->daddr;
483 
484 		if (!rt->rt6i_peer)
485 			rt6_bind_peer(rt, 1);
486 
487 		/* Limit redirects both by destination (here)
488 		   and by source (inside ndisc_send_redirect)
489 		 */
490 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
491 			ndisc_send_redirect(skb, n, target);
492 	} else {
493 		int addrtype = ipv6_addr_type(&hdr->saddr);
494 
495 		/* This check is security critical. */
496 		if (addrtype == IPV6_ADDR_ANY ||
497 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
498 			goto error;
499 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
500 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
501 				    ICMPV6_NOT_NEIGHBOUR, 0);
502 			goto error;
503 		}
504 	}
505 
506 	mtu = dst_mtu(dst);
507 	if (mtu < IPV6_MIN_MTU)
508 		mtu = IPV6_MIN_MTU;
509 
510 	if (skb->len > mtu && !skb_is_gso(skb)) {
511 		/* Again, force OUTPUT device used as source address */
512 		skb->dev = dst->dev;
513 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
514 		IP6_INC_STATS_BH(net,
515 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
516 		IP6_INC_STATS_BH(net,
517 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
518 		kfree_skb(skb);
519 		return -EMSGSIZE;
520 	}
521 
522 	if (skb_cow(skb, dst->dev->hard_header_len)) {
523 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524 		goto drop;
525 	}
526 
527 	hdr = ipv6_hdr(skb);
528 
529 	/* Mangling hops number delayed to point after skb COW */
530 
531 	hdr->hop_limit--;
532 
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
534 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535 		       ip6_forward_finish);
536 
537 error:
538 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 drop:
540 	kfree_skb(skb);
541 	return -EINVAL;
542 }
543 
544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
545 {
546 	to->pkt_type = from->pkt_type;
547 	to->priority = from->priority;
548 	to->protocol = from->protocol;
549 	skb_dst_drop(to);
550 	skb_dst_set(to, dst_clone(skb_dst(from)));
551 	to->dev = from->dev;
552 	to->mark = from->mark;
553 
554 #ifdef CONFIG_NET_SCHED
555 	to->tc_index = from->tc_index;
556 #endif
557 	nf_copy(to, from);
558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
560 	to->nf_trace = from->nf_trace;
561 #endif
562 	skb_copy_secmark(to, from);
563 }
564 
565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
566 {
567 	u16 offset = sizeof(struct ipv6hdr);
568 	struct ipv6_opt_hdr *exthdr =
569 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
570 	unsigned int packet_len = skb->tail - skb->network_header;
571 	int found_rhdr = 0;
572 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
573 
574 	while (offset + 1 <= packet_len) {
575 
576 		switch (**nexthdr) {
577 
578 		case NEXTHDR_HOP:
579 			break;
580 		case NEXTHDR_ROUTING:
581 			found_rhdr = 1;
582 			break;
583 		case NEXTHDR_DEST:
584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
585 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
586 				break;
587 #endif
588 			if (found_rhdr)
589 				return offset;
590 			break;
591 		default :
592 			return offset;
593 		}
594 
595 		offset += ipv6_optlen(exthdr);
596 		*nexthdr = &exthdr->nexthdr;
597 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
598 						 offset);
599 	}
600 
601 	return offset;
602 }
603 
604 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
605 {
606 	static atomic_t ipv6_fragmentation_id;
607 	int old, new;
608 
609 	if (rt) {
610 		struct inet_peer *peer;
611 
612 		if (!rt->rt6i_peer)
613 			rt6_bind_peer(rt, 1);
614 		peer = rt->rt6i_peer;
615 		if (peer) {
616 			fhdr->identification = htonl(inet_getid(peer, 0));
617 			return;
618 		}
619 	}
620 	do {
621 		old = atomic_read(&ipv6_fragmentation_id);
622 		new = old + 1;
623 		if (!new)
624 			new = 1;
625 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
626 	fhdr->identification = htonl(new);
627 }
628 
629 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
630 {
631 	struct sk_buff *frag;
632 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
633 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
634 	struct ipv6hdr *tmp_hdr;
635 	struct frag_hdr *fh;
636 	unsigned int mtu, hlen, left, len;
637 	__be32 frag_id = 0;
638 	int ptr, offset = 0, err=0;
639 	u8 *prevhdr, nexthdr = 0;
640 	struct net *net = dev_net(skb_dst(skb)->dev);
641 
642 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
643 	nexthdr = *prevhdr;
644 
645 	mtu = ip6_skb_dst_mtu(skb);
646 
647 	/* We must not fragment if the socket is set to force MTU discovery
648 	 * or if the skb it not generated by a local socket.
649 	 */
650 	if (!skb->local_df && skb->len > mtu) {
651 		skb->dev = skb_dst(skb)->dev;
652 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
653 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
654 			      IPSTATS_MIB_FRAGFAILS);
655 		kfree_skb(skb);
656 		return -EMSGSIZE;
657 	}
658 
659 	if (np && np->frag_size < mtu) {
660 		if (np->frag_size)
661 			mtu = np->frag_size;
662 	}
663 	mtu -= hlen + sizeof(struct frag_hdr);
664 
665 	if (skb_has_frag_list(skb)) {
666 		int first_len = skb_pagelen(skb);
667 		struct sk_buff *frag2;
668 
669 		if (first_len - hlen > mtu ||
670 		    ((first_len - hlen) & 7) ||
671 		    skb_cloned(skb))
672 			goto slow_path;
673 
674 		skb_walk_frags(skb, frag) {
675 			/* Correct geometry. */
676 			if (frag->len > mtu ||
677 			    ((frag->len & 7) && frag->next) ||
678 			    skb_headroom(frag) < hlen)
679 				goto slow_path_clean;
680 
681 			/* Partially cloned skb? */
682 			if (skb_shared(frag))
683 				goto slow_path_clean;
684 
685 			BUG_ON(frag->sk);
686 			if (skb->sk) {
687 				frag->sk = skb->sk;
688 				frag->destructor = sock_wfree;
689 			}
690 			skb->truesize -= frag->truesize;
691 		}
692 
693 		err = 0;
694 		offset = 0;
695 		frag = skb_shinfo(skb)->frag_list;
696 		skb_frag_list_init(skb);
697 		/* BUILD HEADER */
698 
699 		*prevhdr = NEXTHDR_FRAGMENT;
700 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
701 		if (!tmp_hdr) {
702 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
703 				      IPSTATS_MIB_FRAGFAILS);
704 			return -ENOMEM;
705 		}
706 
707 		__skb_pull(skb, hlen);
708 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
709 		__skb_push(skb, hlen);
710 		skb_reset_network_header(skb);
711 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
712 
713 		ipv6_select_ident(fh, rt);
714 		fh->nexthdr = nexthdr;
715 		fh->reserved = 0;
716 		fh->frag_off = htons(IP6_MF);
717 		frag_id = fh->identification;
718 
719 		first_len = skb_pagelen(skb);
720 		skb->data_len = first_len - skb_headlen(skb);
721 		skb->len = first_len;
722 		ipv6_hdr(skb)->payload_len = htons(first_len -
723 						   sizeof(struct ipv6hdr));
724 
725 		dst_hold(&rt->dst);
726 
727 		for (;;) {
728 			/* Prepare header of the next frame,
729 			 * before previous one went down. */
730 			if (frag) {
731 				frag->ip_summed = CHECKSUM_NONE;
732 				skb_reset_transport_header(frag);
733 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
734 				__skb_push(frag, hlen);
735 				skb_reset_network_header(frag);
736 				memcpy(skb_network_header(frag), tmp_hdr,
737 				       hlen);
738 				offset += skb->len - hlen - sizeof(struct frag_hdr);
739 				fh->nexthdr = nexthdr;
740 				fh->reserved = 0;
741 				fh->frag_off = htons(offset);
742 				if (frag->next != NULL)
743 					fh->frag_off |= htons(IP6_MF);
744 				fh->identification = frag_id;
745 				ipv6_hdr(frag)->payload_len =
746 						htons(frag->len -
747 						      sizeof(struct ipv6hdr));
748 				ip6_copy_metadata(frag, skb);
749 			}
750 
751 			err = output(skb);
752 			if(!err)
753 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
754 					      IPSTATS_MIB_FRAGCREATES);
755 
756 			if (err || !frag)
757 				break;
758 
759 			skb = frag;
760 			frag = skb->next;
761 			skb->next = NULL;
762 		}
763 
764 		kfree(tmp_hdr);
765 
766 		if (err == 0) {
767 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
768 				      IPSTATS_MIB_FRAGOKS);
769 			dst_release(&rt->dst);
770 			return 0;
771 		}
772 
773 		while (frag) {
774 			skb = frag->next;
775 			kfree_skb(frag);
776 			frag = skb;
777 		}
778 
779 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
780 			      IPSTATS_MIB_FRAGFAILS);
781 		dst_release(&rt->dst);
782 		return err;
783 
784 slow_path_clean:
785 		skb_walk_frags(skb, frag2) {
786 			if (frag2 == frag)
787 				break;
788 			frag2->sk = NULL;
789 			frag2->destructor = NULL;
790 			skb->truesize += frag2->truesize;
791 		}
792 	}
793 
794 slow_path:
795 	left = skb->len - hlen;		/* Space per frame */
796 	ptr = hlen;			/* Where to start from */
797 
798 	/*
799 	 *	Fragment the datagram.
800 	 */
801 
802 	*prevhdr = NEXTHDR_FRAGMENT;
803 
804 	/*
805 	 *	Keep copying data until we run out.
806 	 */
807 	while(left > 0)	{
808 		len = left;
809 		/* IF: it doesn't fit, use 'mtu' - the data space left */
810 		if (len > mtu)
811 			len = mtu;
812 		/* IF: we are not sending up to and including the packet end
813 		   then align the next start on an eight byte boundary */
814 		if (len < left)	{
815 			len &= ~7;
816 		}
817 		/*
818 		 *	Allocate buffer.
819 		 */
820 
821 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
822 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
823 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
824 				      IPSTATS_MIB_FRAGFAILS);
825 			err = -ENOMEM;
826 			goto fail;
827 		}
828 
829 		/*
830 		 *	Set up data on packet
831 		 */
832 
833 		ip6_copy_metadata(frag, skb);
834 		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
835 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
836 		skb_reset_network_header(frag);
837 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
838 		frag->transport_header = (frag->network_header + hlen +
839 					  sizeof(struct frag_hdr));
840 
841 		/*
842 		 *	Charge the memory for the fragment to any owner
843 		 *	it might possess
844 		 */
845 		if (skb->sk)
846 			skb_set_owner_w(frag, skb->sk);
847 
848 		/*
849 		 *	Copy the packet header into the new buffer.
850 		 */
851 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
852 
853 		/*
854 		 *	Build fragment header.
855 		 */
856 		fh->nexthdr = nexthdr;
857 		fh->reserved = 0;
858 		if (!frag_id) {
859 			ipv6_select_ident(fh, rt);
860 			frag_id = fh->identification;
861 		} else
862 			fh->identification = frag_id;
863 
864 		/*
865 		 *	Copy a block of the IP datagram.
866 		 */
867 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
868 			BUG();
869 		left -= len;
870 
871 		fh->frag_off = htons(offset);
872 		if (left > 0)
873 			fh->frag_off |= htons(IP6_MF);
874 		ipv6_hdr(frag)->payload_len = htons(frag->len -
875 						    sizeof(struct ipv6hdr));
876 
877 		ptr += len;
878 		offset += len;
879 
880 		/*
881 		 *	Put this fragment into the sending queue.
882 		 */
883 		err = output(frag);
884 		if (err)
885 			goto fail;
886 
887 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888 			      IPSTATS_MIB_FRAGCREATES);
889 	}
890 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
891 		      IPSTATS_MIB_FRAGOKS);
892 	kfree_skb(skb);
893 	return err;
894 
895 fail:
896 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
897 		      IPSTATS_MIB_FRAGFAILS);
898 	kfree_skb(skb);
899 	return err;
900 }
901 
902 static inline int ip6_rt_check(const struct rt6key *rt_key,
903 			       const struct in6_addr *fl_addr,
904 			       const struct in6_addr *addr_cache)
905 {
906 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
907 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
908 }
909 
910 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
911 					  struct dst_entry *dst,
912 					  const struct flowi6 *fl6)
913 {
914 	struct ipv6_pinfo *np = inet6_sk(sk);
915 	struct rt6_info *rt = (struct rt6_info *)dst;
916 
917 	if (!dst)
918 		goto out;
919 
920 	/* Yes, checking route validity in not connected
921 	 * case is not very simple. Take into account,
922 	 * that we do not support routing by source, TOS,
923 	 * and MSG_DONTROUTE 		--ANK (980726)
924 	 *
925 	 * 1. ip6_rt_check(): If route was host route,
926 	 *    check that cached destination is current.
927 	 *    If it is network route, we still may
928 	 *    check its validity using saved pointer
929 	 *    to the last used address: daddr_cache.
930 	 *    We do not want to save whole address now,
931 	 *    (because main consumer of this service
932 	 *    is tcp, which has not this problem),
933 	 *    so that the last trick works only on connected
934 	 *    sockets.
935 	 * 2. oif also should be the same.
936 	 */
937 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
938 #ifdef CONFIG_IPV6_SUBTREES
939 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
940 #endif
941 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
942 		dst_release(dst);
943 		dst = NULL;
944 	}
945 
946 out:
947 	return dst;
948 }
949 
950 static int ip6_dst_lookup_tail(struct sock *sk,
951 			       struct dst_entry **dst, struct flowi6 *fl6)
952 {
953 	struct net *net = sock_net(sk);
954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
955 	struct neighbour *n;
956 #endif
957 	int err;
958 
959 	if (*dst == NULL)
960 		*dst = ip6_route_output(net, sk, fl6);
961 
962 	if ((err = (*dst)->error))
963 		goto out_err_release;
964 
965 	if (ipv6_addr_any(&fl6->saddr)) {
966 		struct rt6_info *rt = (struct rt6_info *) *dst;
967 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
968 					  sk ? inet6_sk(sk)->srcprefs : 0,
969 					  &fl6->saddr);
970 		if (err)
971 			goto out_err_release;
972 	}
973 
974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
975 	/*
976 	 * Here if the dst entry we've looked up
977 	 * has a neighbour entry that is in the INCOMPLETE
978 	 * state and the src address from the flow is
979 	 * marked as OPTIMISTIC, we release the found
980 	 * dst entry and replace it instead with the
981 	 * dst entry of the nexthop router
982 	 */
983 	rcu_read_lock();
984 	n = dst_get_neighbour(*dst);
985 	if (n && !(n->nud_state & NUD_VALID)) {
986 		struct inet6_ifaddr *ifp;
987 		struct flowi6 fl_gw6;
988 		int redirect;
989 
990 		rcu_read_unlock();
991 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
992 				      (*dst)->dev, 1);
993 
994 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
995 		if (ifp)
996 			in6_ifa_put(ifp);
997 
998 		if (redirect) {
999 			/*
1000 			 * We need to get the dst entry for the
1001 			 * default router instead
1002 			 */
1003 			dst_release(*dst);
1004 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006 			*dst = ip6_route_output(net, sk, &fl_gw6);
1007 			if ((err = (*dst)->error))
1008 				goto out_err_release;
1009 		}
1010 	} else {
1011 		rcu_read_unlock();
1012 	}
1013 #endif
1014 
1015 	return 0;
1016 
1017 out_err_release:
1018 	if (err == -ENETUNREACH)
1019 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1020 	dst_release(*dst);
1021 	*dst = NULL;
1022 	return err;
1023 }
1024 
1025 /**
1026  *	ip6_dst_lookup - perform route lookup on flow
1027  *	@sk: socket which provides route info
1028  *	@dst: pointer to dst_entry * for result
1029  *	@fl6: flow to lookup
1030  *
1031  *	This function performs a route lookup on the given flow.
1032  *
1033  *	It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1036 {
1037 	*dst = NULL;
1038 	return ip6_dst_lookup_tail(sk, dst, fl6);
1039 }
1040 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1041 
1042 /**
1043  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1044  *	@sk: socket which provides route info
1045  *	@fl6: flow to lookup
1046  *	@final_dst: final destination address for ipsec lookup
1047  *	@can_sleep: we are in a sleepable context
1048  *
1049  *	This function performs a route lookup on the given flow.
1050  *
1051  *	It returns a valid dst pointer on success, or a pointer encoded
1052  *	error code.
1053  */
1054 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1055 				      const struct in6_addr *final_dst,
1056 				      bool can_sleep)
1057 {
1058 	struct dst_entry *dst = NULL;
1059 	int err;
1060 
1061 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1062 	if (err)
1063 		return ERR_PTR(err);
1064 	if (final_dst)
1065 		ipv6_addr_copy(&fl6->daddr, final_dst);
1066 	if (can_sleep)
1067 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1068 
1069 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1072 
1073 /**
1074  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1075  *	@sk: socket which provides the dst cache and route info
1076  *	@fl6: flow to lookup
1077  *	@final_dst: final destination address for ipsec lookup
1078  *	@can_sleep: we are in a sleepable context
1079  *
1080  *	This function performs a route lookup on the given flow with the
1081  *	possibility of using the cached route in the socket if it is valid.
1082  *	It will take the socket dst lock when operating on the dst cache.
1083  *	As a result, this function can only be used in process context.
1084  *
1085  *	It returns a valid dst pointer on success, or a pointer encoded
1086  *	error code.
1087  */
1088 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1089 					 const struct in6_addr *final_dst,
1090 					 bool can_sleep)
1091 {
1092 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1093 	int err;
1094 
1095 	dst = ip6_sk_dst_check(sk, dst, fl6);
1096 
1097 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1098 	if (err)
1099 		return ERR_PTR(err);
1100 	if (final_dst)
1101 		ipv6_addr_copy(&fl6->daddr, final_dst);
1102 	if (can_sleep)
1103 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1104 
1105 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1106 }
1107 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1108 
1109 static inline int ip6_ufo_append_data(struct sock *sk,
1110 			int getfrag(void *from, char *to, int offset, int len,
1111 			int odd, struct sk_buff *skb),
1112 			void *from, int length, int hh_len, int fragheaderlen,
1113 			int transhdrlen, int mtu,unsigned int flags,
1114 			struct rt6_info *rt)
1115 
1116 {
1117 	struct sk_buff *skb;
1118 	int err;
1119 
1120 	/* There is support for UDP large send offload by network
1121 	 * device, so create one single skb packet containing complete
1122 	 * udp datagram
1123 	 */
1124 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1125 		skb = sock_alloc_send_skb(sk,
1126 			hh_len + fragheaderlen + transhdrlen + 20,
1127 			(flags & MSG_DONTWAIT), &err);
1128 		if (skb == NULL)
1129 			return -ENOMEM;
1130 
1131 		/* reserve space for Hardware header */
1132 		skb_reserve(skb, hh_len);
1133 
1134 		/* create space for UDP/IP header */
1135 		skb_put(skb,fragheaderlen + transhdrlen);
1136 
1137 		/* initialize network header pointer */
1138 		skb_reset_network_header(skb);
1139 
1140 		/* initialize protocol header pointer */
1141 		skb->transport_header = skb->network_header + fragheaderlen;
1142 
1143 		skb->ip_summed = CHECKSUM_PARTIAL;
1144 		skb->csum = 0;
1145 	}
1146 
1147 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1148 				      (length - transhdrlen));
1149 	if (!err) {
1150 		struct frag_hdr fhdr;
1151 
1152 		/* Specify the length of each IPv6 datagram fragment.
1153 		 * It has to be a multiple of 8.
1154 		 */
1155 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1156 					     sizeof(struct frag_hdr)) & ~7;
1157 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1158 		ipv6_select_ident(&fhdr, rt);
1159 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1160 		__skb_queue_tail(&sk->sk_write_queue, skb);
1161 
1162 		return 0;
1163 	}
1164 	/* There is not enough support do UPD LSO,
1165 	 * so follow normal path
1166 	 */
1167 	kfree_skb(skb);
1168 
1169 	return err;
1170 }
1171 
1172 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1173 					       gfp_t gfp)
1174 {
1175 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177 
1178 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1179 						gfp_t gfp)
1180 {
1181 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183 
1184 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1185 	int offset, int len, int odd, struct sk_buff *skb),
1186 	void *from, int length, int transhdrlen,
1187 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1188 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1189 {
1190 	struct inet_sock *inet = inet_sk(sk);
1191 	struct ipv6_pinfo *np = inet6_sk(sk);
1192 	struct inet_cork *cork;
1193 	struct sk_buff *skb;
1194 	unsigned int maxfraglen, fragheaderlen;
1195 	int exthdrlen;
1196 	int hh_len;
1197 	int mtu;
1198 	int copy;
1199 	int err;
1200 	int offset = 0;
1201 	int csummode = CHECKSUM_NONE;
1202 	__u8 tx_flags = 0;
1203 
1204 	if (flags&MSG_PROBE)
1205 		return 0;
1206 	cork = &inet->cork.base;
1207 	if (skb_queue_empty(&sk->sk_write_queue)) {
1208 		/*
1209 		 * setup for corking
1210 		 */
1211 		if (opt) {
1212 			if (WARN_ON(np->cork.opt))
1213 				return -EINVAL;
1214 
1215 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1216 			if (unlikely(np->cork.opt == NULL))
1217 				return -ENOBUFS;
1218 
1219 			np->cork.opt->tot_len = opt->tot_len;
1220 			np->cork.opt->opt_flen = opt->opt_flen;
1221 			np->cork.opt->opt_nflen = opt->opt_nflen;
1222 
1223 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1224 							    sk->sk_allocation);
1225 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1226 				return -ENOBUFS;
1227 
1228 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1229 							    sk->sk_allocation);
1230 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1231 				return -ENOBUFS;
1232 
1233 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1234 							   sk->sk_allocation);
1235 			if (opt->hopopt && !np->cork.opt->hopopt)
1236 				return -ENOBUFS;
1237 
1238 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1239 							    sk->sk_allocation);
1240 			if (opt->srcrt && !np->cork.opt->srcrt)
1241 				return -ENOBUFS;
1242 
1243 			/* need source address above miyazawa*/
1244 		}
1245 		dst_hold(&rt->dst);
1246 		cork->dst = &rt->dst;
1247 		inet->cork.fl.u.ip6 = *fl6;
1248 		np->cork.hop_limit = hlimit;
1249 		np->cork.tclass = tclass;
1250 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1251 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1252 		if (np->frag_size < mtu) {
1253 			if (np->frag_size)
1254 				mtu = np->frag_size;
1255 		}
1256 		cork->fragsize = mtu;
1257 		if (dst_allfrag(rt->dst.path))
1258 			cork->flags |= IPCORK_ALLFRAG;
1259 		cork->length = 0;
1260 		sk->sk_sndmsg_page = NULL;
1261 		sk->sk_sndmsg_off = 0;
1262 		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1263 			    rt->rt6i_nfheader_len;
1264 		length += exthdrlen;
1265 		transhdrlen += exthdrlen;
1266 	} else {
1267 		rt = (struct rt6_info *)cork->dst;
1268 		fl6 = &inet->cork.fl.u.ip6;
1269 		opt = np->cork.opt;
1270 		transhdrlen = 0;
1271 		exthdrlen = 0;
1272 		mtu = cork->fragsize;
1273 	}
1274 
1275 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1276 
1277 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1278 			(opt ? opt->opt_nflen : 0);
1279 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1280 
1281 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1282 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1283 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1284 			return -EMSGSIZE;
1285 		}
1286 	}
1287 
1288 	/* For UDP, check if TX timestamp is enabled */
1289 	if (sk->sk_type == SOCK_DGRAM) {
1290 		err = sock_tx_timestamp(sk, &tx_flags);
1291 		if (err)
1292 			goto error;
1293 	}
1294 
1295 	/*
1296 	 * Let's try using as much space as possible.
1297 	 * Use MTU if total length of the message fits into the MTU.
1298 	 * Otherwise, we need to reserve fragment header and
1299 	 * fragment alignment (= 8-15 octects, in total).
1300 	 *
1301 	 * Note that we may need to "move" the data from the tail of
1302 	 * of the buffer to the new fragment when we split
1303 	 * the message.
1304 	 *
1305 	 * FIXME: It may be fragmented into multiple chunks
1306 	 *        at once if non-fragmentable extension headers
1307 	 *        are too large.
1308 	 * --yoshfuji
1309 	 */
1310 
1311 	cork->length += length;
1312 	if (length > mtu) {
1313 		int proto = sk->sk_protocol;
1314 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1315 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1316 			return -EMSGSIZE;
1317 		}
1318 
1319 		if (proto == IPPROTO_UDP &&
1320 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1321 
1322 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1323 						  hh_len, fragheaderlen,
1324 						  transhdrlen, mtu, flags, rt);
1325 			if (err)
1326 				goto error;
1327 			return 0;
1328 		}
1329 	}
1330 
1331 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1332 		goto alloc_new_skb;
1333 
1334 	while (length > 0) {
1335 		/* Check if the remaining data fits into current packet. */
1336 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1337 		if (copy < length)
1338 			copy = maxfraglen - skb->len;
1339 
1340 		if (copy <= 0) {
1341 			char *data;
1342 			unsigned int datalen;
1343 			unsigned int fraglen;
1344 			unsigned int fraggap;
1345 			unsigned int alloclen;
1346 			struct sk_buff *skb_prev;
1347 alloc_new_skb:
1348 			skb_prev = skb;
1349 
1350 			/* There's no room in the current skb */
1351 			if (skb_prev)
1352 				fraggap = skb_prev->len - maxfraglen;
1353 			else
1354 				fraggap = 0;
1355 
1356 			/*
1357 			 * If remaining data exceeds the mtu,
1358 			 * we know we need more fragment(s).
1359 			 */
1360 			datalen = length + fraggap;
1361 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1362 				datalen = maxfraglen - fragheaderlen;
1363 
1364 			fraglen = datalen + fragheaderlen;
1365 			if ((flags & MSG_MORE) &&
1366 			    !(rt->dst.dev->features&NETIF_F_SG))
1367 				alloclen = mtu;
1368 			else
1369 				alloclen = datalen + fragheaderlen;
1370 
1371 			/*
1372 			 * The last fragment gets additional space at tail.
1373 			 * Note: we overallocate on fragments with MSG_MODE
1374 			 * because we have no idea if we're the last one.
1375 			 */
1376 			if (datalen == length + fraggap)
1377 				alloclen += rt->dst.trailer_len;
1378 
1379 			/*
1380 			 * We just reserve space for fragment header.
1381 			 * Note: this may be overallocation if the message
1382 			 * (without MSG_MORE) fits into the MTU.
1383 			 */
1384 			alloclen += sizeof(struct frag_hdr);
1385 
1386 			if (transhdrlen) {
1387 				skb = sock_alloc_send_skb(sk,
1388 						alloclen + hh_len,
1389 						(flags & MSG_DONTWAIT), &err);
1390 			} else {
1391 				skb = NULL;
1392 				if (atomic_read(&sk->sk_wmem_alloc) <=
1393 				    2 * sk->sk_sndbuf)
1394 					skb = sock_wmalloc(sk,
1395 							   alloclen + hh_len, 1,
1396 							   sk->sk_allocation);
1397 				if (unlikely(skb == NULL))
1398 					err = -ENOBUFS;
1399 				else {
1400 					/* Only the initial fragment
1401 					 * is time stamped.
1402 					 */
1403 					tx_flags = 0;
1404 				}
1405 			}
1406 			if (skb == NULL)
1407 				goto error;
1408 			/*
1409 			 *	Fill in the control structures
1410 			 */
1411 			skb->ip_summed = csummode;
1412 			skb->csum = 0;
1413 			/* reserve for fragmentation */
1414 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1415 
1416 			if (sk->sk_type == SOCK_DGRAM)
1417 				skb_shinfo(skb)->tx_flags = tx_flags;
1418 
1419 			/*
1420 			 *	Find where to start putting bytes
1421 			 */
1422 			data = skb_put(skb, fraglen);
1423 			skb_set_network_header(skb, exthdrlen);
1424 			data += fragheaderlen;
1425 			skb->transport_header = (skb->network_header +
1426 						 fragheaderlen);
1427 			if (fraggap) {
1428 				skb->csum = skb_copy_and_csum_bits(
1429 					skb_prev, maxfraglen,
1430 					data + transhdrlen, fraggap, 0);
1431 				skb_prev->csum = csum_sub(skb_prev->csum,
1432 							  skb->csum);
1433 				data += fraggap;
1434 				pskb_trim_unique(skb_prev, maxfraglen);
1435 			}
1436 			copy = datalen - transhdrlen - fraggap;
1437 			if (copy < 0) {
1438 				err = -EINVAL;
1439 				kfree_skb(skb);
1440 				goto error;
1441 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1442 				err = -EFAULT;
1443 				kfree_skb(skb);
1444 				goto error;
1445 			}
1446 
1447 			offset += copy;
1448 			length -= datalen - fraggap;
1449 			transhdrlen = 0;
1450 			exthdrlen = 0;
1451 			csummode = CHECKSUM_NONE;
1452 
1453 			/*
1454 			 * Put the packet on the pending queue
1455 			 */
1456 			__skb_queue_tail(&sk->sk_write_queue, skb);
1457 			continue;
1458 		}
1459 
1460 		if (copy > length)
1461 			copy = length;
1462 
1463 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1464 			unsigned int off;
1465 
1466 			off = skb->len;
1467 			if (getfrag(from, skb_put(skb, copy),
1468 						offset, copy, off, skb) < 0) {
1469 				__skb_trim(skb, off);
1470 				err = -EFAULT;
1471 				goto error;
1472 			}
1473 		} else {
1474 			int i = skb_shinfo(skb)->nr_frags;
1475 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1476 			struct page *page = sk->sk_sndmsg_page;
1477 			int off = sk->sk_sndmsg_off;
1478 			unsigned int left;
1479 
1480 			if (page && (left = PAGE_SIZE - off) > 0) {
1481 				if (copy >= left)
1482 					copy = left;
1483 				if (page != frag->page) {
1484 					if (i == MAX_SKB_FRAGS) {
1485 						err = -EMSGSIZE;
1486 						goto error;
1487 					}
1488 					get_page(page);
1489 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1490 					frag = &skb_shinfo(skb)->frags[i];
1491 				}
1492 			} else if(i < MAX_SKB_FRAGS) {
1493 				if (copy > PAGE_SIZE)
1494 					copy = PAGE_SIZE;
1495 				page = alloc_pages(sk->sk_allocation, 0);
1496 				if (page == NULL) {
1497 					err = -ENOMEM;
1498 					goto error;
1499 				}
1500 				sk->sk_sndmsg_page = page;
1501 				sk->sk_sndmsg_off = 0;
1502 
1503 				skb_fill_page_desc(skb, i, page, 0, 0);
1504 				frag = &skb_shinfo(skb)->frags[i];
1505 			} else {
1506 				err = -EMSGSIZE;
1507 				goto error;
1508 			}
1509 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1510 				err = -EFAULT;
1511 				goto error;
1512 			}
1513 			sk->sk_sndmsg_off += copy;
1514 			frag->size += copy;
1515 			skb->len += copy;
1516 			skb->data_len += copy;
1517 			skb->truesize += copy;
1518 			atomic_add(copy, &sk->sk_wmem_alloc);
1519 		}
1520 		offset += copy;
1521 		length -= copy;
1522 	}
1523 	return 0;
1524 error:
1525 	cork->length -= length;
1526 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1527 	return err;
1528 }
1529 
1530 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1531 {
1532 	if (np->cork.opt) {
1533 		kfree(np->cork.opt->dst0opt);
1534 		kfree(np->cork.opt->dst1opt);
1535 		kfree(np->cork.opt->hopopt);
1536 		kfree(np->cork.opt->srcrt);
1537 		kfree(np->cork.opt);
1538 		np->cork.opt = NULL;
1539 	}
1540 
1541 	if (inet->cork.base.dst) {
1542 		dst_release(inet->cork.base.dst);
1543 		inet->cork.base.dst = NULL;
1544 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1545 	}
1546 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1547 }
1548 
1549 int ip6_push_pending_frames(struct sock *sk)
1550 {
1551 	struct sk_buff *skb, *tmp_skb;
1552 	struct sk_buff **tail_skb;
1553 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1554 	struct inet_sock *inet = inet_sk(sk);
1555 	struct ipv6_pinfo *np = inet6_sk(sk);
1556 	struct net *net = sock_net(sk);
1557 	struct ipv6hdr *hdr;
1558 	struct ipv6_txoptions *opt = np->cork.opt;
1559 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1560 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1561 	unsigned char proto = fl6->flowi6_proto;
1562 	int err = 0;
1563 
1564 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1565 		goto out;
1566 	tail_skb = &(skb_shinfo(skb)->frag_list);
1567 
1568 	/* move skb->data to ip header from ext header */
1569 	if (skb->data < skb_network_header(skb))
1570 		__skb_pull(skb, skb_network_offset(skb));
1571 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1572 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1573 		*tail_skb = tmp_skb;
1574 		tail_skb = &(tmp_skb->next);
1575 		skb->len += tmp_skb->len;
1576 		skb->data_len += tmp_skb->len;
1577 		skb->truesize += tmp_skb->truesize;
1578 		tmp_skb->destructor = NULL;
1579 		tmp_skb->sk = NULL;
1580 	}
1581 
1582 	/* Allow local fragmentation. */
1583 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1584 		skb->local_df = 1;
1585 
1586 	ipv6_addr_copy(final_dst, &fl6->daddr);
1587 	__skb_pull(skb, skb_network_header_len(skb));
1588 	if (opt && opt->opt_flen)
1589 		ipv6_push_frag_opts(skb, opt, &proto);
1590 	if (opt && opt->opt_nflen)
1591 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1592 
1593 	skb_push(skb, sizeof(struct ipv6hdr));
1594 	skb_reset_network_header(skb);
1595 	hdr = ipv6_hdr(skb);
1596 
1597 	*(__be32*)hdr = fl6->flowlabel |
1598 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1599 
1600 	hdr->hop_limit = np->cork.hop_limit;
1601 	hdr->nexthdr = proto;
1602 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1603 	ipv6_addr_copy(&hdr->daddr, final_dst);
1604 
1605 	skb->priority = sk->sk_priority;
1606 	skb->mark = sk->sk_mark;
1607 
1608 	skb_dst_set(skb, dst_clone(&rt->dst));
1609 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1610 	if (proto == IPPROTO_ICMPV6) {
1611 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1612 
1613 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1614 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1615 	}
1616 
1617 	err = ip6_local_out(skb);
1618 	if (err) {
1619 		if (err > 0)
1620 			err = net_xmit_errno(err);
1621 		if (err)
1622 			goto error;
1623 	}
1624 
1625 out:
1626 	ip6_cork_release(inet, np);
1627 	return err;
1628 error:
1629 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1630 	goto out;
1631 }
1632 
1633 void ip6_flush_pending_frames(struct sock *sk)
1634 {
1635 	struct sk_buff *skb;
1636 
1637 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1638 		if (skb_dst(skb))
1639 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1640 				      IPSTATS_MIB_OUTDISCARDS);
1641 		kfree_skb(skb);
1642 	}
1643 
1644 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1645 }
1646