xref: /linux/net/ipv6/ip6_output.c (revision d229807f669ba3dea9f64467ee965051c4366aed)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	rcu_read_lock();
139 	neigh = dst_get_neighbour(dst);
140 	if (neigh) {
141 		int res = neigh_output(neigh, skb);
142 
143 		rcu_read_unlock();
144 		return res;
145 	}
146 	rcu_read_unlock();
147 	IP6_INC_STATS_BH(dev_net(dst->dev),
148 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149 	kfree_skb(skb);
150 	return -EINVAL;
151 }
152 
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 	    dst_allfrag(skb_dst(skb)))
157 		return ip6_fragment(skb, ip6_finish_output2);
158 	else
159 		return ip6_finish_output2(skb);
160 }
161 
162 int ip6_output(struct sk_buff *skb)
163 {
164 	struct net_device *dev = skb_dst(skb)->dev;
165 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 	if (unlikely(idev->cnf.disable_ipv6)) {
167 		IP6_INC_STATS(dev_net(dev), idev,
168 			      IPSTATS_MIB_OUTDISCARDS);
169 		kfree_skb(skb);
170 		return 0;
171 	}
172 
173 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174 			    ip6_finish_output,
175 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177 
178 /*
179  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181 
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 	     struct ipv6_txoptions *opt)
184 {
185 	struct net *net = sock_net(sk);
186 	struct ipv6_pinfo *np = inet6_sk(sk);
187 	struct in6_addr *first_hop = &fl6->daddr;
188 	struct dst_entry *dst = skb_dst(skb);
189 	struct ipv6hdr *hdr;
190 	u8  proto = fl6->flowi6_proto;
191 	int seg_len = skb->len;
192 	int hlimit = -1;
193 	int tclass = 0;
194 	u32 mtu;
195 
196 	if (opt) {
197 		unsigned int head_room;
198 
199 		/* First: exthdrs may take lots of space (~8K for now)
200 		   MAX_HEADER is not enough.
201 		 */
202 		head_room = opt->opt_nflen + opt->opt_flen;
203 		seg_len += head_room;
204 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
205 
206 		if (skb_headroom(skb) < head_room) {
207 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
208 			if (skb2 == NULL) {
209 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
210 					      IPSTATS_MIB_OUTDISCARDS);
211 				kfree_skb(skb);
212 				return -ENOBUFS;
213 			}
214 			kfree_skb(skb);
215 			skb = skb2;
216 			skb_set_owner_w(skb, sk);
217 		}
218 		if (opt->opt_flen)
219 			ipv6_push_frag_opts(skb, opt, &proto);
220 		if (opt->opt_nflen)
221 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
222 	}
223 
224 	skb_push(skb, sizeof(struct ipv6hdr));
225 	skb_reset_network_header(skb);
226 	hdr = ipv6_hdr(skb);
227 
228 	/*
229 	 *	Fill in the IPv6 header
230 	 */
231 	if (np) {
232 		tclass = np->tclass;
233 		hlimit = np->hop_limit;
234 	}
235 	if (hlimit < 0)
236 		hlimit = ip6_dst_hoplimit(dst);
237 
238 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
239 
240 	hdr->payload_len = htons(seg_len);
241 	hdr->nexthdr = proto;
242 	hdr->hop_limit = hlimit;
243 
244 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
245 	ipv6_addr_copy(&hdr->daddr, first_hop);
246 
247 	skb->priority = sk->sk_priority;
248 	skb->mark = sk->sk_mark;
249 
250 	mtu = dst_mtu(dst);
251 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
252 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
253 			      IPSTATS_MIB_OUT, skb->len);
254 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
255 			       dst->dev, dst_output);
256 	}
257 
258 	if (net_ratelimit())
259 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
260 	skb->dev = dst->dev;
261 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
262 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
263 	kfree_skb(skb);
264 	return -EMSGSIZE;
265 }
266 
267 EXPORT_SYMBOL(ip6_xmit);
268 
269 /*
270  *	To avoid extra problems ND packets are send through this
271  *	routine. It's code duplication but I really want to avoid
272  *	extra checks since ipv6_build_header is used by TCP (which
273  *	is for us performance critical)
274  */
275 
276 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
277 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
278 	       int proto, int len)
279 {
280 	struct ipv6_pinfo *np = inet6_sk(sk);
281 	struct ipv6hdr *hdr;
282 
283 	skb->protocol = htons(ETH_P_IPV6);
284 	skb->dev = dev;
285 
286 	skb_reset_network_header(skb);
287 	skb_put(skb, sizeof(struct ipv6hdr));
288 	hdr = ipv6_hdr(skb);
289 
290 	*(__be32*)hdr = htonl(0x60000000);
291 
292 	hdr->payload_len = htons(len);
293 	hdr->nexthdr = proto;
294 	hdr->hop_limit = np->hop_limit;
295 
296 	ipv6_addr_copy(&hdr->saddr, saddr);
297 	ipv6_addr_copy(&hdr->daddr, daddr);
298 
299 	return 0;
300 }
301 
302 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
303 {
304 	struct ip6_ra_chain *ra;
305 	struct sock *last = NULL;
306 
307 	read_lock(&ip6_ra_lock);
308 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
309 		struct sock *sk = ra->sk;
310 		if (sk && ra->sel == sel &&
311 		    (!sk->sk_bound_dev_if ||
312 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
313 			if (last) {
314 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
315 				if (skb2)
316 					rawv6_rcv(last, skb2);
317 			}
318 			last = sk;
319 		}
320 	}
321 
322 	if (last) {
323 		rawv6_rcv(last, skb);
324 		read_unlock(&ip6_ra_lock);
325 		return 1;
326 	}
327 	read_unlock(&ip6_ra_lock);
328 	return 0;
329 }
330 
331 static int ip6_forward_proxy_check(struct sk_buff *skb)
332 {
333 	struct ipv6hdr *hdr = ipv6_hdr(skb);
334 	u8 nexthdr = hdr->nexthdr;
335 	int offset;
336 
337 	if (ipv6_ext_hdr(nexthdr)) {
338 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
339 		if (offset < 0)
340 			return 0;
341 	} else
342 		offset = sizeof(struct ipv6hdr);
343 
344 	if (nexthdr == IPPROTO_ICMPV6) {
345 		struct icmp6hdr *icmp6;
346 
347 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
348 					 offset + 1 - skb->data)))
349 			return 0;
350 
351 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
352 
353 		switch (icmp6->icmp6_type) {
354 		case NDISC_ROUTER_SOLICITATION:
355 		case NDISC_ROUTER_ADVERTISEMENT:
356 		case NDISC_NEIGHBOUR_SOLICITATION:
357 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
358 		case NDISC_REDIRECT:
359 			/* For reaction involving unicast neighbor discovery
360 			 * message destined to the proxied address, pass it to
361 			 * input function.
362 			 */
363 			return 1;
364 		default:
365 			break;
366 		}
367 	}
368 
369 	/*
370 	 * The proxying router can't forward traffic sent to a link-local
371 	 * address, so signal the sender and discard the packet. This
372 	 * behavior is clarified by the MIPv6 specification.
373 	 */
374 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
375 		dst_link_failure(skb);
376 		return -1;
377 	}
378 
379 	return 0;
380 }
381 
382 static inline int ip6_forward_finish(struct sk_buff *skb)
383 {
384 	return dst_output(skb);
385 }
386 
387 int ip6_forward(struct sk_buff *skb)
388 {
389 	struct dst_entry *dst = skb_dst(skb);
390 	struct ipv6hdr *hdr = ipv6_hdr(skb);
391 	struct inet6_skb_parm *opt = IP6CB(skb);
392 	struct net *net = dev_net(dst->dev);
393 	struct neighbour *n;
394 	u32 mtu;
395 
396 	if (net->ipv6.devconf_all->forwarding == 0)
397 		goto error;
398 
399 	if (skb_warn_if_lro(skb))
400 		goto drop;
401 
402 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
403 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
404 		goto drop;
405 	}
406 
407 	if (skb->pkt_type != PACKET_HOST)
408 		goto drop;
409 
410 	skb_forward_csum(skb);
411 
412 	/*
413 	 *	We DO NOT make any processing on
414 	 *	RA packets, pushing them to user level AS IS
415 	 *	without ane WARRANTY that application will be able
416 	 *	to interpret them. The reason is that we
417 	 *	cannot make anything clever here.
418 	 *
419 	 *	We are not end-node, so that if packet contains
420 	 *	AH/ESP, we cannot make anything.
421 	 *	Defragmentation also would be mistake, RA packets
422 	 *	cannot be fragmented, because there is no warranty
423 	 *	that different fragments will go along one path. --ANK
424 	 */
425 	if (opt->ra) {
426 		u8 *ptr = skb_network_header(skb) + opt->ra;
427 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
428 			return 0;
429 	}
430 
431 	/*
432 	 *	check and decrement ttl
433 	 */
434 	if (hdr->hop_limit <= 1) {
435 		/* Force OUTPUT device used as source address */
436 		skb->dev = dst->dev;
437 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
438 		IP6_INC_STATS_BH(net,
439 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
440 
441 		kfree_skb(skb);
442 		return -ETIMEDOUT;
443 	}
444 
445 	/* XXX: idev->cnf.proxy_ndp? */
446 	if (net->ipv6.devconf_all->proxy_ndp &&
447 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
448 		int proxied = ip6_forward_proxy_check(skb);
449 		if (proxied > 0)
450 			return ip6_input(skb);
451 		else if (proxied < 0) {
452 			IP6_INC_STATS(net, ip6_dst_idev(dst),
453 				      IPSTATS_MIB_INDISCARDS);
454 			goto drop;
455 		}
456 	}
457 
458 	if (!xfrm6_route_forward(skb)) {
459 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
460 		goto drop;
461 	}
462 	dst = skb_dst(skb);
463 
464 	/* IPv6 specs say nothing about it, but it is clear that we cannot
465 	   send redirects to source routed frames.
466 	   We don't send redirects to frames decapsulated from IPsec.
467 	 */
468 	n = dst_get_neighbour(dst);
469 	if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
470 		struct in6_addr *target = NULL;
471 		struct rt6_info *rt;
472 
473 		/*
474 		 *	incoming and outgoing devices are the same
475 		 *	send a redirect.
476 		 */
477 
478 		rt = (struct rt6_info *) dst;
479 		if ((rt->rt6i_flags & RTF_GATEWAY))
480 			target = (struct in6_addr*)&n->primary_key;
481 		else
482 			target = &hdr->daddr;
483 
484 		if (!rt->rt6i_peer)
485 			rt6_bind_peer(rt, 1);
486 
487 		/* Limit redirects both by destination (here)
488 		   and by source (inside ndisc_send_redirect)
489 		 */
490 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
491 			ndisc_send_redirect(skb, n, target);
492 	} else {
493 		int addrtype = ipv6_addr_type(&hdr->saddr);
494 
495 		/* This check is security critical. */
496 		if (addrtype == IPV6_ADDR_ANY ||
497 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
498 			goto error;
499 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
500 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
501 				    ICMPV6_NOT_NEIGHBOUR, 0);
502 			goto error;
503 		}
504 	}
505 
506 	mtu = dst_mtu(dst);
507 	if (mtu < IPV6_MIN_MTU)
508 		mtu = IPV6_MIN_MTU;
509 
510 	if (skb->len > mtu && !skb_is_gso(skb)) {
511 		/* Again, force OUTPUT device used as source address */
512 		skb->dev = dst->dev;
513 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
514 		IP6_INC_STATS_BH(net,
515 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
516 		IP6_INC_STATS_BH(net,
517 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
518 		kfree_skb(skb);
519 		return -EMSGSIZE;
520 	}
521 
522 	if (skb_cow(skb, dst->dev->hard_header_len)) {
523 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524 		goto drop;
525 	}
526 
527 	hdr = ipv6_hdr(skb);
528 
529 	/* Mangling hops number delayed to point after skb COW */
530 
531 	hdr->hop_limit--;
532 
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
534 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535 		       ip6_forward_finish);
536 
537 error:
538 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 drop:
540 	kfree_skb(skb);
541 	return -EINVAL;
542 }
543 
544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
545 {
546 	to->pkt_type = from->pkt_type;
547 	to->priority = from->priority;
548 	to->protocol = from->protocol;
549 	skb_dst_drop(to);
550 	skb_dst_set(to, dst_clone(skb_dst(from)));
551 	to->dev = from->dev;
552 	to->mark = from->mark;
553 
554 #ifdef CONFIG_NET_SCHED
555 	to->tc_index = from->tc_index;
556 #endif
557 	nf_copy(to, from);
558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
560 	to->nf_trace = from->nf_trace;
561 #endif
562 	skb_copy_secmark(to, from);
563 }
564 
565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
566 {
567 	u16 offset = sizeof(struct ipv6hdr);
568 	struct ipv6_opt_hdr *exthdr =
569 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
570 	unsigned int packet_len = skb->tail - skb->network_header;
571 	int found_rhdr = 0;
572 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
573 
574 	while (offset + 1 <= packet_len) {
575 
576 		switch (**nexthdr) {
577 
578 		case NEXTHDR_HOP:
579 			break;
580 		case NEXTHDR_ROUTING:
581 			found_rhdr = 1;
582 			break;
583 		case NEXTHDR_DEST:
584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
585 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
586 				break;
587 #endif
588 			if (found_rhdr)
589 				return offset;
590 			break;
591 		default :
592 			return offset;
593 		}
594 
595 		offset += ipv6_optlen(exthdr);
596 		*nexthdr = &exthdr->nexthdr;
597 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
598 						 offset);
599 	}
600 
601 	return offset;
602 }
603 
604 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
605 {
606 	static atomic_t ipv6_fragmentation_id;
607 	int old, new;
608 
609 	if (rt) {
610 		struct inet_peer *peer;
611 
612 		if (!rt->rt6i_peer)
613 			rt6_bind_peer(rt, 1);
614 		peer = rt->rt6i_peer;
615 		if (peer) {
616 			fhdr->identification = htonl(inet_getid(peer, 0));
617 			return;
618 		}
619 	}
620 	do {
621 		old = atomic_read(&ipv6_fragmentation_id);
622 		new = old + 1;
623 		if (!new)
624 			new = 1;
625 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
626 	fhdr->identification = htonl(new);
627 }
628 
629 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
630 {
631 	struct sk_buff *frag;
632 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
633 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
634 	struct ipv6hdr *tmp_hdr;
635 	struct frag_hdr *fh;
636 	unsigned int mtu, hlen, left, len;
637 	__be32 frag_id = 0;
638 	int ptr, offset = 0, err=0;
639 	u8 *prevhdr, nexthdr = 0;
640 	struct net *net = dev_net(skb_dst(skb)->dev);
641 
642 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
643 	nexthdr = *prevhdr;
644 
645 	mtu = ip6_skb_dst_mtu(skb);
646 
647 	/* We must not fragment if the socket is set to force MTU discovery
648 	 * or if the skb it not generated by a local socket.
649 	 */
650 	if (!skb->local_df && skb->len > mtu) {
651 		skb->dev = skb_dst(skb)->dev;
652 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
653 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
654 			      IPSTATS_MIB_FRAGFAILS);
655 		kfree_skb(skb);
656 		return -EMSGSIZE;
657 	}
658 
659 	if (np && np->frag_size < mtu) {
660 		if (np->frag_size)
661 			mtu = np->frag_size;
662 	}
663 	mtu -= hlen + sizeof(struct frag_hdr);
664 
665 	if (skb_has_frag_list(skb)) {
666 		int first_len = skb_pagelen(skb);
667 		struct sk_buff *frag2;
668 
669 		if (first_len - hlen > mtu ||
670 		    ((first_len - hlen) & 7) ||
671 		    skb_cloned(skb))
672 			goto slow_path;
673 
674 		skb_walk_frags(skb, frag) {
675 			/* Correct geometry. */
676 			if (frag->len > mtu ||
677 			    ((frag->len & 7) && frag->next) ||
678 			    skb_headroom(frag) < hlen)
679 				goto slow_path_clean;
680 
681 			/* Partially cloned skb? */
682 			if (skb_shared(frag))
683 				goto slow_path_clean;
684 
685 			BUG_ON(frag->sk);
686 			if (skb->sk) {
687 				frag->sk = skb->sk;
688 				frag->destructor = sock_wfree;
689 			}
690 			skb->truesize -= frag->truesize;
691 		}
692 
693 		err = 0;
694 		offset = 0;
695 		frag = skb_shinfo(skb)->frag_list;
696 		skb_frag_list_init(skb);
697 		/* BUILD HEADER */
698 
699 		*prevhdr = NEXTHDR_FRAGMENT;
700 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
701 		if (!tmp_hdr) {
702 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
703 				      IPSTATS_MIB_FRAGFAILS);
704 			return -ENOMEM;
705 		}
706 
707 		__skb_pull(skb, hlen);
708 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
709 		__skb_push(skb, hlen);
710 		skb_reset_network_header(skb);
711 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
712 
713 		ipv6_select_ident(fh, rt);
714 		fh->nexthdr = nexthdr;
715 		fh->reserved = 0;
716 		fh->frag_off = htons(IP6_MF);
717 		frag_id = fh->identification;
718 
719 		first_len = skb_pagelen(skb);
720 		skb->data_len = first_len - skb_headlen(skb);
721 		skb->len = first_len;
722 		ipv6_hdr(skb)->payload_len = htons(first_len -
723 						   sizeof(struct ipv6hdr));
724 
725 		dst_hold(&rt->dst);
726 
727 		for (;;) {
728 			/* Prepare header of the next frame,
729 			 * before previous one went down. */
730 			if (frag) {
731 				frag->ip_summed = CHECKSUM_NONE;
732 				skb_reset_transport_header(frag);
733 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
734 				__skb_push(frag, hlen);
735 				skb_reset_network_header(frag);
736 				memcpy(skb_network_header(frag), tmp_hdr,
737 				       hlen);
738 				offset += skb->len - hlen - sizeof(struct frag_hdr);
739 				fh->nexthdr = nexthdr;
740 				fh->reserved = 0;
741 				fh->frag_off = htons(offset);
742 				if (frag->next != NULL)
743 					fh->frag_off |= htons(IP6_MF);
744 				fh->identification = frag_id;
745 				ipv6_hdr(frag)->payload_len =
746 						htons(frag->len -
747 						      sizeof(struct ipv6hdr));
748 				ip6_copy_metadata(frag, skb);
749 			}
750 
751 			err = output(skb);
752 			if(!err)
753 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
754 					      IPSTATS_MIB_FRAGCREATES);
755 
756 			if (err || !frag)
757 				break;
758 
759 			skb = frag;
760 			frag = skb->next;
761 			skb->next = NULL;
762 		}
763 
764 		kfree(tmp_hdr);
765 
766 		if (err == 0) {
767 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
768 				      IPSTATS_MIB_FRAGOKS);
769 			dst_release(&rt->dst);
770 			return 0;
771 		}
772 
773 		while (frag) {
774 			skb = frag->next;
775 			kfree_skb(frag);
776 			frag = skb;
777 		}
778 
779 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
780 			      IPSTATS_MIB_FRAGFAILS);
781 		dst_release(&rt->dst);
782 		return err;
783 
784 slow_path_clean:
785 		skb_walk_frags(skb, frag2) {
786 			if (frag2 == frag)
787 				break;
788 			frag2->sk = NULL;
789 			frag2->destructor = NULL;
790 			skb->truesize += frag2->truesize;
791 		}
792 	}
793 
794 slow_path:
795 	left = skb->len - hlen;		/* Space per frame */
796 	ptr = hlen;			/* Where to start from */
797 
798 	/*
799 	 *	Fragment the datagram.
800 	 */
801 
802 	*prevhdr = NEXTHDR_FRAGMENT;
803 
804 	/*
805 	 *	Keep copying data until we run out.
806 	 */
807 	while(left > 0)	{
808 		len = left;
809 		/* IF: it doesn't fit, use 'mtu' - the data space left */
810 		if (len > mtu)
811 			len = mtu;
812 		/* IF: we are not sending up to and including the packet end
813 		   then align the next start on an eight byte boundary */
814 		if (len < left)	{
815 			len &= ~7;
816 		}
817 		/*
818 		 *	Allocate buffer.
819 		 */
820 
821 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
822 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
823 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
824 				      IPSTATS_MIB_FRAGFAILS);
825 			err = -ENOMEM;
826 			goto fail;
827 		}
828 
829 		/*
830 		 *	Set up data on packet
831 		 */
832 
833 		ip6_copy_metadata(frag, skb);
834 		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
835 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
836 		skb_reset_network_header(frag);
837 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
838 		frag->transport_header = (frag->network_header + hlen +
839 					  sizeof(struct frag_hdr));
840 
841 		/*
842 		 *	Charge the memory for the fragment to any owner
843 		 *	it might possess
844 		 */
845 		if (skb->sk)
846 			skb_set_owner_w(frag, skb->sk);
847 
848 		/*
849 		 *	Copy the packet header into the new buffer.
850 		 */
851 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
852 
853 		/*
854 		 *	Build fragment header.
855 		 */
856 		fh->nexthdr = nexthdr;
857 		fh->reserved = 0;
858 		if (!frag_id) {
859 			ipv6_select_ident(fh, rt);
860 			frag_id = fh->identification;
861 		} else
862 			fh->identification = frag_id;
863 
864 		/*
865 		 *	Copy a block of the IP datagram.
866 		 */
867 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
868 			BUG();
869 		left -= len;
870 
871 		fh->frag_off = htons(offset);
872 		if (left > 0)
873 			fh->frag_off |= htons(IP6_MF);
874 		ipv6_hdr(frag)->payload_len = htons(frag->len -
875 						    sizeof(struct ipv6hdr));
876 
877 		ptr += len;
878 		offset += len;
879 
880 		/*
881 		 *	Put this fragment into the sending queue.
882 		 */
883 		err = output(frag);
884 		if (err)
885 			goto fail;
886 
887 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888 			      IPSTATS_MIB_FRAGCREATES);
889 	}
890 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
891 		      IPSTATS_MIB_FRAGOKS);
892 	kfree_skb(skb);
893 	return err;
894 
895 fail:
896 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
897 		      IPSTATS_MIB_FRAGFAILS);
898 	kfree_skb(skb);
899 	return err;
900 }
901 
902 static inline int ip6_rt_check(const struct rt6key *rt_key,
903 			       const struct in6_addr *fl_addr,
904 			       const struct in6_addr *addr_cache)
905 {
906 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
907 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
908 }
909 
910 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
911 					  struct dst_entry *dst,
912 					  const struct flowi6 *fl6)
913 {
914 	struct ipv6_pinfo *np = inet6_sk(sk);
915 	struct rt6_info *rt = (struct rt6_info *)dst;
916 
917 	if (!dst)
918 		goto out;
919 
920 	/* Yes, checking route validity in not connected
921 	 * case is not very simple. Take into account,
922 	 * that we do not support routing by source, TOS,
923 	 * and MSG_DONTROUTE 		--ANK (980726)
924 	 *
925 	 * 1. ip6_rt_check(): If route was host route,
926 	 *    check that cached destination is current.
927 	 *    If it is network route, we still may
928 	 *    check its validity using saved pointer
929 	 *    to the last used address: daddr_cache.
930 	 *    We do not want to save whole address now,
931 	 *    (because main consumer of this service
932 	 *    is tcp, which has not this problem),
933 	 *    so that the last trick works only on connected
934 	 *    sockets.
935 	 * 2. oif also should be the same.
936 	 */
937 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
938 #ifdef CONFIG_IPV6_SUBTREES
939 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
940 #endif
941 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
942 		dst_release(dst);
943 		dst = NULL;
944 	}
945 
946 out:
947 	return dst;
948 }
949 
950 static int ip6_dst_lookup_tail(struct sock *sk,
951 			       struct dst_entry **dst, struct flowi6 *fl6)
952 {
953 	struct net *net = sock_net(sk);
954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
955 	struct neighbour *n;
956 #endif
957 	int err;
958 
959 	if (*dst == NULL)
960 		*dst = ip6_route_output(net, sk, fl6);
961 
962 	if ((err = (*dst)->error))
963 		goto out_err_release;
964 
965 	if (ipv6_addr_any(&fl6->saddr)) {
966 		struct rt6_info *rt = (struct rt6_info *) *dst;
967 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
968 					  sk ? inet6_sk(sk)->srcprefs : 0,
969 					  &fl6->saddr);
970 		if (err)
971 			goto out_err_release;
972 	}
973 
974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
975 	/*
976 	 * Here if the dst entry we've looked up
977 	 * has a neighbour entry that is in the INCOMPLETE
978 	 * state and the src address from the flow is
979 	 * marked as OPTIMISTIC, we release the found
980 	 * dst entry and replace it instead with the
981 	 * dst entry of the nexthop router
982 	 */
983 	rcu_read_lock();
984 	n = dst_get_neighbour(*dst);
985 	if (n && !(n->nud_state & NUD_VALID)) {
986 		struct inet6_ifaddr *ifp;
987 		struct flowi6 fl_gw6;
988 		int redirect;
989 
990 		rcu_read_unlock();
991 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
992 				      (*dst)->dev, 1);
993 
994 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
995 		if (ifp)
996 			in6_ifa_put(ifp);
997 
998 		if (redirect) {
999 			/*
1000 			 * We need to get the dst entry for the
1001 			 * default router instead
1002 			 */
1003 			dst_release(*dst);
1004 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006 			*dst = ip6_route_output(net, sk, &fl_gw6);
1007 			if ((err = (*dst)->error))
1008 				goto out_err_release;
1009 		}
1010 	} else {
1011 		rcu_read_unlock();
1012 	}
1013 #endif
1014 
1015 	return 0;
1016 
1017 out_err_release:
1018 	if (err == -ENETUNREACH)
1019 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1020 	dst_release(*dst);
1021 	*dst = NULL;
1022 	return err;
1023 }
1024 
1025 /**
1026  *	ip6_dst_lookup - perform route lookup on flow
1027  *	@sk: socket which provides route info
1028  *	@dst: pointer to dst_entry * for result
1029  *	@fl6: flow to lookup
1030  *
1031  *	This function performs a route lookup on the given flow.
1032  *
1033  *	It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1036 {
1037 	*dst = NULL;
1038 	return ip6_dst_lookup_tail(sk, dst, fl6);
1039 }
1040 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1041 
1042 /**
1043  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1044  *	@sk: socket which provides route info
1045  *	@fl6: flow to lookup
1046  *	@final_dst: final destination address for ipsec lookup
1047  *	@can_sleep: we are in a sleepable context
1048  *
1049  *	This function performs a route lookup on the given flow.
1050  *
1051  *	It returns a valid dst pointer on success, or a pointer encoded
1052  *	error code.
1053  */
1054 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1055 				      const struct in6_addr *final_dst,
1056 				      bool can_sleep)
1057 {
1058 	struct dst_entry *dst = NULL;
1059 	int err;
1060 
1061 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1062 	if (err)
1063 		return ERR_PTR(err);
1064 	if (final_dst)
1065 		ipv6_addr_copy(&fl6->daddr, final_dst);
1066 	if (can_sleep)
1067 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1068 
1069 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1072 
1073 /**
1074  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1075  *	@sk: socket which provides the dst cache and route info
1076  *	@fl6: flow to lookup
1077  *	@final_dst: final destination address for ipsec lookup
1078  *	@can_sleep: we are in a sleepable context
1079  *
1080  *	This function performs a route lookup on the given flow with the
1081  *	possibility of using the cached route in the socket if it is valid.
1082  *	It will take the socket dst lock when operating on the dst cache.
1083  *	As a result, this function can only be used in process context.
1084  *
1085  *	It returns a valid dst pointer on success, or a pointer encoded
1086  *	error code.
1087  */
1088 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1089 					 const struct in6_addr *final_dst,
1090 					 bool can_sleep)
1091 {
1092 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1093 	int err;
1094 
1095 	dst = ip6_sk_dst_check(sk, dst, fl6);
1096 
1097 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1098 	if (err)
1099 		return ERR_PTR(err);
1100 	if (final_dst)
1101 		ipv6_addr_copy(&fl6->daddr, final_dst);
1102 	if (can_sleep)
1103 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1104 
1105 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1106 }
1107 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1108 
1109 static inline int ip6_ufo_append_data(struct sock *sk,
1110 			int getfrag(void *from, char *to, int offset, int len,
1111 			int odd, struct sk_buff *skb),
1112 			void *from, int length, int hh_len, int fragheaderlen,
1113 			int transhdrlen, int mtu,unsigned int flags,
1114 			struct rt6_info *rt)
1115 
1116 {
1117 	struct sk_buff *skb;
1118 	int err;
1119 
1120 	/* There is support for UDP large send offload by network
1121 	 * device, so create one single skb packet containing complete
1122 	 * udp datagram
1123 	 */
1124 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1125 		skb = sock_alloc_send_skb(sk,
1126 			hh_len + fragheaderlen + transhdrlen + 20,
1127 			(flags & MSG_DONTWAIT), &err);
1128 		if (skb == NULL)
1129 			return -ENOMEM;
1130 
1131 		/* reserve space for Hardware header */
1132 		skb_reserve(skb, hh_len);
1133 
1134 		/* create space for UDP/IP header */
1135 		skb_put(skb,fragheaderlen + transhdrlen);
1136 
1137 		/* initialize network header pointer */
1138 		skb_reset_network_header(skb);
1139 
1140 		/* initialize protocol header pointer */
1141 		skb->transport_header = skb->network_header + fragheaderlen;
1142 
1143 		skb->ip_summed = CHECKSUM_PARTIAL;
1144 		skb->csum = 0;
1145 	}
1146 
1147 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1148 				      (length - transhdrlen));
1149 	if (!err) {
1150 		struct frag_hdr fhdr;
1151 
1152 		/* Specify the length of each IPv6 datagram fragment.
1153 		 * It has to be a multiple of 8.
1154 		 */
1155 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1156 					     sizeof(struct frag_hdr)) & ~7;
1157 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1158 		ipv6_select_ident(&fhdr, rt);
1159 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1160 		__skb_queue_tail(&sk->sk_write_queue, skb);
1161 
1162 		return 0;
1163 	}
1164 	/* There is not enough support do UPD LSO,
1165 	 * so follow normal path
1166 	 */
1167 	kfree_skb(skb);
1168 
1169 	return err;
1170 }
1171 
1172 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1173 					       gfp_t gfp)
1174 {
1175 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177 
1178 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1179 						gfp_t gfp)
1180 {
1181 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183 
1184 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1185 	int offset, int len, int odd, struct sk_buff *skb),
1186 	void *from, int length, int transhdrlen,
1187 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1188 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1189 {
1190 	struct inet_sock *inet = inet_sk(sk);
1191 	struct ipv6_pinfo *np = inet6_sk(sk);
1192 	struct inet_cork *cork;
1193 	struct sk_buff *skb;
1194 	unsigned int maxfraglen, fragheaderlen;
1195 	int exthdrlen;
1196 	int dst_exthdrlen;
1197 	int hh_len;
1198 	int mtu;
1199 	int copy;
1200 	int err;
1201 	int offset = 0;
1202 	int csummode = CHECKSUM_NONE;
1203 	__u8 tx_flags = 0;
1204 
1205 	if (flags&MSG_PROBE)
1206 		return 0;
1207 	cork = &inet->cork.base;
1208 	if (skb_queue_empty(&sk->sk_write_queue)) {
1209 		/*
1210 		 * setup for corking
1211 		 */
1212 		if (opt) {
1213 			if (WARN_ON(np->cork.opt))
1214 				return -EINVAL;
1215 
1216 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1217 			if (unlikely(np->cork.opt == NULL))
1218 				return -ENOBUFS;
1219 
1220 			np->cork.opt->tot_len = opt->tot_len;
1221 			np->cork.opt->opt_flen = opt->opt_flen;
1222 			np->cork.opt->opt_nflen = opt->opt_nflen;
1223 
1224 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1225 							    sk->sk_allocation);
1226 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1227 				return -ENOBUFS;
1228 
1229 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1230 							    sk->sk_allocation);
1231 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1232 				return -ENOBUFS;
1233 
1234 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1235 							   sk->sk_allocation);
1236 			if (opt->hopopt && !np->cork.opt->hopopt)
1237 				return -ENOBUFS;
1238 
1239 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1240 							    sk->sk_allocation);
1241 			if (opt->srcrt && !np->cork.opt->srcrt)
1242 				return -ENOBUFS;
1243 
1244 			/* need source address above miyazawa*/
1245 		}
1246 		dst_hold(&rt->dst);
1247 		cork->dst = &rt->dst;
1248 		inet->cork.fl.u.ip6 = *fl6;
1249 		np->cork.hop_limit = hlimit;
1250 		np->cork.tclass = tclass;
1251 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1252 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1253 		if (np->frag_size < mtu) {
1254 			if (np->frag_size)
1255 				mtu = np->frag_size;
1256 		}
1257 		cork->fragsize = mtu;
1258 		if (dst_allfrag(rt->dst.path))
1259 			cork->flags |= IPCORK_ALLFRAG;
1260 		cork->length = 0;
1261 		sk->sk_sndmsg_page = NULL;
1262 		sk->sk_sndmsg_off = 0;
1263 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1264 		length += exthdrlen;
1265 		transhdrlen += exthdrlen;
1266 		dst_exthdrlen = rt->dst.header_len;
1267 	} else {
1268 		rt = (struct rt6_info *)cork->dst;
1269 		fl6 = &inet->cork.fl.u.ip6;
1270 		opt = np->cork.opt;
1271 		transhdrlen = 0;
1272 		exthdrlen = 0;
1273 		dst_exthdrlen = 0;
1274 		mtu = cork->fragsize;
1275 	}
1276 
1277 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1278 
1279 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1280 			(opt ? opt->opt_nflen : 0);
1281 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1282 
1283 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1284 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1285 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1286 			return -EMSGSIZE;
1287 		}
1288 	}
1289 
1290 	/* For UDP, check if TX timestamp is enabled */
1291 	if (sk->sk_type == SOCK_DGRAM) {
1292 		err = sock_tx_timestamp(sk, &tx_flags);
1293 		if (err)
1294 			goto error;
1295 	}
1296 
1297 	/*
1298 	 * Let's try using as much space as possible.
1299 	 * Use MTU if total length of the message fits into the MTU.
1300 	 * Otherwise, we need to reserve fragment header and
1301 	 * fragment alignment (= 8-15 octects, in total).
1302 	 *
1303 	 * Note that we may need to "move" the data from the tail of
1304 	 * of the buffer to the new fragment when we split
1305 	 * the message.
1306 	 *
1307 	 * FIXME: It may be fragmented into multiple chunks
1308 	 *        at once if non-fragmentable extension headers
1309 	 *        are too large.
1310 	 * --yoshfuji
1311 	 */
1312 
1313 	cork->length += length;
1314 	if (length > mtu) {
1315 		int proto = sk->sk_protocol;
1316 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1317 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1318 			return -EMSGSIZE;
1319 		}
1320 
1321 		if (proto == IPPROTO_UDP &&
1322 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1323 
1324 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1325 						  hh_len, fragheaderlen,
1326 						  transhdrlen, mtu, flags, rt);
1327 			if (err)
1328 				goto error;
1329 			return 0;
1330 		}
1331 	}
1332 
1333 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1334 		goto alloc_new_skb;
1335 
1336 	while (length > 0) {
1337 		/* Check if the remaining data fits into current packet. */
1338 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1339 		if (copy < length)
1340 			copy = maxfraglen - skb->len;
1341 
1342 		if (copy <= 0) {
1343 			char *data;
1344 			unsigned int datalen;
1345 			unsigned int fraglen;
1346 			unsigned int fraggap;
1347 			unsigned int alloclen;
1348 			struct sk_buff *skb_prev;
1349 alloc_new_skb:
1350 			skb_prev = skb;
1351 
1352 			/* There's no room in the current skb */
1353 			if (skb_prev)
1354 				fraggap = skb_prev->len - maxfraglen;
1355 			else
1356 				fraggap = 0;
1357 
1358 			/*
1359 			 * If remaining data exceeds the mtu,
1360 			 * we know we need more fragment(s).
1361 			 */
1362 			datalen = length + fraggap;
1363 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1364 				datalen = maxfraglen - fragheaderlen;
1365 
1366 			fraglen = datalen + fragheaderlen;
1367 			if ((flags & MSG_MORE) &&
1368 			    !(rt->dst.dev->features&NETIF_F_SG))
1369 				alloclen = mtu;
1370 			else
1371 				alloclen = datalen + fragheaderlen;
1372 
1373 			alloclen += dst_exthdrlen;
1374 
1375 			/*
1376 			 * The last fragment gets additional space at tail.
1377 			 * Note: we overallocate on fragments with MSG_MODE
1378 			 * because we have no idea if we're the last one.
1379 			 */
1380 			if (datalen == length + fraggap)
1381 				alloclen += rt->dst.trailer_len;
1382 
1383 			/*
1384 			 * We just reserve space for fragment header.
1385 			 * Note: this may be overallocation if the message
1386 			 * (without MSG_MORE) fits into the MTU.
1387 			 */
1388 			alloclen += sizeof(struct frag_hdr);
1389 
1390 			if (transhdrlen) {
1391 				skb = sock_alloc_send_skb(sk,
1392 						alloclen + hh_len,
1393 						(flags & MSG_DONTWAIT), &err);
1394 			} else {
1395 				skb = NULL;
1396 				if (atomic_read(&sk->sk_wmem_alloc) <=
1397 				    2 * sk->sk_sndbuf)
1398 					skb = sock_wmalloc(sk,
1399 							   alloclen + hh_len, 1,
1400 							   sk->sk_allocation);
1401 				if (unlikely(skb == NULL))
1402 					err = -ENOBUFS;
1403 				else {
1404 					/* Only the initial fragment
1405 					 * is time stamped.
1406 					 */
1407 					tx_flags = 0;
1408 				}
1409 			}
1410 			if (skb == NULL)
1411 				goto error;
1412 			/*
1413 			 *	Fill in the control structures
1414 			 */
1415 			skb->ip_summed = csummode;
1416 			skb->csum = 0;
1417 			/* reserve for fragmentation */
1418 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1419 
1420 			if (sk->sk_type == SOCK_DGRAM)
1421 				skb_shinfo(skb)->tx_flags = tx_flags;
1422 
1423 			/*
1424 			 *	Find where to start putting bytes
1425 			 */
1426 			data = skb_put(skb, fraglen + dst_exthdrlen);
1427 			skb_set_network_header(skb, exthdrlen + dst_exthdrlen);
1428 			data += fragheaderlen + dst_exthdrlen;
1429 			skb->transport_header = (skb->network_header +
1430 						 fragheaderlen);
1431 			if (fraggap) {
1432 				skb->csum = skb_copy_and_csum_bits(
1433 					skb_prev, maxfraglen,
1434 					data + transhdrlen, fraggap, 0);
1435 				skb_prev->csum = csum_sub(skb_prev->csum,
1436 							  skb->csum);
1437 				data += fraggap;
1438 				pskb_trim_unique(skb_prev, maxfraglen);
1439 			}
1440 			copy = datalen - transhdrlen - fraggap;
1441 
1442 			if (copy < 0) {
1443 				err = -EINVAL;
1444 				kfree_skb(skb);
1445 				goto error;
1446 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1447 				err = -EFAULT;
1448 				kfree_skb(skb);
1449 				goto error;
1450 			}
1451 
1452 			offset += copy;
1453 			length -= datalen - fraggap;
1454 			transhdrlen = 0;
1455 			exthdrlen = 0;
1456 			dst_exthdrlen = 0;
1457 			csummode = CHECKSUM_NONE;
1458 
1459 			/*
1460 			 * Put the packet on the pending queue
1461 			 */
1462 			__skb_queue_tail(&sk->sk_write_queue, skb);
1463 			continue;
1464 		}
1465 
1466 		if (copy > length)
1467 			copy = length;
1468 
1469 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1470 			unsigned int off;
1471 
1472 			off = skb->len;
1473 			if (getfrag(from, skb_put(skb, copy),
1474 						offset, copy, off, skb) < 0) {
1475 				__skb_trim(skb, off);
1476 				err = -EFAULT;
1477 				goto error;
1478 			}
1479 		} else {
1480 			int i = skb_shinfo(skb)->nr_frags;
1481 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1482 			struct page *page = sk->sk_sndmsg_page;
1483 			int off = sk->sk_sndmsg_off;
1484 			unsigned int left;
1485 
1486 			if (page && (left = PAGE_SIZE - off) > 0) {
1487 				if (copy >= left)
1488 					copy = left;
1489 				if (page != skb_frag_page(frag)) {
1490 					if (i == MAX_SKB_FRAGS) {
1491 						err = -EMSGSIZE;
1492 						goto error;
1493 					}
1494 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1495 					skb_frag_ref(skb, i);
1496 					frag = &skb_shinfo(skb)->frags[i];
1497 				}
1498 			} else if(i < MAX_SKB_FRAGS) {
1499 				if (copy > PAGE_SIZE)
1500 					copy = PAGE_SIZE;
1501 				page = alloc_pages(sk->sk_allocation, 0);
1502 				if (page == NULL) {
1503 					err = -ENOMEM;
1504 					goto error;
1505 				}
1506 				sk->sk_sndmsg_page = page;
1507 				sk->sk_sndmsg_off = 0;
1508 
1509 				skb_fill_page_desc(skb, i, page, 0, 0);
1510 				frag = &skb_shinfo(skb)->frags[i];
1511 			} else {
1512 				err = -EMSGSIZE;
1513 				goto error;
1514 			}
1515 			if (getfrag(from,
1516 				    skb_frag_address(frag) + skb_frag_size(frag),
1517 				    offset, copy, skb->len, skb) < 0) {
1518 				err = -EFAULT;
1519 				goto error;
1520 			}
1521 			sk->sk_sndmsg_off += copy;
1522 			skb_frag_size_add(frag, copy);
1523 			skb->len += copy;
1524 			skb->data_len += copy;
1525 			skb->truesize += copy;
1526 			atomic_add(copy, &sk->sk_wmem_alloc);
1527 		}
1528 		offset += copy;
1529 		length -= copy;
1530 	}
1531 	return 0;
1532 error:
1533 	cork->length -= length;
1534 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1535 	return err;
1536 }
1537 
1538 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1539 {
1540 	if (np->cork.opt) {
1541 		kfree(np->cork.opt->dst0opt);
1542 		kfree(np->cork.opt->dst1opt);
1543 		kfree(np->cork.opt->hopopt);
1544 		kfree(np->cork.opt->srcrt);
1545 		kfree(np->cork.opt);
1546 		np->cork.opt = NULL;
1547 	}
1548 
1549 	if (inet->cork.base.dst) {
1550 		dst_release(inet->cork.base.dst);
1551 		inet->cork.base.dst = NULL;
1552 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1553 	}
1554 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1555 }
1556 
1557 int ip6_push_pending_frames(struct sock *sk)
1558 {
1559 	struct sk_buff *skb, *tmp_skb;
1560 	struct sk_buff **tail_skb;
1561 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1562 	struct inet_sock *inet = inet_sk(sk);
1563 	struct ipv6_pinfo *np = inet6_sk(sk);
1564 	struct net *net = sock_net(sk);
1565 	struct ipv6hdr *hdr;
1566 	struct ipv6_txoptions *opt = np->cork.opt;
1567 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1568 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1569 	unsigned char proto = fl6->flowi6_proto;
1570 	int err = 0;
1571 
1572 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1573 		goto out;
1574 	tail_skb = &(skb_shinfo(skb)->frag_list);
1575 
1576 	/* move skb->data to ip header from ext header */
1577 	if (skb->data < skb_network_header(skb))
1578 		__skb_pull(skb, skb_network_offset(skb));
1579 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1580 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1581 		*tail_skb = tmp_skb;
1582 		tail_skb = &(tmp_skb->next);
1583 		skb->len += tmp_skb->len;
1584 		skb->data_len += tmp_skb->len;
1585 		skb->truesize += tmp_skb->truesize;
1586 		tmp_skb->destructor = NULL;
1587 		tmp_skb->sk = NULL;
1588 	}
1589 
1590 	/* Allow local fragmentation. */
1591 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1592 		skb->local_df = 1;
1593 
1594 	ipv6_addr_copy(final_dst, &fl6->daddr);
1595 	__skb_pull(skb, skb_network_header_len(skb));
1596 	if (opt && opt->opt_flen)
1597 		ipv6_push_frag_opts(skb, opt, &proto);
1598 	if (opt && opt->opt_nflen)
1599 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1600 
1601 	skb_push(skb, sizeof(struct ipv6hdr));
1602 	skb_reset_network_header(skb);
1603 	hdr = ipv6_hdr(skb);
1604 
1605 	*(__be32*)hdr = fl6->flowlabel |
1606 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1607 
1608 	hdr->hop_limit = np->cork.hop_limit;
1609 	hdr->nexthdr = proto;
1610 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1611 	ipv6_addr_copy(&hdr->daddr, final_dst);
1612 
1613 	skb->priority = sk->sk_priority;
1614 	skb->mark = sk->sk_mark;
1615 
1616 	skb_dst_set(skb, dst_clone(&rt->dst));
1617 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1618 	if (proto == IPPROTO_ICMPV6) {
1619 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1620 
1621 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1622 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1623 	}
1624 
1625 	err = ip6_local_out(skb);
1626 	if (err) {
1627 		if (err > 0)
1628 			err = net_xmit_errno(err);
1629 		if (err)
1630 			goto error;
1631 	}
1632 
1633 out:
1634 	ip6_cork_release(inet, np);
1635 	return err;
1636 error:
1637 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1638 	goto out;
1639 }
1640 
1641 void ip6_flush_pending_frames(struct sock *sk)
1642 {
1643 	struct sk_buff *skb;
1644 
1645 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1646 		if (skb_dst(skb))
1647 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1648 				      IPSTATS_MIB_OUTDISCARDS);
1649 		kfree_skb(skb);
1650 	}
1651 
1652 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1653 }
1654