xref: /linux/net/ipv6/ip6_output.c (revision 13abf8130139c2ccd4962a7e5a8902be5e6cb5a7)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *	Based on linux/net/ipv4/ip_output.c
11  *
12  *	This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *	Changes:
18  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
19  *				extension headers are implemented.
20  *				route changes now work.
21  *				ip6_forward does not confuse sniffers.
22  *				etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *	Imran Patel	: 	frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *			:       add ip6_append_data and related functions
28  *				for datagram xmit
29  */
30 
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42 
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63 	static u32 ipv6_fragmentation_id = 1;
64 	static DEFINE_SPINLOCK(ip6_id_lock);
65 
66 	spin_lock_bh(&ip6_id_lock);
67 	fhdr->identification = htonl(ipv6_fragmentation_id);
68 	if (++ipv6_fragmentation_id == 0)
69 		ipv6_fragmentation_id = 1;
70 	spin_unlock_bh(&ip6_id_lock);
71 }
72 
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75 
76 	struct dst_entry *dst = skb->dst;
77 	struct hh_cache *hh = dst->hh;
78 
79 	if (hh) {
80 		int hh_alen;
81 
82 		read_lock_bh(&hh->hh_lock);
83 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 		read_unlock_bh(&hh->hh_lock);
86 	        skb_push(skb, hh->hh_len);
87 		return hh->hh_output(skb);
88 	} else if (dst->neighbour)
89 		return dst->neighbour->output(skb);
90 
91 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 	kfree_skb(skb);
93 	return -EINVAL;
94 
95 }
96 
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100 	newskb->mac.raw = newskb->data;
101 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
102 	newskb->pkt_type = PACKET_LOOPBACK;
103 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 	BUG_TRAP(newskb->dst);
105 
106 	netif_rx(newskb);
107 	return 0;
108 }
109 
110 
111 static int ip6_output2(struct sk_buff *skb)
112 {
113 	struct dst_entry *dst = skb->dst;
114 	struct net_device *dev = dst->dev;
115 
116 	skb->protocol = htons(ETH_P_IPV6);
117 	skb->dev = dev;
118 
119 	if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121 
122 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 		    ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 				&skb->nh.ipv6h->saddr)) {
125 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126 
127 			/* Do not check for IFF_ALLMULTI; multicast routing
128 			   is not supported in any case.
129 			 */
130 			if (newskb)
131 				NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 					newskb->dev,
133 					ip6_dev_loopback_xmit);
134 
135 			if (skb->nh.ipv6h->hop_limit == 0) {
136 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 				kfree_skb(skb);
138 				return 0;
139 			}
140 		}
141 
142 		IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 	}
144 
145 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147 
148 int ip6_output(struct sk_buff *skb)
149 {
150 	if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 		return ip6_fragment(skb, ip6_output2);
152 	else
153 		return ip6_output2(skb);
154 }
155 
156 /*
157  *	xmit an sk_buff (used by TCP)
158  */
159 
160 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
161 	     struct ipv6_txoptions *opt, int ipfragok)
162 {
163 	struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
164 	struct in6_addr *first_hop = &fl->fl6_dst;
165 	struct dst_entry *dst = skb->dst;
166 	struct ipv6hdr *hdr;
167 	u8  proto = fl->proto;
168 	int seg_len = skb->len;
169 	int hlimit;
170 	u32 mtu;
171 
172 	if (opt) {
173 		int head_room;
174 
175 		/* First: exthdrs may take lots of space (~8K for now)
176 		   MAX_HEADER is not enough.
177 		 */
178 		head_room = opt->opt_nflen + opt->opt_flen;
179 		seg_len += head_room;
180 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
181 
182 		if (skb_headroom(skb) < head_room) {
183 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
184 			kfree_skb(skb);
185 			skb = skb2;
186 			if (skb == NULL) {
187 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
188 				return -ENOBUFS;
189 			}
190 			if (sk)
191 				skb_set_owner_w(skb, sk);
192 		}
193 		if (opt->opt_flen)
194 			ipv6_push_frag_opts(skb, opt, &proto);
195 		if (opt->opt_nflen)
196 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
197 	}
198 
199 	hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
200 
201 	/*
202 	 *	Fill in the IPv6 header
203 	 */
204 
205 	*(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
206 	hlimit = -1;
207 	if (np)
208 		hlimit = np->hop_limit;
209 	if (hlimit < 0)
210 		hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211 	if (hlimit < 0)
212 		hlimit = ipv6_get_hoplimit(dst->dev);
213 
214 	hdr->payload_len = htons(seg_len);
215 	hdr->nexthdr = proto;
216 	hdr->hop_limit = hlimit;
217 
218 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
219 	ipv6_addr_copy(&hdr->daddr, first_hop);
220 
221 	mtu = dst_mtu(dst);
222 	if ((skb->len <= mtu) || ipfragok) {
223 		IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
224 		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
225 				dst_output);
226 	}
227 
228 	if (net_ratelimit())
229 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
230 	skb->dev = dst->dev;
231 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
232 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
233 	kfree_skb(skb);
234 	return -EMSGSIZE;
235 }
236 
237 /*
238  *	To avoid extra problems ND packets are send through this
239  *	routine. It's code duplication but I really want to avoid
240  *	extra checks since ipv6_build_header is used by TCP (which
241  *	is for us performance critical)
242  */
243 
244 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
245 	       struct in6_addr *saddr, struct in6_addr *daddr,
246 	       int proto, int len)
247 {
248 	struct ipv6_pinfo *np = inet6_sk(sk);
249 	struct ipv6hdr *hdr;
250 	int totlen;
251 
252 	skb->protocol = htons(ETH_P_IPV6);
253 	skb->dev = dev;
254 
255 	totlen = len + sizeof(struct ipv6hdr);
256 
257 	hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
258 	skb->nh.ipv6h = hdr;
259 
260 	*(u32*)hdr = htonl(0x60000000);
261 
262 	hdr->payload_len = htons(len);
263 	hdr->nexthdr = proto;
264 	hdr->hop_limit = np->hop_limit;
265 
266 	ipv6_addr_copy(&hdr->saddr, saddr);
267 	ipv6_addr_copy(&hdr->daddr, daddr);
268 
269 	return 0;
270 }
271 
272 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
273 {
274 	struct ip6_ra_chain *ra;
275 	struct sock *last = NULL;
276 
277 	read_lock(&ip6_ra_lock);
278 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
279 		struct sock *sk = ra->sk;
280 		if (sk && ra->sel == sel &&
281 		    (!sk->sk_bound_dev_if ||
282 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
283 			if (last) {
284 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
285 				if (skb2)
286 					rawv6_rcv(last, skb2);
287 			}
288 			last = sk;
289 		}
290 	}
291 
292 	if (last) {
293 		rawv6_rcv(last, skb);
294 		read_unlock(&ip6_ra_lock);
295 		return 1;
296 	}
297 	read_unlock(&ip6_ra_lock);
298 	return 0;
299 }
300 
301 static inline int ip6_forward_finish(struct sk_buff *skb)
302 {
303 	return dst_output(skb);
304 }
305 
306 int ip6_forward(struct sk_buff *skb)
307 {
308 	struct dst_entry *dst = skb->dst;
309 	struct ipv6hdr *hdr = skb->nh.ipv6h;
310 	struct inet6_skb_parm *opt = IP6CB(skb);
311 
312 	if (ipv6_devconf.forwarding == 0)
313 		goto error;
314 
315 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
316 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
317 		goto drop;
318 	}
319 
320 	skb->ip_summed = CHECKSUM_NONE;
321 
322 	/*
323 	 *	We DO NOT make any processing on
324 	 *	RA packets, pushing them to user level AS IS
325 	 *	without ane WARRANTY that application will be able
326 	 *	to interpret them. The reason is that we
327 	 *	cannot make anything clever here.
328 	 *
329 	 *	We are not end-node, so that if packet contains
330 	 *	AH/ESP, we cannot make anything.
331 	 *	Defragmentation also would be mistake, RA packets
332 	 *	cannot be fragmented, because there is no warranty
333 	 *	that different fragments will go along one path. --ANK
334 	 */
335 	if (opt->ra) {
336 		u8 *ptr = skb->nh.raw + opt->ra;
337 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
338 			return 0;
339 	}
340 
341 	/*
342 	 *	check and decrement ttl
343 	 */
344 	if (hdr->hop_limit <= 1) {
345 		/* Force OUTPUT device used as source address */
346 		skb->dev = dst->dev;
347 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
348 			    0, skb->dev);
349 
350 		kfree_skb(skb);
351 		return -ETIMEDOUT;
352 	}
353 
354 	if (!xfrm6_route_forward(skb)) {
355 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
356 		goto drop;
357 	}
358 	dst = skb->dst;
359 
360 	/* IPv6 specs say nothing about it, but it is clear that we cannot
361 	   send redirects to source routed frames.
362 	 */
363 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
364 		struct in6_addr *target = NULL;
365 		struct rt6_info *rt;
366 		struct neighbour *n = dst->neighbour;
367 
368 		/*
369 		 *	incoming and outgoing devices are the same
370 		 *	send a redirect.
371 		 */
372 
373 		rt = (struct rt6_info *) dst;
374 		if ((rt->rt6i_flags & RTF_GATEWAY))
375 			target = (struct in6_addr*)&n->primary_key;
376 		else
377 			target = &hdr->daddr;
378 
379 		/* Limit redirects both by destination (here)
380 		   and by source (inside ndisc_send_redirect)
381 		 */
382 		if (xrlim_allow(dst, 1*HZ))
383 			ndisc_send_redirect(skb, n, target);
384 	} else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
385 						|IPV6_ADDR_LINKLOCAL)) {
386 		/* This check is security critical. */
387 		goto error;
388 	}
389 
390 	if (skb->len > dst_mtu(dst)) {
391 		/* Again, force OUTPUT device used as source address */
392 		skb->dev = dst->dev;
393 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
394 		IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
395 		IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
396 		kfree_skb(skb);
397 		return -EMSGSIZE;
398 	}
399 
400 	if (skb_cow(skb, dst->dev->hard_header_len)) {
401 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
402 		goto drop;
403 	}
404 
405 	hdr = skb->nh.ipv6h;
406 
407 	/* Mangling hops number delayed to point after skb COW */
408 
409 	hdr->hop_limit--;
410 
411 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
412 	return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
413 
414 error:
415 	IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
416 drop:
417 	kfree_skb(skb);
418 	return -EINVAL;
419 }
420 
421 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
422 {
423 	to->pkt_type = from->pkt_type;
424 	to->priority = from->priority;
425 	to->protocol = from->protocol;
426 	dst_release(to->dst);
427 	to->dst = dst_clone(from->dst);
428 	to->dev = from->dev;
429 
430 #ifdef CONFIG_NET_SCHED
431 	to->tc_index = from->tc_index;
432 #endif
433 #ifdef CONFIG_NETFILTER
434 	to->nfmark = from->nfmark;
435 	/* Connection association is same as pre-frag packet */
436 	to->nfct = from->nfct;
437 	nf_conntrack_get(to->nfct);
438 	to->nfctinfo = from->nfctinfo;
439 #ifdef CONFIG_BRIDGE_NETFILTER
440 	nf_bridge_put(to->nf_bridge);
441 	to->nf_bridge = from->nf_bridge;
442 	nf_bridge_get(to->nf_bridge);
443 #endif
444 #endif
445 }
446 
447 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
448 {
449 	u16 offset = sizeof(struct ipv6hdr);
450 	struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
451 	unsigned int packet_len = skb->tail - skb->nh.raw;
452 	int found_rhdr = 0;
453 	*nexthdr = &skb->nh.ipv6h->nexthdr;
454 
455 	while (offset + 1 <= packet_len) {
456 
457 		switch (**nexthdr) {
458 
459 		case NEXTHDR_HOP:
460 		case NEXTHDR_ROUTING:
461 		case NEXTHDR_DEST:
462 			if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
463 			if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
464 			offset += ipv6_optlen(exthdr);
465 			*nexthdr = &exthdr->nexthdr;
466 			exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
467 			break;
468 		default :
469 			return offset;
470 		}
471 	}
472 
473 	return offset;
474 }
475 
476 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
477 {
478 	struct net_device *dev;
479 	struct sk_buff *frag;
480 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
481 	struct ipv6hdr *tmp_hdr;
482 	struct frag_hdr *fh;
483 	unsigned int mtu, hlen, left, len;
484 	u32 frag_id = 0;
485 	int ptr, offset = 0, err=0;
486 	u8 *prevhdr, nexthdr = 0;
487 
488 	dev = rt->u.dst.dev;
489 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
490 	nexthdr = *prevhdr;
491 
492 	mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
493 
494 	if (skb_shinfo(skb)->frag_list) {
495 		int first_len = skb_pagelen(skb);
496 
497 		if (first_len - hlen > mtu ||
498 		    ((first_len - hlen) & 7) ||
499 		    skb_cloned(skb))
500 			goto slow_path;
501 
502 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
503 			/* Correct geometry. */
504 			if (frag->len > mtu ||
505 			    ((frag->len & 7) && frag->next) ||
506 			    skb_headroom(frag) < hlen)
507 			    goto slow_path;
508 
509 			/* Partially cloned skb? */
510 			if (skb_shared(frag))
511 				goto slow_path;
512 
513 			BUG_ON(frag->sk);
514 			if (skb->sk) {
515 				sock_hold(skb->sk);
516 				frag->sk = skb->sk;
517 				frag->destructor = sock_wfree;
518 				skb->truesize -= frag->truesize;
519 			}
520 		}
521 
522 		err = 0;
523 		offset = 0;
524 		frag = skb_shinfo(skb)->frag_list;
525 		skb_shinfo(skb)->frag_list = NULL;
526 		/* BUILD HEADER */
527 
528 		tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
529 		if (!tmp_hdr) {
530 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
531 			return -ENOMEM;
532 		}
533 
534 		*prevhdr = NEXTHDR_FRAGMENT;
535 		memcpy(tmp_hdr, skb->nh.raw, hlen);
536 		__skb_pull(skb, hlen);
537 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
538 		skb->nh.raw = __skb_push(skb, hlen);
539 		memcpy(skb->nh.raw, tmp_hdr, hlen);
540 
541 		ipv6_select_ident(skb, fh);
542 		fh->nexthdr = nexthdr;
543 		fh->reserved = 0;
544 		fh->frag_off = htons(IP6_MF);
545 		frag_id = fh->identification;
546 
547 		first_len = skb_pagelen(skb);
548 		skb->data_len = first_len - skb_headlen(skb);
549 		skb->len = first_len;
550 		skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
551 
552 
553 		for (;;) {
554 			/* Prepare header of the next frame,
555 			 * before previous one went down. */
556 			if (frag) {
557 				frag->ip_summed = CHECKSUM_NONE;
558 				frag->h.raw = frag->data;
559 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
560 				frag->nh.raw = __skb_push(frag, hlen);
561 				memcpy(frag->nh.raw, tmp_hdr, hlen);
562 				offset += skb->len - hlen - sizeof(struct frag_hdr);
563 				fh->nexthdr = nexthdr;
564 				fh->reserved = 0;
565 				fh->frag_off = htons(offset);
566 				if (frag->next != NULL)
567 					fh->frag_off |= htons(IP6_MF);
568 				fh->identification = frag_id;
569 				frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
570 				ip6_copy_metadata(frag, skb);
571 			}
572 
573 			err = output(skb);
574 			if (err || !frag)
575 				break;
576 
577 			skb = frag;
578 			frag = skb->next;
579 			skb->next = NULL;
580 		}
581 
582 		if (tmp_hdr)
583 			kfree(tmp_hdr);
584 
585 		if (err == 0) {
586 			IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
587 			return 0;
588 		}
589 
590 		while (frag) {
591 			skb = frag->next;
592 			kfree_skb(frag);
593 			frag = skb;
594 		}
595 
596 		IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
597 		return err;
598 	}
599 
600 slow_path:
601 	left = skb->len - hlen;		/* Space per frame */
602 	ptr = hlen;			/* Where to start from */
603 
604 	/*
605 	 *	Fragment the datagram.
606 	 */
607 
608 	*prevhdr = NEXTHDR_FRAGMENT;
609 
610 	/*
611 	 *	Keep copying data until we run out.
612 	 */
613 	while(left > 0)	{
614 		len = left;
615 		/* IF: it doesn't fit, use 'mtu' - the data space left */
616 		if (len > mtu)
617 			len = mtu;
618 		/* IF: we are not sending upto and including the packet end
619 		   then align the next start on an eight byte boundary */
620 		if (len < left)	{
621 			len &= ~7;
622 		}
623 		/*
624 		 *	Allocate buffer.
625 		 */
626 
627 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
628 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
629 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
630 			err = -ENOMEM;
631 			goto fail;
632 		}
633 
634 		/*
635 		 *	Set up data on packet
636 		 */
637 
638 		ip6_copy_metadata(frag, skb);
639 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
640 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
641 		frag->nh.raw = frag->data;
642 		fh = (struct frag_hdr*)(frag->data + hlen);
643 		frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
644 
645 		/*
646 		 *	Charge the memory for the fragment to any owner
647 		 *	it might possess
648 		 */
649 		if (skb->sk)
650 			skb_set_owner_w(frag, skb->sk);
651 
652 		/*
653 		 *	Copy the packet header into the new buffer.
654 		 */
655 		memcpy(frag->nh.raw, skb->data, hlen);
656 
657 		/*
658 		 *	Build fragment header.
659 		 */
660 		fh->nexthdr = nexthdr;
661 		fh->reserved = 0;
662 		if (frag_id) {
663 			ipv6_select_ident(skb, fh);
664 			frag_id = fh->identification;
665 		} else
666 			fh->identification = frag_id;
667 
668 		/*
669 		 *	Copy a block of the IP datagram.
670 		 */
671 		if (skb_copy_bits(skb, ptr, frag->h.raw, len))
672 			BUG();
673 		left -= len;
674 
675 		fh->frag_off = htons(offset);
676 		if (left > 0)
677 			fh->frag_off |= htons(IP6_MF);
678 		frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
679 
680 		ptr += len;
681 		offset += len;
682 
683 		/*
684 		 *	Put this fragment into the sending queue.
685 		 */
686 
687 		IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
688 
689 		err = output(frag);
690 		if (err)
691 			goto fail;
692 	}
693 	kfree_skb(skb);
694 	IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
695 	return err;
696 
697 fail:
698 	kfree_skb(skb);
699 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
700 	return err;
701 }
702 
703 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
704 {
705 	int err = 0;
706 
707 	*dst = NULL;
708 	if (sk) {
709 		struct ipv6_pinfo *np = inet6_sk(sk);
710 
711 		*dst = sk_dst_check(sk, np->dst_cookie);
712 		if (*dst) {
713 			struct rt6_info *rt = (struct rt6_info*)*dst;
714 
715 				/* Yes, checking route validity in not connected
716 				   case is not very simple. Take into account,
717 				   that we do not support routing by source, TOS,
718 				   and MSG_DONTROUTE 		--ANK (980726)
719 
720 				   1. If route was host route, check that
721 				      cached destination is current.
722 				      If it is network route, we still may
723 				      check its validity using saved pointer
724 				      to the last used address: daddr_cache.
725 				      We do not want to save whole address now,
726 				      (because main consumer of this service
727 				       is tcp, which has not this problem),
728 				      so that the last trick works only on connected
729 				      sockets.
730 				   2. oif also should be the same.
731 				 */
732 
733 			if (((rt->rt6i_dst.plen != 128 ||
734 			      !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
735 			     && (np->daddr_cache == NULL ||
736 				 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
737 			    || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
738 				dst_release(*dst);
739 				*dst = NULL;
740 			}
741 		}
742 	}
743 
744 	if (*dst == NULL)
745 		*dst = ip6_route_output(sk, fl);
746 
747 	if ((err = (*dst)->error))
748 		goto out_err_release;
749 
750 	if (ipv6_addr_any(&fl->fl6_src)) {
751 		err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
752 
753 		if (err)
754 			goto out_err_release;
755 	}
756 
757 	return 0;
758 
759 out_err_release:
760 	dst_release(*dst);
761 	*dst = NULL;
762 	return err;
763 }
764 
765 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
766 		    void *from, int length, int transhdrlen,
767 		    int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
768 		    unsigned int flags)
769 {
770 	struct inet_sock *inet = inet_sk(sk);
771 	struct ipv6_pinfo *np = inet6_sk(sk);
772 	struct sk_buff *skb;
773 	unsigned int maxfraglen, fragheaderlen;
774 	int exthdrlen;
775 	int hh_len;
776 	int mtu;
777 	int copy;
778 	int err;
779 	int offset = 0;
780 	int csummode = CHECKSUM_NONE;
781 
782 	if (flags&MSG_PROBE)
783 		return 0;
784 	if (skb_queue_empty(&sk->sk_write_queue)) {
785 		/*
786 		 * setup for corking
787 		 */
788 		if (opt) {
789 			if (np->cork.opt == NULL) {
790 				np->cork.opt = kmalloc(opt->tot_len,
791 						       sk->sk_allocation);
792 				if (unlikely(np->cork.opt == NULL))
793 					return -ENOBUFS;
794 			} else if (np->cork.opt->tot_len < opt->tot_len) {
795 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
796 				return -EINVAL;
797 			}
798 			memcpy(np->cork.opt, opt, opt->tot_len);
799 			inet->cork.flags |= IPCORK_OPT;
800 			/* need source address above miyazawa*/
801 		}
802 		dst_hold(&rt->u.dst);
803 		np->cork.rt = rt;
804 		inet->cork.fl = *fl;
805 		np->cork.hop_limit = hlimit;
806 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
807 		if (dst_allfrag(rt->u.dst.path))
808 			inet->cork.flags |= IPCORK_ALLFRAG;
809 		inet->cork.length = 0;
810 		sk->sk_sndmsg_page = NULL;
811 		sk->sk_sndmsg_off = 0;
812 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
813 		length += exthdrlen;
814 		transhdrlen += exthdrlen;
815 	} else {
816 		rt = np->cork.rt;
817 		fl = &inet->cork.fl;
818 		if (inet->cork.flags & IPCORK_OPT)
819 			opt = np->cork.opt;
820 		transhdrlen = 0;
821 		exthdrlen = 0;
822 		mtu = inet->cork.fragsize;
823 	}
824 
825 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
826 
827 	fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
828 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
829 
830 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
831 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
832 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
833 			return -EMSGSIZE;
834 		}
835 	}
836 
837 	/*
838 	 * Let's try using as much space as possible.
839 	 * Use MTU if total length of the message fits into the MTU.
840 	 * Otherwise, we need to reserve fragment header and
841 	 * fragment alignment (= 8-15 octects, in total).
842 	 *
843 	 * Note that we may need to "move" the data from the tail of
844 	 * of the buffer to the new fragment when we split
845 	 * the message.
846 	 *
847 	 * FIXME: It may be fragmented into multiple chunks
848 	 *        at once if non-fragmentable extension headers
849 	 *        are too large.
850 	 * --yoshfuji
851 	 */
852 
853 	inet->cork.length += length;
854 
855 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
856 		goto alloc_new_skb;
857 
858 	while (length > 0) {
859 		/* Check if the remaining data fits into current packet. */
860 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
861 		if (copy < length)
862 			copy = maxfraglen - skb->len;
863 
864 		if (copy <= 0) {
865 			char *data;
866 			unsigned int datalen;
867 			unsigned int fraglen;
868 			unsigned int fraggap;
869 			unsigned int alloclen;
870 			struct sk_buff *skb_prev;
871 alloc_new_skb:
872 			skb_prev = skb;
873 
874 			/* There's no room in the current skb */
875 			if (skb_prev)
876 				fraggap = skb_prev->len - maxfraglen;
877 			else
878 				fraggap = 0;
879 
880 			/*
881 			 * If remaining data exceeds the mtu,
882 			 * we know we need more fragment(s).
883 			 */
884 			datalen = length + fraggap;
885 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
886 				datalen = maxfraglen - fragheaderlen;
887 
888 			fraglen = datalen + fragheaderlen;
889 			if ((flags & MSG_MORE) &&
890 			    !(rt->u.dst.dev->features&NETIF_F_SG))
891 				alloclen = mtu;
892 			else
893 				alloclen = datalen + fragheaderlen;
894 
895 			/*
896 			 * The last fragment gets additional space at tail.
897 			 * Note: we overallocate on fragments with MSG_MODE
898 			 * because we have no idea if we're the last one.
899 			 */
900 			if (datalen == length + fraggap)
901 				alloclen += rt->u.dst.trailer_len;
902 
903 			/*
904 			 * We just reserve space for fragment header.
905 			 * Note: this may be overallocation if the message
906 			 * (without MSG_MORE) fits into the MTU.
907 			 */
908 			alloclen += sizeof(struct frag_hdr);
909 
910 			if (transhdrlen) {
911 				skb = sock_alloc_send_skb(sk,
912 						alloclen + hh_len,
913 						(flags & MSG_DONTWAIT), &err);
914 			} else {
915 				skb = NULL;
916 				if (atomic_read(&sk->sk_wmem_alloc) <=
917 				    2 * sk->sk_sndbuf)
918 					skb = sock_wmalloc(sk,
919 							   alloclen + hh_len, 1,
920 							   sk->sk_allocation);
921 				if (unlikely(skb == NULL))
922 					err = -ENOBUFS;
923 			}
924 			if (skb == NULL)
925 				goto error;
926 			/*
927 			 *	Fill in the control structures
928 			 */
929 			skb->ip_summed = csummode;
930 			skb->csum = 0;
931 			/* reserve for fragmentation */
932 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
933 
934 			/*
935 			 *	Find where to start putting bytes
936 			 */
937 			data = skb_put(skb, fraglen);
938 			skb->nh.raw = data + exthdrlen;
939 			data += fragheaderlen;
940 			skb->h.raw = data + exthdrlen;
941 
942 			if (fraggap) {
943 				skb->csum = skb_copy_and_csum_bits(
944 					skb_prev, maxfraglen,
945 					data + transhdrlen, fraggap, 0);
946 				skb_prev->csum = csum_sub(skb_prev->csum,
947 							  skb->csum);
948 				data += fraggap;
949 				skb_trim(skb_prev, maxfraglen);
950 			}
951 			copy = datalen - transhdrlen - fraggap;
952 			if (copy < 0) {
953 				err = -EINVAL;
954 				kfree_skb(skb);
955 				goto error;
956 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
957 				err = -EFAULT;
958 				kfree_skb(skb);
959 				goto error;
960 			}
961 
962 			offset += copy;
963 			length -= datalen - fraggap;
964 			transhdrlen = 0;
965 			exthdrlen = 0;
966 			csummode = CHECKSUM_NONE;
967 
968 			/*
969 			 * Put the packet on the pending queue
970 			 */
971 			__skb_queue_tail(&sk->sk_write_queue, skb);
972 			continue;
973 		}
974 
975 		if (copy > length)
976 			copy = length;
977 
978 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
979 			unsigned int off;
980 
981 			off = skb->len;
982 			if (getfrag(from, skb_put(skb, copy),
983 						offset, copy, off, skb) < 0) {
984 				__skb_trim(skb, off);
985 				err = -EFAULT;
986 				goto error;
987 			}
988 		} else {
989 			int i = skb_shinfo(skb)->nr_frags;
990 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
991 			struct page *page = sk->sk_sndmsg_page;
992 			int off = sk->sk_sndmsg_off;
993 			unsigned int left;
994 
995 			if (page && (left = PAGE_SIZE - off) > 0) {
996 				if (copy >= left)
997 					copy = left;
998 				if (page != frag->page) {
999 					if (i == MAX_SKB_FRAGS) {
1000 						err = -EMSGSIZE;
1001 						goto error;
1002 					}
1003 					get_page(page);
1004 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1005 					frag = &skb_shinfo(skb)->frags[i];
1006 				}
1007 			} else if(i < MAX_SKB_FRAGS) {
1008 				if (copy > PAGE_SIZE)
1009 					copy = PAGE_SIZE;
1010 				page = alloc_pages(sk->sk_allocation, 0);
1011 				if (page == NULL) {
1012 					err = -ENOMEM;
1013 					goto error;
1014 				}
1015 				sk->sk_sndmsg_page = page;
1016 				sk->sk_sndmsg_off = 0;
1017 
1018 				skb_fill_page_desc(skb, i, page, 0, 0);
1019 				frag = &skb_shinfo(skb)->frags[i];
1020 				skb->truesize += PAGE_SIZE;
1021 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1022 			} else {
1023 				err = -EMSGSIZE;
1024 				goto error;
1025 			}
1026 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1027 				err = -EFAULT;
1028 				goto error;
1029 			}
1030 			sk->sk_sndmsg_off += copy;
1031 			frag->size += copy;
1032 			skb->len += copy;
1033 			skb->data_len += copy;
1034 		}
1035 		offset += copy;
1036 		length -= copy;
1037 	}
1038 	return 0;
1039 error:
1040 	inet->cork.length -= length;
1041 	IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1042 	return err;
1043 }
1044 
1045 int ip6_push_pending_frames(struct sock *sk)
1046 {
1047 	struct sk_buff *skb, *tmp_skb;
1048 	struct sk_buff **tail_skb;
1049 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1050 	struct inet_sock *inet = inet_sk(sk);
1051 	struct ipv6_pinfo *np = inet6_sk(sk);
1052 	struct ipv6hdr *hdr;
1053 	struct ipv6_txoptions *opt = np->cork.opt;
1054 	struct rt6_info *rt = np->cork.rt;
1055 	struct flowi *fl = &inet->cork.fl;
1056 	unsigned char proto = fl->proto;
1057 	int err = 0;
1058 
1059 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1060 		goto out;
1061 	tail_skb = &(skb_shinfo(skb)->frag_list);
1062 
1063 	/* move skb->data to ip header from ext header */
1064 	if (skb->data < skb->nh.raw)
1065 		__skb_pull(skb, skb->nh.raw - skb->data);
1066 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1067 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1068 		*tail_skb = tmp_skb;
1069 		tail_skb = &(tmp_skb->next);
1070 		skb->len += tmp_skb->len;
1071 		skb->data_len += tmp_skb->len;
1072 		skb->truesize += tmp_skb->truesize;
1073 		__sock_put(tmp_skb->sk);
1074 		tmp_skb->destructor = NULL;
1075 		tmp_skb->sk = NULL;
1076 	}
1077 
1078 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1079 	__skb_pull(skb, skb->h.raw - skb->nh.raw);
1080 	if (opt && opt->opt_flen)
1081 		ipv6_push_frag_opts(skb, opt, &proto);
1082 	if (opt && opt->opt_nflen)
1083 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1084 
1085 	skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1086 
1087 	*(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1088 
1089 	if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1090 		hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1091 	else
1092 		hdr->payload_len = 0;
1093 	hdr->hop_limit = np->cork.hop_limit;
1094 	hdr->nexthdr = proto;
1095 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1096 	ipv6_addr_copy(&hdr->daddr, final_dst);
1097 
1098 	skb->dst = dst_clone(&rt->u.dst);
1099 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1100 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1101 	if (err) {
1102 		if (err > 0)
1103 			err = np->recverr ? net_xmit_errno(err) : 0;
1104 		if (err)
1105 			goto error;
1106 	}
1107 
1108 out:
1109 	inet->cork.flags &= ~IPCORK_OPT;
1110 	if (np->cork.opt) {
1111 		kfree(np->cork.opt);
1112 		np->cork.opt = NULL;
1113 	}
1114 	if (np->cork.rt) {
1115 		dst_release(&np->cork.rt->u.dst);
1116 		np->cork.rt = NULL;
1117 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1118 	}
1119 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1120 	return err;
1121 error:
1122 	goto out;
1123 }
1124 
1125 void ip6_flush_pending_frames(struct sock *sk)
1126 {
1127 	struct inet_sock *inet = inet_sk(sk);
1128 	struct ipv6_pinfo *np = inet6_sk(sk);
1129 	struct sk_buff *skb;
1130 
1131 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1132 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1133 		kfree_skb(skb);
1134 	}
1135 
1136 	inet->cork.flags &= ~IPCORK_OPT;
1137 
1138 	if (np->cork.opt) {
1139 		kfree(np->cork.opt);
1140 		np->cork.opt = NULL;
1141 	}
1142 	if (np->cork.rt) {
1143 		dst_release(&np->cork.rt->u.dst);
1144 		np->cork.rt = NULL;
1145 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1146 	}
1147 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1148 }
1149