xref: /linux/net/ipv6/ip6_output.c (revision 36ca1195ad7f760a6af3814cb002bd3a3d4b4db1)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *	Based on linux/net/ipv4/ip_output.c
11  *
12  *	This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *	Changes:
18  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
19  *				extension headers are implemented.
20  *				route changes now work.
21  *				ip6_forward does not confuse sniffers.
22  *				etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *	Imran Patel	: 	frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *			:       add ip6_append_data and related functions
28  *				for datagram xmit
29  */
30 
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42 
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63 	static u32 ipv6_fragmentation_id = 1;
64 	static DEFINE_SPINLOCK(ip6_id_lock);
65 
66 	spin_lock_bh(&ip6_id_lock);
67 	fhdr->identification = htonl(ipv6_fragmentation_id);
68 	if (++ipv6_fragmentation_id == 0)
69 		ipv6_fragmentation_id = 1;
70 	spin_unlock_bh(&ip6_id_lock);
71 }
72 
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75 
76 	struct dst_entry *dst = skb->dst;
77 	struct hh_cache *hh = dst->hh;
78 
79 	if (hh) {
80 		int hh_alen;
81 
82 		read_lock_bh(&hh->hh_lock);
83 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 		read_unlock_bh(&hh->hh_lock);
86 	        skb_push(skb, hh->hh_len);
87 		return hh->hh_output(skb);
88 	} else if (dst->neighbour)
89 		return dst->neighbour->output(skb);
90 
91 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 	kfree_skb(skb);
93 	return -EINVAL;
94 
95 }
96 
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100 	newskb->mac.raw = newskb->data;
101 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
102 	newskb->pkt_type = PACKET_LOOPBACK;
103 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 	BUG_TRAP(newskb->dst);
105 
106 	netif_rx(newskb);
107 	return 0;
108 }
109 
110 
111 static int ip6_output2(struct sk_buff *skb)
112 {
113 	struct dst_entry *dst = skb->dst;
114 	struct net_device *dev = dst->dev;
115 
116 	skb->protocol = htons(ETH_P_IPV6);
117 	skb->dev = dev;
118 
119 	if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121 
122 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 		    ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 				&skb->nh.ipv6h->saddr)) {
125 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126 
127 			/* Do not check for IFF_ALLMULTI; multicast routing
128 			   is not supported in any case.
129 			 */
130 			if (newskb)
131 				NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 					newskb->dev,
133 					ip6_dev_loopback_xmit);
134 
135 			if (skb->nh.ipv6h->hop_limit == 0) {
136 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 				kfree_skb(skb);
138 				return 0;
139 			}
140 		}
141 
142 		IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 	}
144 
145 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147 
148 int ip6_output(struct sk_buff *skb)
149 {
150 	if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 		return ip6_fragment(skb, ip6_output2);
152 	else
153 		return ip6_output2(skb);
154 }
155 
156 #ifdef CONFIG_NETFILTER
157 int ip6_route_me_harder(struct sk_buff *skb)
158 {
159 	struct ipv6hdr *iph = skb->nh.ipv6h;
160 	struct dst_entry *dst;
161 	struct flowi fl = {
162 		.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163 		.nl_u =
164 		{ .ip6_u =
165 		  { .daddr = iph->daddr,
166 		    .saddr = iph->saddr, } },
167 		.proto = iph->nexthdr,
168 	};
169 
170 	dst = ip6_route_output(skb->sk, &fl);
171 
172 	if (dst->error) {
173 		IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174 		LIMIT_NETDEBUG(
175 			printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176 		dst_release(dst);
177 		return -EINVAL;
178 	}
179 
180 	/* Drop old route. */
181 	dst_release(skb->dst);
182 
183 	skb->dst = dst;
184 	return 0;
185 }
186 #endif
187 
188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
189 {
190 #ifdef CONFIG_NETFILTER
191 	if (skb->nfcache & NFC_ALTERED){
192 		if (ip6_route_me_harder(skb) != 0){
193 			kfree_skb(skb);
194 			return -EINVAL;
195 		}
196 	}
197 #endif /* CONFIG_NETFILTER */
198 	return dst_output(skb);
199 }
200 
201 /*
202  *	xmit an sk_buff (used by TCP)
203  */
204 
205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206 	     struct ipv6_txoptions *opt, int ipfragok)
207 {
208 	struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209 	struct in6_addr *first_hop = &fl->fl6_dst;
210 	struct dst_entry *dst = skb->dst;
211 	struct ipv6hdr *hdr;
212 	u8  proto = fl->proto;
213 	int seg_len = skb->len;
214 	int hlimit;
215 	u32 mtu;
216 
217 	if (opt) {
218 		int head_room;
219 
220 		/* First: exthdrs may take lots of space (~8K for now)
221 		   MAX_HEADER is not enough.
222 		 */
223 		head_room = opt->opt_nflen + opt->opt_flen;
224 		seg_len += head_room;
225 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226 
227 		if (skb_headroom(skb) < head_room) {
228 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229 			kfree_skb(skb);
230 			skb = skb2;
231 			if (skb == NULL) {
232 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233 				return -ENOBUFS;
234 			}
235 			if (sk)
236 				skb_set_owner_w(skb, sk);
237 		}
238 		if (opt->opt_flen)
239 			ipv6_push_frag_opts(skb, opt, &proto);
240 		if (opt->opt_nflen)
241 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242 	}
243 
244 	hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245 
246 	/*
247 	 *	Fill in the IPv6 header
248 	 */
249 
250 	*(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251 	hlimit = -1;
252 	if (np)
253 		hlimit = np->hop_limit;
254 	if (hlimit < 0)
255 		hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256 	if (hlimit < 0)
257 		hlimit = ipv6_get_hoplimit(dst->dev);
258 
259 	hdr->payload_len = htons(seg_len);
260 	hdr->nexthdr = proto;
261 	hdr->hop_limit = hlimit;
262 
263 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
264 	ipv6_addr_copy(&hdr->daddr, first_hop);
265 
266 	mtu = dst_mtu(dst);
267 	if ((skb->len <= mtu) || ipfragok) {
268 		IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269 		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
270 	}
271 
272 	if (net_ratelimit())
273 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
274 	skb->dev = dst->dev;
275 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
276 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
277 	kfree_skb(skb);
278 	return -EMSGSIZE;
279 }
280 
281 /*
282  *	To avoid extra problems ND packets are send through this
283  *	routine. It's code duplication but I really want to avoid
284  *	extra checks since ipv6_build_header is used by TCP (which
285  *	is for us performance critical)
286  */
287 
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289 	       struct in6_addr *saddr, struct in6_addr *daddr,
290 	       int proto, int len)
291 {
292 	struct ipv6_pinfo *np = inet6_sk(sk);
293 	struct ipv6hdr *hdr;
294 	int totlen;
295 
296 	skb->protocol = htons(ETH_P_IPV6);
297 	skb->dev = dev;
298 
299 	totlen = len + sizeof(struct ipv6hdr);
300 
301 	hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
302 	skb->nh.ipv6h = hdr;
303 
304 	*(u32*)hdr = htonl(0x60000000);
305 
306 	hdr->payload_len = htons(len);
307 	hdr->nexthdr = proto;
308 	hdr->hop_limit = np->hop_limit;
309 
310 	ipv6_addr_copy(&hdr->saddr, saddr);
311 	ipv6_addr_copy(&hdr->daddr, daddr);
312 
313 	return 0;
314 }
315 
316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 {
318 	struct ip6_ra_chain *ra;
319 	struct sock *last = NULL;
320 
321 	read_lock(&ip6_ra_lock);
322 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
323 		struct sock *sk = ra->sk;
324 		if (sk && ra->sel == sel) {
325 			if (last) {
326 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327 				if (skb2)
328 					rawv6_rcv(last, skb2);
329 			}
330 			last = sk;
331 		}
332 	}
333 
334 	if (last) {
335 		rawv6_rcv(last, skb);
336 		read_unlock(&ip6_ra_lock);
337 		return 1;
338 	}
339 	read_unlock(&ip6_ra_lock);
340 	return 0;
341 }
342 
343 static inline int ip6_forward_finish(struct sk_buff *skb)
344 {
345 	return dst_output(skb);
346 }
347 
348 int ip6_forward(struct sk_buff *skb)
349 {
350 	struct dst_entry *dst = skb->dst;
351 	struct ipv6hdr *hdr = skb->nh.ipv6h;
352 	struct inet6_skb_parm *opt = IP6CB(skb);
353 
354 	if (ipv6_devconf.forwarding == 0)
355 		goto error;
356 
357 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
358 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
359 		goto drop;
360 	}
361 
362 	skb->ip_summed = CHECKSUM_NONE;
363 
364 	/*
365 	 *	We DO NOT make any processing on
366 	 *	RA packets, pushing them to user level AS IS
367 	 *	without ane WARRANTY that application will be able
368 	 *	to interpret them. The reason is that we
369 	 *	cannot make anything clever here.
370 	 *
371 	 *	We are not end-node, so that if packet contains
372 	 *	AH/ESP, we cannot make anything.
373 	 *	Defragmentation also would be mistake, RA packets
374 	 *	cannot be fragmented, because there is no warranty
375 	 *	that different fragments will go along one path. --ANK
376 	 */
377 	if (opt->ra) {
378 		u8 *ptr = skb->nh.raw + opt->ra;
379 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
380 			return 0;
381 	}
382 
383 	/*
384 	 *	check and decrement ttl
385 	 */
386 	if (hdr->hop_limit <= 1) {
387 		/* Force OUTPUT device used as source address */
388 		skb->dev = dst->dev;
389 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
390 			    0, skb->dev);
391 
392 		kfree_skb(skb);
393 		return -ETIMEDOUT;
394 	}
395 
396 	if (!xfrm6_route_forward(skb)) {
397 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
398 		goto drop;
399 	}
400 	dst = skb->dst;
401 
402 	/* IPv6 specs say nothing about it, but it is clear that we cannot
403 	   send redirects to source routed frames.
404 	 */
405 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
406 		struct in6_addr *target = NULL;
407 		struct rt6_info *rt;
408 		struct neighbour *n = dst->neighbour;
409 
410 		/*
411 		 *	incoming and outgoing devices are the same
412 		 *	send a redirect.
413 		 */
414 
415 		rt = (struct rt6_info *) dst;
416 		if ((rt->rt6i_flags & RTF_GATEWAY))
417 			target = (struct in6_addr*)&n->primary_key;
418 		else
419 			target = &hdr->daddr;
420 
421 		/* Limit redirects both by destination (here)
422 		   and by source (inside ndisc_send_redirect)
423 		 */
424 		if (xrlim_allow(dst, 1*HZ))
425 			ndisc_send_redirect(skb, n, target);
426 	} else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
427 						|IPV6_ADDR_LINKLOCAL)) {
428 		/* This check is security critical. */
429 		goto error;
430 	}
431 
432 	if (skb->len > dst_mtu(dst)) {
433 		/* Again, force OUTPUT device used as source address */
434 		skb->dev = dst->dev;
435 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
436 		IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
437 		IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
438 		kfree_skb(skb);
439 		return -EMSGSIZE;
440 	}
441 
442 	if (skb_cow(skb, dst->dev->hard_header_len)) {
443 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
444 		goto drop;
445 	}
446 
447 	hdr = skb->nh.ipv6h;
448 
449 	/* Mangling hops number delayed to point after skb COW */
450 
451 	hdr->hop_limit--;
452 
453 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
454 	return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
455 
456 error:
457 	IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
458 drop:
459 	kfree_skb(skb);
460 	return -EINVAL;
461 }
462 
463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
464 {
465 	to->pkt_type = from->pkt_type;
466 	to->priority = from->priority;
467 	to->protocol = from->protocol;
468 	to->security = from->security;
469 	dst_release(to->dst);
470 	to->dst = dst_clone(from->dst);
471 	to->dev = from->dev;
472 
473 #ifdef CONFIG_NET_SCHED
474 	to->tc_index = from->tc_index;
475 #endif
476 #ifdef CONFIG_NETFILTER
477 	to->nfmark = from->nfmark;
478 	/* Connection association is same as pre-frag packet */
479 	to->nfct = from->nfct;
480 	nf_conntrack_get(to->nfct);
481 	to->nfctinfo = from->nfctinfo;
482 #ifdef CONFIG_BRIDGE_NETFILTER
483 	nf_bridge_put(to->nf_bridge);
484 	to->nf_bridge = from->nf_bridge;
485 	nf_bridge_get(to->nf_bridge);
486 #endif
487 #ifdef CONFIG_NETFILTER_DEBUG
488 	to->nf_debug = from->nf_debug;
489 #endif
490 #endif
491 }
492 
493 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
494 {
495 	u16 offset = sizeof(struct ipv6hdr);
496 	struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
497 	unsigned int packet_len = skb->tail - skb->nh.raw;
498 	int found_rhdr = 0;
499 	*nexthdr = &skb->nh.ipv6h->nexthdr;
500 
501 	while (offset + 1 <= packet_len) {
502 
503 		switch (**nexthdr) {
504 
505 		case NEXTHDR_HOP:
506 		case NEXTHDR_ROUTING:
507 		case NEXTHDR_DEST:
508 			if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
509 			if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
510 			offset += ipv6_optlen(exthdr);
511 			*nexthdr = &exthdr->nexthdr;
512 			exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
513 			break;
514 		default :
515 			return offset;
516 		}
517 	}
518 
519 	return offset;
520 }
521 
522 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
523 {
524 	struct net_device *dev;
525 	struct sk_buff *frag;
526 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
527 	struct ipv6hdr *tmp_hdr;
528 	struct frag_hdr *fh;
529 	unsigned int mtu, hlen, left, len;
530 	u32 frag_id = 0;
531 	int ptr, offset = 0, err=0;
532 	u8 *prevhdr, nexthdr = 0;
533 
534 	dev = rt->u.dst.dev;
535 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
536 	nexthdr = *prevhdr;
537 
538 	mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
539 
540 	if (skb_shinfo(skb)->frag_list) {
541 		int first_len = skb_pagelen(skb);
542 
543 		if (first_len - hlen > mtu ||
544 		    ((first_len - hlen) & 7) ||
545 		    skb_cloned(skb))
546 			goto slow_path;
547 
548 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
549 			/* Correct geometry. */
550 			if (frag->len > mtu ||
551 			    ((frag->len & 7) && frag->next) ||
552 			    skb_headroom(frag) < hlen)
553 			    goto slow_path;
554 
555 			/* Partially cloned skb? */
556 			if (skb_shared(frag))
557 				goto slow_path;
558 
559 			BUG_ON(frag->sk);
560 			if (skb->sk) {
561 				sock_hold(skb->sk);
562 				frag->sk = skb->sk;
563 				frag->destructor = sock_wfree;
564 				skb->truesize -= frag->truesize;
565 			}
566 		}
567 
568 		err = 0;
569 		offset = 0;
570 		frag = skb_shinfo(skb)->frag_list;
571 		skb_shinfo(skb)->frag_list = NULL;
572 		/* BUILD HEADER */
573 
574 		tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
575 		if (!tmp_hdr) {
576 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
577 			return -ENOMEM;
578 		}
579 
580 		*prevhdr = NEXTHDR_FRAGMENT;
581 		memcpy(tmp_hdr, skb->nh.raw, hlen);
582 		__skb_pull(skb, hlen);
583 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
584 		skb->nh.raw = __skb_push(skb, hlen);
585 		memcpy(skb->nh.raw, tmp_hdr, hlen);
586 
587 		ipv6_select_ident(skb, fh);
588 		fh->nexthdr = nexthdr;
589 		fh->reserved = 0;
590 		fh->frag_off = htons(IP6_MF);
591 		frag_id = fh->identification;
592 
593 		first_len = skb_pagelen(skb);
594 		skb->data_len = first_len - skb_headlen(skb);
595 		skb->len = first_len;
596 		skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
597 
598 
599 		for (;;) {
600 			/* Prepare header of the next frame,
601 			 * before previous one went down. */
602 			if (frag) {
603 				frag->ip_summed = CHECKSUM_NONE;
604 				frag->h.raw = frag->data;
605 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
606 				frag->nh.raw = __skb_push(frag, hlen);
607 				memcpy(frag->nh.raw, tmp_hdr, hlen);
608 				offset += skb->len - hlen - sizeof(struct frag_hdr);
609 				fh->nexthdr = nexthdr;
610 				fh->reserved = 0;
611 				fh->frag_off = htons(offset);
612 				if (frag->next != NULL)
613 					fh->frag_off |= htons(IP6_MF);
614 				fh->identification = frag_id;
615 				frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
616 				ip6_copy_metadata(frag, skb);
617 			}
618 
619 			err = output(skb);
620 			if (err || !frag)
621 				break;
622 
623 			skb = frag;
624 			frag = skb->next;
625 			skb->next = NULL;
626 		}
627 
628 		if (tmp_hdr)
629 			kfree(tmp_hdr);
630 
631 		if (err == 0) {
632 			IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
633 			return 0;
634 		}
635 
636 		while (frag) {
637 			skb = frag->next;
638 			kfree_skb(frag);
639 			frag = skb;
640 		}
641 
642 		IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
643 		return err;
644 	}
645 
646 slow_path:
647 	left = skb->len - hlen;		/* Space per frame */
648 	ptr = hlen;			/* Where to start from */
649 
650 	/*
651 	 *	Fragment the datagram.
652 	 */
653 
654 	*prevhdr = NEXTHDR_FRAGMENT;
655 
656 	/*
657 	 *	Keep copying data until we run out.
658 	 */
659 	while(left > 0)	{
660 		len = left;
661 		/* IF: it doesn't fit, use 'mtu' - the data space left */
662 		if (len > mtu)
663 			len = mtu;
664 		/* IF: we are not sending upto and including the packet end
665 		   then align the next start on an eight byte boundary */
666 		if (len < left)	{
667 			len &= ~7;
668 		}
669 		/*
670 		 *	Allocate buffer.
671 		 */
672 
673 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
674 			NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
675 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
676 			err = -ENOMEM;
677 			goto fail;
678 		}
679 
680 		/*
681 		 *	Set up data on packet
682 		 */
683 
684 		ip6_copy_metadata(frag, skb);
685 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
686 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
687 		frag->nh.raw = frag->data;
688 		fh = (struct frag_hdr*)(frag->data + hlen);
689 		frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
690 
691 		/*
692 		 *	Charge the memory for the fragment to any owner
693 		 *	it might possess
694 		 */
695 		if (skb->sk)
696 			skb_set_owner_w(frag, skb->sk);
697 
698 		/*
699 		 *	Copy the packet header into the new buffer.
700 		 */
701 		memcpy(frag->nh.raw, skb->data, hlen);
702 
703 		/*
704 		 *	Build fragment header.
705 		 */
706 		fh->nexthdr = nexthdr;
707 		fh->reserved = 0;
708 		if (frag_id) {
709 			ipv6_select_ident(skb, fh);
710 			frag_id = fh->identification;
711 		} else
712 			fh->identification = frag_id;
713 
714 		/*
715 		 *	Copy a block of the IP datagram.
716 		 */
717 		if (skb_copy_bits(skb, ptr, frag->h.raw, len))
718 			BUG();
719 		left -= len;
720 
721 		fh->frag_off = htons(offset);
722 		if (left > 0)
723 			fh->frag_off |= htons(IP6_MF);
724 		frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
725 
726 		ptr += len;
727 		offset += len;
728 
729 		/*
730 		 *	Put this fragment into the sending queue.
731 		 */
732 
733 		IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
734 
735 		err = output(frag);
736 		if (err)
737 			goto fail;
738 	}
739 	kfree_skb(skb);
740 	IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
741 	return err;
742 
743 fail:
744 	kfree_skb(skb);
745 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
746 	return err;
747 }
748 
749 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
750 {
751 	int err = 0;
752 
753 	*dst = NULL;
754 	if (sk) {
755 		struct ipv6_pinfo *np = inet6_sk(sk);
756 
757 		*dst = sk_dst_check(sk, np->dst_cookie);
758 		if (*dst) {
759 			struct rt6_info *rt = (struct rt6_info*)*dst;
760 
761 				/* Yes, checking route validity in not connected
762 				   case is not very simple. Take into account,
763 				   that we do not support routing by source, TOS,
764 				   and MSG_DONTROUTE 		--ANK (980726)
765 
766 				   1. If route was host route, check that
767 				      cached destination is current.
768 				      If it is network route, we still may
769 				      check its validity using saved pointer
770 				      to the last used address: daddr_cache.
771 				      We do not want to save whole address now,
772 				      (because main consumer of this service
773 				       is tcp, which has not this problem),
774 				      so that the last trick works only on connected
775 				      sockets.
776 				   2. oif also should be the same.
777 				 */
778 
779 			if (((rt->rt6i_dst.plen != 128 ||
780 			      !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
781 			     && (np->daddr_cache == NULL ||
782 				 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
783 			    || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
784 				dst_release(*dst);
785 				*dst = NULL;
786 			}
787 		}
788 	}
789 
790 	if (*dst == NULL)
791 		*dst = ip6_route_output(sk, fl);
792 
793 	if ((err = (*dst)->error))
794 		goto out_err_release;
795 
796 	if (ipv6_addr_any(&fl->fl6_src)) {
797 		err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
798 
799 		if (err) {
800 #if IP6_DEBUG >= 2
801 			printk(KERN_DEBUG "ip6_dst_lookup: "
802 			       "no available source address\n");
803 #endif
804 			goto out_err_release;
805 		}
806 	}
807 
808 	return 0;
809 
810 out_err_release:
811 	dst_release(*dst);
812 	*dst = NULL;
813 	return err;
814 }
815 
816 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
817 		    void *from, int length, int transhdrlen,
818 		    int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
819 		    unsigned int flags)
820 {
821 	struct inet_sock *inet = inet_sk(sk);
822 	struct ipv6_pinfo *np = inet6_sk(sk);
823 	struct sk_buff *skb;
824 	unsigned int maxfraglen, fragheaderlen;
825 	int exthdrlen;
826 	int hh_len;
827 	int mtu;
828 	int copy;
829 	int err;
830 	int offset = 0;
831 	int csummode = CHECKSUM_NONE;
832 
833 	if (flags&MSG_PROBE)
834 		return 0;
835 	if (skb_queue_empty(&sk->sk_write_queue)) {
836 		/*
837 		 * setup for corking
838 		 */
839 		if (opt) {
840 			if (np->cork.opt == NULL) {
841 				np->cork.opt = kmalloc(opt->tot_len,
842 						       sk->sk_allocation);
843 				if (unlikely(np->cork.opt == NULL))
844 					return -ENOBUFS;
845 			} else if (np->cork.opt->tot_len < opt->tot_len) {
846 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
847 				return -EINVAL;
848 			}
849 			memcpy(np->cork.opt, opt, opt->tot_len);
850 			inet->cork.flags |= IPCORK_OPT;
851 			/* need source address above miyazawa*/
852 		}
853 		dst_hold(&rt->u.dst);
854 		np->cork.rt = rt;
855 		inet->cork.fl = *fl;
856 		np->cork.hop_limit = hlimit;
857 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
858 		if (dst_allfrag(rt->u.dst.path))
859 			inet->cork.flags |= IPCORK_ALLFRAG;
860 		inet->cork.length = 0;
861 		sk->sk_sndmsg_page = NULL;
862 		sk->sk_sndmsg_off = 0;
863 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
864 		length += exthdrlen;
865 		transhdrlen += exthdrlen;
866 	} else {
867 		rt = np->cork.rt;
868 		fl = &inet->cork.fl;
869 		if (inet->cork.flags & IPCORK_OPT)
870 			opt = np->cork.opt;
871 		transhdrlen = 0;
872 		exthdrlen = 0;
873 		mtu = inet->cork.fragsize;
874 	}
875 
876 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
877 
878 	fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
879 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
880 
881 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
882 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
883 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
884 			return -EMSGSIZE;
885 		}
886 	}
887 
888 	/*
889 	 * Let's try using as much space as possible.
890 	 * Use MTU if total length of the message fits into the MTU.
891 	 * Otherwise, we need to reserve fragment header and
892 	 * fragment alignment (= 8-15 octects, in total).
893 	 *
894 	 * Note that we may need to "move" the data from the tail of
895 	 * of the buffer to the new fragment when we split
896 	 * the message.
897 	 *
898 	 * FIXME: It may be fragmented into multiple chunks
899 	 *        at once if non-fragmentable extension headers
900 	 *        are too large.
901 	 * --yoshfuji
902 	 */
903 
904 	inet->cork.length += length;
905 
906 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
907 		goto alloc_new_skb;
908 
909 	while (length > 0) {
910 		/* Check if the remaining data fits into current packet. */
911 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
912 		if (copy < length)
913 			copy = maxfraglen - skb->len;
914 
915 		if (copy <= 0) {
916 			char *data;
917 			unsigned int datalen;
918 			unsigned int fraglen;
919 			unsigned int fraggap;
920 			unsigned int alloclen;
921 			struct sk_buff *skb_prev;
922 alloc_new_skb:
923 			skb_prev = skb;
924 
925 			/* There's no room in the current skb */
926 			if (skb_prev)
927 				fraggap = skb_prev->len - maxfraglen;
928 			else
929 				fraggap = 0;
930 
931 			/*
932 			 * If remaining data exceeds the mtu,
933 			 * we know we need more fragment(s).
934 			 */
935 			datalen = length + fraggap;
936 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
937 				datalen = maxfraglen - fragheaderlen;
938 
939 			fraglen = datalen + fragheaderlen;
940 			if ((flags & MSG_MORE) &&
941 			    !(rt->u.dst.dev->features&NETIF_F_SG))
942 				alloclen = mtu;
943 			else
944 				alloclen = datalen + fragheaderlen;
945 
946 			/*
947 			 * The last fragment gets additional space at tail.
948 			 * Note: we overallocate on fragments with MSG_MODE
949 			 * because we have no idea if we're the last one.
950 			 */
951 			if (datalen == length + fraggap)
952 				alloclen += rt->u.dst.trailer_len;
953 
954 			/*
955 			 * We just reserve space for fragment header.
956 			 * Note: this may be overallocation if the message
957 			 * (without MSG_MORE) fits into the MTU.
958 			 */
959 			alloclen += sizeof(struct frag_hdr);
960 
961 			if (transhdrlen) {
962 				skb = sock_alloc_send_skb(sk,
963 						alloclen + hh_len,
964 						(flags & MSG_DONTWAIT), &err);
965 			} else {
966 				skb = NULL;
967 				if (atomic_read(&sk->sk_wmem_alloc) <=
968 				    2 * sk->sk_sndbuf)
969 					skb = sock_wmalloc(sk,
970 							   alloclen + hh_len, 1,
971 							   sk->sk_allocation);
972 				if (unlikely(skb == NULL))
973 					err = -ENOBUFS;
974 			}
975 			if (skb == NULL)
976 				goto error;
977 			/*
978 			 *	Fill in the control structures
979 			 */
980 			skb->ip_summed = csummode;
981 			skb->csum = 0;
982 			/* reserve for fragmentation */
983 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
984 
985 			/*
986 			 *	Find where to start putting bytes
987 			 */
988 			data = skb_put(skb, fraglen);
989 			skb->nh.raw = data + exthdrlen;
990 			data += fragheaderlen;
991 			skb->h.raw = data + exthdrlen;
992 
993 			if (fraggap) {
994 				skb->csum = skb_copy_and_csum_bits(
995 					skb_prev, maxfraglen,
996 					data + transhdrlen, fraggap, 0);
997 				skb_prev->csum = csum_sub(skb_prev->csum,
998 							  skb->csum);
999 				data += fraggap;
1000 				skb_trim(skb_prev, maxfraglen);
1001 			}
1002 			copy = datalen - transhdrlen - fraggap;
1003 			if (copy < 0) {
1004 				err = -EINVAL;
1005 				kfree_skb(skb);
1006 				goto error;
1007 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1008 				err = -EFAULT;
1009 				kfree_skb(skb);
1010 				goto error;
1011 			}
1012 
1013 			offset += copy;
1014 			length -= datalen - fraggap;
1015 			transhdrlen = 0;
1016 			exthdrlen = 0;
1017 			csummode = CHECKSUM_NONE;
1018 
1019 			/*
1020 			 * Put the packet on the pending queue
1021 			 */
1022 			__skb_queue_tail(&sk->sk_write_queue, skb);
1023 			continue;
1024 		}
1025 
1026 		if (copy > length)
1027 			copy = length;
1028 
1029 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1030 			unsigned int off;
1031 
1032 			off = skb->len;
1033 			if (getfrag(from, skb_put(skb, copy),
1034 						offset, copy, off, skb) < 0) {
1035 				__skb_trim(skb, off);
1036 				err = -EFAULT;
1037 				goto error;
1038 			}
1039 		} else {
1040 			int i = skb_shinfo(skb)->nr_frags;
1041 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1042 			struct page *page = sk->sk_sndmsg_page;
1043 			int off = sk->sk_sndmsg_off;
1044 			unsigned int left;
1045 
1046 			if (page && (left = PAGE_SIZE - off) > 0) {
1047 				if (copy >= left)
1048 					copy = left;
1049 				if (page != frag->page) {
1050 					if (i == MAX_SKB_FRAGS) {
1051 						err = -EMSGSIZE;
1052 						goto error;
1053 					}
1054 					get_page(page);
1055 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1056 					frag = &skb_shinfo(skb)->frags[i];
1057 				}
1058 			} else if(i < MAX_SKB_FRAGS) {
1059 				if (copy > PAGE_SIZE)
1060 					copy = PAGE_SIZE;
1061 				page = alloc_pages(sk->sk_allocation, 0);
1062 				if (page == NULL) {
1063 					err = -ENOMEM;
1064 					goto error;
1065 				}
1066 				sk->sk_sndmsg_page = page;
1067 				sk->sk_sndmsg_off = 0;
1068 
1069 				skb_fill_page_desc(skb, i, page, 0, 0);
1070 				frag = &skb_shinfo(skb)->frags[i];
1071 				skb->truesize += PAGE_SIZE;
1072 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1073 			} else {
1074 				err = -EMSGSIZE;
1075 				goto error;
1076 			}
1077 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1078 				err = -EFAULT;
1079 				goto error;
1080 			}
1081 			sk->sk_sndmsg_off += copy;
1082 			frag->size += copy;
1083 			skb->len += copy;
1084 			skb->data_len += copy;
1085 		}
1086 		offset += copy;
1087 		length -= copy;
1088 	}
1089 	return 0;
1090 error:
1091 	inet->cork.length -= length;
1092 	IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1093 	return err;
1094 }
1095 
1096 int ip6_push_pending_frames(struct sock *sk)
1097 {
1098 	struct sk_buff *skb, *tmp_skb;
1099 	struct sk_buff **tail_skb;
1100 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1101 	struct inet_sock *inet = inet_sk(sk);
1102 	struct ipv6_pinfo *np = inet6_sk(sk);
1103 	struct ipv6hdr *hdr;
1104 	struct ipv6_txoptions *opt = np->cork.opt;
1105 	struct rt6_info *rt = np->cork.rt;
1106 	struct flowi *fl = &inet->cork.fl;
1107 	unsigned char proto = fl->proto;
1108 	int err = 0;
1109 
1110 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1111 		goto out;
1112 	tail_skb = &(skb_shinfo(skb)->frag_list);
1113 
1114 	/* move skb->data to ip header from ext header */
1115 	if (skb->data < skb->nh.raw)
1116 		__skb_pull(skb, skb->nh.raw - skb->data);
1117 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1118 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1119 		*tail_skb = tmp_skb;
1120 		tail_skb = &(tmp_skb->next);
1121 		skb->len += tmp_skb->len;
1122 		skb->data_len += tmp_skb->len;
1123 		skb->truesize += tmp_skb->truesize;
1124 		__sock_put(tmp_skb->sk);
1125 		tmp_skb->destructor = NULL;
1126 		tmp_skb->sk = NULL;
1127 	}
1128 
1129 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1130 	__skb_pull(skb, skb->h.raw - skb->nh.raw);
1131 	if (opt && opt->opt_flen)
1132 		ipv6_push_frag_opts(skb, opt, &proto);
1133 	if (opt && opt->opt_nflen)
1134 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1135 
1136 	skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1137 
1138 	*(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1139 
1140 	if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1141 		hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1142 	else
1143 		hdr->payload_len = 0;
1144 	hdr->hop_limit = np->cork.hop_limit;
1145 	hdr->nexthdr = proto;
1146 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1147 	ipv6_addr_copy(&hdr->daddr, final_dst);
1148 
1149 	skb->dst = dst_clone(&rt->u.dst);
1150 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1151 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1152 	if (err) {
1153 		if (err > 0)
1154 			err = np->recverr ? net_xmit_errno(err) : 0;
1155 		if (err)
1156 			goto error;
1157 	}
1158 
1159 out:
1160 	inet->cork.flags &= ~IPCORK_OPT;
1161 	if (np->cork.opt) {
1162 		kfree(np->cork.opt);
1163 		np->cork.opt = NULL;
1164 	}
1165 	if (np->cork.rt) {
1166 		dst_release(&np->cork.rt->u.dst);
1167 		np->cork.rt = NULL;
1168 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1169 	}
1170 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1171 	return err;
1172 error:
1173 	goto out;
1174 }
1175 
1176 void ip6_flush_pending_frames(struct sock *sk)
1177 {
1178 	struct inet_sock *inet = inet_sk(sk);
1179 	struct ipv6_pinfo *np = inet6_sk(sk);
1180 	struct sk_buff *skb;
1181 
1182 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1183 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1184 		kfree_skb(skb);
1185 	}
1186 
1187 	inet->cork.flags &= ~IPCORK_OPT;
1188 
1189 	if (np->cork.opt) {
1190 		kfree(np->cork.opt);
1191 		np->cork.opt = NULL;
1192 	}
1193 	if (np->cork.rt) {
1194 		dst_release(&np->cork.rt->u.dst);
1195 		np->cork.rt = NULL;
1196 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1197 	}
1198 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1199 }
1200