xref: /linux/net/ipv6/ip6_output.c (revision 60b2737de1b1ddfdb90f3ba622634eb49d6f3603)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *	Based on linux/net/ipv4/ip_output.c
11  *
12  *	This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *	Changes:
18  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
19  *				extension headers are implemented.
20  *				route changes now work.
21  *				ip6_forward does not confuse sniffers.
22  *				etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *	Imran Patel	: 	frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *			:       add ip6_append_data and related functions
28  *				for datagram xmit
29  */
30 
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42 
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63 	static u32 ipv6_fragmentation_id = 1;
64 	static DEFINE_SPINLOCK(ip6_id_lock);
65 
66 	spin_lock_bh(&ip6_id_lock);
67 	fhdr->identification = htonl(ipv6_fragmentation_id);
68 	if (++ipv6_fragmentation_id == 0)
69 		ipv6_fragmentation_id = 1;
70 	spin_unlock_bh(&ip6_id_lock);
71 }
72 
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75 
76 	struct dst_entry *dst = skb->dst;
77 	struct hh_cache *hh = dst->hh;
78 
79 	if (hh) {
80 		int hh_alen;
81 
82 		read_lock_bh(&hh->hh_lock);
83 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 		read_unlock_bh(&hh->hh_lock);
86 	        skb_push(skb, hh->hh_len);
87 		return hh->hh_output(skb);
88 	} else if (dst->neighbour)
89 		return dst->neighbour->output(skb);
90 
91 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 	kfree_skb(skb);
93 	return -EINVAL;
94 
95 }
96 
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100 	newskb->mac.raw = newskb->data;
101 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
102 	newskb->pkt_type = PACKET_LOOPBACK;
103 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 	BUG_TRAP(newskb->dst);
105 
106 	netif_rx(newskb);
107 	return 0;
108 }
109 
110 
111 static int ip6_output2(struct sk_buff *skb)
112 {
113 	struct dst_entry *dst = skb->dst;
114 	struct net_device *dev = dst->dev;
115 
116 	skb->protocol = htons(ETH_P_IPV6);
117 	skb->dev = dev;
118 
119 	if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121 
122 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 		    ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 				&skb->nh.ipv6h->saddr)) {
125 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126 
127 			/* Do not check for IFF_ALLMULTI; multicast routing
128 			   is not supported in any case.
129 			 */
130 			if (newskb)
131 				NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 					newskb->dev,
133 					ip6_dev_loopback_xmit);
134 
135 			if (skb->nh.ipv6h->hop_limit == 0) {
136 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 				kfree_skb(skb);
138 				return 0;
139 			}
140 		}
141 
142 		IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 	}
144 
145 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147 
148 int ip6_output(struct sk_buff *skb)
149 {
150 	if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 		return ip6_fragment(skb, ip6_output2);
152 	else
153 		return ip6_output2(skb);
154 }
155 
156 #ifdef CONFIG_NETFILTER
157 int ip6_route_me_harder(struct sk_buff *skb)
158 {
159 	struct ipv6hdr *iph = skb->nh.ipv6h;
160 	struct dst_entry *dst;
161 	struct flowi fl = {
162 		.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163 		.nl_u =
164 		{ .ip6_u =
165 		  { .daddr = iph->daddr,
166 		    .saddr = iph->saddr, } },
167 		.proto = iph->nexthdr,
168 	};
169 
170 	dst = ip6_route_output(skb->sk, &fl);
171 
172 	if (dst->error) {
173 		IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174 		LIMIT_NETDEBUG(
175 			printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176 		dst_release(dst);
177 		return -EINVAL;
178 	}
179 
180 	/* Drop old route. */
181 	dst_release(skb->dst);
182 
183 	skb->dst = dst;
184 	return 0;
185 }
186 #endif
187 
188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
189 {
190 #ifdef CONFIG_NETFILTER
191 	if (skb->nfcache & NFC_ALTERED){
192 		if (ip6_route_me_harder(skb) != 0){
193 			kfree_skb(skb);
194 			return -EINVAL;
195 		}
196 	}
197 #endif /* CONFIG_NETFILTER */
198 	return dst_output(skb);
199 }
200 
201 /*
202  *	xmit an sk_buff (used by TCP)
203  */
204 
205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206 	     struct ipv6_txoptions *opt, int ipfragok)
207 {
208 	struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209 	struct in6_addr *first_hop = &fl->fl6_dst;
210 	struct dst_entry *dst = skb->dst;
211 	struct ipv6hdr *hdr;
212 	u8  proto = fl->proto;
213 	int seg_len = skb->len;
214 	int hlimit;
215 	u32 mtu;
216 
217 	if (opt) {
218 		int head_room;
219 
220 		/* First: exthdrs may take lots of space (~8K for now)
221 		   MAX_HEADER is not enough.
222 		 */
223 		head_room = opt->opt_nflen + opt->opt_flen;
224 		seg_len += head_room;
225 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226 
227 		if (skb_headroom(skb) < head_room) {
228 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229 			kfree_skb(skb);
230 			skb = skb2;
231 			if (skb == NULL) {
232 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233 				return -ENOBUFS;
234 			}
235 			if (sk)
236 				skb_set_owner_w(skb, sk);
237 		}
238 		if (opt->opt_flen)
239 			ipv6_push_frag_opts(skb, opt, &proto);
240 		if (opt->opt_nflen)
241 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242 	}
243 
244 	hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245 
246 	/*
247 	 *	Fill in the IPv6 header
248 	 */
249 
250 	*(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251 	hlimit = -1;
252 	if (np)
253 		hlimit = np->hop_limit;
254 	if (hlimit < 0)
255 		hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256 	if (hlimit < 0)
257 		hlimit = ipv6_get_hoplimit(dst->dev);
258 
259 	hdr->payload_len = htons(seg_len);
260 	hdr->nexthdr = proto;
261 	hdr->hop_limit = hlimit;
262 
263 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
264 	ipv6_addr_copy(&hdr->daddr, first_hop);
265 
266 	mtu = dst_mtu(dst);
267 	if ((skb->len <= mtu) || ipfragok) {
268 		IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269 		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
270 	}
271 
272 	if (net_ratelimit())
273 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
274 	skb->dev = dst->dev;
275 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
276 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
277 	kfree_skb(skb);
278 	return -EMSGSIZE;
279 }
280 
281 /*
282  *	To avoid extra problems ND packets are send through this
283  *	routine. It's code duplication but I really want to avoid
284  *	extra checks since ipv6_build_header is used by TCP (which
285  *	is for us performance critical)
286  */
287 
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289 	       struct in6_addr *saddr, struct in6_addr *daddr,
290 	       int proto, int len)
291 {
292 	struct ipv6_pinfo *np = inet6_sk(sk);
293 	struct ipv6hdr *hdr;
294 	int totlen;
295 
296 	skb->protocol = htons(ETH_P_IPV6);
297 	skb->dev = dev;
298 
299 	totlen = len + sizeof(struct ipv6hdr);
300 
301 	hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
302 	skb->nh.ipv6h = hdr;
303 
304 	*(u32*)hdr = htonl(0x60000000);
305 
306 	hdr->payload_len = htons(len);
307 	hdr->nexthdr = proto;
308 	hdr->hop_limit = np->hop_limit;
309 
310 	ipv6_addr_copy(&hdr->saddr, saddr);
311 	ipv6_addr_copy(&hdr->daddr, daddr);
312 
313 	return 0;
314 }
315 
316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 {
318 	struct ip6_ra_chain *ra;
319 	struct sock *last = NULL;
320 
321 	read_lock(&ip6_ra_lock);
322 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
323 		struct sock *sk = ra->sk;
324 		if (sk && ra->sel == sel) {
325 			if (last) {
326 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327 				if (skb2)
328 					rawv6_rcv(last, skb2);
329 			}
330 			last = sk;
331 		}
332 	}
333 
334 	if (last) {
335 		rawv6_rcv(last, skb);
336 		read_unlock(&ip6_ra_lock);
337 		return 1;
338 	}
339 	read_unlock(&ip6_ra_lock);
340 	return 0;
341 }
342 
343 static inline int ip6_forward_finish(struct sk_buff *skb)
344 {
345 	return dst_output(skb);
346 }
347 
348 int ip6_forward(struct sk_buff *skb)
349 {
350 	struct dst_entry *dst = skb->dst;
351 	struct ipv6hdr *hdr = skb->nh.ipv6h;
352 	struct inet6_skb_parm *opt = IP6CB(skb);
353 
354 	if (ipv6_devconf.forwarding == 0)
355 		goto error;
356 
357 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
358 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
359 		goto drop;
360 	}
361 
362 	skb->ip_summed = CHECKSUM_NONE;
363 
364 	/*
365 	 *	We DO NOT make any processing on
366 	 *	RA packets, pushing them to user level AS IS
367 	 *	without ane WARRANTY that application will be able
368 	 *	to interpret them. The reason is that we
369 	 *	cannot make anything clever here.
370 	 *
371 	 *	We are not end-node, so that if packet contains
372 	 *	AH/ESP, we cannot make anything.
373 	 *	Defragmentation also would be mistake, RA packets
374 	 *	cannot be fragmented, because there is no warranty
375 	 *	that different fragments will go along one path. --ANK
376 	 */
377 	if (opt->ra) {
378 		u8 *ptr = skb->nh.raw + opt->ra;
379 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
380 			return 0;
381 	}
382 
383 	/*
384 	 *	check and decrement ttl
385 	 */
386 	if (hdr->hop_limit <= 1) {
387 		/* Force OUTPUT device used as source address */
388 		skb->dev = dst->dev;
389 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
390 			    0, skb->dev);
391 
392 		kfree_skb(skb);
393 		return -ETIMEDOUT;
394 	}
395 
396 	if (!xfrm6_route_forward(skb)) {
397 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
398 		goto drop;
399 	}
400 	dst = skb->dst;
401 
402 	/* IPv6 specs say nothing about it, but it is clear that we cannot
403 	   send redirects to source routed frames.
404 	 */
405 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
406 		struct in6_addr *target = NULL;
407 		struct rt6_info *rt;
408 		struct neighbour *n = dst->neighbour;
409 
410 		/*
411 		 *	incoming and outgoing devices are the same
412 		 *	send a redirect.
413 		 */
414 
415 		rt = (struct rt6_info *) dst;
416 		if ((rt->rt6i_flags & RTF_GATEWAY))
417 			target = (struct in6_addr*)&n->primary_key;
418 		else
419 			target = &hdr->daddr;
420 
421 		/* Limit redirects both by destination (here)
422 		   and by source (inside ndisc_send_redirect)
423 		 */
424 		if (xrlim_allow(dst, 1*HZ))
425 			ndisc_send_redirect(skb, n, target);
426 	} else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
427 						|IPV6_ADDR_LINKLOCAL)) {
428 		/* This check is security critical. */
429 		goto error;
430 	}
431 
432 	if (skb->len > dst_mtu(dst)) {
433 		/* Again, force OUTPUT device used as source address */
434 		skb->dev = dst->dev;
435 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
436 		IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
437 		IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
438 		kfree_skb(skb);
439 		return -EMSGSIZE;
440 	}
441 
442 	if (skb_cow(skb, dst->dev->hard_header_len)) {
443 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
444 		goto drop;
445 	}
446 
447 	hdr = skb->nh.ipv6h;
448 
449 	/* Mangling hops number delayed to point after skb COW */
450 
451 	hdr->hop_limit--;
452 
453 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
454 	return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
455 
456 error:
457 	IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
458 drop:
459 	kfree_skb(skb);
460 	return -EINVAL;
461 }
462 
463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
464 {
465 	to->pkt_type = from->pkt_type;
466 	to->priority = from->priority;
467 	to->protocol = from->protocol;
468 	to->security = from->security;
469 	dst_release(to->dst);
470 	to->dst = dst_clone(from->dst);
471 	to->dev = from->dev;
472 
473 #ifdef CONFIG_NET_SCHED
474 	to->tc_index = from->tc_index;
475 #endif
476 #ifdef CONFIG_NETFILTER
477 	to->nfmark = from->nfmark;
478 	/* Connection association is same as pre-frag packet */
479 	to->nfct = from->nfct;
480 	nf_conntrack_get(to->nfct);
481 	to->nfctinfo = from->nfctinfo;
482 #ifdef CONFIG_BRIDGE_NETFILTER
483 	nf_bridge_put(to->nf_bridge);
484 	to->nf_bridge = from->nf_bridge;
485 	nf_bridge_get(to->nf_bridge);
486 #endif
487 #endif
488 }
489 
490 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
491 {
492 	u16 offset = sizeof(struct ipv6hdr);
493 	struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
494 	unsigned int packet_len = skb->tail - skb->nh.raw;
495 	int found_rhdr = 0;
496 	*nexthdr = &skb->nh.ipv6h->nexthdr;
497 
498 	while (offset + 1 <= packet_len) {
499 
500 		switch (**nexthdr) {
501 
502 		case NEXTHDR_HOP:
503 		case NEXTHDR_ROUTING:
504 		case NEXTHDR_DEST:
505 			if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
506 			if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
507 			offset += ipv6_optlen(exthdr);
508 			*nexthdr = &exthdr->nexthdr;
509 			exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
510 			break;
511 		default :
512 			return offset;
513 		}
514 	}
515 
516 	return offset;
517 }
518 
519 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
520 {
521 	struct net_device *dev;
522 	struct sk_buff *frag;
523 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
524 	struct ipv6hdr *tmp_hdr;
525 	struct frag_hdr *fh;
526 	unsigned int mtu, hlen, left, len;
527 	u32 frag_id = 0;
528 	int ptr, offset = 0, err=0;
529 	u8 *prevhdr, nexthdr = 0;
530 
531 	dev = rt->u.dst.dev;
532 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
533 	nexthdr = *prevhdr;
534 
535 	mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
536 
537 	if (skb_shinfo(skb)->frag_list) {
538 		int first_len = skb_pagelen(skb);
539 
540 		if (first_len - hlen > mtu ||
541 		    ((first_len - hlen) & 7) ||
542 		    skb_cloned(skb))
543 			goto slow_path;
544 
545 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
546 			/* Correct geometry. */
547 			if (frag->len > mtu ||
548 			    ((frag->len & 7) && frag->next) ||
549 			    skb_headroom(frag) < hlen)
550 			    goto slow_path;
551 
552 			/* Partially cloned skb? */
553 			if (skb_shared(frag))
554 				goto slow_path;
555 
556 			BUG_ON(frag->sk);
557 			if (skb->sk) {
558 				sock_hold(skb->sk);
559 				frag->sk = skb->sk;
560 				frag->destructor = sock_wfree;
561 				skb->truesize -= frag->truesize;
562 			}
563 		}
564 
565 		err = 0;
566 		offset = 0;
567 		frag = skb_shinfo(skb)->frag_list;
568 		skb_shinfo(skb)->frag_list = NULL;
569 		/* BUILD HEADER */
570 
571 		tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
572 		if (!tmp_hdr) {
573 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
574 			return -ENOMEM;
575 		}
576 
577 		*prevhdr = NEXTHDR_FRAGMENT;
578 		memcpy(tmp_hdr, skb->nh.raw, hlen);
579 		__skb_pull(skb, hlen);
580 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
581 		skb->nh.raw = __skb_push(skb, hlen);
582 		memcpy(skb->nh.raw, tmp_hdr, hlen);
583 
584 		ipv6_select_ident(skb, fh);
585 		fh->nexthdr = nexthdr;
586 		fh->reserved = 0;
587 		fh->frag_off = htons(IP6_MF);
588 		frag_id = fh->identification;
589 
590 		first_len = skb_pagelen(skb);
591 		skb->data_len = first_len - skb_headlen(skb);
592 		skb->len = first_len;
593 		skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
594 
595 
596 		for (;;) {
597 			/* Prepare header of the next frame,
598 			 * before previous one went down. */
599 			if (frag) {
600 				frag->ip_summed = CHECKSUM_NONE;
601 				frag->h.raw = frag->data;
602 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
603 				frag->nh.raw = __skb_push(frag, hlen);
604 				memcpy(frag->nh.raw, tmp_hdr, hlen);
605 				offset += skb->len - hlen - sizeof(struct frag_hdr);
606 				fh->nexthdr = nexthdr;
607 				fh->reserved = 0;
608 				fh->frag_off = htons(offset);
609 				if (frag->next != NULL)
610 					fh->frag_off |= htons(IP6_MF);
611 				fh->identification = frag_id;
612 				frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
613 				ip6_copy_metadata(frag, skb);
614 			}
615 
616 			err = output(skb);
617 			if (err || !frag)
618 				break;
619 
620 			skb = frag;
621 			frag = skb->next;
622 			skb->next = NULL;
623 		}
624 
625 		if (tmp_hdr)
626 			kfree(tmp_hdr);
627 
628 		if (err == 0) {
629 			IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
630 			return 0;
631 		}
632 
633 		while (frag) {
634 			skb = frag->next;
635 			kfree_skb(frag);
636 			frag = skb;
637 		}
638 
639 		IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
640 		return err;
641 	}
642 
643 slow_path:
644 	left = skb->len - hlen;		/* Space per frame */
645 	ptr = hlen;			/* Where to start from */
646 
647 	/*
648 	 *	Fragment the datagram.
649 	 */
650 
651 	*prevhdr = NEXTHDR_FRAGMENT;
652 
653 	/*
654 	 *	Keep copying data until we run out.
655 	 */
656 	while(left > 0)	{
657 		len = left;
658 		/* IF: it doesn't fit, use 'mtu' - the data space left */
659 		if (len > mtu)
660 			len = mtu;
661 		/* IF: we are not sending upto and including the packet end
662 		   then align the next start on an eight byte boundary */
663 		if (len < left)	{
664 			len &= ~7;
665 		}
666 		/*
667 		 *	Allocate buffer.
668 		 */
669 
670 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
671 			NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
672 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
673 			err = -ENOMEM;
674 			goto fail;
675 		}
676 
677 		/*
678 		 *	Set up data on packet
679 		 */
680 
681 		ip6_copy_metadata(frag, skb);
682 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
683 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
684 		frag->nh.raw = frag->data;
685 		fh = (struct frag_hdr*)(frag->data + hlen);
686 		frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
687 
688 		/*
689 		 *	Charge the memory for the fragment to any owner
690 		 *	it might possess
691 		 */
692 		if (skb->sk)
693 			skb_set_owner_w(frag, skb->sk);
694 
695 		/*
696 		 *	Copy the packet header into the new buffer.
697 		 */
698 		memcpy(frag->nh.raw, skb->data, hlen);
699 
700 		/*
701 		 *	Build fragment header.
702 		 */
703 		fh->nexthdr = nexthdr;
704 		fh->reserved = 0;
705 		if (frag_id) {
706 			ipv6_select_ident(skb, fh);
707 			frag_id = fh->identification;
708 		} else
709 			fh->identification = frag_id;
710 
711 		/*
712 		 *	Copy a block of the IP datagram.
713 		 */
714 		if (skb_copy_bits(skb, ptr, frag->h.raw, len))
715 			BUG();
716 		left -= len;
717 
718 		fh->frag_off = htons(offset);
719 		if (left > 0)
720 			fh->frag_off |= htons(IP6_MF);
721 		frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
722 
723 		ptr += len;
724 		offset += len;
725 
726 		/*
727 		 *	Put this fragment into the sending queue.
728 		 */
729 
730 		IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
731 
732 		err = output(frag);
733 		if (err)
734 			goto fail;
735 	}
736 	kfree_skb(skb);
737 	IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
738 	return err;
739 
740 fail:
741 	kfree_skb(skb);
742 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
743 	return err;
744 }
745 
746 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
747 {
748 	int err = 0;
749 
750 	*dst = NULL;
751 	if (sk) {
752 		struct ipv6_pinfo *np = inet6_sk(sk);
753 
754 		*dst = sk_dst_check(sk, np->dst_cookie);
755 		if (*dst) {
756 			struct rt6_info *rt = (struct rt6_info*)*dst;
757 
758 				/* Yes, checking route validity in not connected
759 				   case is not very simple. Take into account,
760 				   that we do not support routing by source, TOS,
761 				   and MSG_DONTROUTE 		--ANK (980726)
762 
763 				   1. If route was host route, check that
764 				      cached destination is current.
765 				      If it is network route, we still may
766 				      check its validity using saved pointer
767 				      to the last used address: daddr_cache.
768 				      We do not want to save whole address now,
769 				      (because main consumer of this service
770 				       is tcp, which has not this problem),
771 				      so that the last trick works only on connected
772 				      sockets.
773 				   2. oif also should be the same.
774 				 */
775 
776 			if (((rt->rt6i_dst.plen != 128 ||
777 			      !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
778 			     && (np->daddr_cache == NULL ||
779 				 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
780 			    || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
781 				dst_release(*dst);
782 				*dst = NULL;
783 			}
784 		}
785 	}
786 
787 	if (*dst == NULL)
788 		*dst = ip6_route_output(sk, fl);
789 
790 	if ((err = (*dst)->error))
791 		goto out_err_release;
792 
793 	if (ipv6_addr_any(&fl->fl6_src)) {
794 		err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
795 
796 		if (err) {
797 #if IP6_DEBUG >= 2
798 			printk(KERN_DEBUG "ip6_dst_lookup: "
799 			       "no available source address\n");
800 #endif
801 			goto out_err_release;
802 		}
803 	}
804 
805 	return 0;
806 
807 out_err_release:
808 	dst_release(*dst);
809 	*dst = NULL;
810 	return err;
811 }
812 
813 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
814 		    void *from, int length, int transhdrlen,
815 		    int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
816 		    unsigned int flags)
817 {
818 	struct inet_sock *inet = inet_sk(sk);
819 	struct ipv6_pinfo *np = inet6_sk(sk);
820 	struct sk_buff *skb;
821 	unsigned int maxfraglen, fragheaderlen;
822 	int exthdrlen;
823 	int hh_len;
824 	int mtu;
825 	int copy;
826 	int err;
827 	int offset = 0;
828 	int csummode = CHECKSUM_NONE;
829 
830 	if (flags&MSG_PROBE)
831 		return 0;
832 	if (skb_queue_empty(&sk->sk_write_queue)) {
833 		/*
834 		 * setup for corking
835 		 */
836 		if (opt) {
837 			if (np->cork.opt == NULL) {
838 				np->cork.opt = kmalloc(opt->tot_len,
839 						       sk->sk_allocation);
840 				if (unlikely(np->cork.opt == NULL))
841 					return -ENOBUFS;
842 			} else if (np->cork.opt->tot_len < opt->tot_len) {
843 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
844 				return -EINVAL;
845 			}
846 			memcpy(np->cork.opt, opt, opt->tot_len);
847 			inet->cork.flags |= IPCORK_OPT;
848 			/* need source address above miyazawa*/
849 		}
850 		dst_hold(&rt->u.dst);
851 		np->cork.rt = rt;
852 		inet->cork.fl = *fl;
853 		np->cork.hop_limit = hlimit;
854 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
855 		if (dst_allfrag(rt->u.dst.path))
856 			inet->cork.flags |= IPCORK_ALLFRAG;
857 		inet->cork.length = 0;
858 		sk->sk_sndmsg_page = NULL;
859 		sk->sk_sndmsg_off = 0;
860 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
861 		length += exthdrlen;
862 		transhdrlen += exthdrlen;
863 	} else {
864 		rt = np->cork.rt;
865 		fl = &inet->cork.fl;
866 		if (inet->cork.flags & IPCORK_OPT)
867 			opt = np->cork.opt;
868 		transhdrlen = 0;
869 		exthdrlen = 0;
870 		mtu = inet->cork.fragsize;
871 	}
872 
873 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
874 
875 	fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
876 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
877 
878 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
879 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
880 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
881 			return -EMSGSIZE;
882 		}
883 	}
884 
885 	/*
886 	 * Let's try using as much space as possible.
887 	 * Use MTU if total length of the message fits into the MTU.
888 	 * Otherwise, we need to reserve fragment header and
889 	 * fragment alignment (= 8-15 octects, in total).
890 	 *
891 	 * Note that we may need to "move" the data from the tail of
892 	 * of the buffer to the new fragment when we split
893 	 * the message.
894 	 *
895 	 * FIXME: It may be fragmented into multiple chunks
896 	 *        at once if non-fragmentable extension headers
897 	 *        are too large.
898 	 * --yoshfuji
899 	 */
900 
901 	inet->cork.length += length;
902 
903 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
904 		goto alloc_new_skb;
905 
906 	while (length > 0) {
907 		/* Check if the remaining data fits into current packet. */
908 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
909 		if (copy < length)
910 			copy = maxfraglen - skb->len;
911 
912 		if (copy <= 0) {
913 			char *data;
914 			unsigned int datalen;
915 			unsigned int fraglen;
916 			unsigned int fraggap;
917 			unsigned int alloclen;
918 			struct sk_buff *skb_prev;
919 alloc_new_skb:
920 			skb_prev = skb;
921 
922 			/* There's no room in the current skb */
923 			if (skb_prev)
924 				fraggap = skb_prev->len - maxfraglen;
925 			else
926 				fraggap = 0;
927 
928 			/*
929 			 * If remaining data exceeds the mtu,
930 			 * we know we need more fragment(s).
931 			 */
932 			datalen = length + fraggap;
933 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
934 				datalen = maxfraglen - fragheaderlen;
935 
936 			fraglen = datalen + fragheaderlen;
937 			if ((flags & MSG_MORE) &&
938 			    !(rt->u.dst.dev->features&NETIF_F_SG))
939 				alloclen = mtu;
940 			else
941 				alloclen = datalen + fragheaderlen;
942 
943 			/*
944 			 * The last fragment gets additional space at tail.
945 			 * Note: we overallocate on fragments with MSG_MODE
946 			 * because we have no idea if we're the last one.
947 			 */
948 			if (datalen == length + fraggap)
949 				alloclen += rt->u.dst.trailer_len;
950 
951 			/*
952 			 * We just reserve space for fragment header.
953 			 * Note: this may be overallocation if the message
954 			 * (without MSG_MORE) fits into the MTU.
955 			 */
956 			alloclen += sizeof(struct frag_hdr);
957 
958 			if (transhdrlen) {
959 				skb = sock_alloc_send_skb(sk,
960 						alloclen + hh_len,
961 						(flags & MSG_DONTWAIT), &err);
962 			} else {
963 				skb = NULL;
964 				if (atomic_read(&sk->sk_wmem_alloc) <=
965 				    2 * sk->sk_sndbuf)
966 					skb = sock_wmalloc(sk,
967 							   alloclen + hh_len, 1,
968 							   sk->sk_allocation);
969 				if (unlikely(skb == NULL))
970 					err = -ENOBUFS;
971 			}
972 			if (skb == NULL)
973 				goto error;
974 			/*
975 			 *	Fill in the control structures
976 			 */
977 			skb->ip_summed = csummode;
978 			skb->csum = 0;
979 			/* reserve for fragmentation */
980 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
981 
982 			/*
983 			 *	Find where to start putting bytes
984 			 */
985 			data = skb_put(skb, fraglen);
986 			skb->nh.raw = data + exthdrlen;
987 			data += fragheaderlen;
988 			skb->h.raw = data + exthdrlen;
989 
990 			if (fraggap) {
991 				skb->csum = skb_copy_and_csum_bits(
992 					skb_prev, maxfraglen,
993 					data + transhdrlen, fraggap, 0);
994 				skb_prev->csum = csum_sub(skb_prev->csum,
995 							  skb->csum);
996 				data += fraggap;
997 				skb_trim(skb_prev, maxfraglen);
998 			}
999 			copy = datalen - transhdrlen - fraggap;
1000 			if (copy < 0) {
1001 				err = -EINVAL;
1002 				kfree_skb(skb);
1003 				goto error;
1004 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1005 				err = -EFAULT;
1006 				kfree_skb(skb);
1007 				goto error;
1008 			}
1009 
1010 			offset += copy;
1011 			length -= datalen - fraggap;
1012 			transhdrlen = 0;
1013 			exthdrlen = 0;
1014 			csummode = CHECKSUM_NONE;
1015 
1016 			/*
1017 			 * Put the packet on the pending queue
1018 			 */
1019 			__skb_queue_tail(&sk->sk_write_queue, skb);
1020 			continue;
1021 		}
1022 
1023 		if (copy > length)
1024 			copy = length;
1025 
1026 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1027 			unsigned int off;
1028 
1029 			off = skb->len;
1030 			if (getfrag(from, skb_put(skb, copy),
1031 						offset, copy, off, skb) < 0) {
1032 				__skb_trim(skb, off);
1033 				err = -EFAULT;
1034 				goto error;
1035 			}
1036 		} else {
1037 			int i = skb_shinfo(skb)->nr_frags;
1038 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1039 			struct page *page = sk->sk_sndmsg_page;
1040 			int off = sk->sk_sndmsg_off;
1041 			unsigned int left;
1042 
1043 			if (page && (left = PAGE_SIZE - off) > 0) {
1044 				if (copy >= left)
1045 					copy = left;
1046 				if (page != frag->page) {
1047 					if (i == MAX_SKB_FRAGS) {
1048 						err = -EMSGSIZE;
1049 						goto error;
1050 					}
1051 					get_page(page);
1052 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1053 					frag = &skb_shinfo(skb)->frags[i];
1054 				}
1055 			} else if(i < MAX_SKB_FRAGS) {
1056 				if (copy > PAGE_SIZE)
1057 					copy = PAGE_SIZE;
1058 				page = alloc_pages(sk->sk_allocation, 0);
1059 				if (page == NULL) {
1060 					err = -ENOMEM;
1061 					goto error;
1062 				}
1063 				sk->sk_sndmsg_page = page;
1064 				sk->sk_sndmsg_off = 0;
1065 
1066 				skb_fill_page_desc(skb, i, page, 0, 0);
1067 				frag = &skb_shinfo(skb)->frags[i];
1068 				skb->truesize += PAGE_SIZE;
1069 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1070 			} else {
1071 				err = -EMSGSIZE;
1072 				goto error;
1073 			}
1074 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1075 				err = -EFAULT;
1076 				goto error;
1077 			}
1078 			sk->sk_sndmsg_off += copy;
1079 			frag->size += copy;
1080 			skb->len += copy;
1081 			skb->data_len += copy;
1082 		}
1083 		offset += copy;
1084 		length -= copy;
1085 	}
1086 	return 0;
1087 error:
1088 	inet->cork.length -= length;
1089 	IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1090 	return err;
1091 }
1092 
1093 int ip6_push_pending_frames(struct sock *sk)
1094 {
1095 	struct sk_buff *skb, *tmp_skb;
1096 	struct sk_buff **tail_skb;
1097 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1098 	struct inet_sock *inet = inet_sk(sk);
1099 	struct ipv6_pinfo *np = inet6_sk(sk);
1100 	struct ipv6hdr *hdr;
1101 	struct ipv6_txoptions *opt = np->cork.opt;
1102 	struct rt6_info *rt = np->cork.rt;
1103 	struct flowi *fl = &inet->cork.fl;
1104 	unsigned char proto = fl->proto;
1105 	int err = 0;
1106 
1107 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1108 		goto out;
1109 	tail_skb = &(skb_shinfo(skb)->frag_list);
1110 
1111 	/* move skb->data to ip header from ext header */
1112 	if (skb->data < skb->nh.raw)
1113 		__skb_pull(skb, skb->nh.raw - skb->data);
1114 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1115 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1116 		*tail_skb = tmp_skb;
1117 		tail_skb = &(tmp_skb->next);
1118 		skb->len += tmp_skb->len;
1119 		skb->data_len += tmp_skb->len;
1120 		skb->truesize += tmp_skb->truesize;
1121 		__sock_put(tmp_skb->sk);
1122 		tmp_skb->destructor = NULL;
1123 		tmp_skb->sk = NULL;
1124 	}
1125 
1126 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1127 	__skb_pull(skb, skb->h.raw - skb->nh.raw);
1128 	if (opt && opt->opt_flen)
1129 		ipv6_push_frag_opts(skb, opt, &proto);
1130 	if (opt && opt->opt_nflen)
1131 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1132 
1133 	skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1134 
1135 	*(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1136 
1137 	if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1138 		hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1139 	else
1140 		hdr->payload_len = 0;
1141 	hdr->hop_limit = np->cork.hop_limit;
1142 	hdr->nexthdr = proto;
1143 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1144 	ipv6_addr_copy(&hdr->daddr, final_dst);
1145 
1146 	skb->dst = dst_clone(&rt->u.dst);
1147 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1148 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1149 	if (err) {
1150 		if (err > 0)
1151 			err = np->recverr ? net_xmit_errno(err) : 0;
1152 		if (err)
1153 			goto error;
1154 	}
1155 
1156 out:
1157 	inet->cork.flags &= ~IPCORK_OPT;
1158 	if (np->cork.opt) {
1159 		kfree(np->cork.opt);
1160 		np->cork.opt = NULL;
1161 	}
1162 	if (np->cork.rt) {
1163 		dst_release(&np->cork.rt->u.dst);
1164 		np->cork.rt = NULL;
1165 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1166 	}
1167 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1168 	return err;
1169 error:
1170 	goto out;
1171 }
1172 
1173 void ip6_flush_pending_frames(struct sock *sk)
1174 {
1175 	struct inet_sock *inet = inet_sk(sk);
1176 	struct ipv6_pinfo *np = inet6_sk(sk);
1177 	struct sk_buff *skb;
1178 
1179 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1180 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1181 		kfree_skb(skb);
1182 	}
1183 
1184 	inet->cork.flags &= ~IPCORK_OPT;
1185 
1186 	if (np->cork.opt) {
1187 		kfree(np->cork.opt);
1188 		np->cork.opt = NULL;
1189 	}
1190 	if (np->cork.rt) {
1191 		dst_release(&np->cork.rt->u.dst);
1192 		np->cork.rt = NULL;
1193 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1194 	}
1195 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1196 }
1197