xref: /linux/net/ipv6/ip6_output.c (revision 54a8a2220c936a47840c9a3d74910c5a56fae2ed)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *	Based on linux/net/ipv4/ip_output.c
11  *
12  *	This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *	Changes:
18  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
19  *				extension headers are implemented.
20  *				route changes now work.
21  *				ip6_forward does not confuse sniffers.
22  *				etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *	Imran Patel	: 	frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *			:       add ip6_append_data and related functions
28  *				for datagram xmit
29  */
30 
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42 
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63 	static u32 ipv6_fragmentation_id = 1;
64 	static DEFINE_SPINLOCK(ip6_id_lock);
65 
66 	spin_lock_bh(&ip6_id_lock);
67 	fhdr->identification = htonl(ipv6_fragmentation_id);
68 	if (++ipv6_fragmentation_id == 0)
69 		ipv6_fragmentation_id = 1;
70 	spin_unlock_bh(&ip6_id_lock);
71 }
72 
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75 
76 	struct dst_entry *dst = skb->dst;
77 	struct hh_cache *hh = dst->hh;
78 
79 	if (hh) {
80 		int hh_alen;
81 
82 		read_lock_bh(&hh->hh_lock);
83 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 		read_unlock_bh(&hh->hh_lock);
86 	        skb_push(skb, hh->hh_len);
87 		return hh->hh_output(skb);
88 	} else if (dst->neighbour)
89 		return dst->neighbour->output(skb);
90 
91 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 	kfree_skb(skb);
93 	return -EINVAL;
94 
95 }
96 
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100 	newskb->mac.raw = newskb->data;
101 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
102 	newskb->pkt_type = PACKET_LOOPBACK;
103 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 	BUG_TRAP(newskb->dst);
105 
106 	netif_rx(newskb);
107 	return 0;
108 }
109 
110 
111 static int ip6_output2(struct sk_buff *skb)
112 {
113 	struct dst_entry *dst = skb->dst;
114 	struct net_device *dev = dst->dev;
115 
116 	skb->protocol = htons(ETH_P_IPV6);
117 	skb->dev = dev;
118 
119 	if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121 
122 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 		    ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 				&skb->nh.ipv6h->saddr)) {
125 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126 
127 			/* Do not check for IFF_ALLMULTI; multicast routing
128 			   is not supported in any case.
129 			 */
130 			if (newskb)
131 				NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 					newskb->dev,
133 					ip6_dev_loopback_xmit);
134 
135 			if (skb->nh.ipv6h->hop_limit == 0) {
136 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 				kfree_skb(skb);
138 				return 0;
139 			}
140 		}
141 
142 		IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 	}
144 
145 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147 
148 int ip6_output(struct sk_buff *skb)
149 {
150 	if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 		return ip6_fragment(skb, ip6_output2);
152 	else
153 		return ip6_output2(skb);
154 }
155 
156 /*
157  *	xmit an sk_buff (used by TCP)
158  */
159 
160 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
161 	     struct ipv6_txoptions *opt, int ipfragok)
162 {
163 	struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
164 	struct in6_addr *first_hop = &fl->fl6_dst;
165 	struct dst_entry *dst = skb->dst;
166 	struct ipv6hdr *hdr;
167 	u8  proto = fl->proto;
168 	int seg_len = skb->len;
169 	int hlimit, tclass;
170 	u32 mtu;
171 
172 	if (opt) {
173 		int head_room;
174 
175 		/* First: exthdrs may take lots of space (~8K for now)
176 		   MAX_HEADER is not enough.
177 		 */
178 		head_room = opt->opt_nflen + opt->opt_flen;
179 		seg_len += head_room;
180 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
181 
182 		if (skb_headroom(skb) < head_room) {
183 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
184 			kfree_skb(skb);
185 			skb = skb2;
186 			if (skb == NULL) {
187 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
188 				return -ENOBUFS;
189 			}
190 			if (sk)
191 				skb_set_owner_w(skb, sk);
192 		}
193 		if (opt->opt_flen)
194 			ipv6_push_frag_opts(skb, opt, &proto);
195 		if (opt->opt_nflen)
196 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
197 	}
198 
199 	hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
200 
201 	/*
202 	 *	Fill in the IPv6 header
203 	 */
204 
205 	hlimit = -1;
206 	if (np)
207 		hlimit = np->hop_limit;
208 	if (hlimit < 0)
209 		hlimit = dst_metric(dst, RTAX_HOPLIMIT);
210 	if (hlimit < 0)
211 		hlimit = ipv6_get_hoplimit(dst->dev);
212 
213 	tclass = -1;
214 	if (np)
215 		tclass = np->tclass;
216 	if (tclass < 0)
217 		tclass = 0;
218 
219 	*(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
220 
221 	hdr->payload_len = htons(seg_len);
222 	hdr->nexthdr = proto;
223 	hdr->hop_limit = hlimit;
224 
225 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
226 	ipv6_addr_copy(&hdr->daddr, first_hop);
227 
228 	mtu = dst_mtu(dst);
229 	if ((skb->len <= mtu) || ipfragok) {
230 		IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
231 		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
232 				dst_output);
233 	}
234 
235 	if (net_ratelimit())
236 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
237 	skb->dev = dst->dev;
238 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
239 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
240 	kfree_skb(skb);
241 	return -EMSGSIZE;
242 }
243 
244 /*
245  *	To avoid extra problems ND packets are send through this
246  *	routine. It's code duplication but I really want to avoid
247  *	extra checks since ipv6_build_header is used by TCP (which
248  *	is for us performance critical)
249  */
250 
251 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
252 	       struct in6_addr *saddr, struct in6_addr *daddr,
253 	       int proto, int len)
254 {
255 	struct ipv6_pinfo *np = inet6_sk(sk);
256 	struct ipv6hdr *hdr;
257 	int totlen;
258 
259 	skb->protocol = htons(ETH_P_IPV6);
260 	skb->dev = dev;
261 
262 	totlen = len + sizeof(struct ipv6hdr);
263 
264 	hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
265 	skb->nh.ipv6h = hdr;
266 
267 	*(u32*)hdr = htonl(0x60000000);
268 
269 	hdr->payload_len = htons(len);
270 	hdr->nexthdr = proto;
271 	hdr->hop_limit = np->hop_limit;
272 
273 	ipv6_addr_copy(&hdr->saddr, saddr);
274 	ipv6_addr_copy(&hdr->daddr, daddr);
275 
276 	return 0;
277 }
278 
279 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
280 {
281 	struct ip6_ra_chain *ra;
282 	struct sock *last = NULL;
283 
284 	read_lock(&ip6_ra_lock);
285 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
286 		struct sock *sk = ra->sk;
287 		if (sk && ra->sel == sel &&
288 		    (!sk->sk_bound_dev_if ||
289 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
290 			if (last) {
291 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
292 				if (skb2)
293 					rawv6_rcv(last, skb2);
294 			}
295 			last = sk;
296 		}
297 	}
298 
299 	if (last) {
300 		rawv6_rcv(last, skb);
301 		read_unlock(&ip6_ra_lock);
302 		return 1;
303 	}
304 	read_unlock(&ip6_ra_lock);
305 	return 0;
306 }
307 
308 static inline int ip6_forward_finish(struct sk_buff *skb)
309 {
310 	return dst_output(skb);
311 }
312 
313 int ip6_forward(struct sk_buff *skb)
314 {
315 	struct dst_entry *dst = skb->dst;
316 	struct ipv6hdr *hdr = skb->nh.ipv6h;
317 	struct inet6_skb_parm *opt = IP6CB(skb);
318 
319 	if (ipv6_devconf.forwarding == 0)
320 		goto error;
321 
322 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
323 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
324 		goto drop;
325 	}
326 
327 	skb->ip_summed = CHECKSUM_NONE;
328 
329 	/*
330 	 *	We DO NOT make any processing on
331 	 *	RA packets, pushing them to user level AS IS
332 	 *	without ane WARRANTY that application will be able
333 	 *	to interpret them. The reason is that we
334 	 *	cannot make anything clever here.
335 	 *
336 	 *	We are not end-node, so that if packet contains
337 	 *	AH/ESP, we cannot make anything.
338 	 *	Defragmentation also would be mistake, RA packets
339 	 *	cannot be fragmented, because there is no warranty
340 	 *	that different fragments will go along one path. --ANK
341 	 */
342 	if (opt->ra) {
343 		u8 *ptr = skb->nh.raw + opt->ra;
344 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
345 			return 0;
346 	}
347 
348 	/*
349 	 *	check and decrement ttl
350 	 */
351 	if (hdr->hop_limit <= 1) {
352 		/* Force OUTPUT device used as source address */
353 		skb->dev = dst->dev;
354 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
355 			    0, skb->dev);
356 
357 		kfree_skb(skb);
358 		return -ETIMEDOUT;
359 	}
360 
361 	if (!xfrm6_route_forward(skb)) {
362 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
363 		goto drop;
364 	}
365 	dst = skb->dst;
366 
367 	/* IPv6 specs say nothing about it, but it is clear that we cannot
368 	   send redirects to source routed frames.
369 	 */
370 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
371 		struct in6_addr *target = NULL;
372 		struct rt6_info *rt;
373 		struct neighbour *n = dst->neighbour;
374 
375 		/*
376 		 *	incoming and outgoing devices are the same
377 		 *	send a redirect.
378 		 */
379 
380 		rt = (struct rt6_info *) dst;
381 		if ((rt->rt6i_flags & RTF_GATEWAY))
382 			target = (struct in6_addr*)&n->primary_key;
383 		else
384 			target = &hdr->daddr;
385 
386 		/* Limit redirects both by destination (here)
387 		   and by source (inside ndisc_send_redirect)
388 		 */
389 		if (xrlim_allow(dst, 1*HZ))
390 			ndisc_send_redirect(skb, n, target);
391 	} else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
392 						|IPV6_ADDR_LINKLOCAL)) {
393 		/* This check is security critical. */
394 		goto error;
395 	}
396 
397 	if (skb->len > dst_mtu(dst)) {
398 		/* Again, force OUTPUT device used as source address */
399 		skb->dev = dst->dev;
400 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
401 		IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
402 		IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
403 		kfree_skb(skb);
404 		return -EMSGSIZE;
405 	}
406 
407 	if (skb_cow(skb, dst->dev->hard_header_len)) {
408 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
409 		goto drop;
410 	}
411 
412 	hdr = skb->nh.ipv6h;
413 
414 	/* Mangling hops number delayed to point after skb COW */
415 
416 	hdr->hop_limit--;
417 
418 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
419 	return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
420 
421 error:
422 	IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
423 drop:
424 	kfree_skb(skb);
425 	return -EINVAL;
426 }
427 
428 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
429 {
430 	to->pkt_type = from->pkt_type;
431 	to->priority = from->priority;
432 	to->protocol = from->protocol;
433 	dst_release(to->dst);
434 	to->dst = dst_clone(from->dst);
435 	to->dev = from->dev;
436 
437 #ifdef CONFIG_NET_SCHED
438 	to->tc_index = from->tc_index;
439 #endif
440 #ifdef CONFIG_NETFILTER
441 	to->nfmark = from->nfmark;
442 	/* Connection association is same as pre-frag packet */
443 	to->nfct = from->nfct;
444 	nf_conntrack_get(to->nfct);
445 	to->nfctinfo = from->nfctinfo;
446 #ifdef CONFIG_BRIDGE_NETFILTER
447 	nf_bridge_put(to->nf_bridge);
448 	to->nf_bridge = from->nf_bridge;
449 	nf_bridge_get(to->nf_bridge);
450 #endif
451 #endif
452 }
453 
454 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
455 {
456 	u16 offset = sizeof(struct ipv6hdr);
457 	struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
458 	unsigned int packet_len = skb->tail - skb->nh.raw;
459 	int found_rhdr = 0;
460 	*nexthdr = &skb->nh.ipv6h->nexthdr;
461 
462 	while (offset + 1 <= packet_len) {
463 
464 		switch (**nexthdr) {
465 
466 		case NEXTHDR_HOP:
467 		case NEXTHDR_ROUTING:
468 		case NEXTHDR_DEST:
469 			if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
470 			if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
471 			offset += ipv6_optlen(exthdr);
472 			*nexthdr = &exthdr->nexthdr;
473 			exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
474 			break;
475 		default :
476 			return offset;
477 		}
478 	}
479 
480 	return offset;
481 }
482 
483 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
484 {
485 	struct net_device *dev;
486 	struct sk_buff *frag;
487 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
488 	struct ipv6hdr *tmp_hdr;
489 	struct frag_hdr *fh;
490 	unsigned int mtu, hlen, left, len;
491 	u32 frag_id = 0;
492 	int ptr, offset = 0, err=0;
493 	u8 *prevhdr, nexthdr = 0;
494 
495 	dev = rt->u.dst.dev;
496 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
497 	nexthdr = *prevhdr;
498 
499 	mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
500 
501 	if (skb_shinfo(skb)->frag_list) {
502 		int first_len = skb_pagelen(skb);
503 
504 		if (first_len - hlen > mtu ||
505 		    ((first_len - hlen) & 7) ||
506 		    skb_cloned(skb))
507 			goto slow_path;
508 
509 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
510 			/* Correct geometry. */
511 			if (frag->len > mtu ||
512 			    ((frag->len & 7) && frag->next) ||
513 			    skb_headroom(frag) < hlen)
514 			    goto slow_path;
515 
516 			/* Partially cloned skb? */
517 			if (skb_shared(frag))
518 				goto slow_path;
519 
520 			BUG_ON(frag->sk);
521 			if (skb->sk) {
522 				sock_hold(skb->sk);
523 				frag->sk = skb->sk;
524 				frag->destructor = sock_wfree;
525 				skb->truesize -= frag->truesize;
526 			}
527 		}
528 
529 		err = 0;
530 		offset = 0;
531 		frag = skb_shinfo(skb)->frag_list;
532 		skb_shinfo(skb)->frag_list = NULL;
533 		/* BUILD HEADER */
534 
535 		tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
536 		if (!tmp_hdr) {
537 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
538 			return -ENOMEM;
539 		}
540 
541 		*prevhdr = NEXTHDR_FRAGMENT;
542 		memcpy(tmp_hdr, skb->nh.raw, hlen);
543 		__skb_pull(skb, hlen);
544 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
545 		skb->nh.raw = __skb_push(skb, hlen);
546 		memcpy(skb->nh.raw, tmp_hdr, hlen);
547 
548 		ipv6_select_ident(skb, fh);
549 		fh->nexthdr = nexthdr;
550 		fh->reserved = 0;
551 		fh->frag_off = htons(IP6_MF);
552 		frag_id = fh->identification;
553 
554 		first_len = skb_pagelen(skb);
555 		skb->data_len = first_len - skb_headlen(skb);
556 		skb->len = first_len;
557 		skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
558 
559 
560 		for (;;) {
561 			/* Prepare header of the next frame,
562 			 * before previous one went down. */
563 			if (frag) {
564 				frag->ip_summed = CHECKSUM_NONE;
565 				frag->h.raw = frag->data;
566 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
567 				frag->nh.raw = __skb_push(frag, hlen);
568 				memcpy(frag->nh.raw, tmp_hdr, hlen);
569 				offset += skb->len - hlen - sizeof(struct frag_hdr);
570 				fh->nexthdr = nexthdr;
571 				fh->reserved = 0;
572 				fh->frag_off = htons(offset);
573 				if (frag->next != NULL)
574 					fh->frag_off |= htons(IP6_MF);
575 				fh->identification = frag_id;
576 				frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
577 				ip6_copy_metadata(frag, skb);
578 			}
579 
580 			err = output(skb);
581 			if (err || !frag)
582 				break;
583 
584 			skb = frag;
585 			frag = skb->next;
586 			skb->next = NULL;
587 		}
588 
589 		if (tmp_hdr)
590 			kfree(tmp_hdr);
591 
592 		if (err == 0) {
593 			IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
594 			return 0;
595 		}
596 
597 		while (frag) {
598 			skb = frag->next;
599 			kfree_skb(frag);
600 			frag = skb;
601 		}
602 
603 		IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
604 		return err;
605 	}
606 
607 slow_path:
608 	left = skb->len - hlen;		/* Space per frame */
609 	ptr = hlen;			/* Where to start from */
610 
611 	/*
612 	 *	Fragment the datagram.
613 	 */
614 
615 	*prevhdr = NEXTHDR_FRAGMENT;
616 
617 	/*
618 	 *	Keep copying data until we run out.
619 	 */
620 	while(left > 0)	{
621 		len = left;
622 		/* IF: it doesn't fit, use 'mtu' - the data space left */
623 		if (len > mtu)
624 			len = mtu;
625 		/* IF: we are not sending upto and including the packet end
626 		   then align the next start on an eight byte boundary */
627 		if (len < left)	{
628 			len &= ~7;
629 		}
630 		/*
631 		 *	Allocate buffer.
632 		 */
633 
634 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
635 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
636 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
637 			err = -ENOMEM;
638 			goto fail;
639 		}
640 
641 		/*
642 		 *	Set up data on packet
643 		 */
644 
645 		ip6_copy_metadata(frag, skb);
646 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
647 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
648 		frag->nh.raw = frag->data;
649 		fh = (struct frag_hdr*)(frag->data + hlen);
650 		frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
651 
652 		/*
653 		 *	Charge the memory for the fragment to any owner
654 		 *	it might possess
655 		 */
656 		if (skb->sk)
657 			skb_set_owner_w(frag, skb->sk);
658 
659 		/*
660 		 *	Copy the packet header into the new buffer.
661 		 */
662 		memcpy(frag->nh.raw, skb->data, hlen);
663 
664 		/*
665 		 *	Build fragment header.
666 		 */
667 		fh->nexthdr = nexthdr;
668 		fh->reserved = 0;
669 		if (!frag_id) {
670 			ipv6_select_ident(skb, fh);
671 			frag_id = fh->identification;
672 		} else
673 			fh->identification = frag_id;
674 
675 		/*
676 		 *	Copy a block of the IP datagram.
677 		 */
678 		if (skb_copy_bits(skb, ptr, frag->h.raw, len))
679 			BUG();
680 		left -= len;
681 
682 		fh->frag_off = htons(offset);
683 		if (left > 0)
684 			fh->frag_off |= htons(IP6_MF);
685 		frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
686 
687 		ptr += len;
688 		offset += len;
689 
690 		/*
691 		 *	Put this fragment into the sending queue.
692 		 */
693 
694 		IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
695 
696 		err = output(frag);
697 		if (err)
698 			goto fail;
699 	}
700 	kfree_skb(skb);
701 	IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
702 	return err;
703 
704 fail:
705 	kfree_skb(skb);
706 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
707 	return err;
708 }
709 
710 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
711 {
712 	int err = 0;
713 
714 	*dst = NULL;
715 	if (sk) {
716 		struct ipv6_pinfo *np = inet6_sk(sk);
717 
718 		*dst = sk_dst_check(sk, np->dst_cookie);
719 		if (*dst) {
720 			struct rt6_info *rt = (struct rt6_info*)*dst;
721 
722 				/* Yes, checking route validity in not connected
723 				   case is not very simple. Take into account,
724 				   that we do not support routing by source, TOS,
725 				   and MSG_DONTROUTE 		--ANK (980726)
726 
727 				   1. If route was host route, check that
728 				      cached destination is current.
729 				      If it is network route, we still may
730 				      check its validity using saved pointer
731 				      to the last used address: daddr_cache.
732 				      We do not want to save whole address now,
733 				      (because main consumer of this service
734 				       is tcp, which has not this problem),
735 				      so that the last trick works only on connected
736 				      sockets.
737 				   2. oif also should be the same.
738 				 */
739 
740 			if (((rt->rt6i_dst.plen != 128 ||
741 			      !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
742 			     && (np->daddr_cache == NULL ||
743 				 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
744 			    || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
745 				dst_release(*dst);
746 				*dst = NULL;
747 			}
748 		}
749 	}
750 
751 	if (*dst == NULL)
752 		*dst = ip6_route_output(sk, fl);
753 
754 	if ((err = (*dst)->error))
755 		goto out_err_release;
756 
757 	if (ipv6_addr_any(&fl->fl6_src)) {
758 		err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
759 
760 		if (err)
761 			goto out_err_release;
762 	}
763 
764 	return 0;
765 
766 out_err_release:
767 	dst_release(*dst);
768 	*dst = NULL;
769 	return err;
770 }
771 
772 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
773 	int offset, int len, int odd, struct sk_buff *skb),
774 	void *from, int length, int transhdrlen,
775 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
776 	struct rt6_info *rt, unsigned int flags)
777 {
778 	struct inet_sock *inet = inet_sk(sk);
779 	struct ipv6_pinfo *np = inet6_sk(sk);
780 	struct sk_buff *skb;
781 	unsigned int maxfraglen, fragheaderlen;
782 	int exthdrlen;
783 	int hh_len;
784 	int mtu;
785 	int copy;
786 	int err;
787 	int offset = 0;
788 	int csummode = CHECKSUM_NONE;
789 
790 	if (flags&MSG_PROBE)
791 		return 0;
792 	if (skb_queue_empty(&sk->sk_write_queue)) {
793 		/*
794 		 * setup for corking
795 		 */
796 		if (opt) {
797 			if (np->cork.opt == NULL) {
798 				np->cork.opt = kmalloc(opt->tot_len,
799 						       sk->sk_allocation);
800 				if (unlikely(np->cork.opt == NULL))
801 					return -ENOBUFS;
802 			} else if (np->cork.opt->tot_len < opt->tot_len) {
803 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
804 				return -EINVAL;
805 			}
806 			memcpy(np->cork.opt, opt, opt->tot_len);
807 			inet->cork.flags |= IPCORK_OPT;
808 			/* need source address above miyazawa*/
809 		}
810 		dst_hold(&rt->u.dst);
811 		np->cork.rt = rt;
812 		inet->cork.fl = *fl;
813 		np->cork.hop_limit = hlimit;
814 		np->cork.tclass = tclass;
815 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
816 		if (dst_allfrag(rt->u.dst.path))
817 			inet->cork.flags |= IPCORK_ALLFRAG;
818 		inet->cork.length = 0;
819 		sk->sk_sndmsg_page = NULL;
820 		sk->sk_sndmsg_off = 0;
821 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
822 		length += exthdrlen;
823 		transhdrlen += exthdrlen;
824 	} else {
825 		rt = np->cork.rt;
826 		fl = &inet->cork.fl;
827 		if (inet->cork.flags & IPCORK_OPT)
828 			opt = np->cork.opt;
829 		transhdrlen = 0;
830 		exthdrlen = 0;
831 		mtu = inet->cork.fragsize;
832 	}
833 
834 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
835 
836 	fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
837 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
838 
839 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
840 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
841 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
842 			return -EMSGSIZE;
843 		}
844 	}
845 
846 	/*
847 	 * Let's try using as much space as possible.
848 	 * Use MTU if total length of the message fits into the MTU.
849 	 * Otherwise, we need to reserve fragment header and
850 	 * fragment alignment (= 8-15 octects, in total).
851 	 *
852 	 * Note that we may need to "move" the data from the tail of
853 	 * of the buffer to the new fragment when we split
854 	 * the message.
855 	 *
856 	 * FIXME: It may be fragmented into multiple chunks
857 	 *        at once if non-fragmentable extension headers
858 	 *        are too large.
859 	 * --yoshfuji
860 	 */
861 
862 	inet->cork.length += length;
863 
864 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
865 		goto alloc_new_skb;
866 
867 	while (length > 0) {
868 		/* Check if the remaining data fits into current packet. */
869 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
870 		if (copy < length)
871 			copy = maxfraglen - skb->len;
872 
873 		if (copy <= 0) {
874 			char *data;
875 			unsigned int datalen;
876 			unsigned int fraglen;
877 			unsigned int fraggap;
878 			unsigned int alloclen;
879 			struct sk_buff *skb_prev;
880 alloc_new_skb:
881 			skb_prev = skb;
882 
883 			/* There's no room in the current skb */
884 			if (skb_prev)
885 				fraggap = skb_prev->len - maxfraglen;
886 			else
887 				fraggap = 0;
888 
889 			/*
890 			 * If remaining data exceeds the mtu,
891 			 * we know we need more fragment(s).
892 			 */
893 			datalen = length + fraggap;
894 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
895 				datalen = maxfraglen - fragheaderlen;
896 
897 			fraglen = datalen + fragheaderlen;
898 			if ((flags & MSG_MORE) &&
899 			    !(rt->u.dst.dev->features&NETIF_F_SG))
900 				alloclen = mtu;
901 			else
902 				alloclen = datalen + fragheaderlen;
903 
904 			/*
905 			 * The last fragment gets additional space at tail.
906 			 * Note: we overallocate on fragments with MSG_MODE
907 			 * because we have no idea if we're the last one.
908 			 */
909 			if (datalen == length + fraggap)
910 				alloclen += rt->u.dst.trailer_len;
911 
912 			/*
913 			 * We just reserve space for fragment header.
914 			 * Note: this may be overallocation if the message
915 			 * (without MSG_MORE) fits into the MTU.
916 			 */
917 			alloclen += sizeof(struct frag_hdr);
918 
919 			if (transhdrlen) {
920 				skb = sock_alloc_send_skb(sk,
921 						alloclen + hh_len,
922 						(flags & MSG_DONTWAIT), &err);
923 			} else {
924 				skb = NULL;
925 				if (atomic_read(&sk->sk_wmem_alloc) <=
926 				    2 * sk->sk_sndbuf)
927 					skb = sock_wmalloc(sk,
928 							   alloclen + hh_len, 1,
929 							   sk->sk_allocation);
930 				if (unlikely(skb == NULL))
931 					err = -ENOBUFS;
932 			}
933 			if (skb == NULL)
934 				goto error;
935 			/*
936 			 *	Fill in the control structures
937 			 */
938 			skb->ip_summed = csummode;
939 			skb->csum = 0;
940 			/* reserve for fragmentation */
941 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
942 
943 			/*
944 			 *	Find where to start putting bytes
945 			 */
946 			data = skb_put(skb, fraglen);
947 			skb->nh.raw = data + exthdrlen;
948 			data += fragheaderlen;
949 			skb->h.raw = data + exthdrlen;
950 
951 			if (fraggap) {
952 				skb->csum = skb_copy_and_csum_bits(
953 					skb_prev, maxfraglen,
954 					data + transhdrlen, fraggap, 0);
955 				skb_prev->csum = csum_sub(skb_prev->csum,
956 							  skb->csum);
957 				data += fraggap;
958 				skb_trim(skb_prev, maxfraglen);
959 			}
960 			copy = datalen - transhdrlen - fraggap;
961 			if (copy < 0) {
962 				err = -EINVAL;
963 				kfree_skb(skb);
964 				goto error;
965 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
966 				err = -EFAULT;
967 				kfree_skb(skb);
968 				goto error;
969 			}
970 
971 			offset += copy;
972 			length -= datalen - fraggap;
973 			transhdrlen = 0;
974 			exthdrlen = 0;
975 			csummode = CHECKSUM_NONE;
976 
977 			/*
978 			 * Put the packet on the pending queue
979 			 */
980 			__skb_queue_tail(&sk->sk_write_queue, skb);
981 			continue;
982 		}
983 
984 		if (copy > length)
985 			copy = length;
986 
987 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
988 			unsigned int off;
989 
990 			off = skb->len;
991 			if (getfrag(from, skb_put(skb, copy),
992 						offset, copy, off, skb) < 0) {
993 				__skb_trim(skb, off);
994 				err = -EFAULT;
995 				goto error;
996 			}
997 		} else {
998 			int i = skb_shinfo(skb)->nr_frags;
999 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1000 			struct page *page = sk->sk_sndmsg_page;
1001 			int off = sk->sk_sndmsg_off;
1002 			unsigned int left;
1003 
1004 			if (page && (left = PAGE_SIZE - off) > 0) {
1005 				if (copy >= left)
1006 					copy = left;
1007 				if (page != frag->page) {
1008 					if (i == MAX_SKB_FRAGS) {
1009 						err = -EMSGSIZE;
1010 						goto error;
1011 					}
1012 					get_page(page);
1013 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1014 					frag = &skb_shinfo(skb)->frags[i];
1015 				}
1016 			} else if(i < MAX_SKB_FRAGS) {
1017 				if (copy > PAGE_SIZE)
1018 					copy = PAGE_SIZE;
1019 				page = alloc_pages(sk->sk_allocation, 0);
1020 				if (page == NULL) {
1021 					err = -ENOMEM;
1022 					goto error;
1023 				}
1024 				sk->sk_sndmsg_page = page;
1025 				sk->sk_sndmsg_off = 0;
1026 
1027 				skb_fill_page_desc(skb, i, page, 0, 0);
1028 				frag = &skb_shinfo(skb)->frags[i];
1029 				skb->truesize += PAGE_SIZE;
1030 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1031 			} else {
1032 				err = -EMSGSIZE;
1033 				goto error;
1034 			}
1035 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1036 				err = -EFAULT;
1037 				goto error;
1038 			}
1039 			sk->sk_sndmsg_off += copy;
1040 			frag->size += copy;
1041 			skb->len += copy;
1042 			skb->data_len += copy;
1043 		}
1044 		offset += copy;
1045 		length -= copy;
1046 	}
1047 	return 0;
1048 error:
1049 	inet->cork.length -= length;
1050 	IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1051 	return err;
1052 }
1053 
1054 int ip6_push_pending_frames(struct sock *sk)
1055 {
1056 	struct sk_buff *skb, *tmp_skb;
1057 	struct sk_buff **tail_skb;
1058 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1059 	struct inet_sock *inet = inet_sk(sk);
1060 	struct ipv6_pinfo *np = inet6_sk(sk);
1061 	struct ipv6hdr *hdr;
1062 	struct ipv6_txoptions *opt = np->cork.opt;
1063 	struct rt6_info *rt = np->cork.rt;
1064 	struct flowi *fl = &inet->cork.fl;
1065 	unsigned char proto = fl->proto;
1066 	int err = 0;
1067 
1068 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1069 		goto out;
1070 	tail_skb = &(skb_shinfo(skb)->frag_list);
1071 
1072 	/* move skb->data to ip header from ext header */
1073 	if (skb->data < skb->nh.raw)
1074 		__skb_pull(skb, skb->nh.raw - skb->data);
1075 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1076 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1077 		*tail_skb = tmp_skb;
1078 		tail_skb = &(tmp_skb->next);
1079 		skb->len += tmp_skb->len;
1080 		skb->data_len += tmp_skb->len;
1081 		skb->truesize += tmp_skb->truesize;
1082 		__sock_put(tmp_skb->sk);
1083 		tmp_skb->destructor = NULL;
1084 		tmp_skb->sk = NULL;
1085 	}
1086 
1087 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1088 	__skb_pull(skb, skb->h.raw - skb->nh.raw);
1089 	if (opt && opt->opt_flen)
1090 		ipv6_push_frag_opts(skb, opt, &proto);
1091 	if (opt && opt->opt_nflen)
1092 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1093 
1094 	skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1095 
1096 	*(u32*)hdr = fl->fl6_flowlabel |
1097 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1098 
1099 	if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1100 		hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1101 	else
1102 		hdr->payload_len = 0;
1103 	hdr->hop_limit = np->cork.hop_limit;
1104 	hdr->nexthdr = proto;
1105 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1106 	ipv6_addr_copy(&hdr->daddr, final_dst);
1107 
1108 	skb->dst = dst_clone(&rt->u.dst);
1109 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1110 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1111 	if (err) {
1112 		if (err > 0)
1113 			err = np->recverr ? net_xmit_errno(err) : 0;
1114 		if (err)
1115 			goto error;
1116 	}
1117 
1118 out:
1119 	inet->cork.flags &= ~IPCORK_OPT;
1120 	if (np->cork.opt) {
1121 		kfree(np->cork.opt);
1122 		np->cork.opt = NULL;
1123 	}
1124 	if (np->cork.rt) {
1125 		dst_release(&np->cork.rt->u.dst);
1126 		np->cork.rt = NULL;
1127 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1128 	}
1129 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1130 	return err;
1131 error:
1132 	goto out;
1133 }
1134 
1135 void ip6_flush_pending_frames(struct sock *sk)
1136 {
1137 	struct inet_sock *inet = inet_sk(sk);
1138 	struct ipv6_pinfo *np = inet6_sk(sk);
1139 	struct sk_buff *skb;
1140 
1141 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1142 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1143 		kfree_skb(skb);
1144 	}
1145 
1146 	inet->cork.flags &= ~IPCORK_OPT;
1147 
1148 	if (np->cork.opt) {
1149 		kfree(np->cork.opt);
1150 		np->cork.opt = NULL;
1151 	}
1152 	if (np->cork.rt) {
1153 		dst_release(&np->cork.rt->u.dst);
1154 		np->cork.rt = NULL;
1155 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1156 	}
1157 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1158 }
1159