xref: /linux/net/ipv6/ip6_output.c (revision 9ce7677cfd7cd871adb457c80bea3b581b839641)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *	Based on linux/net/ipv4/ip_output.c
11  *
12  *	This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *	Changes:
18  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
19  *				extension headers are implemented.
20  *				route changes now work.
21  *				ip6_forward does not confuse sniffers.
22  *				etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *	Imran Patel	: 	frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *			:       add ip6_append_data and related functions
28  *				for datagram xmit
29  */
30 
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42 
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63 	static u32 ipv6_fragmentation_id = 1;
64 	static DEFINE_SPINLOCK(ip6_id_lock);
65 
66 	spin_lock_bh(&ip6_id_lock);
67 	fhdr->identification = htonl(ipv6_fragmentation_id);
68 	if (++ipv6_fragmentation_id == 0)
69 		ipv6_fragmentation_id = 1;
70 	spin_unlock_bh(&ip6_id_lock);
71 }
72 
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75 
76 	struct dst_entry *dst = skb->dst;
77 	struct hh_cache *hh = dst->hh;
78 
79 	if (hh) {
80 		int hh_alen;
81 
82 		read_lock_bh(&hh->hh_lock);
83 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 		read_unlock_bh(&hh->hh_lock);
86 	        skb_push(skb, hh->hh_len);
87 		return hh->hh_output(skb);
88 	} else if (dst->neighbour)
89 		return dst->neighbour->output(skb);
90 
91 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 	kfree_skb(skb);
93 	return -EINVAL;
94 
95 }
96 
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100 	newskb->mac.raw = newskb->data;
101 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
102 	newskb->pkt_type = PACKET_LOOPBACK;
103 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 	BUG_TRAP(newskb->dst);
105 
106 	netif_rx(newskb);
107 	return 0;
108 }
109 
110 
111 static int ip6_output2(struct sk_buff *skb)
112 {
113 	struct dst_entry *dst = skb->dst;
114 	struct net_device *dev = dst->dev;
115 
116 	skb->protocol = htons(ETH_P_IPV6);
117 	skb->dev = dev;
118 
119 	if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121 
122 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 		    ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 				&skb->nh.ipv6h->saddr)) {
125 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126 
127 			/* Do not check for IFF_ALLMULTI; multicast routing
128 			   is not supported in any case.
129 			 */
130 			if (newskb)
131 				NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 					newskb->dev,
133 					ip6_dev_loopback_xmit);
134 
135 			if (skb->nh.ipv6h->hop_limit == 0) {
136 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 				kfree_skb(skb);
138 				return 0;
139 			}
140 		}
141 
142 		IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 	}
144 
145 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147 
148 int ip6_output(struct sk_buff *skb)
149 {
150 	if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151 				dst_allfrag(skb->dst))
152 		return ip6_fragment(skb, ip6_output2);
153 	else
154 		return ip6_output2(skb);
155 }
156 
157 /*
158  *	xmit an sk_buff (used by TCP)
159  */
160 
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162 	     struct ipv6_txoptions *opt, int ipfragok)
163 {
164 	struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165 	struct in6_addr *first_hop = &fl->fl6_dst;
166 	struct dst_entry *dst = skb->dst;
167 	struct ipv6hdr *hdr;
168 	u8  proto = fl->proto;
169 	int seg_len = skb->len;
170 	int hlimit, tclass;
171 	u32 mtu;
172 
173 	if (opt) {
174 		int head_room;
175 
176 		/* First: exthdrs may take lots of space (~8K for now)
177 		   MAX_HEADER is not enough.
178 		 */
179 		head_room = opt->opt_nflen + opt->opt_flen;
180 		seg_len += head_room;
181 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182 
183 		if (skb_headroom(skb) < head_room) {
184 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185 			kfree_skb(skb);
186 			skb = skb2;
187 			if (skb == NULL) {
188 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189 				return -ENOBUFS;
190 			}
191 			if (sk)
192 				skb_set_owner_w(skb, sk);
193 		}
194 		if (opt->opt_flen)
195 			ipv6_push_frag_opts(skb, opt, &proto);
196 		if (opt->opt_nflen)
197 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198 	}
199 
200 	hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201 
202 	/*
203 	 *	Fill in the IPv6 header
204 	 */
205 
206 	hlimit = -1;
207 	if (np)
208 		hlimit = np->hop_limit;
209 	if (hlimit < 0)
210 		hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211 	if (hlimit < 0)
212 		hlimit = ipv6_get_hoplimit(dst->dev);
213 
214 	tclass = -1;
215 	if (np)
216 		tclass = np->tclass;
217 	if (tclass < 0)
218 		tclass = 0;
219 
220 	*(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221 
222 	hdr->payload_len = htons(seg_len);
223 	hdr->nexthdr = proto;
224 	hdr->hop_limit = hlimit;
225 
226 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227 	ipv6_addr_copy(&hdr->daddr, first_hop);
228 
229 	mtu = dst_mtu(dst);
230 	if ((skb->len <= mtu) || ipfragok) {
231 		IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
232 		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
233 				dst_output);
234 	}
235 
236 	if (net_ratelimit())
237 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
238 	skb->dev = dst->dev;
239 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
240 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
241 	kfree_skb(skb);
242 	return -EMSGSIZE;
243 }
244 
245 /*
246  *	To avoid extra problems ND packets are send through this
247  *	routine. It's code duplication but I really want to avoid
248  *	extra checks since ipv6_build_header is used by TCP (which
249  *	is for us performance critical)
250  */
251 
252 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
253 	       struct in6_addr *saddr, struct in6_addr *daddr,
254 	       int proto, int len)
255 {
256 	struct ipv6_pinfo *np = inet6_sk(sk);
257 	struct ipv6hdr *hdr;
258 	int totlen;
259 
260 	skb->protocol = htons(ETH_P_IPV6);
261 	skb->dev = dev;
262 
263 	totlen = len + sizeof(struct ipv6hdr);
264 
265 	hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
266 	skb->nh.ipv6h = hdr;
267 
268 	*(u32*)hdr = htonl(0x60000000);
269 
270 	hdr->payload_len = htons(len);
271 	hdr->nexthdr = proto;
272 	hdr->hop_limit = np->hop_limit;
273 
274 	ipv6_addr_copy(&hdr->saddr, saddr);
275 	ipv6_addr_copy(&hdr->daddr, daddr);
276 
277 	return 0;
278 }
279 
280 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
281 {
282 	struct ip6_ra_chain *ra;
283 	struct sock *last = NULL;
284 
285 	read_lock(&ip6_ra_lock);
286 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
287 		struct sock *sk = ra->sk;
288 		if (sk && ra->sel == sel &&
289 		    (!sk->sk_bound_dev_if ||
290 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
291 			if (last) {
292 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
293 				if (skb2)
294 					rawv6_rcv(last, skb2);
295 			}
296 			last = sk;
297 		}
298 	}
299 
300 	if (last) {
301 		rawv6_rcv(last, skb);
302 		read_unlock(&ip6_ra_lock);
303 		return 1;
304 	}
305 	read_unlock(&ip6_ra_lock);
306 	return 0;
307 }
308 
309 static inline int ip6_forward_finish(struct sk_buff *skb)
310 {
311 	return dst_output(skb);
312 }
313 
314 int ip6_forward(struct sk_buff *skb)
315 {
316 	struct dst_entry *dst = skb->dst;
317 	struct ipv6hdr *hdr = skb->nh.ipv6h;
318 	struct inet6_skb_parm *opt = IP6CB(skb);
319 
320 	if (ipv6_devconf.forwarding == 0)
321 		goto error;
322 
323 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
324 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
325 		goto drop;
326 	}
327 
328 	skb->ip_summed = CHECKSUM_NONE;
329 
330 	/*
331 	 *	We DO NOT make any processing on
332 	 *	RA packets, pushing them to user level AS IS
333 	 *	without ane WARRANTY that application will be able
334 	 *	to interpret them. The reason is that we
335 	 *	cannot make anything clever here.
336 	 *
337 	 *	We are not end-node, so that if packet contains
338 	 *	AH/ESP, we cannot make anything.
339 	 *	Defragmentation also would be mistake, RA packets
340 	 *	cannot be fragmented, because there is no warranty
341 	 *	that different fragments will go along one path. --ANK
342 	 */
343 	if (opt->ra) {
344 		u8 *ptr = skb->nh.raw + opt->ra;
345 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
346 			return 0;
347 	}
348 
349 	/*
350 	 *	check and decrement ttl
351 	 */
352 	if (hdr->hop_limit <= 1) {
353 		/* Force OUTPUT device used as source address */
354 		skb->dev = dst->dev;
355 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
356 			    0, skb->dev);
357 
358 		kfree_skb(skb);
359 		return -ETIMEDOUT;
360 	}
361 
362 	if (!xfrm6_route_forward(skb)) {
363 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
364 		goto drop;
365 	}
366 	dst = skb->dst;
367 
368 	/* IPv6 specs say nothing about it, but it is clear that we cannot
369 	   send redirects to source routed frames.
370 	 */
371 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
372 		struct in6_addr *target = NULL;
373 		struct rt6_info *rt;
374 		struct neighbour *n = dst->neighbour;
375 
376 		/*
377 		 *	incoming and outgoing devices are the same
378 		 *	send a redirect.
379 		 */
380 
381 		rt = (struct rt6_info *) dst;
382 		if ((rt->rt6i_flags & RTF_GATEWAY))
383 			target = (struct in6_addr*)&n->primary_key;
384 		else
385 			target = &hdr->daddr;
386 
387 		/* Limit redirects both by destination (here)
388 		   and by source (inside ndisc_send_redirect)
389 		 */
390 		if (xrlim_allow(dst, 1*HZ))
391 			ndisc_send_redirect(skb, n, target);
392 	} else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
393 						|IPV6_ADDR_LINKLOCAL)) {
394 		/* This check is security critical. */
395 		goto error;
396 	}
397 
398 	if (skb->len > dst_mtu(dst)) {
399 		/* Again, force OUTPUT device used as source address */
400 		skb->dev = dst->dev;
401 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
402 		IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
403 		IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
404 		kfree_skb(skb);
405 		return -EMSGSIZE;
406 	}
407 
408 	if (skb_cow(skb, dst->dev->hard_header_len)) {
409 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
410 		goto drop;
411 	}
412 
413 	hdr = skb->nh.ipv6h;
414 
415 	/* Mangling hops number delayed to point after skb COW */
416 
417 	hdr->hop_limit--;
418 
419 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
420 	return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
421 
422 error:
423 	IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
424 drop:
425 	kfree_skb(skb);
426 	return -EINVAL;
427 }
428 
429 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
430 {
431 	to->pkt_type = from->pkt_type;
432 	to->priority = from->priority;
433 	to->protocol = from->protocol;
434 	dst_release(to->dst);
435 	to->dst = dst_clone(from->dst);
436 	to->dev = from->dev;
437 
438 #ifdef CONFIG_NET_SCHED
439 	to->tc_index = from->tc_index;
440 #endif
441 #ifdef CONFIG_NETFILTER
442 	to->nfmark = from->nfmark;
443 	/* Connection association is same as pre-frag packet */
444 	nf_conntrack_put(to->nfct);
445 	to->nfct = from->nfct;
446 	nf_conntrack_get(to->nfct);
447 	to->nfctinfo = from->nfctinfo;
448 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
449 	nf_conntrack_put_reasm(to->nfct_reasm);
450 	to->nfct_reasm = from->nfct_reasm;
451 	nf_conntrack_get_reasm(to->nfct_reasm);
452 #endif
453 #ifdef CONFIG_BRIDGE_NETFILTER
454 	nf_bridge_put(to->nf_bridge);
455 	to->nf_bridge = from->nf_bridge;
456 	nf_bridge_get(to->nf_bridge);
457 #endif
458 #endif
459 }
460 
461 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
462 {
463 	u16 offset = sizeof(struct ipv6hdr);
464 	struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
465 	unsigned int packet_len = skb->tail - skb->nh.raw;
466 	int found_rhdr = 0;
467 	*nexthdr = &skb->nh.ipv6h->nexthdr;
468 
469 	while (offset + 1 <= packet_len) {
470 
471 		switch (**nexthdr) {
472 
473 		case NEXTHDR_HOP:
474 		case NEXTHDR_ROUTING:
475 		case NEXTHDR_DEST:
476 			if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
477 			if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
478 			offset += ipv6_optlen(exthdr);
479 			*nexthdr = &exthdr->nexthdr;
480 			exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
481 			break;
482 		default :
483 			return offset;
484 		}
485 	}
486 
487 	return offset;
488 }
489 
490 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
491 {
492 	struct net_device *dev;
493 	struct sk_buff *frag;
494 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
495 	struct ipv6hdr *tmp_hdr;
496 	struct frag_hdr *fh;
497 	unsigned int mtu, hlen, left, len;
498 	u32 frag_id = 0;
499 	int ptr, offset = 0, err=0;
500 	u8 *prevhdr, nexthdr = 0;
501 
502 	dev = rt->u.dst.dev;
503 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
504 	nexthdr = *prevhdr;
505 
506 	mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
507 
508 	if (skb_shinfo(skb)->frag_list) {
509 		int first_len = skb_pagelen(skb);
510 
511 		if (first_len - hlen > mtu ||
512 		    ((first_len - hlen) & 7) ||
513 		    skb_cloned(skb))
514 			goto slow_path;
515 
516 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
517 			/* Correct geometry. */
518 			if (frag->len > mtu ||
519 			    ((frag->len & 7) && frag->next) ||
520 			    skb_headroom(frag) < hlen)
521 			    goto slow_path;
522 
523 			/* Partially cloned skb? */
524 			if (skb_shared(frag))
525 				goto slow_path;
526 
527 			BUG_ON(frag->sk);
528 			if (skb->sk) {
529 				sock_hold(skb->sk);
530 				frag->sk = skb->sk;
531 				frag->destructor = sock_wfree;
532 				skb->truesize -= frag->truesize;
533 			}
534 		}
535 
536 		err = 0;
537 		offset = 0;
538 		frag = skb_shinfo(skb)->frag_list;
539 		skb_shinfo(skb)->frag_list = NULL;
540 		/* BUILD HEADER */
541 
542 		tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
543 		if (!tmp_hdr) {
544 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
545 			return -ENOMEM;
546 		}
547 
548 		*prevhdr = NEXTHDR_FRAGMENT;
549 		memcpy(tmp_hdr, skb->nh.raw, hlen);
550 		__skb_pull(skb, hlen);
551 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
552 		skb->nh.raw = __skb_push(skb, hlen);
553 		memcpy(skb->nh.raw, tmp_hdr, hlen);
554 
555 		ipv6_select_ident(skb, fh);
556 		fh->nexthdr = nexthdr;
557 		fh->reserved = 0;
558 		fh->frag_off = htons(IP6_MF);
559 		frag_id = fh->identification;
560 
561 		first_len = skb_pagelen(skb);
562 		skb->data_len = first_len - skb_headlen(skb);
563 		skb->len = first_len;
564 		skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
565 
566 
567 		for (;;) {
568 			/* Prepare header of the next frame,
569 			 * before previous one went down. */
570 			if (frag) {
571 				frag->ip_summed = CHECKSUM_NONE;
572 				frag->h.raw = frag->data;
573 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
574 				frag->nh.raw = __skb_push(frag, hlen);
575 				memcpy(frag->nh.raw, tmp_hdr, hlen);
576 				offset += skb->len - hlen - sizeof(struct frag_hdr);
577 				fh->nexthdr = nexthdr;
578 				fh->reserved = 0;
579 				fh->frag_off = htons(offset);
580 				if (frag->next != NULL)
581 					fh->frag_off |= htons(IP6_MF);
582 				fh->identification = frag_id;
583 				frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
584 				ip6_copy_metadata(frag, skb);
585 			}
586 
587 			err = output(skb);
588 			if (err || !frag)
589 				break;
590 
591 			skb = frag;
592 			frag = skb->next;
593 			skb->next = NULL;
594 		}
595 
596 		kfree(tmp_hdr);
597 
598 		if (err == 0) {
599 			IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
600 			return 0;
601 		}
602 
603 		while (frag) {
604 			skb = frag->next;
605 			kfree_skb(frag);
606 			frag = skb;
607 		}
608 
609 		IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
610 		return err;
611 	}
612 
613 slow_path:
614 	left = skb->len - hlen;		/* Space per frame */
615 	ptr = hlen;			/* Where to start from */
616 
617 	/*
618 	 *	Fragment the datagram.
619 	 */
620 
621 	*prevhdr = NEXTHDR_FRAGMENT;
622 
623 	/*
624 	 *	Keep copying data until we run out.
625 	 */
626 	while(left > 0)	{
627 		len = left;
628 		/* IF: it doesn't fit, use 'mtu' - the data space left */
629 		if (len > mtu)
630 			len = mtu;
631 		/* IF: we are not sending upto and including the packet end
632 		   then align the next start on an eight byte boundary */
633 		if (len < left)	{
634 			len &= ~7;
635 		}
636 		/*
637 		 *	Allocate buffer.
638 		 */
639 
640 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
641 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
642 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
643 			err = -ENOMEM;
644 			goto fail;
645 		}
646 
647 		/*
648 		 *	Set up data on packet
649 		 */
650 
651 		ip6_copy_metadata(frag, skb);
652 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
653 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
654 		frag->nh.raw = frag->data;
655 		fh = (struct frag_hdr*)(frag->data + hlen);
656 		frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
657 
658 		/*
659 		 *	Charge the memory for the fragment to any owner
660 		 *	it might possess
661 		 */
662 		if (skb->sk)
663 			skb_set_owner_w(frag, skb->sk);
664 
665 		/*
666 		 *	Copy the packet header into the new buffer.
667 		 */
668 		memcpy(frag->nh.raw, skb->data, hlen);
669 
670 		/*
671 		 *	Build fragment header.
672 		 */
673 		fh->nexthdr = nexthdr;
674 		fh->reserved = 0;
675 		if (!frag_id) {
676 			ipv6_select_ident(skb, fh);
677 			frag_id = fh->identification;
678 		} else
679 			fh->identification = frag_id;
680 
681 		/*
682 		 *	Copy a block of the IP datagram.
683 		 */
684 		if (skb_copy_bits(skb, ptr, frag->h.raw, len))
685 			BUG();
686 		left -= len;
687 
688 		fh->frag_off = htons(offset);
689 		if (left > 0)
690 			fh->frag_off |= htons(IP6_MF);
691 		frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
692 
693 		ptr += len;
694 		offset += len;
695 
696 		/*
697 		 *	Put this fragment into the sending queue.
698 		 */
699 
700 		IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
701 
702 		err = output(frag);
703 		if (err)
704 			goto fail;
705 	}
706 	kfree_skb(skb);
707 	IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
708 	return err;
709 
710 fail:
711 	kfree_skb(skb);
712 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
713 	return err;
714 }
715 
716 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
717 {
718 	int err = 0;
719 
720 	*dst = NULL;
721 	if (sk) {
722 		struct ipv6_pinfo *np = inet6_sk(sk);
723 
724 		*dst = sk_dst_check(sk, np->dst_cookie);
725 		if (*dst) {
726 			struct rt6_info *rt = (struct rt6_info*)*dst;
727 
728 				/* Yes, checking route validity in not connected
729 				   case is not very simple. Take into account,
730 				   that we do not support routing by source, TOS,
731 				   and MSG_DONTROUTE 		--ANK (980726)
732 
733 				   1. If route was host route, check that
734 				      cached destination is current.
735 				      If it is network route, we still may
736 				      check its validity using saved pointer
737 				      to the last used address: daddr_cache.
738 				      We do not want to save whole address now,
739 				      (because main consumer of this service
740 				       is tcp, which has not this problem),
741 				      so that the last trick works only on connected
742 				      sockets.
743 				   2. oif also should be the same.
744 				 */
745 
746 			if (((rt->rt6i_dst.plen != 128 ||
747 			      !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
748 			     && (np->daddr_cache == NULL ||
749 				 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
750 			    || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
751 				dst_release(*dst);
752 				*dst = NULL;
753 			}
754 		}
755 	}
756 
757 	if (*dst == NULL)
758 		*dst = ip6_route_output(sk, fl);
759 
760 	if ((err = (*dst)->error))
761 		goto out_err_release;
762 
763 	if (ipv6_addr_any(&fl->fl6_src)) {
764 		err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
765 
766 		if (err)
767 			goto out_err_release;
768 	}
769 
770 	return 0;
771 
772 out_err_release:
773 	dst_release(*dst);
774 	*dst = NULL;
775 	return err;
776 }
777 
778 static inline int ip6_ufo_append_data(struct sock *sk,
779 			int getfrag(void *from, char *to, int offset, int len,
780 			int odd, struct sk_buff *skb),
781 			void *from, int length, int hh_len, int fragheaderlen,
782 			int transhdrlen, int mtu,unsigned int flags)
783 
784 {
785 	struct sk_buff *skb;
786 	int err;
787 
788 	/* There is support for UDP large send offload by network
789 	 * device, so create one single skb packet containing complete
790 	 * udp datagram
791 	 */
792 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
793 		skb = sock_alloc_send_skb(sk,
794 			hh_len + fragheaderlen + transhdrlen + 20,
795 			(flags & MSG_DONTWAIT), &err);
796 		if (skb == NULL)
797 			return -ENOMEM;
798 
799 		/* reserve space for Hardware header */
800 		skb_reserve(skb, hh_len);
801 
802 		/* create space for UDP/IP header */
803 		skb_put(skb,fragheaderlen + transhdrlen);
804 
805 		/* initialize network header pointer */
806 		skb->nh.raw = skb->data;
807 
808 		/* initialize protocol header pointer */
809 		skb->h.raw = skb->data + fragheaderlen;
810 
811 		skb->ip_summed = CHECKSUM_HW;
812 		skb->csum = 0;
813 		sk->sk_sndmsg_off = 0;
814 	}
815 
816 	err = skb_append_datato_frags(sk,skb, getfrag, from,
817 				      (length - transhdrlen));
818 	if (!err) {
819 		struct frag_hdr fhdr;
820 
821 		/* specify the length of each IP datagram fragment*/
822 		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
823 						sizeof(struct frag_hdr);
824 		ipv6_select_ident(skb, &fhdr);
825 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
826 		__skb_queue_tail(&sk->sk_write_queue, skb);
827 
828 		return 0;
829 	}
830 	/* There is not enough support do UPD LSO,
831 	 * so follow normal path
832 	 */
833 	kfree_skb(skb);
834 
835 	return err;
836 }
837 
838 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
839 	int offset, int len, int odd, struct sk_buff *skb),
840 	void *from, int length, int transhdrlen,
841 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
842 	struct rt6_info *rt, unsigned int flags)
843 {
844 	struct inet_sock *inet = inet_sk(sk);
845 	struct ipv6_pinfo *np = inet6_sk(sk);
846 	struct sk_buff *skb;
847 	unsigned int maxfraglen, fragheaderlen;
848 	int exthdrlen;
849 	int hh_len;
850 	int mtu;
851 	int copy;
852 	int err;
853 	int offset = 0;
854 	int csummode = CHECKSUM_NONE;
855 
856 	if (flags&MSG_PROBE)
857 		return 0;
858 	if (skb_queue_empty(&sk->sk_write_queue)) {
859 		/*
860 		 * setup for corking
861 		 */
862 		if (opt) {
863 			if (np->cork.opt == NULL) {
864 				np->cork.opt = kmalloc(opt->tot_len,
865 						       sk->sk_allocation);
866 				if (unlikely(np->cork.opt == NULL))
867 					return -ENOBUFS;
868 			} else if (np->cork.opt->tot_len < opt->tot_len) {
869 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
870 				return -EINVAL;
871 			}
872 			memcpy(np->cork.opt, opt, opt->tot_len);
873 			inet->cork.flags |= IPCORK_OPT;
874 			/* need source address above miyazawa*/
875 		}
876 		dst_hold(&rt->u.dst);
877 		np->cork.rt = rt;
878 		inet->cork.fl = *fl;
879 		np->cork.hop_limit = hlimit;
880 		np->cork.tclass = tclass;
881 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
882 		if (dst_allfrag(rt->u.dst.path))
883 			inet->cork.flags |= IPCORK_ALLFRAG;
884 		inet->cork.length = 0;
885 		sk->sk_sndmsg_page = NULL;
886 		sk->sk_sndmsg_off = 0;
887 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
888 		length += exthdrlen;
889 		transhdrlen += exthdrlen;
890 	} else {
891 		rt = np->cork.rt;
892 		fl = &inet->cork.fl;
893 		if (inet->cork.flags & IPCORK_OPT)
894 			opt = np->cork.opt;
895 		transhdrlen = 0;
896 		exthdrlen = 0;
897 		mtu = inet->cork.fragsize;
898 	}
899 
900 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
901 
902 	fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
903 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
904 
905 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
906 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
907 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
908 			return -EMSGSIZE;
909 		}
910 	}
911 
912 	/*
913 	 * Let's try using as much space as possible.
914 	 * Use MTU if total length of the message fits into the MTU.
915 	 * Otherwise, we need to reserve fragment header and
916 	 * fragment alignment (= 8-15 octects, in total).
917 	 *
918 	 * Note that we may need to "move" the data from the tail of
919 	 * of the buffer to the new fragment when we split
920 	 * the message.
921 	 *
922 	 * FIXME: It may be fragmented into multiple chunks
923 	 *        at once if non-fragmentable extension headers
924 	 *        are too large.
925 	 * --yoshfuji
926 	 */
927 
928 	inet->cork.length += length;
929 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
930 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
931 
932 		if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
933 				fragheaderlen, transhdrlen, mtu, flags))
934 			goto error;
935 
936 		return 0;
937 	}
938 
939 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
940 		goto alloc_new_skb;
941 
942 	while (length > 0) {
943 		/* Check if the remaining data fits into current packet. */
944 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
945 		if (copy < length)
946 			copy = maxfraglen - skb->len;
947 
948 		if (copy <= 0) {
949 			char *data;
950 			unsigned int datalen;
951 			unsigned int fraglen;
952 			unsigned int fraggap;
953 			unsigned int alloclen;
954 			struct sk_buff *skb_prev;
955 alloc_new_skb:
956 			skb_prev = skb;
957 
958 			/* There's no room in the current skb */
959 			if (skb_prev)
960 				fraggap = skb_prev->len - maxfraglen;
961 			else
962 				fraggap = 0;
963 
964 			/*
965 			 * If remaining data exceeds the mtu,
966 			 * we know we need more fragment(s).
967 			 */
968 			datalen = length + fraggap;
969 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
970 				datalen = maxfraglen - fragheaderlen;
971 
972 			fraglen = datalen + fragheaderlen;
973 			if ((flags & MSG_MORE) &&
974 			    !(rt->u.dst.dev->features&NETIF_F_SG))
975 				alloclen = mtu;
976 			else
977 				alloclen = datalen + fragheaderlen;
978 
979 			/*
980 			 * The last fragment gets additional space at tail.
981 			 * Note: we overallocate on fragments with MSG_MODE
982 			 * because we have no idea if we're the last one.
983 			 */
984 			if (datalen == length + fraggap)
985 				alloclen += rt->u.dst.trailer_len;
986 
987 			/*
988 			 * We just reserve space for fragment header.
989 			 * Note: this may be overallocation if the message
990 			 * (without MSG_MORE) fits into the MTU.
991 			 */
992 			alloclen += sizeof(struct frag_hdr);
993 
994 			if (transhdrlen) {
995 				skb = sock_alloc_send_skb(sk,
996 						alloclen + hh_len,
997 						(flags & MSG_DONTWAIT), &err);
998 			} else {
999 				skb = NULL;
1000 				if (atomic_read(&sk->sk_wmem_alloc) <=
1001 				    2 * sk->sk_sndbuf)
1002 					skb = sock_wmalloc(sk,
1003 							   alloclen + hh_len, 1,
1004 							   sk->sk_allocation);
1005 				if (unlikely(skb == NULL))
1006 					err = -ENOBUFS;
1007 			}
1008 			if (skb == NULL)
1009 				goto error;
1010 			/*
1011 			 *	Fill in the control structures
1012 			 */
1013 			skb->ip_summed = csummode;
1014 			skb->csum = 0;
1015 			/* reserve for fragmentation */
1016 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1017 
1018 			/*
1019 			 *	Find where to start putting bytes
1020 			 */
1021 			data = skb_put(skb, fraglen);
1022 			skb->nh.raw = data + exthdrlen;
1023 			data += fragheaderlen;
1024 			skb->h.raw = data + exthdrlen;
1025 
1026 			if (fraggap) {
1027 				skb->csum = skb_copy_and_csum_bits(
1028 					skb_prev, maxfraglen,
1029 					data + transhdrlen, fraggap, 0);
1030 				skb_prev->csum = csum_sub(skb_prev->csum,
1031 							  skb->csum);
1032 				data += fraggap;
1033 				skb_trim(skb_prev, maxfraglen);
1034 			}
1035 			copy = datalen - transhdrlen - fraggap;
1036 			if (copy < 0) {
1037 				err = -EINVAL;
1038 				kfree_skb(skb);
1039 				goto error;
1040 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1041 				err = -EFAULT;
1042 				kfree_skb(skb);
1043 				goto error;
1044 			}
1045 
1046 			offset += copy;
1047 			length -= datalen - fraggap;
1048 			transhdrlen = 0;
1049 			exthdrlen = 0;
1050 			csummode = CHECKSUM_NONE;
1051 
1052 			/*
1053 			 * Put the packet on the pending queue
1054 			 */
1055 			__skb_queue_tail(&sk->sk_write_queue, skb);
1056 			continue;
1057 		}
1058 
1059 		if (copy > length)
1060 			copy = length;
1061 
1062 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1063 			unsigned int off;
1064 
1065 			off = skb->len;
1066 			if (getfrag(from, skb_put(skb, copy),
1067 						offset, copy, off, skb) < 0) {
1068 				__skb_trim(skb, off);
1069 				err = -EFAULT;
1070 				goto error;
1071 			}
1072 		} else {
1073 			int i = skb_shinfo(skb)->nr_frags;
1074 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1075 			struct page *page = sk->sk_sndmsg_page;
1076 			int off = sk->sk_sndmsg_off;
1077 			unsigned int left;
1078 
1079 			if (page && (left = PAGE_SIZE - off) > 0) {
1080 				if (copy >= left)
1081 					copy = left;
1082 				if (page != frag->page) {
1083 					if (i == MAX_SKB_FRAGS) {
1084 						err = -EMSGSIZE;
1085 						goto error;
1086 					}
1087 					get_page(page);
1088 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1089 					frag = &skb_shinfo(skb)->frags[i];
1090 				}
1091 			} else if(i < MAX_SKB_FRAGS) {
1092 				if (copy > PAGE_SIZE)
1093 					copy = PAGE_SIZE;
1094 				page = alloc_pages(sk->sk_allocation, 0);
1095 				if (page == NULL) {
1096 					err = -ENOMEM;
1097 					goto error;
1098 				}
1099 				sk->sk_sndmsg_page = page;
1100 				sk->sk_sndmsg_off = 0;
1101 
1102 				skb_fill_page_desc(skb, i, page, 0, 0);
1103 				frag = &skb_shinfo(skb)->frags[i];
1104 				skb->truesize += PAGE_SIZE;
1105 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1106 			} else {
1107 				err = -EMSGSIZE;
1108 				goto error;
1109 			}
1110 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1111 				err = -EFAULT;
1112 				goto error;
1113 			}
1114 			sk->sk_sndmsg_off += copy;
1115 			frag->size += copy;
1116 			skb->len += copy;
1117 			skb->data_len += copy;
1118 		}
1119 		offset += copy;
1120 		length -= copy;
1121 	}
1122 	return 0;
1123 error:
1124 	inet->cork.length -= length;
1125 	IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1126 	return err;
1127 }
1128 
1129 int ip6_push_pending_frames(struct sock *sk)
1130 {
1131 	struct sk_buff *skb, *tmp_skb;
1132 	struct sk_buff **tail_skb;
1133 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1134 	struct inet_sock *inet = inet_sk(sk);
1135 	struct ipv6_pinfo *np = inet6_sk(sk);
1136 	struct ipv6hdr *hdr;
1137 	struct ipv6_txoptions *opt = np->cork.opt;
1138 	struct rt6_info *rt = np->cork.rt;
1139 	struct flowi *fl = &inet->cork.fl;
1140 	unsigned char proto = fl->proto;
1141 	int err = 0;
1142 
1143 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1144 		goto out;
1145 	tail_skb = &(skb_shinfo(skb)->frag_list);
1146 
1147 	/* move skb->data to ip header from ext header */
1148 	if (skb->data < skb->nh.raw)
1149 		__skb_pull(skb, skb->nh.raw - skb->data);
1150 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1151 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1152 		*tail_skb = tmp_skb;
1153 		tail_skb = &(tmp_skb->next);
1154 		skb->len += tmp_skb->len;
1155 		skb->data_len += tmp_skb->len;
1156 		skb->truesize += tmp_skb->truesize;
1157 		__sock_put(tmp_skb->sk);
1158 		tmp_skb->destructor = NULL;
1159 		tmp_skb->sk = NULL;
1160 	}
1161 
1162 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1163 	__skb_pull(skb, skb->h.raw - skb->nh.raw);
1164 	if (opt && opt->opt_flen)
1165 		ipv6_push_frag_opts(skb, opt, &proto);
1166 	if (opt && opt->opt_nflen)
1167 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1168 
1169 	skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1170 
1171 	*(u32*)hdr = fl->fl6_flowlabel |
1172 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1173 
1174 	if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1175 		hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1176 	else
1177 		hdr->payload_len = 0;
1178 	hdr->hop_limit = np->cork.hop_limit;
1179 	hdr->nexthdr = proto;
1180 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1181 	ipv6_addr_copy(&hdr->daddr, final_dst);
1182 
1183 	skb->dst = dst_clone(&rt->u.dst);
1184 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1185 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1186 	if (err) {
1187 		if (err > 0)
1188 			err = np->recverr ? net_xmit_errno(err) : 0;
1189 		if (err)
1190 			goto error;
1191 	}
1192 
1193 out:
1194 	inet->cork.flags &= ~IPCORK_OPT;
1195 	kfree(np->cork.opt);
1196 	np->cork.opt = NULL;
1197 	if (np->cork.rt) {
1198 		dst_release(&np->cork.rt->u.dst);
1199 		np->cork.rt = NULL;
1200 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1201 	}
1202 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1203 	return err;
1204 error:
1205 	goto out;
1206 }
1207 
1208 void ip6_flush_pending_frames(struct sock *sk)
1209 {
1210 	struct inet_sock *inet = inet_sk(sk);
1211 	struct ipv6_pinfo *np = inet6_sk(sk);
1212 	struct sk_buff *skb;
1213 
1214 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1215 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1216 		kfree_skb(skb);
1217 	}
1218 
1219 	inet->cork.flags &= ~IPCORK_OPT;
1220 
1221 	kfree(np->cork.opt);
1222 	np->cork.opt = NULL;
1223 	if (np->cork.rt) {
1224 		dst_release(&np->cork.rt->u.dst);
1225 		np->cork.rt = NULL;
1226 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1227 	}
1228 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1229 }
1230