xref: /linux/net/ipv4/ip_output.c (revision 858259cf7d1c443c836a2022b78cb281f0a9b95e)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Version:	$Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Donald Becker, <becker@super.org>
13  *		Alan Cox, <Alan.Cox@linux.org>
14  *		Richard Underwood
15  *		Stefan Becker, <stefanb@yello.ping.de>
16  *		Jorge Cwik, <jorge@laser.satlink.net>
17  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *		Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *	See ip_input.c for original log
21  *
22  *	Fixes:
23  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
24  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
25  *		Bradford Johnson:	Fix faulty handling of some frames when
26  *					no route is found.
27  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
28  *					(in case if packet not accepted by
29  *					output firewall rules)
30  *		Mike McLagan	:	Routing by source
31  *		Alexey Kuznetsov:	use new route cache
32  *		Andi Kleen:		Fix broken PMTU recovery and remove
33  *					some redundant tests.
34  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
35  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
36  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
37  *					for decreased register pressure on x86
38  *					and more readibility.
39  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
40  *					silently drop skb instead of failing with -EPERM.
41  *		Detlev Wengorz	:	Copy protocol for fragments.
42  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
43  *					datagrams.
44  *		Hirokazu Takahashi:	sendfile() on UDP works now.
45  */
46 
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57 
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67 
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
85 
86 int sysctl_ip_default_ttl = IPDEFTTL;
87 
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91 	iph->check = 0;
92 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94 
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98 	newskb->mac.raw = newskb->data;
99 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
100 	newskb->pkt_type = PACKET_LOOPBACK;
101 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
102 	BUG_TRAP(newskb->dst);
103 	netif_rx(newskb);
104 	return 0;
105 }
106 
107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108 {
109 	int ttl = inet->uc_ttl;
110 
111 	if (ttl < 0)
112 		ttl = dst_metric(dst, RTAX_HOPLIMIT);
113 	return ttl;
114 }
115 
116 /*
117  *		Add an ip header to a skbuff and send it out.
118  *
119  */
120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121 			  u32 saddr, u32 daddr, struct ip_options *opt)
122 {
123 	struct inet_sock *inet = inet_sk(sk);
124 	struct rtable *rt = (struct rtable *)skb->dst;
125 	struct iphdr *iph;
126 
127 	/* Build the IP header. */
128 	if (opt)
129 		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130 	else
131 		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132 
133 	iph->version  = 4;
134 	iph->ihl      = 5;
135 	iph->tos      = inet->tos;
136 	if (ip_dont_fragment(sk, &rt->u.dst))
137 		iph->frag_off = htons(IP_DF);
138 	else
139 		iph->frag_off = 0;
140 	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
141 	iph->daddr    = rt->rt_dst;
142 	iph->saddr    = rt->rt_src;
143 	iph->protocol = sk->sk_protocol;
144 	iph->tot_len  = htons(skb->len);
145 	ip_select_ident(iph, &rt->u.dst, sk);
146 	skb->nh.iph   = iph;
147 
148 	if (opt && opt->optlen) {
149 		iph->ihl += opt->optlen>>2;
150 		ip_options_build(skb, opt, daddr, rt, 0);
151 	}
152 	ip_send_check(iph);
153 
154 	skb->priority = sk->sk_priority;
155 
156 	/* Send it out. */
157 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
158 		       dst_output);
159 }
160 
161 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
162 
163 static inline int ip_finish_output2(struct sk_buff *skb)
164 {
165 	struct dst_entry *dst = skb->dst;
166 	struct hh_cache *hh = dst->hh;
167 	struct net_device *dev = dst->dev;
168 	int hh_len = LL_RESERVED_SPACE(dev);
169 
170 	/* Be paranoid, rather than too clever. */
171 	if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
172 		struct sk_buff *skb2;
173 
174 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
175 		if (skb2 == NULL) {
176 			kfree_skb(skb);
177 			return -ENOMEM;
178 		}
179 		if (skb->sk)
180 			skb_set_owner_w(skb2, skb->sk);
181 		kfree_skb(skb);
182 		skb = skb2;
183 	}
184 
185 	if (hh) {
186 		int hh_alen;
187 
188 		read_lock_bh(&hh->hh_lock);
189 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
190   		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
191 		read_unlock_bh(&hh->hh_lock);
192 	        skb_push(skb, hh->hh_len);
193 		return hh->hh_output(skb);
194 	} else if (dst->neighbour)
195 		return dst->neighbour->output(skb);
196 
197 	if (net_ratelimit())
198 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
199 	kfree_skb(skb);
200 	return -EINVAL;
201 }
202 
203 static inline int ip_finish_output(struct sk_buff *skb)
204 {
205 	struct net_device *dev = skb->dst->dev;
206 
207 	skb->dev = dev;
208 	skb->protocol = htons(ETH_P_IP);
209 
210 	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
211 		       ip_finish_output2);
212 }
213 
214 int ip_mc_output(struct sk_buff *skb)
215 {
216 	struct sock *sk = skb->sk;
217 	struct rtable *rt = (struct rtable*)skb->dst;
218 	struct net_device *dev = rt->u.dst.dev;
219 
220 	/*
221 	 *	If the indicated interface is up and running, send the packet.
222 	 */
223 	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
224 
225 	skb->dev = dev;
226 	skb->protocol = htons(ETH_P_IP);
227 
228 	/*
229 	 *	Multicasts are looped back for other local users
230 	 */
231 
232 	if (rt->rt_flags&RTCF_MULTICAST) {
233 		if ((!sk || inet_sk(sk)->mc_loop)
234 #ifdef CONFIG_IP_MROUTE
235 		/* Small optimization: do not loopback not local frames,
236 		   which returned after forwarding; they will be  dropped
237 		   by ip_mr_input in any case.
238 		   Note, that local frames are looped back to be delivered
239 		   to local recipients.
240 
241 		   This check is duplicated in ip_mr_input at the moment.
242 		 */
243 		    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
244 #endif
245 		) {
246 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
247 			if (newskb)
248 				NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
249 					newskb->dev,
250 					ip_dev_loopback_xmit);
251 		}
252 
253 		/* Multicasts with ttl 0 must not go beyond the host */
254 
255 		if (skb->nh.iph->ttl == 0) {
256 			kfree_skb(skb);
257 			return 0;
258 		}
259 	}
260 
261 	if (rt->rt_flags&RTCF_BROADCAST) {
262 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
263 		if (newskb)
264 			NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
265 				newskb->dev, ip_dev_loopback_xmit);
266 	}
267 
268 	if (skb->len > dst_mtu(&rt->u.dst))
269 		return ip_fragment(skb, ip_finish_output);
270 	else
271 		return ip_finish_output(skb);
272 }
273 
274 int ip_output(struct sk_buff *skb)
275 {
276 	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
277 
278 	if (skb->len > dst_mtu(skb->dst) &&
279 		!(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
280 		return ip_fragment(skb, ip_finish_output);
281 	else
282 		return ip_finish_output(skb);
283 }
284 
285 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
286 {
287 	struct sock *sk = skb->sk;
288 	struct inet_sock *inet = inet_sk(sk);
289 	struct ip_options *opt = inet->opt;
290 	struct rtable *rt;
291 	struct iphdr *iph;
292 
293 	/* Skip all of this if the packet is already routed,
294 	 * f.e. by something like SCTP.
295 	 */
296 	rt = (struct rtable *) skb->dst;
297 	if (rt != NULL)
298 		goto packet_routed;
299 
300 	/* Make sure we can route this packet. */
301 	rt = (struct rtable *)__sk_dst_check(sk, 0);
302 	if (rt == NULL) {
303 		u32 daddr;
304 
305 		/* Use correct destination address if we have options. */
306 		daddr = inet->daddr;
307 		if(opt && opt->srr)
308 			daddr = opt->faddr;
309 
310 		{
311 			struct flowi fl = { .oif = sk->sk_bound_dev_if,
312 					    .nl_u = { .ip4_u =
313 						      { .daddr = daddr,
314 							.saddr = inet->saddr,
315 							.tos = RT_CONN_FLAGS(sk) } },
316 					    .proto = sk->sk_protocol,
317 					    .uli_u = { .ports =
318 						       { .sport = inet->sport,
319 							 .dport = inet->dport } } };
320 
321 			/* If this fails, retransmit mechanism of transport layer will
322 			 * keep trying until route appears or the connection times
323 			 * itself out.
324 			 */
325 			if (ip_route_output_flow(&rt, &fl, sk, 0))
326 				goto no_route;
327 		}
328 		sk_setup_caps(sk, &rt->u.dst);
329 	}
330 	skb->dst = dst_clone(&rt->u.dst);
331 
332 packet_routed:
333 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
334 		goto no_route;
335 
336 	/* OK, we know where to send it, allocate and build IP header. */
337 	iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
338 	*((__u16 *)iph)	= htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
339 	iph->tot_len = htons(skb->len);
340 	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
341 		iph->frag_off = htons(IP_DF);
342 	else
343 		iph->frag_off = 0;
344 	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
345 	iph->protocol = sk->sk_protocol;
346 	iph->saddr    = rt->rt_src;
347 	iph->daddr    = rt->rt_dst;
348 	skb->nh.iph   = iph;
349 	/* Transport layer set skb->h.foo itself. */
350 
351 	if (opt && opt->optlen) {
352 		iph->ihl += opt->optlen >> 2;
353 		ip_options_build(skb, opt, inet->daddr, rt, 0);
354 	}
355 
356 	ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
357 
358 	/* Add an IP checksum. */
359 	ip_send_check(iph);
360 
361 	skb->priority = sk->sk_priority;
362 
363 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
364 		       dst_output);
365 
366 no_route:
367 	IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
368 	kfree_skb(skb);
369 	return -EHOSTUNREACH;
370 }
371 
372 
373 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
374 {
375 	to->pkt_type = from->pkt_type;
376 	to->priority = from->priority;
377 	to->protocol = from->protocol;
378 	dst_release(to->dst);
379 	to->dst = dst_clone(from->dst);
380 	to->dev = from->dev;
381 
382 	/* Copy the flags to each fragment. */
383 	IPCB(to)->flags = IPCB(from)->flags;
384 
385 #ifdef CONFIG_NET_SCHED
386 	to->tc_index = from->tc_index;
387 #endif
388 #ifdef CONFIG_NETFILTER
389 	to->nfmark = from->nfmark;
390 	/* Connection association is same as pre-frag packet */
391 	nf_conntrack_put(to->nfct);
392 	to->nfct = from->nfct;
393 	nf_conntrack_get(to->nfct);
394 	to->nfctinfo = from->nfctinfo;
395 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
396 	to->ipvs_property = from->ipvs_property;
397 #endif
398 #ifdef CONFIG_BRIDGE_NETFILTER
399 	nf_bridge_put(to->nf_bridge);
400 	to->nf_bridge = from->nf_bridge;
401 	nf_bridge_get(to->nf_bridge);
402 #endif
403 #endif
404 }
405 
406 /*
407  *	This IP datagram is too large to be sent in one piece.  Break it up into
408  *	smaller pieces (each of size equal to IP header plus
409  *	a block of the data of the original IP data part) that will yet fit in a
410  *	single device frame, and queue such a frame for sending.
411  */
412 
413 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
414 {
415 	struct iphdr *iph;
416 	int raw = 0;
417 	int ptr;
418 	struct net_device *dev;
419 	struct sk_buff *skb2;
420 	unsigned int mtu, hlen, left, len, ll_rs;
421 	int offset;
422 	int not_last_frag;
423 	struct rtable *rt = (struct rtable*)skb->dst;
424 	int err = 0;
425 
426 	dev = rt->u.dst.dev;
427 
428 	/*
429 	 *	Point into the IP datagram header.
430 	 */
431 
432 	iph = skb->nh.iph;
433 
434 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
435 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
436 			  htonl(dst_mtu(&rt->u.dst)));
437 		kfree_skb(skb);
438 		return -EMSGSIZE;
439 	}
440 
441 	/*
442 	 *	Setup starting values.
443 	 */
444 
445 	hlen = iph->ihl * 4;
446 	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
447 
448 	/* When frag_list is given, use it. First, check its validity:
449 	 * some transformers could create wrong frag_list or break existing
450 	 * one, it is not prohibited. In this case fall back to copying.
451 	 *
452 	 * LATER: this step can be merged to real generation of fragments,
453 	 * we can switch to copy when see the first bad fragment.
454 	 */
455 	if (skb_shinfo(skb)->frag_list) {
456 		struct sk_buff *frag;
457 		int first_len = skb_pagelen(skb);
458 
459 		if (first_len - hlen > mtu ||
460 		    ((first_len - hlen) & 7) ||
461 		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
462 		    skb_cloned(skb))
463 			goto slow_path;
464 
465 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
466 			/* Correct geometry. */
467 			if (frag->len > mtu ||
468 			    ((frag->len & 7) && frag->next) ||
469 			    skb_headroom(frag) < hlen)
470 			    goto slow_path;
471 
472 			/* Partially cloned skb? */
473 			if (skb_shared(frag))
474 				goto slow_path;
475 
476 			BUG_ON(frag->sk);
477 			if (skb->sk) {
478 				sock_hold(skb->sk);
479 				frag->sk = skb->sk;
480 				frag->destructor = sock_wfree;
481 				skb->truesize -= frag->truesize;
482 			}
483 		}
484 
485 		/* Everything is OK. Generate! */
486 
487 		err = 0;
488 		offset = 0;
489 		frag = skb_shinfo(skb)->frag_list;
490 		skb_shinfo(skb)->frag_list = NULL;
491 		skb->data_len = first_len - skb_headlen(skb);
492 		skb->len = first_len;
493 		iph->tot_len = htons(first_len);
494 		iph->frag_off = htons(IP_MF);
495 		ip_send_check(iph);
496 
497 		for (;;) {
498 			/* Prepare header of the next frame,
499 			 * before previous one went down. */
500 			if (frag) {
501 				frag->ip_summed = CHECKSUM_NONE;
502 				frag->h.raw = frag->data;
503 				frag->nh.raw = __skb_push(frag, hlen);
504 				memcpy(frag->nh.raw, iph, hlen);
505 				iph = frag->nh.iph;
506 				iph->tot_len = htons(frag->len);
507 				ip_copy_metadata(frag, skb);
508 				if (offset == 0)
509 					ip_options_fragment(frag);
510 				offset += skb->len - hlen;
511 				iph->frag_off = htons(offset>>3);
512 				if (frag->next != NULL)
513 					iph->frag_off |= htons(IP_MF);
514 				/* Ready, complete checksum */
515 				ip_send_check(iph);
516 			}
517 
518 			err = output(skb);
519 
520 			if (err || !frag)
521 				break;
522 
523 			skb = frag;
524 			frag = skb->next;
525 			skb->next = NULL;
526 		}
527 
528 		if (err == 0) {
529 			IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
530 			return 0;
531 		}
532 
533 		while (frag) {
534 			skb = frag->next;
535 			kfree_skb(frag);
536 			frag = skb;
537 		}
538 		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
539 		return err;
540 	}
541 
542 slow_path:
543 	left = skb->len - hlen;		/* Space per frame */
544 	ptr = raw + hlen;		/* Where to start from */
545 
546 #ifdef CONFIG_BRIDGE_NETFILTER
547 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
548 	 * we need to make room for the encapsulating header */
549 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
550 	mtu -= nf_bridge_pad(skb);
551 #else
552 	ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
553 #endif
554 	/*
555 	 *	Fragment the datagram.
556 	 */
557 
558 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
559 	not_last_frag = iph->frag_off & htons(IP_MF);
560 
561 	/*
562 	 *	Keep copying data until we run out.
563 	 */
564 
565 	while(left > 0)	{
566 		len = left;
567 		/* IF: it doesn't fit, use 'mtu' - the data space left */
568 		if (len > mtu)
569 			len = mtu;
570 		/* IF: we are not sending upto and including the packet end
571 		   then align the next start on an eight byte boundary */
572 		if (len < left)	{
573 			len &= ~7;
574 		}
575 		/*
576 		 *	Allocate buffer.
577 		 */
578 
579 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
580 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
581 			err = -ENOMEM;
582 			goto fail;
583 		}
584 
585 		/*
586 		 *	Set up data on packet
587 		 */
588 
589 		ip_copy_metadata(skb2, skb);
590 		skb_reserve(skb2, ll_rs);
591 		skb_put(skb2, len + hlen);
592 		skb2->nh.raw = skb2->data;
593 		skb2->h.raw = skb2->data + hlen;
594 
595 		/*
596 		 *	Charge the memory for the fragment to any owner
597 		 *	it might possess
598 		 */
599 
600 		if (skb->sk)
601 			skb_set_owner_w(skb2, skb->sk);
602 
603 		/*
604 		 *	Copy the packet header into the new buffer.
605 		 */
606 
607 		memcpy(skb2->nh.raw, skb->data, hlen);
608 
609 		/*
610 		 *	Copy a block of the IP datagram.
611 		 */
612 		if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
613 			BUG();
614 		left -= len;
615 
616 		/*
617 		 *	Fill in the new header fields.
618 		 */
619 		iph = skb2->nh.iph;
620 		iph->frag_off = htons((offset >> 3));
621 
622 		/* ANK: dirty, but effective trick. Upgrade options only if
623 		 * the segment to be fragmented was THE FIRST (otherwise,
624 		 * options are already fixed) and make it ONCE
625 		 * on the initial skb, so that all the following fragments
626 		 * will inherit fixed options.
627 		 */
628 		if (offset == 0)
629 			ip_options_fragment(skb);
630 
631 		/*
632 		 *	Added AC : If we are fragmenting a fragment that's not the
633 		 *		   last fragment then keep MF on each bit
634 		 */
635 		if (left > 0 || not_last_frag)
636 			iph->frag_off |= htons(IP_MF);
637 		ptr += len;
638 		offset += len;
639 
640 		/*
641 		 *	Put this fragment into the sending queue.
642 		 */
643 
644 		IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
645 
646 		iph->tot_len = htons(len + hlen);
647 
648 		ip_send_check(iph);
649 
650 		err = output(skb2);
651 		if (err)
652 			goto fail;
653 	}
654 	kfree_skb(skb);
655 	IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
656 	return err;
657 
658 fail:
659 	kfree_skb(skb);
660 	IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
661 	return err;
662 }
663 
664 int
665 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
666 {
667 	struct iovec *iov = from;
668 
669 	if (skb->ip_summed == CHECKSUM_HW) {
670 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
671 			return -EFAULT;
672 	} else {
673 		unsigned int csum = 0;
674 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
675 			return -EFAULT;
676 		skb->csum = csum_block_add(skb->csum, csum, odd);
677 	}
678 	return 0;
679 }
680 
681 static inline unsigned int
682 csum_page(struct page *page, int offset, int copy)
683 {
684 	char *kaddr;
685 	unsigned int csum;
686 	kaddr = kmap(page);
687 	csum = csum_partial(kaddr + offset, copy, 0);
688 	kunmap(page);
689 	return csum;
690 }
691 
692 inline int ip_ufo_append_data(struct sock *sk,
693 			int getfrag(void *from, char *to, int offset, int len,
694 			       int odd, struct sk_buff *skb),
695 			void *from, int length, int hh_len, int fragheaderlen,
696 			int transhdrlen, int mtu,unsigned int flags)
697 {
698 	struct sk_buff *skb;
699 	int err;
700 
701 	/* There is support for UDP fragmentation offload by network
702 	 * device, so create one single skb packet containing complete
703 	 * udp datagram
704 	 */
705 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
706 		skb = sock_alloc_send_skb(sk,
707 			hh_len + fragheaderlen + transhdrlen + 20,
708 			(flags & MSG_DONTWAIT), &err);
709 
710 		if (skb == NULL)
711 			return err;
712 
713 		/* reserve space for Hardware header */
714 		skb_reserve(skb, hh_len);
715 
716 		/* create space for UDP/IP header */
717 		skb_put(skb,fragheaderlen + transhdrlen);
718 
719 		/* initialize network header pointer */
720 		skb->nh.raw = skb->data;
721 
722 		/* initialize protocol header pointer */
723 		skb->h.raw = skb->data + fragheaderlen;
724 
725 		skb->ip_summed = CHECKSUM_HW;
726 		skb->csum = 0;
727 		sk->sk_sndmsg_off = 0;
728 	}
729 
730 	err = skb_append_datato_frags(sk,skb, getfrag, from,
731 			       (length - transhdrlen));
732 	if (!err) {
733 		/* specify the length of each IP datagram fragment*/
734 		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
735 		__skb_queue_tail(&sk->sk_write_queue, skb);
736 
737 		return 0;
738 	}
739 	/* There is not enough support do UFO ,
740 	 * so follow normal path
741 	 */
742 	kfree_skb(skb);
743 	return err;
744 }
745 
746 /*
747  *	ip_append_data() and ip_append_page() can make one large IP datagram
748  *	from many pieces of data. Each pieces will be holded on the socket
749  *	until ip_push_pending_frames() is called. Each piece can be a page
750  *	or non-page data.
751  *
752  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
753  *	this interface potentially.
754  *
755  *	LATER: length must be adjusted by pad at tail, when it is required.
756  */
757 int ip_append_data(struct sock *sk,
758 		   int getfrag(void *from, char *to, int offset, int len,
759 			       int odd, struct sk_buff *skb),
760 		   void *from, int length, int transhdrlen,
761 		   struct ipcm_cookie *ipc, struct rtable *rt,
762 		   unsigned int flags)
763 {
764 	struct inet_sock *inet = inet_sk(sk);
765 	struct sk_buff *skb;
766 
767 	struct ip_options *opt = NULL;
768 	int hh_len;
769 	int exthdrlen;
770 	int mtu;
771 	int copy;
772 	int err;
773 	int offset = 0;
774 	unsigned int maxfraglen, fragheaderlen;
775 	int csummode = CHECKSUM_NONE;
776 
777 	if (flags&MSG_PROBE)
778 		return 0;
779 
780 	if (skb_queue_empty(&sk->sk_write_queue)) {
781 		/*
782 		 * setup for corking.
783 		 */
784 		opt = ipc->opt;
785 		if (opt) {
786 			if (inet->cork.opt == NULL) {
787 				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
788 				if (unlikely(inet->cork.opt == NULL))
789 					return -ENOBUFS;
790 			}
791 			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
792 			inet->cork.flags |= IPCORK_OPT;
793 			inet->cork.addr = ipc->addr;
794 		}
795 		dst_hold(&rt->u.dst);
796 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
797 		inet->cork.rt = rt;
798 		inet->cork.length = 0;
799 		sk->sk_sndmsg_page = NULL;
800 		sk->sk_sndmsg_off = 0;
801 		if ((exthdrlen = rt->u.dst.header_len) != 0) {
802 			length += exthdrlen;
803 			transhdrlen += exthdrlen;
804 		}
805 	} else {
806 		rt = inet->cork.rt;
807 		if (inet->cork.flags & IPCORK_OPT)
808 			opt = inet->cork.opt;
809 
810 		transhdrlen = 0;
811 		exthdrlen = 0;
812 		mtu = inet->cork.fragsize;
813 	}
814 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
815 
816 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
817 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
818 
819 	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
820 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
821 		return -EMSGSIZE;
822 	}
823 
824 	/*
825 	 * transhdrlen > 0 means that this is the first fragment and we wish
826 	 * it won't be fragmented in the future.
827 	 */
828 	if (transhdrlen &&
829 	    length + fragheaderlen <= mtu &&
830 	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
831 	    !exthdrlen)
832 		csummode = CHECKSUM_HW;
833 
834 	inet->cork.length += length;
835 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
836 			(rt->u.dst.dev->features & NETIF_F_UFO)) {
837 
838 		if(ip_ufo_append_data(sk, getfrag, from, length, hh_len,
839 			       fragheaderlen, transhdrlen, mtu, flags))
840 			goto error;
841 
842 		return 0;
843 	}
844 
845 	/* So, what's going on in the loop below?
846 	 *
847 	 * We use calculated fragment length to generate chained skb,
848 	 * each of segments is IP fragment ready for sending to network after
849 	 * adding appropriate IP header.
850 	 */
851 
852 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
853 		goto alloc_new_skb;
854 
855 	while (length > 0) {
856 		/* Check if the remaining data fits into current packet. */
857 		copy = mtu - skb->len;
858 		if (copy < length)
859 			copy = maxfraglen - skb->len;
860 		if (copy <= 0) {
861 			char *data;
862 			unsigned int datalen;
863 			unsigned int fraglen;
864 			unsigned int fraggap;
865 			unsigned int alloclen;
866 			struct sk_buff *skb_prev;
867 alloc_new_skb:
868 			skb_prev = skb;
869 			if (skb_prev)
870 				fraggap = skb_prev->len - maxfraglen;
871 			else
872 				fraggap = 0;
873 
874 			/*
875 			 * If remaining data exceeds the mtu,
876 			 * we know we need more fragment(s).
877 			 */
878 			datalen = length + fraggap;
879 			if (datalen > mtu - fragheaderlen)
880 				datalen = maxfraglen - fragheaderlen;
881 			fraglen = datalen + fragheaderlen;
882 
883 			if ((flags & MSG_MORE) &&
884 			    !(rt->u.dst.dev->features&NETIF_F_SG))
885 				alloclen = mtu;
886 			else
887 				alloclen = datalen + fragheaderlen;
888 
889 			/* The last fragment gets additional space at tail.
890 			 * Note, with MSG_MORE we overallocate on fragments,
891 			 * because we have no idea what fragment will be
892 			 * the last.
893 			 */
894 			if (datalen == length)
895 				alloclen += rt->u.dst.trailer_len;
896 
897 			if (transhdrlen) {
898 				skb = sock_alloc_send_skb(sk,
899 						alloclen + hh_len + 15,
900 						(flags & MSG_DONTWAIT), &err);
901 			} else {
902 				skb = NULL;
903 				if (atomic_read(&sk->sk_wmem_alloc) <=
904 				    2 * sk->sk_sndbuf)
905 					skb = sock_wmalloc(sk,
906 							   alloclen + hh_len + 15, 1,
907 							   sk->sk_allocation);
908 				if (unlikely(skb == NULL))
909 					err = -ENOBUFS;
910 			}
911 			if (skb == NULL)
912 				goto error;
913 
914 			/*
915 			 *	Fill in the control structures
916 			 */
917 			skb->ip_summed = csummode;
918 			skb->csum = 0;
919 			skb_reserve(skb, hh_len);
920 
921 			/*
922 			 *	Find where to start putting bytes.
923 			 */
924 			data = skb_put(skb, fraglen);
925 			skb->nh.raw = data + exthdrlen;
926 			data += fragheaderlen;
927 			skb->h.raw = data + exthdrlen;
928 
929 			if (fraggap) {
930 				skb->csum = skb_copy_and_csum_bits(
931 					skb_prev, maxfraglen,
932 					data + transhdrlen, fraggap, 0);
933 				skb_prev->csum = csum_sub(skb_prev->csum,
934 							  skb->csum);
935 				data += fraggap;
936 				skb_trim(skb_prev, maxfraglen);
937 			}
938 
939 			copy = datalen - transhdrlen - fraggap;
940 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
941 				err = -EFAULT;
942 				kfree_skb(skb);
943 				goto error;
944 			}
945 
946 			offset += copy;
947 			length -= datalen - fraggap;
948 			transhdrlen = 0;
949 			exthdrlen = 0;
950 			csummode = CHECKSUM_NONE;
951 
952 			/*
953 			 * Put the packet on the pending queue.
954 			 */
955 			__skb_queue_tail(&sk->sk_write_queue, skb);
956 			continue;
957 		}
958 
959 		if (copy > length)
960 			copy = length;
961 
962 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
963 			unsigned int off;
964 
965 			off = skb->len;
966 			if (getfrag(from, skb_put(skb, copy),
967 					offset, copy, off, skb) < 0) {
968 				__skb_trim(skb, off);
969 				err = -EFAULT;
970 				goto error;
971 			}
972 		} else {
973 			int i = skb_shinfo(skb)->nr_frags;
974 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
975 			struct page *page = sk->sk_sndmsg_page;
976 			int off = sk->sk_sndmsg_off;
977 			unsigned int left;
978 
979 			if (page && (left = PAGE_SIZE - off) > 0) {
980 				if (copy >= left)
981 					copy = left;
982 				if (page != frag->page) {
983 					if (i == MAX_SKB_FRAGS) {
984 						err = -EMSGSIZE;
985 						goto error;
986 					}
987 					get_page(page);
988 	 				skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
989 					frag = &skb_shinfo(skb)->frags[i];
990 				}
991 			} else if (i < MAX_SKB_FRAGS) {
992 				if (copy > PAGE_SIZE)
993 					copy = PAGE_SIZE;
994 				page = alloc_pages(sk->sk_allocation, 0);
995 				if (page == NULL)  {
996 					err = -ENOMEM;
997 					goto error;
998 				}
999 				sk->sk_sndmsg_page = page;
1000 				sk->sk_sndmsg_off = 0;
1001 
1002 				skb_fill_page_desc(skb, i, page, 0, 0);
1003 				frag = &skb_shinfo(skb)->frags[i];
1004 				skb->truesize += PAGE_SIZE;
1005 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1006 			} else {
1007 				err = -EMSGSIZE;
1008 				goto error;
1009 			}
1010 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1011 				err = -EFAULT;
1012 				goto error;
1013 			}
1014 			sk->sk_sndmsg_off += copy;
1015 			frag->size += copy;
1016 			skb->len += copy;
1017 			skb->data_len += copy;
1018 		}
1019 		offset += copy;
1020 		length -= copy;
1021 	}
1022 
1023 	return 0;
1024 
1025 error:
1026 	inet->cork.length -= length;
1027 	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1028 	return err;
1029 }
1030 
1031 ssize_t	ip_append_page(struct sock *sk, struct page *page,
1032 		       int offset, size_t size, int flags)
1033 {
1034 	struct inet_sock *inet = inet_sk(sk);
1035 	struct sk_buff *skb;
1036 	struct rtable *rt;
1037 	struct ip_options *opt = NULL;
1038 	int hh_len;
1039 	int mtu;
1040 	int len;
1041 	int err;
1042 	unsigned int maxfraglen, fragheaderlen, fraggap;
1043 
1044 	if (inet->hdrincl)
1045 		return -EPERM;
1046 
1047 	if (flags&MSG_PROBE)
1048 		return 0;
1049 
1050 	if (skb_queue_empty(&sk->sk_write_queue))
1051 		return -EINVAL;
1052 
1053 	rt = inet->cork.rt;
1054 	if (inet->cork.flags & IPCORK_OPT)
1055 		opt = inet->cork.opt;
1056 
1057 	if (!(rt->u.dst.dev->features&NETIF_F_SG))
1058 		return -EOPNOTSUPP;
1059 
1060 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1061 	mtu = inet->cork.fragsize;
1062 
1063 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1064 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1065 
1066 	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1067 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1068 		return -EMSGSIZE;
1069 	}
1070 
1071 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1072 		return -EINVAL;
1073 
1074 	inet->cork.length += size;
1075 	if ((sk->sk_protocol == IPPROTO_UDP) &&
1076 	    (rt->u.dst.dev->features & NETIF_F_UFO))
1077 		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
1078 
1079 
1080 	while (size > 0) {
1081 		int i;
1082 
1083 		if (skb_shinfo(skb)->ufo_size)
1084 			len = size;
1085 		else {
1086 
1087 			/* Check if the remaining data fits into current packet. */
1088 			len = mtu - skb->len;
1089 			if (len < size)
1090 				len = maxfraglen - skb->len;
1091 		}
1092 		if (len <= 0) {
1093 			struct sk_buff *skb_prev;
1094 			char *data;
1095 			struct iphdr *iph;
1096 			int alloclen;
1097 
1098 			skb_prev = skb;
1099 			fraggap = skb_prev->len - maxfraglen;
1100 
1101 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1102 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1103 			if (unlikely(!skb)) {
1104 				err = -ENOBUFS;
1105 				goto error;
1106 			}
1107 
1108 			/*
1109 			 *	Fill in the control structures
1110 			 */
1111 			skb->ip_summed = CHECKSUM_NONE;
1112 			skb->csum = 0;
1113 			skb_reserve(skb, hh_len);
1114 
1115 			/*
1116 			 *	Find where to start putting bytes.
1117 			 */
1118 			data = skb_put(skb, fragheaderlen + fraggap);
1119 			skb->nh.iph = iph = (struct iphdr *)data;
1120 			data += fragheaderlen;
1121 			skb->h.raw = data;
1122 
1123 			if (fraggap) {
1124 				skb->csum = skb_copy_and_csum_bits(
1125 					skb_prev, maxfraglen,
1126 					data, fraggap, 0);
1127 				skb_prev->csum = csum_sub(skb_prev->csum,
1128 							  skb->csum);
1129 				skb_trim(skb_prev, maxfraglen);
1130 			}
1131 
1132 			/*
1133 			 * Put the packet on the pending queue.
1134 			 */
1135 			__skb_queue_tail(&sk->sk_write_queue, skb);
1136 			continue;
1137 		}
1138 
1139 		i = skb_shinfo(skb)->nr_frags;
1140 		if (len > size)
1141 			len = size;
1142 		if (skb_can_coalesce(skb, i, page, offset)) {
1143 			skb_shinfo(skb)->frags[i-1].size += len;
1144 		} else if (i < MAX_SKB_FRAGS) {
1145 			get_page(page);
1146 			skb_fill_page_desc(skb, i, page, offset, len);
1147 		} else {
1148 			err = -EMSGSIZE;
1149 			goto error;
1150 		}
1151 
1152 		if (skb->ip_summed == CHECKSUM_NONE) {
1153 			unsigned int csum;
1154 			csum = csum_page(page, offset, len);
1155 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1156 		}
1157 
1158 		skb->len += len;
1159 		skb->data_len += len;
1160 		offset += len;
1161 		size -= len;
1162 	}
1163 	return 0;
1164 
1165 error:
1166 	inet->cork.length -= size;
1167 	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1168 	return err;
1169 }
1170 
1171 /*
1172  *	Combined all pending IP fragments on the socket as one IP datagram
1173  *	and push them out.
1174  */
1175 int ip_push_pending_frames(struct sock *sk)
1176 {
1177 	struct sk_buff *skb, *tmp_skb;
1178 	struct sk_buff **tail_skb;
1179 	struct inet_sock *inet = inet_sk(sk);
1180 	struct ip_options *opt = NULL;
1181 	struct rtable *rt = inet->cork.rt;
1182 	struct iphdr *iph;
1183 	int df = 0;
1184 	__u8 ttl;
1185 	int err = 0;
1186 
1187 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1188 		goto out;
1189 	tail_skb = &(skb_shinfo(skb)->frag_list);
1190 
1191 	/* move skb->data to ip header from ext header */
1192 	if (skb->data < skb->nh.raw)
1193 		__skb_pull(skb, skb->nh.raw - skb->data);
1194 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1195 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1196 		*tail_skb = tmp_skb;
1197 		tail_skb = &(tmp_skb->next);
1198 		skb->len += tmp_skb->len;
1199 		skb->data_len += tmp_skb->len;
1200 		skb->truesize += tmp_skb->truesize;
1201 		__sock_put(tmp_skb->sk);
1202 		tmp_skb->destructor = NULL;
1203 		tmp_skb->sk = NULL;
1204 	}
1205 
1206 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1207 	 * to fragment the frame generated here. No matter, what transforms
1208 	 * how transforms change size of the packet, it will come out.
1209 	 */
1210 	if (inet->pmtudisc != IP_PMTUDISC_DO)
1211 		skb->local_df = 1;
1212 
1213 	/* DF bit is set when we want to see DF on outgoing frames.
1214 	 * If local_df is set too, we still allow to fragment this frame
1215 	 * locally. */
1216 	if (inet->pmtudisc == IP_PMTUDISC_DO ||
1217 	    (skb->len <= dst_mtu(&rt->u.dst) &&
1218 	     ip_dont_fragment(sk, &rt->u.dst)))
1219 		df = htons(IP_DF);
1220 
1221 	if (inet->cork.flags & IPCORK_OPT)
1222 		opt = inet->cork.opt;
1223 
1224 	if (rt->rt_type == RTN_MULTICAST)
1225 		ttl = inet->mc_ttl;
1226 	else
1227 		ttl = ip_select_ttl(inet, &rt->u.dst);
1228 
1229 	iph = (struct iphdr *)skb->data;
1230 	iph->version = 4;
1231 	iph->ihl = 5;
1232 	if (opt) {
1233 		iph->ihl += opt->optlen>>2;
1234 		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1235 	}
1236 	iph->tos = inet->tos;
1237 	iph->tot_len = htons(skb->len);
1238 	iph->frag_off = df;
1239 	if (!df) {
1240 		__ip_select_ident(iph, &rt->u.dst, 0);
1241 	} else {
1242 		iph->id = htons(inet->id++);
1243 	}
1244 	iph->ttl = ttl;
1245 	iph->protocol = sk->sk_protocol;
1246 	iph->saddr = rt->rt_src;
1247 	iph->daddr = rt->rt_dst;
1248 	ip_send_check(iph);
1249 
1250 	skb->priority = sk->sk_priority;
1251 	skb->dst = dst_clone(&rt->u.dst);
1252 
1253 	/* Netfilter gets whole the not fragmented skb. */
1254 	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1255 		      skb->dst->dev, dst_output);
1256 	if (err) {
1257 		if (err > 0)
1258 			err = inet->recverr ? net_xmit_errno(err) : 0;
1259 		if (err)
1260 			goto error;
1261 	}
1262 
1263 out:
1264 	inet->cork.flags &= ~IPCORK_OPT;
1265 	if (inet->cork.opt) {
1266 		kfree(inet->cork.opt);
1267 		inet->cork.opt = NULL;
1268 	}
1269 	if (inet->cork.rt) {
1270 		ip_rt_put(inet->cork.rt);
1271 		inet->cork.rt = NULL;
1272 	}
1273 	return err;
1274 
1275 error:
1276 	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1277 	goto out;
1278 }
1279 
1280 /*
1281  *	Throw away all pending data on the socket.
1282  */
1283 void ip_flush_pending_frames(struct sock *sk)
1284 {
1285 	struct inet_sock *inet = inet_sk(sk);
1286 	struct sk_buff *skb;
1287 
1288 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1289 		kfree_skb(skb);
1290 
1291 	inet->cork.flags &= ~IPCORK_OPT;
1292 	if (inet->cork.opt) {
1293 		kfree(inet->cork.opt);
1294 		inet->cork.opt = NULL;
1295 	}
1296 	if (inet->cork.rt) {
1297 		ip_rt_put(inet->cork.rt);
1298 		inet->cork.rt = NULL;
1299 	}
1300 }
1301 
1302 
1303 /*
1304  *	Fetch data from kernel space and fill in checksum if needed.
1305  */
1306 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1307 			      int len, int odd, struct sk_buff *skb)
1308 {
1309 	unsigned int csum;
1310 
1311 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1312 	skb->csum = csum_block_add(skb->csum, csum, odd);
1313 	return 0;
1314 }
1315 
1316 /*
1317  *	Generic function to send a packet as reply to another packet.
1318  *	Used to send TCP resets so far. ICMP should use this function too.
1319  *
1320  *	Should run single threaded per socket because it uses the sock
1321  *     	structure to pass arguments.
1322  *
1323  *	LATER: switch from ip_build_xmit to ip_append_*
1324  */
1325 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1326 		   unsigned int len)
1327 {
1328 	struct inet_sock *inet = inet_sk(sk);
1329 	struct {
1330 		struct ip_options	opt;
1331 		char			data[40];
1332 	} replyopts;
1333 	struct ipcm_cookie ipc;
1334 	u32 daddr;
1335 	struct rtable *rt = (struct rtable*)skb->dst;
1336 
1337 	if (ip_options_echo(&replyopts.opt, skb))
1338 		return;
1339 
1340 	daddr = ipc.addr = rt->rt_src;
1341 	ipc.opt = NULL;
1342 
1343 	if (replyopts.opt.optlen) {
1344 		ipc.opt = &replyopts.opt;
1345 
1346 		if (ipc.opt->srr)
1347 			daddr = replyopts.opt.faddr;
1348 	}
1349 
1350 	{
1351 		struct flowi fl = { .nl_u = { .ip4_u =
1352 					      { .daddr = daddr,
1353 						.saddr = rt->rt_spec_dst,
1354 						.tos = RT_TOS(skb->nh.iph->tos) } },
1355 				    /* Not quite clean, but right. */
1356 				    .uli_u = { .ports =
1357 					       { .sport = skb->h.th->dest,
1358 					         .dport = skb->h.th->source } },
1359 				    .proto = sk->sk_protocol };
1360 		if (ip_route_output_key(&rt, &fl))
1361 			return;
1362 	}
1363 
1364 	/* And let IP do all the hard work.
1365 
1366 	   This chunk is not reenterable, hence spinlock.
1367 	   Note that it uses the fact, that this function is called
1368 	   with locally disabled BH and that sk cannot be already spinlocked.
1369 	 */
1370 	bh_lock_sock(sk);
1371 	inet->tos = skb->nh.iph->tos;
1372 	sk->sk_priority = skb->priority;
1373 	sk->sk_protocol = skb->nh.iph->protocol;
1374 	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1375 		       &ipc, rt, MSG_DONTWAIT);
1376 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1377 		if (arg->csumoffset >= 0)
1378 			*((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1379 		skb->ip_summed = CHECKSUM_NONE;
1380 		ip_push_pending_frames(sk);
1381 	}
1382 
1383 	bh_unlock_sock(sk);
1384 
1385 	ip_rt_put(rt);
1386 }
1387 
1388 void __init ip_init(void)
1389 {
1390 	ip_rt_init();
1391 	inet_initpeers();
1392 
1393 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1394 	igmp_mc_proc_init();
1395 #endif
1396 }
1397 
1398 EXPORT_SYMBOL(ip_fragment);
1399 EXPORT_SYMBOL(ip_generic_getfrag);
1400 EXPORT_SYMBOL(ip_queue_xmit);
1401 EXPORT_SYMBOL(ip_send_check);
1402