xref: /linux/net/ipv4/ip_output.c (revision 9ce7677cfd7cd871adb457c80bea3b581b839641)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Version:	$Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Donald Becker, <becker@super.org>
13  *		Alan Cox, <Alan.Cox@linux.org>
14  *		Richard Underwood
15  *		Stefan Becker, <stefanb@yello.ping.de>
16  *		Jorge Cwik, <jorge@laser.satlink.net>
17  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *		Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *	See ip_input.c for original log
21  *
22  *	Fixes:
23  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
24  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
25  *		Bradford Johnson:	Fix faulty handling of some frames when
26  *					no route is found.
27  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
28  *					(in case if packet not accepted by
29  *					output firewall rules)
30  *		Mike McLagan	:	Routing by source
31  *		Alexey Kuznetsov:	use new route cache
32  *		Andi Kleen:		Fix broken PMTU recovery and remove
33  *					some redundant tests.
34  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
35  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
36  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
37  *					for decreased register pressure on x86
38  *					and more readibility.
39  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
40  *					silently drop skb instead of failing with -EPERM.
41  *		Detlev Wengorz	:	Copy protocol for fragments.
42  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
43  *					datagrams.
44  *		Hirokazu Takahashi:	sendfile() on UDP works now.
45  */
46 
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57 
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67 
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
85 
86 int sysctl_ip_default_ttl = IPDEFTTL;
87 
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91 	iph->check = 0;
92 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94 
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98 	newskb->mac.raw = newskb->data;
99 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
100 	newskb->pkt_type = PACKET_LOOPBACK;
101 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
102 	BUG_TRAP(newskb->dst);
103 	netif_rx(newskb);
104 	return 0;
105 }
106 
107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108 {
109 	int ttl = inet->uc_ttl;
110 
111 	if (ttl < 0)
112 		ttl = dst_metric(dst, RTAX_HOPLIMIT);
113 	return ttl;
114 }
115 
116 /*
117  *		Add an ip header to a skbuff and send it out.
118  *
119  */
120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121 			  u32 saddr, u32 daddr, struct ip_options *opt)
122 {
123 	struct inet_sock *inet = inet_sk(sk);
124 	struct rtable *rt = (struct rtable *)skb->dst;
125 	struct iphdr *iph;
126 
127 	/* Build the IP header. */
128 	if (opt)
129 		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130 	else
131 		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132 
133 	iph->version  = 4;
134 	iph->ihl      = 5;
135 	iph->tos      = inet->tos;
136 	if (ip_dont_fragment(sk, &rt->u.dst))
137 		iph->frag_off = htons(IP_DF);
138 	else
139 		iph->frag_off = 0;
140 	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
141 	iph->daddr    = rt->rt_dst;
142 	iph->saddr    = rt->rt_src;
143 	iph->protocol = sk->sk_protocol;
144 	iph->tot_len  = htons(skb->len);
145 	ip_select_ident(iph, &rt->u.dst, sk);
146 	skb->nh.iph   = iph;
147 
148 	if (opt && opt->optlen) {
149 		iph->ihl += opt->optlen>>2;
150 		ip_options_build(skb, opt, daddr, rt, 0);
151 	}
152 	ip_send_check(iph);
153 
154 	skb->priority = sk->sk_priority;
155 
156 	/* Send it out. */
157 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
158 		       dst_output);
159 }
160 
161 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
162 
163 static inline int ip_finish_output2(struct sk_buff *skb)
164 {
165 	struct dst_entry *dst = skb->dst;
166 	struct hh_cache *hh = dst->hh;
167 	struct net_device *dev = dst->dev;
168 	int hh_len = LL_RESERVED_SPACE(dev);
169 
170 	/* Be paranoid, rather than too clever. */
171 	if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
172 		struct sk_buff *skb2;
173 
174 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
175 		if (skb2 == NULL) {
176 			kfree_skb(skb);
177 			return -ENOMEM;
178 		}
179 		if (skb->sk)
180 			skb_set_owner_w(skb2, skb->sk);
181 		kfree_skb(skb);
182 		skb = skb2;
183 	}
184 
185 	if (hh) {
186 		int hh_alen;
187 
188 		read_lock_bh(&hh->hh_lock);
189 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
190   		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
191 		read_unlock_bh(&hh->hh_lock);
192 	        skb_push(skb, hh->hh_len);
193 		return hh->hh_output(skb);
194 	} else if (dst->neighbour)
195 		return dst->neighbour->output(skb);
196 
197 	if (net_ratelimit())
198 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
199 	kfree_skb(skb);
200 	return -EINVAL;
201 }
202 
203 static inline int ip_finish_output(struct sk_buff *skb)
204 {
205 	struct net_device *dev = skb->dst->dev;
206 
207 	skb->dev = dev;
208 	skb->protocol = htons(ETH_P_IP);
209 
210 	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
211 		       ip_finish_output2);
212 }
213 
214 int ip_mc_output(struct sk_buff *skb)
215 {
216 	struct sock *sk = skb->sk;
217 	struct rtable *rt = (struct rtable*)skb->dst;
218 	struct net_device *dev = rt->u.dst.dev;
219 
220 	/*
221 	 *	If the indicated interface is up and running, send the packet.
222 	 */
223 	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
224 
225 	skb->dev = dev;
226 	skb->protocol = htons(ETH_P_IP);
227 
228 	/*
229 	 *	Multicasts are looped back for other local users
230 	 */
231 
232 	if (rt->rt_flags&RTCF_MULTICAST) {
233 		if ((!sk || inet_sk(sk)->mc_loop)
234 #ifdef CONFIG_IP_MROUTE
235 		/* Small optimization: do not loopback not local frames,
236 		   which returned after forwarding; they will be  dropped
237 		   by ip_mr_input in any case.
238 		   Note, that local frames are looped back to be delivered
239 		   to local recipients.
240 
241 		   This check is duplicated in ip_mr_input at the moment.
242 		 */
243 		    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
244 #endif
245 		) {
246 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
247 			if (newskb)
248 				NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
249 					newskb->dev,
250 					ip_dev_loopback_xmit);
251 		}
252 
253 		/* Multicasts with ttl 0 must not go beyond the host */
254 
255 		if (skb->nh.iph->ttl == 0) {
256 			kfree_skb(skb);
257 			return 0;
258 		}
259 	}
260 
261 	if (rt->rt_flags&RTCF_BROADCAST) {
262 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
263 		if (newskb)
264 			NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
265 				newskb->dev, ip_dev_loopback_xmit);
266 	}
267 
268 	if (skb->len > dst_mtu(&rt->u.dst))
269 		return ip_fragment(skb, ip_finish_output);
270 	else
271 		return ip_finish_output(skb);
272 }
273 
274 int ip_output(struct sk_buff *skb)
275 {
276 	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
277 
278 	if (skb->len > dst_mtu(skb->dst) &&
279 		!(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
280 		return ip_fragment(skb, ip_finish_output);
281 	else
282 		return ip_finish_output(skb);
283 }
284 
285 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
286 {
287 	struct sock *sk = skb->sk;
288 	struct inet_sock *inet = inet_sk(sk);
289 	struct ip_options *opt = inet->opt;
290 	struct rtable *rt;
291 	struct iphdr *iph;
292 
293 	/* Skip all of this if the packet is already routed,
294 	 * f.e. by something like SCTP.
295 	 */
296 	rt = (struct rtable *) skb->dst;
297 	if (rt != NULL)
298 		goto packet_routed;
299 
300 	/* Make sure we can route this packet. */
301 	rt = (struct rtable *)__sk_dst_check(sk, 0);
302 	if (rt == NULL) {
303 		u32 daddr;
304 
305 		/* Use correct destination address if we have options. */
306 		daddr = inet->daddr;
307 		if(opt && opt->srr)
308 			daddr = opt->faddr;
309 
310 		{
311 			struct flowi fl = { .oif = sk->sk_bound_dev_if,
312 					    .nl_u = { .ip4_u =
313 						      { .daddr = daddr,
314 							.saddr = inet->saddr,
315 							.tos = RT_CONN_FLAGS(sk) } },
316 					    .proto = sk->sk_protocol,
317 					    .uli_u = { .ports =
318 						       { .sport = inet->sport,
319 							 .dport = inet->dport } } };
320 
321 			/* If this fails, retransmit mechanism of transport layer will
322 			 * keep trying until route appears or the connection times
323 			 * itself out.
324 			 */
325 			if (ip_route_output_flow(&rt, &fl, sk, 0))
326 				goto no_route;
327 		}
328 		sk_setup_caps(sk, &rt->u.dst);
329 	}
330 	skb->dst = dst_clone(&rt->u.dst);
331 
332 packet_routed:
333 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
334 		goto no_route;
335 
336 	/* OK, we know where to send it, allocate and build IP header. */
337 	iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
338 	*((__u16 *)iph)	= htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
339 	iph->tot_len = htons(skb->len);
340 	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
341 		iph->frag_off = htons(IP_DF);
342 	else
343 		iph->frag_off = 0;
344 	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
345 	iph->protocol = sk->sk_protocol;
346 	iph->saddr    = rt->rt_src;
347 	iph->daddr    = rt->rt_dst;
348 	skb->nh.iph   = iph;
349 	/* Transport layer set skb->h.foo itself. */
350 
351 	if (opt && opt->optlen) {
352 		iph->ihl += opt->optlen >> 2;
353 		ip_options_build(skb, opt, inet->daddr, rt, 0);
354 	}
355 
356 	ip_select_ident_more(iph, &rt->u.dst, sk,
357 			     (skb_shinfo(skb)->tso_segs ?: 1) - 1);
358 
359 	/* Add an IP checksum. */
360 	ip_send_check(iph);
361 
362 	skb->priority = sk->sk_priority;
363 
364 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
365 		       dst_output);
366 
367 no_route:
368 	IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
369 	kfree_skb(skb);
370 	return -EHOSTUNREACH;
371 }
372 
373 
374 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
375 {
376 	to->pkt_type = from->pkt_type;
377 	to->priority = from->priority;
378 	to->protocol = from->protocol;
379 	dst_release(to->dst);
380 	to->dst = dst_clone(from->dst);
381 	to->dev = from->dev;
382 
383 	/* Copy the flags to each fragment. */
384 	IPCB(to)->flags = IPCB(from)->flags;
385 
386 #ifdef CONFIG_NET_SCHED
387 	to->tc_index = from->tc_index;
388 #endif
389 #ifdef CONFIG_NETFILTER
390 	to->nfmark = from->nfmark;
391 	/* Connection association is same as pre-frag packet */
392 	nf_conntrack_put(to->nfct);
393 	to->nfct = from->nfct;
394 	nf_conntrack_get(to->nfct);
395 	to->nfctinfo = from->nfctinfo;
396 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
397 	to->ipvs_property = from->ipvs_property;
398 #endif
399 #ifdef CONFIG_BRIDGE_NETFILTER
400 	nf_bridge_put(to->nf_bridge);
401 	to->nf_bridge = from->nf_bridge;
402 	nf_bridge_get(to->nf_bridge);
403 #endif
404 #endif
405 }
406 
407 /*
408  *	This IP datagram is too large to be sent in one piece.  Break it up into
409  *	smaller pieces (each of size equal to IP header plus
410  *	a block of the data of the original IP data part) that will yet fit in a
411  *	single device frame, and queue such a frame for sending.
412  */
413 
414 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
415 {
416 	struct iphdr *iph;
417 	int raw = 0;
418 	int ptr;
419 	struct net_device *dev;
420 	struct sk_buff *skb2;
421 	unsigned int mtu, hlen, left, len, ll_rs;
422 	int offset;
423 	int not_last_frag;
424 	struct rtable *rt = (struct rtable*)skb->dst;
425 	int err = 0;
426 
427 	dev = rt->u.dst.dev;
428 
429 	/*
430 	 *	Point into the IP datagram header.
431 	 */
432 
433 	iph = skb->nh.iph;
434 
435 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
436 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
437 			  htonl(dst_mtu(&rt->u.dst)));
438 		kfree_skb(skb);
439 		return -EMSGSIZE;
440 	}
441 
442 	/*
443 	 *	Setup starting values.
444 	 */
445 
446 	hlen = iph->ihl * 4;
447 	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
448 
449 	/* When frag_list is given, use it. First, check its validity:
450 	 * some transformers could create wrong frag_list or break existing
451 	 * one, it is not prohibited. In this case fall back to copying.
452 	 *
453 	 * LATER: this step can be merged to real generation of fragments,
454 	 * we can switch to copy when see the first bad fragment.
455 	 */
456 	if (skb_shinfo(skb)->frag_list) {
457 		struct sk_buff *frag;
458 		int first_len = skb_pagelen(skb);
459 
460 		if (first_len - hlen > mtu ||
461 		    ((first_len - hlen) & 7) ||
462 		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
463 		    skb_cloned(skb))
464 			goto slow_path;
465 
466 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
467 			/* Correct geometry. */
468 			if (frag->len > mtu ||
469 			    ((frag->len & 7) && frag->next) ||
470 			    skb_headroom(frag) < hlen)
471 			    goto slow_path;
472 
473 			/* Partially cloned skb? */
474 			if (skb_shared(frag))
475 				goto slow_path;
476 
477 			BUG_ON(frag->sk);
478 			if (skb->sk) {
479 				sock_hold(skb->sk);
480 				frag->sk = skb->sk;
481 				frag->destructor = sock_wfree;
482 				skb->truesize -= frag->truesize;
483 			}
484 		}
485 
486 		/* Everything is OK. Generate! */
487 
488 		err = 0;
489 		offset = 0;
490 		frag = skb_shinfo(skb)->frag_list;
491 		skb_shinfo(skb)->frag_list = NULL;
492 		skb->data_len = first_len - skb_headlen(skb);
493 		skb->len = first_len;
494 		iph->tot_len = htons(first_len);
495 		iph->frag_off = htons(IP_MF);
496 		ip_send_check(iph);
497 
498 		for (;;) {
499 			/* Prepare header of the next frame,
500 			 * before previous one went down. */
501 			if (frag) {
502 				frag->ip_summed = CHECKSUM_NONE;
503 				frag->h.raw = frag->data;
504 				frag->nh.raw = __skb_push(frag, hlen);
505 				memcpy(frag->nh.raw, iph, hlen);
506 				iph = frag->nh.iph;
507 				iph->tot_len = htons(frag->len);
508 				ip_copy_metadata(frag, skb);
509 				if (offset == 0)
510 					ip_options_fragment(frag);
511 				offset += skb->len - hlen;
512 				iph->frag_off = htons(offset>>3);
513 				if (frag->next != NULL)
514 					iph->frag_off |= htons(IP_MF);
515 				/* Ready, complete checksum */
516 				ip_send_check(iph);
517 			}
518 
519 			err = output(skb);
520 
521 			if (err || !frag)
522 				break;
523 
524 			skb = frag;
525 			frag = skb->next;
526 			skb->next = NULL;
527 		}
528 
529 		if (err == 0) {
530 			IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
531 			return 0;
532 		}
533 
534 		while (frag) {
535 			skb = frag->next;
536 			kfree_skb(frag);
537 			frag = skb;
538 		}
539 		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
540 		return err;
541 	}
542 
543 slow_path:
544 	left = skb->len - hlen;		/* Space per frame */
545 	ptr = raw + hlen;		/* Where to start from */
546 
547 #ifdef CONFIG_BRIDGE_NETFILTER
548 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
549 	 * we need to make room for the encapsulating header */
550 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
551 	mtu -= nf_bridge_pad(skb);
552 #else
553 	ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
554 #endif
555 	/*
556 	 *	Fragment the datagram.
557 	 */
558 
559 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
560 	not_last_frag = iph->frag_off & htons(IP_MF);
561 
562 	/*
563 	 *	Keep copying data until we run out.
564 	 */
565 
566 	while(left > 0)	{
567 		len = left;
568 		/* IF: it doesn't fit, use 'mtu' - the data space left */
569 		if (len > mtu)
570 			len = mtu;
571 		/* IF: we are not sending upto and including the packet end
572 		   then align the next start on an eight byte boundary */
573 		if (len < left)	{
574 			len &= ~7;
575 		}
576 		/*
577 		 *	Allocate buffer.
578 		 */
579 
580 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
581 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
582 			err = -ENOMEM;
583 			goto fail;
584 		}
585 
586 		/*
587 		 *	Set up data on packet
588 		 */
589 
590 		ip_copy_metadata(skb2, skb);
591 		skb_reserve(skb2, ll_rs);
592 		skb_put(skb2, len + hlen);
593 		skb2->nh.raw = skb2->data;
594 		skb2->h.raw = skb2->data + hlen;
595 
596 		/*
597 		 *	Charge the memory for the fragment to any owner
598 		 *	it might possess
599 		 */
600 
601 		if (skb->sk)
602 			skb_set_owner_w(skb2, skb->sk);
603 
604 		/*
605 		 *	Copy the packet header into the new buffer.
606 		 */
607 
608 		memcpy(skb2->nh.raw, skb->data, hlen);
609 
610 		/*
611 		 *	Copy a block of the IP datagram.
612 		 */
613 		if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
614 			BUG();
615 		left -= len;
616 
617 		/*
618 		 *	Fill in the new header fields.
619 		 */
620 		iph = skb2->nh.iph;
621 		iph->frag_off = htons((offset >> 3));
622 
623 		/* ANK: dirty, but effective trick. Upgrade options only if
624 		 * the segment to be fragmented was THE FIRST (otherwise,
625 		 * options are already fixed) and make it ONCE
626 		 * on the initial skb, so that all the following fragments
627 		 * will inherit fixed options.
628 		 */
629 		if (offset == 0)
630 			ip_options_fragment(skb);
631 
632 		/*
633 		 *	Added AC : If we are fragmenting a fragment that's not the
634 		 *		   last fragment then keep MF on each bit
635 		 */
636 		if (left > 0 || not_last_frag)
637 			iph->frag_off |= htons(IP_MF);
638 		ptr += len;
639 		offset += len;
640 
641 		/*
642 		 *	Put this fragment into the sending queue.
643 		 */
644 
645 		IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
646 
647 		iph->tot_len = htons(len + hlen);
648 
649 		ip_send_check(iph);
650 
651 		err = output(skb2);
652 		if (err)
653 			goto fail;
654 	}
655 	kfree_skb(skb);
656 	IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
657 	return err;
658 
659 fail:
660 	kfree_skb(skb);
661 	IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
662 	return err;
663 }
664 
665 int
666 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
667 {
668 	struct iovec *iov = from;
669 
670 	if (skb->ip_summed == CHECKSUM_HW) {
671 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
672 			return -EFAULT;
673 	} else {
674 		unsigned int csum = 0;
675 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
676 			return -EFAULT;
677 		skb->csum = csum_block_add(skb->csum, csum, odd);
678 	}
679 	return 0;
680 }
681 
682 static inline unsigned int
683 csum_page(struct page *page, int offset, int copy)
684 {
685 	char *kaddr;
686 	unsigned int csum;
687 	kaddr = kmap(page);
688 	csum = csum_partial(kaddr + offset, copy, 0);
689 	kunmap(page);
690 	return csum;
691 }
692 
693 static inline int ip_ufo_append_data(struct sock *sk,
694 			int getfrag(void *from, char *to, int offset, int len,
695 			       int odd, struct sk_buff *skb),
696 			void *from, int length, int hh_len, int fragheaderlen,
697 			int transhdrlen, int mtu,unsigned int flags)
698 {
699 	struct sk_buff *skb;
700 	int err;
701 
702 	/* There is support for UDP fragmentation offload by network
703 	 * device, so create one single skb packet containing complete
704 	 * udp datagram
705 	 */
706 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
707 		skb = sock_alloc_send_skb(sk,
708 			hh_len + fragheaderlen + transhdrlen + 20,
709 			(flags & MSG_DONTWAIT), &err);
710 
711 		if (skb == NULL)
712 			return err;
713 
714 		/* reserve space for Hardware header */
715 		skb_reserve(skb, hh_len);
716 
717 		/* create space for UDP/IP header */
718 		skb_put(skb,fragheaderlen + transhdrlen);
719 
720 		/* initialize network header pointer */
721 		skb->nh.raw = skb->data;
722 
723 		/* initialize protocol header pointer */
724 		skb->h.raw = skb->data + fragheaderlen;
725 
726 		skb->ip_summed = CHECKSUM_HW;
727 		skb->csum = 0;
728 		sk->sk_sndmsg_off = 0;
729 	}
730 
731 	err = skb_append_datato_frags(sk,skb, getfrag, from,
732 			       (length - transhdrlen));
733 	if (!err) {
734 		/* specify the length of each IP datagram fragment*/
735 		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
736 		__skb_queue_tail(&sk->sk_write_queue, skb);
737 
738 		return 0;
739 	}
740 	/* There is not enough support do UFO ,
741 	 * so follow normal path
742 	 */
743 	kfree_skb(skb);
744 	return err;
745 }
746 
747 /*
748  *	ip_append_data() and ip_append_page() can make one large IP datagram
749  *	from many pieces of data. Each pieces will be holded on the socket
750  *	until ip_push_pending_frames() is called. Each piece can be a page
751  *	or non-page data.
752  *
753  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
754  *	this interface potentially.
755  *
756  *	LATER: length must be adjusted by pad at tail, when it is required.
757  */
758 int ip_append_data(struct sock *sk,
759 		   int getfrag(void *from, char *to, int offset, int len,
760 			       int odd, struct sk_buff *skb),
761 		   void *from, int length, int transhdrlen,
762 		   struct ipcm_cookie *ipc, struct rtable *rt,
763 		   unsigned int flags)
764 {
765 	struct inet_sock *inet = inet_sk(sk);
766 	struct sk_buff *skb;
767 
768 	struct ip_options *opt = NULL;
769 	int hh_len;
770 	int exthdrlen;
771 	int mtu;
772 	int copy;
773 	int err;
774 	int offset = 0;
775 	unsigned int maxfraglen, fragheaderlen;
776 	int csummode = CHECKSUM_NONE;
777 
778 	if (flags&MSG_PROBE)
779 		return 0;
780 
781 	if (skb_queue_empty(&sk->sk_write_queue)) {
782 		/*
783 		 * setup for corking.
784 		 */
785 		opt = ipc->opt;
786 		if (opt) {
787 			if (inet->cork.opt == NULL) {
788 				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
789 				if (unlikely(inet->cork.opt == NULL))
790 					return -ENOBUFS;
791 			}
792 			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
793 			inet->cork.flags |= IPCORK_OPT;
794 			inet->cork.addr = ipc->addr;
795 		}
796 		dst_hold(&rt->u.dst);
797 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
798 		inet->cork.rt = rt;
799 		inet->cork.length = 0;
800 		sk->sk_sndmsg_page = NULL;
801 		sk->sk_sndmsg_off = 0;
802 		if ((exthdrlen = rt->u.dst.header_len) != 0) {
803 			length += exthdrlen;
804 			transhdrlen += exthdrlen;
805 		}
806 	} else {
807 		rt = inet->cork.rt;
808 		if (inet->cork.flags & IPCORK_OPT)
809 			opt = inet->cork.opt;
810 
811 		transhdrlen = 0;
812 		exthdrlen = 0;
813 		mtu = inet->cork.fragsize;
814 	}
815 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
816 
817 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
818 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
819 
820 	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
821 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
822 		return -EMSGSIZE;
823 	}
824 
825 	/*
826 	 * transhdrlen > 0 means that this is the first fragment and we wish
827 	 * it won't be fragmented in the future.
828 	 */
829 	if (transhdrlen &&
830 	    length + fragheaderlen <= mtu &&
831 	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
832 	    !exthdrlen)
833 		csummode = CHECKSUM_HW;
834 
835 	inet->cork.length += length;
836 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
837 			(rt->u.dst.dev->features & NETIF_F_UFO)) {
838 
839 		if(ip_ufo_append_data(sk, getfrag, from, length, hh_len,
840 			       fragheaderlen, transhdrlen, mtu, flags))
841 			goto error;
842 
843 		return 0;
844 	}
845 
846 	/* So, what's going on in the loop below?
847 	 *
848 	 * We use calculated fragment length to generate chained skb,
849 	 * each of segments is IP fragment ready for sending to network after
850 	 * adding appropriate IP header.
851 	 */
852 
853 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
854 		goto alloc_new_skb;
855 
856 	while (length > 0) {
857 		/* Check if the remaining data fits into current packet. */
858 		copy = mtu - skb->len;
859 		if (copy < length)
860 			copy = maxfraglen - skb->len;
861 		if (copy <= 0) {
862 			char *data;
863 			unsigned int datalen;
864 			unsigned int fraglen;
865 			unsigned int fraggap;
866 			unsigned int alloclen;
867 			struct sk_buff *skb_prev;
868 alloc_new_skb:
869 			skb_prev = skb;
870 			if (skb_prev)
871 				fraggap = skb_prev->len - maxfraglen;
872 			else
873 				fraggap = 0;
874 
875 			/*
876 			 * If remaining data exceeds the mtu,
877 			 * we know we need more fragment(s).
878 			 */
879 			datalen = length + fraggap;
880 			if (datalen > mtu - fragheaderlen)
881 				datalen = maxfraglen - fragheaderlen;
882 			fraglen = datalen + fragheaderlen;
883 
884 			if ((flags & MSG_MORE) &&
885 			    !(rt->u.dst.dev->features&NETIF_F_SG))
886 				alloclen = mtu;
887 			else
888 				alloclen = datalen + fragheaderlen;
889 
890 			/* The last fragment gets additional space at tail.
891 			 * Note, with MSG_MORE we overallocate on fragments,
892 			 * because we have no idea what fragment will be
893 			 * the last.
894 			 */
895 			if (datalen == length)
896 				alloclen += rt->u.dst.trailer_len;
897 
898 			if (transhdrlen) {
899 				skb = sock_alloc_send_skb(sk,
900 						alloclen + hh_len + 15,
901 						(flags & MSG_DONTWAIT), &err);
902 			} else {
903 				skb = NULL;
904 				if (atomic_read(&sk->sk_wmem_alloc) <=
905 				    2 * sk->sk_sndbuf)
906 					skb = sock_wmalloc(sk,
907 							   alloclen + hh_len + 15, 1,
908 							   sk->sk_allocation);
909 				if (unlikely(skb == NULL))
910 					err = -ENOBUFS;
911 			}
912 			if (skb == NULL)
913 				goto error;
914 
915 			/*
916 			 *	Fill in the control structures
917 			 */
918 			skb->ip_summed = csummode;
919 			skb->csum = 0;
920 			skb_reserve(skb, hh_len);
921 
922 			/*
923 			 *	Find where to start putting bytes.
924 			 */
925 			data = skb_put(skb, fraglen);
926 			skb->nh.raw = data + exthdrlen;
927 			data += fragheaderlen;
928 			skb->h.raw = data + exthdrlen;
929 
930 			if (fraggap) {
931 				skb->csum = skb_copy_and_csum_bits(
932 					skb_prev, maxfraglen,
933 					data + transhdrlen, fraggap, 0);
934 				skb_prev->csum = csum_sub(skb_prev->csum,
935 							  skb->csum);
936 				data += fraggap;
937 				skb_trim(skb_prev, maxfraglen);
938 			}
939 
940 			copy = datalen - transhdrlen - fraggap;
941 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
942 				err = -EFAULT;
943 				kfree_skb(skb);
944 				goto error;
945 			}
946 
947 			offset += copy;
948 			length -= datalen - fraggap;
949 			transhdrlen = 0;
950 			exthdrlen = 0;
951 			csummode = CHECKSUM_NONE;
952 
953 			/*
954 			 * Put the packet on the pending queue.
955 			 */
956 			__skb_queue_tail(&sk->sk_write_queue, skb);
957 			continue;
958 		}
959 
960 		if (copy > length)
961 			copy = length;
962 
963 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
964 			unsigned int off;
965 
966 			off = skb->len;
967 			if (getfrag(from, skb_put(skb, copy),
968 					offset, copy, off, skb) < 0) {
969 				__skb_trim(skb, off);
970 				err = -EFAULT;
971 				goto error;
972 			}
973 		} else {
974 			int i = skb_shinfo(skb)->nr_frags;
975 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
976 			struct page *page = sk->sk_sndmsg_page;
977 			int off = sk->sk_sndmsg_off;
978 			unsigned int left;
979 
980 			if (page && (left = PAGE_SIZE - off) > 0) {
981 				if (copy >= left)
982 					copy = left;
983 				if (page != frag->page) {
984 					if (i == MAX_SKB_FRAGS) {
985 						err = -EMSGSIZE;
986 						goto error;
987 					}
988 					get_page(page);
989 	 				skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
990 					frag = &skb_shinfo(skb)->frags[i];
991 				}
992 			} else if (i < MAX_SKB_FRAGS) {
993 				if (copy > PAGE_SIZE)
994 					copy = PAGE_SIZE;
995 				page = alloc_pages(sk->sk_allocation, 0);
996 				if (page == NULL)  {
997 					err = -ENOMEM;
998 					goto error;
999 				}
1000 				sk->sk_sndmsg_page = page;
1001 				sk->sk_sndmsg_off = 0;
1002 
1003 				skb_fill_page_desc(skb, i, page, 0, 0);
1004 				frag = &skb_shinfo(skb)->frags[i];
1005 				skb->truesize += PAGE_SIZE;
1006 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1007 			} else {
1008 				err = -EMSGSIZE;
1009 				goto error;
1010 			}
1011 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1012 				err = -EFAULT;
1013 				goto error;
1014 			}
1015 			sk->sk_sndmsg_off += copy;
1016 			frag->size += copy;
1017 			skb->len += copy;
1018 			skb->data_len += copy;
1019 		}
1020 		offset += copy;
1021 		length -= copy;
1022 	}
1023 
1024 	return 0;
1025 
1026 error:
1027 	inet->cork.length -= length;
1028 	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1029 	return err;
1030 }
1031 
1032 ssize_t	ip_append_page(struct sock *sk, struct page *page,
1033 		       int offset, size_t size, int flags)
1034 {
1035 	struct inet_sock *inet = inet_sk(sk);
1036 	struct sk_buff *skb;
1037 	struct rtable *rt;
1038 	struct ip_options *opt = NULL;
1039 	int hh_len;
1040 	int mtu;
1041 	int len;
1042 	int err;
1043 	unsigned int maxfraglen, fragheaderlen, fraggap;
1044 
1045 	if (inet->hdrincl)
1046 		return -EPERM;
1047 
1048 	if (flags&MSG_PROBE)
1049 		return 0;
1050 
1051 	if (skb_queue_empty(&sk->sk_write_queue))
1052 		return -EINVAL;
1053 
1054 	rt = inet->cork.rt;
1055 	if (inet->cork.flags & IPCORK_OPT)
1056 		opt = inet->cork.opt;
1057 
1058 	if (!(rt->u.dst.dev->features&NETIF_F_SG))
1059 		return -EOPNOTSUPP;
1060 
1061 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1062 	mtu = inet->cork.fragsize;
1063 
1064 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1065 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1066 
1067 	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1068 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1069 		return -EMSGSIZE;
1070 	}
1071 
1072 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1073 		return -EINVAL;
1074 
1075 	inet->cork.length += size;
1076 	if ((sk->sk_protocol == IPPROTO_UDP) &&
1077 	    (rt->u.dst.dev->features & NETIF_F_UFO))
1078 		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
1079 
1080 
1081 	while (size > 0) {
1082 		int i;
1083 
1084 		if (skb_shinfo(skb)->ufo_size)
1085 			len = size;
1086 		else {
1087 
1088 			/* Check if the remaining data fits into current packet. */
1089 			len = mtu - skb->len;
1090 			if (len < size)
1091 				len = maxfraglen - skb->len;
1092 		}
1093 		if (len <= 0) {
1094 			struct sk_buff *skb_prev;
1095 			char *data;
1096 			struct iphdr *iph;
1097 			int alloclen;
1098 
1099 			skb_prev = skb;
1100 			fraggap = skb_prev->len - maxfraglen;
1101 
1102 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1103 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1104 			if (unlikely(!skb)) {
1105 				err = -ENOBUFS;
1106 				goto error;
1107 			}
1108 
1109 			/*
1110 			 *	Fill in the control structures
1111 			 */
1112 			skb->ip_summed = CHECKSUM_NONE;
1113 			skb->csum = 0;
1114 			skb_reserve(skb, hh_len);
1115 
1116 			/*
1117 			 *	Find where to start putting bytes.
1118 			 */
1119 			data = skb_put(skb, fragheaderlen + fraggap);
1120 			skb->nh.iph = iph = (struct iphdr *)data;
1121 			data += fragheaderlen;
1122 			skb->h.raw = data;
1123 
1124 			if (fraggap) {
1125 				skb->csum = skb_copy_and_csum_bits(
1126 					skb_prev, maxfraglen,
1127 					data, fraggap, 0);
1128 				skb_prev->csum = csum_sub(skb_prev->csum,
1129 							  skb->csum);
1130 				skb_trim(skb_prev, maxfraglen);
1131 			}
1132 
1133 			/*
1134 			 * Put the packet on the pending queue.
1135 			 */
1136 			__skb_queue_tail(&sk->sk_write_queue, skb);
1137 			continue;
1138 		}
1139 
1140 		i = skb_shinfo(skb)->nr_frags;
1141 		if (len > size)
1142 			len = size;
1143 		if (skb_can_coalesce(skb, i, page, offset)) {
1144 			skb_shinfo(skb)->frags[i-1].size += len;
1145 		} else if (i < MAX_SKB_FRAGS) {
1146 			get_page(page);
1147 			skb_fill_page_desc(skb, i, page, offset, len);
1148 		} else {
1149 			err = -EMSGSIZE;
1150 			goto error;
1151 		}
1152 
1153 		if (skb->ip_summed == CHECKSUM_NONE) {
1154 			unsigned int csum;
1155 			csum = csum_page(page, offset, len);
1156 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1157 		}
1158 
1159 		skb->len += len;
1160 		skb->data_len += len;
1161 		offset += len;
1162 		size -= len;
1163 	}
1164 	return 0;
1165 
1166 error:
1167 	inet->cork.length -= size;
1168 	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1169 	return err;
1170 }
1171 
1172 /*
1173  *	Combined all pending IP fragments on the socket as one IP datagram
1174  *	and push them out.
1175  */
1176 int ip_push_pending_frames(struct sock *sk)
1177 {
1178 	struct sk_buff *skb, *tmp_skb;
1179 	struct sk_buff **tail_skb;
1180 	struct inet_sock *inet = inet_sk(sk);
1181 	struct ip_options *opt = NULL;
1182 	struct rtable *rt = inet->cork.rt;
1183 	struct iphdr *iph;
1184 	int df = 0;
1185 	__u8 ttl;
1186 	int err = 0;
1187 
1188 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1189 		goto out;
1190 	tail_skb = &(skb_shinfo(skb)->frag_list);
1191 
1192 	/* move skb->data to ip header from ext header */
1193 	if (skb->data < skb->nh.raw)
1194 		__skb_pull(skb, skb->nh.raw - skb->data);
1195 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1196 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1197 		*tail_skb = tmp_skb;
1198 		tail_skb = &(tmp_skb->next);
1199 		skb->len += tmp_skb->len;
1200 		skb->data_len += tmp_skb->len;
1201 		skb->truesize += tmp_skb->truesize;
1202 		__sock_put(tmp_skb->sk);
1203 		tmp_skb->destructor = NULL;
1204 		tmp_skb->sk = NULL;
1205 	}
1206 
1207 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1208 	 * to fragment the frame generated here. No matter, what transforms
1209 	 * how transforms change size of the packet, it will come out.
1210 	 */
1211 	if (inet->pmtudisc != IP_PMTUDISC_DO)
1212 		skb->local_df = 1;
1213 
1214 	/* DF bit is set when we want to see DF on outgoing frames.
1215 	 * If local_df is set too, we still allow to fragment this frame
1216 	 * locally. */
1217 	if (inet->pmtudisc == IP_PMTUDISC_DO ||
1218 	    (skb->len <= dst_mtu(&rt->u.dst) &&
1219 	     ip_dont_fragment(sk, &rt->u.dst)))
1220 		df = htons(IP_DF);
1221 
1222 	if (inet->cork.flags & IPCORK_OPT)
1223 		opt = inet->cork.opt;
1224 
1225 	if (rt->rt_type == RTN_MULTICAST)
1226 		ttl = inet->mc_ttl;
1227 	else
1228 		ttl = ip_select_ttl(inet, &rt->u.dst);
1229 
1230 	iph = (struct iphdr *)skb->data;
1231 	iph->version = 4;
1232 	iph->ihl = 5;
1233 	if (opt) {
1234 		iph->ihl += opt->optlen>>2;
1235 		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1236 	}
1237 	iph->tos = inet->tos;
1238 	iph->tot_len = htons(skb->len);
1239 	iph->frag_off = df;
1240 	if (!df) {
1241 		__ip_select_ident(iph, &rt->u.dst, 0);
1242 	} else {
1243 		iph->id = htons(inet->id++);
1244 	}
1245 	iph->ttl = ttl;
1246 	iph->protocol = sk->sk_protocol;
1247 	iph->saddr = rt->rt_src;
1248 	iph->daddr = rt->rt_dst;
1249 	ip_send_check(iph);
1250 
1251 	skb->priority = sk->sk_priority;
1252 	skb->dst = dst_clone(&rt->u.dst);
1253 
1254 	/* Netfilter gets whole the not fragmented skb. */
1255 	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1256 		      skb->dst->dev, dst_output);
1257 	if (err) {
1258 		if (err > 0)
1259 			err = inet->recverr ? net_xmit_errno(err) : 0;
1260 		if (err)
1261 			goto error;
1262 	}
1263 
1264 out:
1265 	inet->cork.flags &= ~IPCORK_OPT;
1266 	kfree(inet->cork.opt);
1267 	inet->cork.opt = NULL;
1268 	if (inet->cork.rt) {
1269 		ip_rt_put(inet->cork.rt);
1270 		inet->cork.rt = NULL;
1271 	}
1272 	return err;
1273 
1274 error:
1275 	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1276 	goto out;
1277 }
1278 
1279 /*
1280  *	Throw away all pending data on the socket.
1281  */
1282 void ip_flush_pending_frames(struct sock *sk)
1283 {
1284 	struct inet_sock *inet = inet_sk(sk);
1285 	struct sk_buff *skb;
1286 
1287 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1288 		kfree_skb(skb);
1289 
1290 	inet->cork.flags &= ~IPCORK_OPT;
1291 	kfree(inet->cork.opt);
1292 	inet->cork.opt = NULL;
1293 	if (inet->cork.rt) {
1294 		ip_rt_put(inet->cork.rt);
1295 		inet->cork.rt = NULL;
1296 	}
1297 }
1298 
1299 
1300 /*
1301  *	Fetch data from kernel space and fill in checksum if needed.
1302  */
1303 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1304 			      int len, int odd, struct sk_buff *skb)
1305 {
1306 	unsigned int csum;
1307 
1308 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1309 	skb->csum = csum_block_add(skb->csum, csum, odd);
1310 	return 0;
1311 }
1312 
1313 /*
1314  *	Generic function to send a packet as reply to another packet.
1315  *	Used to send TCP resets so far. ICMP should use this function too.
1316  *
1317  *	Should run single threaded per socket because it uses the sock
1318  *     	structure to pass arguments.
1319  *
1320  *	LATER: switch from ip_build_xmit to ip_append_*
1321  */
1322 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1323 		   unsigned int len)
1324 {
1325 	struct inet_sock *inet = inet_sk(sk);
1326 	struct {
1327 		struct ip_options	opt;
1328 		char			data[40];
1329 	} replyopts;
1330 	struct ipcm_cookie ipc;
1331 	u32 daddr;
1332 	struct rtable *rt = (struct rtable*)skb->dst;
1333 
1334 	if (ip_options_echo(&replyopts.opt, skb))
1335 		return;
1336 
1337 	daddr = ipc.addr = rt->rt_src;
1338 	ipc.opt = NULL;
1339 
1340 	if (replyopts.opt.optlen) {
1341 		ipc.opt = &replyopts.opt;
1342 
1343 		if (ipc.opt->srr)
1344 			daddr = replyopts.opt.faddr;
1345 	}
1346 
1347 	{
1348 		struct flowi fl = { .nl_u = { .ip4_u =
1349 					      { .daddr = daddr,
1350 						.saddr = rt->rt_spec_dst,
1351 						.tos = RT_TOS(skb->nh.iph->tos) } },
1352 				    /* Not quite clean, but right. */
1353 				    .uli_u = { .ports =
1354 					       { .sport = skb->h.th->dest,
1355 					         .dport = skb->h.th->source } },
1356 				    .proto = sk->sk_protocol };
1357 		if (ip_route_output_key(&rt, &fl))
1358 			return;
1359 	}
1360 
1361 	/* And let IP do all the hard work.
1362 
1363 	   This chunk is not reenterable, hence spinlock.
1364 	   Note that it uses the fact, that this function is called
1365 	   with locally disabled BH and that sk cannot be already spinlocked.
1366 	 */
1367 	bh_lock_sock(sk);
1368 	inet->tos = skb->nh.iph->tos;
1369 	sk->sk_priority = skb->priority;
1370 	sk->sk_protocol = skb->nh.iph->protocol;
1371 	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1372 		       &ipc, rt, MSG_DONTWAIT);
1373 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1374 		if (arg->csumoffset >= 0)
1375 			*((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1376 		skb->ip_summed = CHECKSUM_NONE;
1377 		ip_push_pending_frames(sk);
1378 	}
1379 
1380 	bh_unlock_sock(sk);
1381 
1382 	ip_rt_put(rt);
1383 }
1384 
1385 void __init ip_init(void)
1386 {
1387 	ip_rt_init();
1388 	inet_initpeers();
1389 
1390 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1391 	igmp_mc_proc_init();
1392 #endif
1393 }
1394 
1395 EXPORT_SYMBOL(ip_fragment);
1396 EXPORT_SYMBOL(ip_generic_getfrag);
1397 EXPORT_SYMBOL(ip_queue_xmit);
1398 EXPORT_SYMBOL(ip_send_check);
1399