xref: /linux/net/ipv4/ip_output.c (revision 36ca1195ad7f760a6af3814cb002bd3a3d4b4db1)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Version:	$Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Donald Becker, <becker@super.org>
13  *		Alan Cox, <Alan.Cox@linux.org>
14  *		Richard Underwood
15  *		Stefan Becker, <stefanb@yello.ping.de>
16  *		Jorge Cwik, <jorge@laser.satlink.net>
17  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *		Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *	See ip_input.c for original log
21  *
22  *	Fixes:
23  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
24  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
25  *		Bradford Johnson:	Fix faulty handling of some frames when
26  *					no route is found.
27  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
28  *					(in case if packet not accepted by
29  *					output firewall rules)
30  *		Mike McLagan	:	Routing by source
31  *		Alexey Kuznetsov:	use new route cache
32  *		Andi Kleen:		Fix broken PMTU recovery and remove
33  *					some redundant tests.
34  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
35  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
36  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
37  *					for decreased register pressure on x86
38  *					and more readibility.
39  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
40  *					silently drop skb instead of failing with -EPERM.
41  *		Detlev Wengorz	:	Copy protocol for fragments.
42  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
43  *					datagrams.
44  *		Hirokazu Takahashi:	sendfile() on UDP works now.
45  */
46 
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57 
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67 
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/tcp.h>
73 #include <net/udp.h>
74 #include <linux/skbuff.h>
75 #include <net/sock.h>
76 #include <net/arp.h>
77 #include <net/icmp.h>
78 #include <net/raw.h>
79 #include <net/checksum.h>
80 #include <net/inetpeer.h>
81 #include <net/checksum.h>
82 #include <linux/igmp.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/netfilter_bridge.h>
85 #include <linux/mroute.h>
86 #include <linux/netlink.h>
87 
88 /*
89  *      Shall we try to damage output packets if routing dev changes?
90  */
91 
92 int sysctl_ip_dynaddr;
93 int sysctl_ip_default_ttl = IPDEFTTL;
94 
95 /* Generate a checksum for an outgoing IP datagram. */
96 __inline__ void ip_send_check(struct iphdr *iph)
97 {
98 	iph->check = 0;
99 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
100 }
101 
102 /* dev_loopback_xmit for use with netfilter. */
103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
104 {
105 	newskb->mac.raw = newskb->data;
106 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
107 	newskb->pkt_type = PACKET_LOOPBACK;
108 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 	BUG_TRAP(newskb->dst);
110 
111 #ifdef CONFIG_NETFILTER_DEBUG
112 	nf_debug_ip_loopback_xmit(newskb);
113 #endif
114 	nf_reset(newskb);
115 	netif_rx(newskb);
116 	return 0;
117 }
118 
119 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
120 {
121 	int ttl = inet->uc_ttl;
122 
123 	if (ttl < 0)
124 		ttl = dst_metric(dst, RTAX_HOPLIMIT);
125 	return ttl;
126 }
127 
128 /*
129  *		Add an ip header to a skbuff and send it out.
130  *
131  */
132 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
133 			  u32 saddr, u32 daddr, struct ip_options *opt)
134 {
135 	struct inet_sock *inet = inet_sk(sk);
136 	struct rtable *rt = (struct rtable *)skb->dst;
137 	struct iphdr *iph;
138 
139 	/* Build the IP header. */
140 	if (opt)
141 		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
142 	else
143 		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
144 
145 	iph->version  = 4;
146 	iph->ihl      = 5;
147 	iph->tos      = inet->tos;
148 	if (ip_dont_fragment(sk, &rt->u.dst))
149 		iph->frag_off = htons(IP_DF);
150 	else
151 		iph->frag_off = 0;
152 	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
153 	iph->daddr    = rt->rt_dst;
154 	iph->saddr    = rt->rt_src;
155 	iph->protocol = sk->sk_protocol;
156 	iph->tot_len  = htons(skb->len);
157 	ip_select_ident(iph, &rt->u.dst, sk);
158 	skb->nh.iph   = iph;
159 
160 	if (opt && opt->optlen) {
161 		iph->ihl += opt->optlen>>2;
162 		ip_options_build(skb, opt, daddr, rt, 0);
163 	}
164 	ip_send_check(iph);
165 
166 	skb->priority = sk->sk_priority;
167 
168 	/* Send it out. */
169 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
170 		       dst_output);
171 }
172 
173 static inline int ip_finish_output2(struct sk_buff *skb)
174 {
175 	struct dst_entry *dst = skb->dst;
176 	struct hh_cache *hh = dst->hh;
177 	struct net_device *dev = dst->dev;
178 	int hh_len = LL_RESERVED_SPACE(dev);
179 
180 	/* Be paranoid, rather than too clever. */
181 	if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
182 		struct sk_buff *skb2;
183 
184 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
185 		if (skb2 == NULL) {
186 			kfree_skb(skb);
187 			return -ENOMEM;
188 		}
189 		if (skb->sk)
190 			skb_set_owner_w(skb2, skb->sk);
191 		kfree_skb(skb);
192 		skb = skb2;
193 	}
194 
195 #ifdef CONFIG_NETFILTER_DEBUG
196 	nf_debug_ip_finish_output2(skb);
197 #endif /*CONFIG_NETFILTER_DEBUG*/
198 
199 	nf_reset(skb);
200 
201 	if (hh) {
202 		int hh_alen;
203 
204 		read_lock_bh(&hh->hh_lock);
205 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
206   		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
207 		read_unlock_bh(&hh->hh_lock);
208 	        skb_push(skb, hh->hh_len);
209 		return hh->hh_output(skb);
210 	} else if (dst->neighbour)
211 		return dst->neighbour->output(skb);
212 
213 	if (net_ratelimit())
214 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
215 	kfree_skb(skb);
216 	return -EINVAL;
217 }
218 
219 int ip_finish_output(struct sk_buff *skb)
220 {
221 	struct net_device *dev = skb->dst->dev;
222 
223 	skb->dev = dev;
224 	skb->protocol = htons(ETH_P_IP);
225 
226 	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
227 		       ip_finish_output2);
228 }
229 
230 int ip_mc_output(struct sk_buff *skb)
231 {
232 	struct sock *sk = skb->sk;
233 	struct rtable *rt = (struct rtable*)skb->dst;
234 	struct net_device *dev = rt->u.dst.dev;
235 
236 	/*
237 	 *	If the indicated interface is up and running, send the packet.
238 	 */
239 	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
240 
241 	skb->dev = dev;
242 	skb->protocol = htons(ETH_P_IP);
243 
244 	/*
245 	 *	Multicasts are looped back for other local users
246 	 */
247 
248 	if (rt->rt_flags&RTCF_MULTICAST) {
249 		if ((!sk || inet_sk(sk)->mc_loop)
250 #ifdef CONFIG_IP_MROUTE
251 		/* Small optimization: do not loopback not local frames,
252 		   which returned after forwarding; they will be  dropped
253 		   by ip_mr_input in any case.
254 		   Note, that local frames are looped back to be delivered
255 		   to local recipients.
256 
257 		   This check is duplicated in ip_mr_input at the moment.
258 		 */
259 		    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
260 #endif
261 		) {
262 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
263 			if (newskb)
264 				NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
265 					newskb->dev,
266 					ip_dev_loopback_xmit);
267 		}
268 
269 		/* Multicasts with ttl 0 must not go beyond the host */
270 
271 		if (skb->nh.iph->ttl == 0) {
272 			kfree_skb(skb);
273 			return 0;
274 		}
275 	}
276 
277 	if (rt->rt_flags&RTCF_BROADCAST) {
278 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
279 		if (newskb)
280 			NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
281 				newskb->dev, ip_dev_loopback_xmit);
282 	}
283 
284 	if (skb->len > dst_mtu(&rt->u.dst))
285 		return ip_fragment(skb, ip_finish_output);
286 	else
287 		return ip_finish_output(skb);
288 }
289 
290 int ip_output(struct sk_buff *skb)
291 {
292 	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
293 
294 	if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
295 		return ip_fragment(skb, ip_finish_output);
296 	else
297 		return ip_finish_output(skb);
298 }
299 
300 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
301 {
302 	struct sock *sk = skb->sk;
303 	struct inet_sock *inet = inet_sk(sk);
304 	struct ip_options *opt = inet->opt;
305 	struct rtable *rt;
306 	struct iphdr *iph;
307 
308 	/* Skip all of this if the packet is already routed,
309 	 * f.e. by something like SCTP.
310 	 */
311 	rt = (struct rtable *) skb->dst;
312 	if (rt != NULL)
313 		goto packet_routed;
314 
315 	/* Make sure we can route this packet. */
316 	rt = (struct rtable *)__sk_dst_check(sk, 0);
317 	if (rt == NULL) {
318 		u32 daddr;
319 
320 		/* Use correct destination address if we have options. */
321 		daddr = inet->daddr;
322 		if(opt && opt->srr)
323 			daddr = opt->faddr;
324 
325 		{
326 			struct flowi fl = { .oif = sk->sk_bound_dev_if,
327 					    .nl_u = { .ip4_u =
328 						      { .daddr = daddr,
329 							.saddr = inet->saddr,
330 							.tos = RT_CONN_FLAGS(sk) } },
331 					    .proto = sk->sk_protocol,
332 					    .uli_u = { .ports =
333 						       { .sport = inet->sport,
334 							 .dport = inet->dport } } };
335 
336 			/* If this fails, retransmit mechanism of transport layer will
337 			 * keep trying until route appears or the connection times
338 			 * itself out.
339 			 */
340 			if (ip_route_output_flow(&rt, &fl, sk, 0))
341 				goto no_route;
342 		}
343 		__sk_dst_set(sk, &rt->u.dst);
344 		tcp_v4_setup_caps(sk, &rt->u.dst);
345 	}
346 	skb->dst = dst_clone(&rt->u.dst);
347 
348 packet_routed:
349 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
350 		goto no_route;
351 
352 	/* OK, we know where to send it, allocate and build IP header. */
353 	iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
354 	*((__u16 *)iph)	= htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
355 	iph->tot_len = htons(skb->len);
356 	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
357 		iph->frag_off = htons(IP_DF);
358 	else
359 		iph->frag_off = 0;
360 	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
361 	iph->protocol = sk->sk_protocol;
362 	iph->saddr    = rt->rt_src;
363 	iph->daddr    = rt->rt_dst;
364 	skb->nh.iph   = iph;
365 	/* Transport layer set skb->h.foo itself. */
366 
367 	if (opt && opt->optlen) {
368 		iph->ihl += opt->optlen >> 2;
369 		ip_options_build(skb, opt, inet->daddr, rt, 0);
370 	}
371 
372 	ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
373 
374 	/* Add an IP checksum. */
375 	ip_send_check(iph);
376 
377 	skb->priority = sk->sk_priority;
378 
379 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
380 		       dst_output);
381 
382 no_route:
383 	IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
384 	kfree_skb(skb);
385 	return -EHOSTUNREACH;
386 }
387 
388 
389 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
390 {
391 	to->pkt_type = from->pkt_type;
392 	to->priority = from->priority;
393 	to->protocol = from->protocol;
394 	to->security = from->security;
395 	dst_release(to->dst);
396 	to->dst = dst_clone(from->dst);
397 	to->dev = from->dev;
398 
399 	/* Copy the flags to each fragment. */
400 	IPCB(to)->flags = IPCB(from)->flags;
401 
402 #ifdef CONFIG_NET_SCHED
403 	to->tc_index = from->tc_index;
404 #endif
405 #ifdef CONFIG_NETFILTER
406 	to->nfmark = from->nfmark;
407 	to->nfcache = from->nfcache;
408 	/* Connection association is same as pre-frag packet */
409 	nf_conntrack_put(to->nfct);
410 	to->nfct = from->nfct;
411 	nf_conntrack_get(to->nfct);
412 	to->nfctinfo = from->nfctinfo;
413 #ifdef CONFIG_BRIDGE_NETFILTER
414 	nf_bridge_put(to->nf_bridge);
415 	to->nf_bridge = from->nf_bridge;
416 	nf_bridge_get(to->nf_bridge);
417 #endif
418 #ifdef CONFIG_NETFILTER_DEBUG
419 	to->nf_debug = from->nf_debug;
420 #endif
421 #endif
422 }
423 
424 /*
425  *	This IP datagram is too large to be sent in one piece.  Break it up into
426  *	smaller pieces (each of size equal to IP header plus
427  *	a block of the data of the original IP data part) that will yet fit in a
428  *	single device frame, and queue such a frame for sending.
429  */
430 
431 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
432 {
433 	struct iphdr *iph;
434 	int raw = 0;
435 	int ptr;
436 	struct net_device *dev;
437 	struct sk_buff *skb2;
438 	unsigned int mtu, hlen, left, len, ll_rs;
439 	int offset;
440 	int not_last_frag;
441 	struct rtable *rt = (struct rtable*)skb->dst;
442 	int err = 0;
443 
444 	dev = rt->u.dst.dev;
445 
446 	/*
447 	 *	Point into the IP datagram header.
448 	 */
449 
450 	iph = skb->nh.iph;
451 
452 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
453 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
454 			  htonl(dst_mtu(&rt->u.dst)));
455 		kfree_skb(skb);
456 		return -EMSGSIZE;
457 	}
458 
459 	/*
460 	 *	Setup starting values.
461 	 */
462 
463 	hlen = iph->ihl * 4;
464 	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
465 
466 	/* When frag_list is given, use it. First, check its validity:
467 	 * some transformers could create wrong frag_list or break existing
468 	 * one, it is not prohibited. In this case fall back to copying.
469 	 *
470 	 * LATER: this step can be merged to real generation of fragments,
471 	 * we can switch to copy when see the first bad fragment.
472 	 */
473 	if (skb_shinfo(skb)->frag_list) {
474 		struct sk_buff *frag;
475 		int first_len = skb_pagelen(skb);
476 
477 		if (first_len - hlen > mtu ||
478 		    ((first_len - hlen) & 7) ||
479 		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
480 		    skb_cloned(skb))
481 			goto slow_path;
482 
483 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
484 			/* Correct geometry. */
485 			if (frag->len > mtu ||
486 			    ((frag->len & 7) && frag->next) ||
487 			    skb_headroom(frag) < hlen)
488 			    goto slow_path;
489 
490 			/* Partially cloned skb? */
491 			if (skb_shared(frag))
492 				goto slow_path;
493 
494 			BUG_ON(frag->sk);
495 			if (skb->sk) {
496 				sock_hold(skb->sk);
497 				frag->sk = skb->sk;
498 				frag->destructor = sock_wfree;
499 				skb->truesize -= frag->truesize;
500 			}
501 		}
502 
503 		/* Everything is OK. Generate! */
504 
505 		err = 0;
506 		offset = 0;
507 		frag = skb_shinfo(skb)->frag_list;
508 		skb_shinfo(skb)->frag_list = NULL;
509 		skb->data_len = first_len - skb_headlen(skb);
510 		skb->len = first_len;
511 		iph->tot_len = htons(first_len);
512 		iph->frag_off = htons(IP_MF);
513 		ip_send_check(iph);
514 
515 		for (;;) {
516 			/* Prepare header of the next frame,
517 			 * before previous one went down. */
518 			if (frag) {
519 				frag->ip_summed = CHECKSUM_NONE;
520 				frag->h.raw = frag->data;
521 				frag->nh.raw = __skb_push(frag, hlen);
522 				memcpy(frag->nh.raw, iph, hlen);
523 				iph = frag->nh.iph;
524 				iph->tot_len = htons(frag->len);
525 				ip_copy_metadata(frag, skb);
526 				if (offset == 0)
527 					ip_options_fragment(frag);
528 				offset += skb->len - hlen;
529 				iph->frag_off = htons(offset>>3);
530 				if (frag->next != NULL)
531 					iph->frag_off |= htons(IP_MF);
532 				/* Ready, complete checksum */
533 				ip_send_check(iph);
534 			}
535 
536 			err = output(skb);
537 
538 			if (err || !frag)
539 				break;
540 
541 			skb = frag;
542 			frag = skb->next;
543 			skb->next = NULL;
544 		}
545 
546 		if (err == 0) {
547 			IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
548 			return 0;
549 		}
550 
551 		while (frag) {
552 			skb = frag->next;
553 			kfree_skb(frag);
554 			frag = skb;
555 		}
556 		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
557 		return err;
558 	}
559 
560 slow_path:
561 	left = skb->len - hlen;		/* Space per frame */
562 	ptr = raw + hlen;		/* Where to start from */
563 
564 #ifdef CONFIG_BRIDGE_NETFILTER
565 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
566 	 * we need to make room for the encapsulating header */
567 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
568 	mtu -= nf_bridge_pad(skb);
569 #else
570 	ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
571 #endif
572 	/*
573 	 *	Fragment the datagram.
574 	 */
575 
576 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
577 	not_last_frag = iph->frag_off & htons(IP_MF);
578 
579 	/*
580 	 *	Keep copying data until we run out.
581 	 */
582 
583 	while(left > 0)	{
584 		len = left;
585 		/* IF: it doesn't fit, use 'mtu' - the data space left */
586 		if (len > mtu)
587 			len = mtu;
588 		/* IF: we are not sending upto and including the packet end
589 		   then align the next start on an eight byte boundary */
590 		if (len < left)	{
591 			len &= ~7;
592 		}
593 		/*
594 		 *	Allocate buffer.
595 		 */
596 
597 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
598 			NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
599 			err = -ENOMEM;
600 			goto fail;
601 		}
602 
603 		/*
604 		 *	Set up data on packet
605 		 */
606 
607 		ip_copy_metadata(skb2, skb);
608 		skb_reserve(skb2, ll_rs);
609 		skb_put(skb2, len + hlen);
610 		skb2->nh.raw = skb2->data;
611 		skb2->h.raw = skb2->data + hlen;
612 
613 		/*
614 		 *	Charge the memory for the fragment to any owner
615 		 *	it might possess
616 		 */
617 
618 		if (skb->sk)
619 			skb_set_owner_w(skb2, skb->sk);
620 
621 		/*
622 		 *	Copy the packet header into the new buffer.
623 		 */
624 
625 		memcpy(skb2->nh.raw, skb->data, hlen);
626 
627 		/*
628 		 *	Copy a block of the IP datagram.
629 		 */
630 		if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
631 			BUG();
632 		left -= len;
633 
634 		/*
635 		 *	Fill in the new header fields.
636 		 */
637 		iph = skb2->nh.iph;
638 		iph->frag_off = htons((offset >> 3));
639 
640 		/* ANK: dirty, but effective trick. Upgrade options only if
641 		 * the segment to be fragmented was THE FIRST (otherwise,
642 		 * options are already fixed) and make it ONCE
643 		 * on the initial skb, so that all the following fragments
644 		 * will inherit fixed options.
645 		 */
646 		if (offset == 0)
647 			ip_options_fragment(skb);
648 
649 		/*
650 		 *	Added AC : If we are fragmenting a fragment that's not the
651 		 *		   last fragment then keep MF on each bit
652 		 */
653 		if (left > 0 || not_last_frag)
654 			iph->frag_off |= htons(IP_MF);
655 		ptr += len;
656 		offset += len;
657 
658 		/*
659 		 *	Put this fragment into the sending queue.
660 		 */
661 
662 		IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
663 
664 		iph->tot_len = htons(len + hlen);
665 
666 		ip_send_check(iph);
667 
668 		err = output(skb2);
669 		if (err)
670 			goto fail;
671 	}
672 	kfree_skb(skb);
673 	IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
674 	return err;
675 
676 fail:
677 	kfree_skb(skb);
678 	IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
679 	return err;
680 }
681 
682 int
683 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
684 {
685 	struct iovec *iov = from;
686 
687 	if (skb->ip_summed == CHECKSUM_HW) {
688 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
689 			return -EFAULT;
690 	} else {
691 		unsigned int csum = 0;
692 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
693 			return -EFAULT;
694 		skb->csum = csum_block_add(skb->csum, csum, odd);
695 	}
696 	return 0;
697 }
698 
699 static inline unsigned int
700 csum_page(struct page *page, int offset, int copy)
701 {
702 	char *kaddr;
703 	unsigned int csum;
704 	kaddr = kmap(page);
705 	csum = csum_partial(kaddr + offset, copy, 0);
706 	kunmap(page);
707 	return csum;
708 }
709 
710 /*
711  *	ip_append_data() and ip_append_page() can make one large IP datagram
712  *	from many pieces of data. Each pieces will be holded on the socket
713  *	until ip_push_pending_frames() is called. Each piece can be a page
714  *	or non-page data.
715  *
716  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
717  *	this interface potentially.
718  *
719  *	LATER: length must be adjusted by pad at tail, when it is required.
720  */
721 int ip_append_data(struct sock *sk,
722 		   int getfrag(void *from, char *to, int offset, int len,
723 			       int odd, struct sk_buff *skb),
724 		   void *from, int length, int transhdrlen,
725 		   struct ipcm_cookie *ipc, struct rtable *rt,
726 		   unsigned int flags)
727 {
728 	struct inet_sock *inet = inet_sk(sk);
729 	struct sk_buff *skb;
730 
731 	struct ip_options *opt = NULL;
732 	int hh_len;
733 	int exthdrlen;
734 	int mtu;
735 	int copy;
736 	int err;
737 	int offset = 0;
738 	unsigned int maxfraglen, fragheaderlen;
739 	int csummode = CHECKSUM_NONE;
740 
741 	if (flags&MSG_PROBE)
742 		return 0;
743 
744 	if (skb_queue_empty(&sk->sk_write_queue)) {
745 		/*
746 		 * setup for corking.
747 		 */
748 		opt = ipc->opt;
749 		if (opt) {
750 			if (inet->cork.opt == NULL) {
751 				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
752 				if (unlikely(inet->cork.opt == NULL))
753 					return -ENOBUFS;
754 			}
755 			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
756 			inet->cork.flags |= IPCORK_OPT;
757 			inet->cork.addr = ipc->addr;
758 		}
759 		dst_hold(&rt->u.dst);
760 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
761 		inet->cork.rt = rt;
762 		inet->cork.length = 0;
763 		sk->sk_sndmsg_page = NULL;
764 		sk->sk_sndmsg_off = 0;
765 		if ((exthdrlen = rt->u.dst.header_len) != 0) {
766 			length += exthdrlen;
767 			transhdrlen += exthdrlen;
768 		}
769 	} else {
770 		rt = inet->cork.rt;
771 		if (inet->cork.flags & IPCORK_OPT)
772 			opt = inet->cork.opt;
773 
774 		transhdrlen = 0;
775 		exthdrlen = 0;
776 		mtu = inet->cork.fragsize;
777 	}
778 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
779 
780 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
781 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
782 
783 	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
784 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
785 		return -EMSGSIZE;
786 	}
787 
788 	/*
789 	 * transhdrlen > 0 means that this is the first fragment and we wish
790 	 * it won't be fragmented in the future.
791 	 */
792 	if (transhdrlen &&
793 	    length + fragheaderlen <= mtu &&
794 	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
795 	    !exthdrlen)
796 		csummode = CHECKSUM_HW;
797 
798 	inet->cork.length += length;
799 
800 	/* So, what's going on in the loop below?
801 	 *
802 	 * We use calculated fragment length to generate chained skb,
803 	 * each of segments is IP fragment ready for sending to network after
804 	 * adding appropriate IP header.
805 	 */
806 
807 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
808 		goto alloc_new_skb;
809 
810 	while (length > 0) {
811 		/* Check if the remaining data fits into current packet. */
812 		copy = mtu - skb->len;
813 		if (copy < length)
814 			copy = maxfraglen - skb->len;
815 		if (copy <= 0) {
816 			char *data;
817 			unsigned int datalen;
818 			unsigned int fraglen;
819 			unsigned int fraggap;
820 			unsigned int alloclen;
821 			struct sk_buff *skb_prev;
822 alloc_new_skb:
823 			skb_prev = skb;
824 			if (skb_prev)
825 				fraggap = skb_prev->len - maxfraglen;
826 			else
827 				fraggap = 0;
828 
829 			/*
830 			 * If remaining data exceeds the mtu,
831 			 * we know we need more fragment(s).
832 			 */
833 			datalen = length + fraggap;
834 			if (datalen > mtu - fragheaderlen)
835 				datalen = maxfraglen - fragheaderlen;
836 			fraglen = datalen + fragheaderlen;
837 
838 			if ((flags & MSG_MORE) &&
839 			    !(rt->u.dst.dev->features&NETIF_F_SG))
840 				alloclen = mtu;
841 			else
842 				alloclen = datalen + fragheaderlen;
843 
844 			/* The last fragment gets additional space at tail.
845 			 * Note, with MSG_MORE we overallocate on fragments,
846 			 * because we have no idea what fragment will be
847 			 * the last.
848 			 */
849 			if (datalen == length)
850 				alloclen += rt->u.dst.trailer_len;
851 
852 			if (transhdrlen) {
853 				skb = sock_alloc_send_skb(sk,
854 						alloclen + hh_len + 15,
855 						(flags & MSG_DONTWAIT), &err);
856 			} else {
857 				skb = NULL;
858 				if (atomic_read(&sk->sk_wmem_alloc) <=
859 				    2 * sk->sk_sndbuf)
860 					skb = sock_wmalloc(sk,
861 							   alloclen + hh_len + 15, 1,
862 							   sk->sk_allocation);
863 				if (unlikely(skb == NULL))
864 					err = -ENOBUFS;
865 			}
866 			if (skb == NULL)
867 				goto error;
868 
869 			/*
870 			 *	Fill in the control structures
871 			 */
872 			skb->ip_summed = csummode;
873 			skb->csum = 0;
874 			skb_reserve(skb, hh_len);
875 
876 			/*
877 			 *	Find where to start putting bytes.
878 			 */
879 			data = skb_put(skb, fraglen);
880 			skb->nh.raw = data + exthdrlen;
881 			data += fragheaderlen;
882 			skb->h.raw = data + exthdrlen;
883 
884 			if (fraggap) {
885 				skb->csum = skb_copy_and_csum_bits(
886 					skb_prev, maxfraglen,
887 					data + transhdrlen, fraggap, 0);
888 				skb_prev->csum = csum_sub(skb_prev->csum,
889 							  skb->csum);
890 				data += fraggap;
891 				skb_trim(skb_prev, maxfraglen);
892 			}
893 
894 			copy = datalen - transhdrlen - fraggap;
895 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
896 				err = -EFAULT;
897 				kfree_skb(skb);
898 				goto error;
899 			}
900 
901 			offset += copy;
902 			length -= datalen - fraggap;
903 			transhdrlen = 0;
904 			exthdrlen = 0;
905 			csummode = CHECKSUM_NONE;
906 
907 			/*
908 			 * Put the packet on the pending queue.
909 			 */
910 			__skb_queue_tail(&sk->sk_write_queue, skb);
911 			continue;
912 		}
913 
914 		if (copy > length)
915 			copy = length;
916 
917 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
918 			unsigned int off;
919 
920 			off = skb->len;
921 			if (getfrag(from, skb_put(skb, copy),
922 					offset, copy, off, skb) < 0) {
923 				__skb_trim(skb, off);
924 				err = -EFAULT;
925 				goto error;
926 			}
927 		} else {
928 			int i = skb_shinfo(skb)->nr_frags;
929 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
930 			struct page *page = sk->sk_sndmsg_page;
931 			int off = sk->sk_sndmsg_off;
932 			unsigned int left;
933 
934 			if (page && (left = PAGE_SIZE - off) > 0) {
935 				if (copy >= left)
936 					copy = left;
937 				if (page != frag->page) {
938 					if (i == MAX_SKB_FRAGS) {
939 						err = -EMSGSIZE;
940 						goto error;
941 					}
942 					get_page(page);
943 	 				skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
944 					frag = &skb_shinfo(skb)->frags[i];
945 				}
946 			} else if (i < MAX_SKB_FRAGS) {
947 				if (copy > PAGE_SIZE)
948 					copy = PAGE_SIZE;
949 				page = alloc_pages(sk->sk_allocation, 0);
950 				if (page == NULL)  {
951 					err = -ENOMEM;
952 					goto error;
953 				}
954 				sk->sk_sndmsg_page = page;
955 				sk->sk_sndmsg_off = 0;
956 
957 				skb_fill_page_desc(skb, i, page, 0, 0);
958 				frag = &skb_shinfo(skb)->frags[i];
959 				skb->truesize += PAGE_SIZE;
960 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
961 			} else {
962 				err = -EMSGSIZE;
963 				goto error;
964 			}
965 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
966 				err = -EFAULT;
967 				goto error;
968 			}
969 			sk->sk_sndmsg_off += copy;
970 			frag->size += copy;
971 			skb->len += copy;
972 			skb->data_len += copy;
973 		}
974 		offset += copy;
975 		length -= copy;
976 	}
977 
978 	return 0;
979 
980 error:
981 	inet->cork.length -= length;
982 	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
983 	return err;
984 }
985 
986 ssize_t	ip_append_page(struct sock *sk, struct page *page,
987 		       int offset, size_t size, int flags)
988 {
989 	struct inet_sock *inet = inet_sk(sk);
990 	struct sk_buff *skb;
991 	struct rtable *rt;
992 	struct ip_options *opt = NULL;
993 	int hh_len;
994 	int mtu;
995 	int len;
996 	int err;
997 	unsigned int maxfraglen, fragheaderlen, fraggap;
998 
999 	if (inet->hdrincl)
1000 		return -EPERM;
1001 
1002 	if (flags&MSG_PROBE)
1003 		return 0;
1004 
1005 	if (skb_queue_empty(&sk->sk_write_queue))
1006 		return -EINVAL;
1007 
1008 	rt = inet->cork.rt;
1009 	if (inet->cork.flags & IPCORK_OPT)
1010 		opt = inet->cork.opt;
1011 
1012 	if (!(rt->u.dst.dev->features&NETIF_F_SG))
1013 		return -EOPNOTSUPP;
1014 
1015 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1016 	mtu = inet->cork.fragsize;
1017 
1018 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1019 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1020 
1021 	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1022 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1023 		return -EMSGSIZE;
1024 	}
1025 
1026 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1027 		return -EINVAL;
1028 
1029 	inet->cork.length += size;
1030 
1031 	while (size > 0) {
1032 		int i;
1033 
1034 		/* Check if the remaining data fits into current packet. */
1035 		len = mtu - skb->len;
1036 		if (len < size)
1037 			len = maxfraglen - skb->len;
1038 		if (len <= 0) {
1039 			struct sk_buff *skb_prev;
1040 			char *data;
1041 			struct iphdr *iph;
1042 			int alloclen;
1043 
1044 			skb_prev = skb;
1045 			if (skb_prev)
1046 				fraggap = skb_prev->len - maxfraglen;
1047 			else
1048 				fraggap = 0;
1049 
1050 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1051 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1052 			if (unlikely(!skb)) {
1053 				err = -ENOBUFS;
1054 				goto error;
1055 			}
1056 
1057 			/*
1058 			 *	Fill in the control structures
1059 			 */
1060 			skb->ip_summed = CHECKSUM_NONE;
1061 			skb->csum = 0;
1062 			skb_reserve(skb, hh_len);
1063 
1064 			/*
1065 			 *	Find where to start putting bytes.
1066 			 */
1067 			data = skb_put(skb, fragheaderlen + fraggap);
1068 			skb->nh.iph = iph = (struct iphdr *)data;
1069 			data += fragheaderlen;
1070 			skb->h.raw = data;
1071 
1072 			if (fraggap) {
1073 				skb->csum = skb_copy_and_csum_bits(
1074 					skb_prev, maxfraglen,
1075 					data, fraggap, 0);
1076 				skb_prev->csum = csum_sub(skb_prev->csum,
1077 							  skb->csum);
1078 				skb_trim(skb_prev, maxfraglen);
1079 			}
1080 
1081 			/*
1082 			 * Put the packet on the pending queue.
1083 			 */
1084 			__skb_queue_tail(&sk->sk_write_queue, skb);
1085 			continue;
1086 		}
1087 
1088 		i = skb_shinfo(skb)->nr_frags;
1089 		if (len > size)
1090 			len = size;
1091 		if (skb_can_coalesce(skb, i, page, offset)) {
1092 			skb_shinfo(skb)->frags[i-1].size += len;
1093 		} else if (i < MAX_SKB_FRAGS) {
1094 			get_page(page);
1095 			skb_fill_page_desc(skb, i, page, offset, len);
1096 		} else {
1097 			err = -EMSGSIZE;
1098 			goto error;
1099 		}
1100 
1101 		if (skb->ip_summed == CHECKSUM_NONE) {
1102 			unsigned int csum;
1103 			csum = csum_page(page, offset, len);
1104 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1105 		}
1106 
1107 		skb->len += len;
1108 		skb->data_len += len;
1109 		offset += len;
1110 		size -= len;
1111 	}
1112 	return 0;
1113 
1114 error:
1115 	inet->cork.length -= size;
1116 	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1117 	return err;
1118 }
1119 
1120 /*
1121  *	Combined all pending IP fragments on the socket as one IP datagram
1122  *	and push them out.
1123  */
1124 int ip_push_pending_frames(struct sock *sk)
1125 {
1126 	struct sk_buff *skb, *tmp_skb;
1127 	struct sk_buff **tail_skb;
1128 	struct inet_sock *inet = inet_sk(sk);
1129 	struct ip_options *opt = NULL;
1130 	struct rtable *rt = inet->cork.rt;
1131 	struct iphdr *iph;
1132 	int df = 0;
1133 	__u8 ttl;
1134 	int err = 0;
1135 
1136 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1137 		goto out;
1138 	tail_skb = &(skb_shinfo(skb)->frag_list);
1139 
1140 	/* move skb->data to ip header from ext header */
1141 	if (skb->data < skb->nh.raw)
1142 		__skb_pull(skb, skb->nh.raw - skb->data);
1143 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1144 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1145 		*tail_skb = tmp_skb;
1146 		tail_skb = &(tmp_skb->next);
1147 		skb->len += tmp_skb->len;
1148 		skb->data_len += tmp_skb->len;
1149 		skb->truesize += tmp_skb->truesize;
1150 		__sock_put(tmp_skb->sk);
1151 		tmp_skb->destructor = NULL;
1152 		tmp_skb->sk = NULL;
1153 	}
1154 
1155 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1156 	 * to fragment the frame generated here. No matter, what transforms
1157 	 * how transforms change size of the packet, it will come out.
1158 	 */
1159 	if (inet->pmtudisc != IP_PMTUDISC_DO)
1160 		skb->local_df = 1;
1161 
1162 	/* DF bit is set when we want to see DF on outgoing frames.
1163 	 * If local_df is set too, we still allow to fragment this frame
1164 	 * locally. */
1165 	if (inet->pmtudisc == IP_PMTUDISC_DO ||
1166 	    (skb->len <= dst_mtu(&rt->u.dst) &&
1167 	     ip_dont_fragment(sk, &rt->u.dst)))
1168 		df = htons(IP_DF);
1169 
1170 	if (inet->cork.flags & IPCORK_OPT)
1171 		opt = inet->cork.opt;
1172 
1173 	if (rt->rt_type == RTN_MULTICAST)
1174 		ttl = inet->mc_ttl;
1175 	else
1176 		ttl = ip_select_ttl(inet, &rt->u.dst);
1177 
1178 	iph = (struct iphdr *)skb->data;
1179 	iph->version = 4;
1180 	iph->ihl = 5;
1181 	if (opt) {
1182 		iph->ihl += opt->optlen>>2;
1183 		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1184 	}
1185 	iph->tos = inet->tos;
1186 	iph->tot_len = htons(skb->len);
1187 	iph->frag_off = df;
1188 	if (!df) {
1189 		__ip_select_ident(iph, &rt->u.dst, 0);
1190 	} else {
1191 		iph->id = htons(inet->id++);
1192 	}
1193 	iph->ttl = ttl;
1194 	iph->protocol = sk->sk_protocol;
1195 	iph->saddr = rt->rt_src;
1196 	iph->daddr = rt->rt_dst;
1197 	ip_send_check(iph);
1198 
1199 	skb->priority = sk->sk_priority;
1200 	skb->dst = dst_clone(&rt->u.dst);
1201 
1202 	/* Netfilter gets whole the not fragmented skb. */
1203 	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1204 		      skb->dst->dev, dst_output);
1205 	if (err) {
1206 		if (err > 0)
1207 			err = inet->recverr ? net_xmit_errno(err) : 0;
1208 		if (err)
1209 			goto error;
1210 	}
1211 
1212 out:
1213 	inet->cork.flags &= ~IPCORK_OPT;
1214 	if (inet->cork.opt) {
1215 		kfree(inet->cork.opt);
1216 		inet->cork.opt = NULL;
1217 	}
1218 	if (inet->cork.rt) {
1219 		ip_rt_put(inet->cork.rt);
1220 		inet->cork.rt = NULL;
1221 	}
1222 	return err;
1223 
1224 error:
1225 	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1226 	goto out;
1227 }
1228 
1229 /*
1230  *	Throw away all pending data on the socket.
1231  */
1232 void ip_flush_pending_frames(struct sock *sk)
1233 {
1234 	struct inet_sock *inet = inet_sk(sk);
1235 	struct sk_buff *skb;
1236 
1237 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1238 		kfree_skb(skb);
1239 
1240 	inet->cork.flags &= ~IPCORK_OPT;
1241 	if (inet->cork.opt) {
1242 		kfree(inet->cork.opt);
1243 		inet->cork.opt = NULL;
1244 	}
1245 	if (inet->cork.rt) {
1246 		ip_rt_put(inet->cork.rt);
1247 		inet->cork.rt = NULL;
1248 	}
1249 }
1250 
1251 
1252 /*
1253  *	Fetch data from kernel space and fill in checksum if needed.
1254  */
1255 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1256 			      int len, int odd, struct sk_buff *skb)
1257 {
1258 	unsigned int csum;
1259 
1260 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1261 	skb->csum = csum_block_add(skb->csum, csum, odd);
1262 	return 0;
1263 }
1264 
1265 /*
1266  *	Generic function to send a packet as reply to another packet.
1267  *	Used to send TCP resets so far. ICMP should use this function too.
1268  *
1269  *	Should run single threaded per socket because it uses the sock
1270  *     	structure to pass arguments.
1271  *
1272  *	LATER: switch from ip_build_xmit to ip_append_*
1273  */
1274 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1275 		   unsigned int len)
1276 {
1277 	struct inet_sock *inet = inet_sk(sk);
1278 	struct {
1279 		struct ip_options	opt;
1280 		char			data[40];
1281 	} replyopts;
1282 	struct ipcm_cookie ipc;
1283 	u32 daddr;
1284 	struct rtable *rt = (struct rtable*)skb->dst;
1285 
1286 	if (ip_options_echo(&replyopts.opt, skb))
1287 		return;
1288 
1289 	daddr = ipc.addr = rt->rt_src;
1290 	ipc.opt = NULL;
1291 
1292 	if (replyopts.opt.optlen) {
1293 		ipc.opt = &replyopts.opt;
1294 
1295 		if (ipc.opt->srr)
1296 			daddr = replyopts.opt.faddr;
1297 	}
1298 
1299 	{
1300 		struct flowi fl = { .nl_u = { .ip4_u =
1301 					      { .daddr = daddr,
1302 						.saddr = rt->rt_spec_dst,
1303 						.tos = RT_TOS(skb->nh.iph->tos) } },
1304 				    /* Not quite clean, but right. */
1305 				    .uli_u = { .ports =
1306 					       { .sport = skb->h.th->dest,
1307 					         .dport = skb->h.th->source } },
1308 				    .proto = sk->sk_protocol };
1309 		if (ip_route_output_key(&rt, &fl))
1310 			return;
1311 	}
1312 
1313 	/* And let IP do all the hard work.
1314 
1315 	   This chunk is not reenterable, hence spinlock.
1316 	   Note that it uses the fact, that this function is called
1317 	   with locally disabled BH and that sk cannot be already spinlocked.
1318 	 */
1319 	bh_lock_sock(sk);
1320 	inet->tos = skb->nh.iph->tos;
1321 	sk->sk_priority = skb->priority;
1322 	sk->sk_protocol = skb->nh.iph->protocol;
1323 	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1324 		       &ipc, rt, MSG_DONTWAIT);
1325 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1326 		if (arg->csumoffset >= 0)
1327 			*((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1328 		skb->ip_summed = CHECKSUM_NONE;
1329 		ip_push_pending_frames(sk);
1330 	}
1331 
1332 	bh_unlock_sock(sk);
1333 
1334 	ip_rt_put(rt);
1335 }
1336 
1337 /*
1338  *	IP protocol layer initialiser
1339  */
1340 
1341 static struct packet_type ip_packet_type = {
1342 	.type = __constant_htons(ETH_P_IP),
1343 	.func = ip_rcv,
1344 };
1345 
1346 /*
1347  *	IP registers the packet type and then calls the subprotocol initialisers
1348  */
1349 
1350 void __init ip_init(void)
1351 {
1352 	dev_add_pack(&ip_packet_type);
1353 
1354 	ip_rt_init();
1355 	inet_initpeers();
1356 
1357 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1358 	igmp_mc_proc_init();
1359 #endif
1360 }
1361 
1362 EXPORT_SYMBOL(ip_finish_output);
1363 EXPORT_SYMBOL(ip_fragment);
1364 EXPORT_SYMBOL(ip_generic_getfrag);
1365 EXPORT_SYMBOL(ip_queue_xmit);
1366 EXPORT_SYMBOL(ip_send_check);
1367 
1368 #ifdef CONFIG_SYSCTL
1369 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1370 #endif
1371