xref: /linux/net/ipv4/ip_output.c (revision a33f32244d8550da8b4a26e277ce07d5c6d158b5)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Donald Becker, <becker@super.org>
11  *		Alan Cox, <Alan.Cox@linux.org>
12  *		Richard Underwood
13  *		Stefan Becker, <stefanb@yello.ping.de>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16  *		Hirokazu Takahashi, <taka@valinux.co.jp>
17  *
18  *	See ip_input.c for original log
19  *
20  *	Fixes:
21  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23  *		Bradford Johnson:	Fix faulty handling of some frames when
24  *					no route is found.
25  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26  *					(in case if packet not accepted by
27  *					output firewall rules)
28  *		Mike McLagan	:	Routing by source
29  *		Alexey Kuznetsov:	use new route cache
30  *		Andi Kleen:		Fix broken PMTU recovery and remove
31  *					some redundant tests.
32  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35  *					for decreased register pressure on x86
36  *					and more readibility.
37  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38  *					silently drop skb instead of failing with -EPERM.
39  *		Detlev Wengorz	:	Copy protocol for fragments.
40  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41  *					datagrams.
42  *		Hirokazu Takahashi:	sendfile() on UDP works now.
43  */
44 
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
55 
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
65 
66 #include <net/snmp.h>
67 #include <net/ip.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
70 #include <net/xfrm.h>
71 #include <linux/skbuff.h>
72 #include <net/sock.h>
73 #include <net/arp.h>
74 #include <net/icmp.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
83 
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 
86 /* Generate a checksum for an outgoing IP datagram. */
87 __inline__ void ip_send_check(struct iphdr *iph)
88 {
89 	iph->check = 0;
90 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91 }
92 
93 int __ip_local_out(struct sk_buff *skb)
94 {
95 	struct iphdr *iph = ip_hdr(skb);
96 
97 	iph->tot_len = htons(skb->len);
98 	ip_send_check(iph);
99 	return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
100 		       dst_output);
101 }
102 
103 int ip_local_out(struct sk_buff *skb)
104 {
105 	int err;
106 
107 	err = __ip_local_out(skb);
108 	if (likely(err == 1))
109 		err = dst_output(skb);
110 
111 	return err;
112 }
113 EXPORT_SYMBOL_GPL(ip_local_out);
114 
115 /* dev_loopback_xmit for use with netfilter. */
116 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
117 {
118 	skb_reset_mac_header(newskb);
119 	__skb_pull(newskb, skb_network_offset(newskb));
120 	newskb->pkt_type = PACKET_LOOPBACK;
121 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
122 	WARN_ON(!skb_dst(newskb));
123 	netif_rx_ni(newskb);
124 	return 0;
125 }
126 
127 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
128 {
129 	int ttl = inet->uc_ttl;
130 
131 	if (ttl < 0)
132 		ttl = dst_metric(dst, RTAX_HOPLIMIT);
133 	return ttl;
134 }
135 
136 /*
137  *		Add an ip header to a skbuff and send it out.
138  *
139  */
140 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
141 			  __be32 saddr, __be32 daddr, struct ip_options *opt)
142 {
143 	struct inet_sock *inet = inet_sk(sk);
144 	struct rtable *rt = skb_rtable(skb);
145 	struct iphdr *iph;
146 
147 	/* Build the IP header. */
148 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
149 	skb_reset_network_header(skb);
150 	iph = ip_hdr(skb);
151 	iph->version  = 4;
152 	iph->ihl      = 5;
153 	iph->tos      = inet->tos;
154 	if (ip_dont_fragment(sk, &rt->u.dst))
155 		iph->frag_off = htons(IP_DF);
156 	else
157 		iph->frag_off = 0;
158 	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
159 	iph->daddr    = rt->rt_dst;
160 	iph->saddr    = rt->rt_src;
161 	iph->protocol = sk->sk_protocol;
162 	ip_select_ident(iph, &rt->u.dst, sk);
163 
164 	if (opt && opt->optlen) {
165 		iph->ihl += opt->optlen>>2;
166 		ip_options_build(skb, opt, daddr, rt, 0);
167 	}
168 
169 	skb->priority = sk->sk_priority;
170 	skb->mark = sk->sk_mark;
171 
172 	/* Send it out. */
173 	return ip_local_out(skb);
174 }
175 
176 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
177 
178 static inline int ip_finish_output2(struct sk_buff *skb)
179 {
180 	struct dst_entry *dst = skb_dst(skb);
181 	struct rtable *rt = (struct rtable *)dst;
182 	struct net_device *dev = dst->dev;
183 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
184 
185 	if (rt->rt_type == RTN_MULTICAST) {
186 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
187 	} else if (rt->rt_type == RTN_BROADCAST)
188 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
189 
190 	/* Be paranoid, rather than too clever. */
191 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
192 		struct sk_buff *skb2;
193 
194 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
195 		if (skb2 == NULL) {
196 			kfree_skb(skb);
197 			return -ENOMEM;
198 		}
199 		if (skb->sk)
200 			skb_set_owner_w(skb2, skb->sk);
201 		kfree_skb(skb);
202 		skb = skb2;
203 	}
204 
205 	if (dst->hh)
206 		return neigh_hh_output(dst->hh, skb);
207 	else if (dst->neighbour)
208 		return dst->neighbour->output(skb);
209 
210 	if (net_ratelimit())
211 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
212 	kfree_skb(skb);
213 	return -EINVAL;
214 }
215 
216 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
217 {
218 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
219 
220 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
221 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
222 }
223 
224 static int ip_finish_output(struct sk_buff *skb)
225 {
226 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
227 	/* Policy lookup after SNAT yielded a new policy */
228 	if (skb_dst(skb)->xfrm != NULL) {
229 		IPCB(skb)->flags |= IPSKB_REROUTED;
230 		return dst_output(skb);
231 	}
232 #endif
233 	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
234 		return ip_fragment(skb, ip_finish_output2);
235 	else
236 		return ip_finish_output2(skb);
237 }
238 
239 int ip_mc_output(struct sk_buff *skb)
240 {
241 	struct sock *sk = skb->sk;
242 	struct rtable *rt = skb_rtable(skb);
243 	struct net_device *dev = rt->u.dst.dev;
244 
245 	/*
246 	 *	If the indicated interface is up and running, send the packet.
247 	 */
248 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
249 
250 	skb->dev = dev;
251 	skb->protocol = htons(ETH_P_IP);
252 
253 	/*
254 	 *	Multicasts are looped back for other local users
255 	 */
256 
257 	if (rt->rt_flags&RTCF_MULTICAST) {
258 		if (sk_mc_loop(sk)
259 #ifdef CONFIG_IP_MROUTE
260 		/* Small optimization: do not loopback not local frames,
261 		   which returned after forwarding; they will be  dropped
262 		   by ip_mr_input in any case.
263 		   Note, that local frames are looped back to be delivered
264 		   to local recipients.
265 
266 		   This check is duplicated in ip_mr_input at the moment.
267 		 */
268 		    &&
269 		    ((rt->rt_flags & RTCF_LOCAL) ||
270 		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
271 #endif
272 		   ) {
273 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
274 			if (newskb)
275 				NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
276 					NULL, newskb->dev,
277 					ip_dev_loopback_xmit);
278 		}
279 
280 		/* Multicasts with ttl 0 must not go beyond the host */
281 
282 		if (ip_hdr(skb)->ttl == 0) {
283 			kfree_skb(skb);
284 			return 0;
285 		}
286 	}
287 
288 	if (rt->rt_flags&RTCF_BROADCAST) {
289 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
290 		if (newskb)
291 			NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
292 				newskb->dev, ip_dev_loopback_xmit);
293 	}
294 
295 	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
296 			    ip_finish_output,
297 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
298 }
299 
300 int ip_output(struct sk_buff *skb)
301 {
302 	struct net_device *dev = skb_dst(skb)->dev;
303 
304 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
305 
306 	skb->dev = dev;
307 	skb->protocol = htons(ETH_P_IP);
308 
309 	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
310 			    ip_finish_output,
311 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
312 }
313 
314 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
315 {
316 	struct sock *sk = skb->sk;
317 	struct inet_sock *inet = inet_sk(sk);
318 	struct ip_options *opt = inet->opt;
319 	struct rtable *rt;
320 	struct iphdr *iph;
321 
322 	/* Skip all of this if the packet is already routed,
323 	 * f.e. by something like SCTP.
324 	 */
325 	rt = skb_rtable(skb);
326 	if (rt != NULL)
327 		goto packet_routed;
328 
329 	/* Make sure we can route this packet. */
330 	rt = (struct rtable *)__sk_dst_check(sk, 0);
331 	if (rt == NULL) {
332 		__be32 daddr;
333 
334 		/* Use correct destination address if we have options. */
335 		daddr = inet->inet_daddr;
336 		if(opt && opt->srr)
337 			daddr = opt->faddr;
338 
339 		{
340 			struct flowi fl = { .oif = sk->sk_bound_dev_if,
341 					    .mark = sk->sk_mark,
342 					    .nl_u = { .ip4_u =
343 						      { .daddr = daddr,
344 							.saddr = inet->inet_saddr,
345 							.tos = RT_CONN_FLAGS(sk) } },
346 					    .proto = sk->sk_protocol,
347 					    .flags = inet_sk_flowi_flags(sk),
348 					    .uli_u = { .ports =
349 						       { .sport = inet->inet_sport,
350 							 .dport = inet->inet_dport } } };
351 
352 			/* If this fails, retransmit mechanism of transport layer will
353 			 * keep trying until route appears or the connection times
354 			 * itself out.
355 			 */
356 			security_sk_classify_flow(sk, &fl);
357 			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
358 				goto no_route;
359 		}
360 		sk_setup_caps(sk, &rt->u.dst);
361 	}
362 	skb_dst_set(skb, dst_clone(&rt->u.dst));
363 
364 packet_routed:
365 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
366 		goto no_route;
367 
368 	/* OK, we know where to send it, allocate and build IP header. */
369 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
370 	skb_reset_network_header(skb);
371 	iph = ip_hdr(skb);
372 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
373 	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
374 		iph->frag_off = htons(IP_DF);
375 	else
376 		iph->frag_off = 0;
377 	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
378 	iph->protocol = sk->sk_protocol;
379 	iph->saddr    = rt->rt_src;
380 	iph->daddr    = rt->rt_dst;
381 	/* Transport layer set skb->h.foo itself. */
382 
383 	if (opt && opt->optlen) {
384 		iph->ihl += opt->optlen >> 2;
385 		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
386 	}
387 
388 	ip_select_ident_more(iph, &rt->u.dst, sk,
389 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
390 
391 	skb->priority = sk->sk_priority;
392 	skb->mark = sk->sk_mark;
393 
394 	return ip_local_out(skb);
395 
396 no_route:
397 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
398 	kfree_skb(skb);
399 	return -EHOSTUNREACH;
400 }
401 
402 
403 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
404 {
405 	to->pkt_type = from->pkt_type;
406 	to->priority = from->priority;
407 	to->protocol = from->protocol;
408 	skb_dst_drop(to);
409 	skb_dst_set(to, dst_clone(skb_dst(from)));
410 	to->dev = from->dev;
411 	to->mark = from->mark;
412 
413 	/* Copy the flags to each fragment. */
414 	IPCB(to)->flags = IPCB(from)->flags;
415 
416 #ifdef CONFIG_NET_SCHED
417 	to->tc_index = from->tc_index;
418 #endif
419 	nf_copy(to, from);
420 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
421     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
422 	to->nf_trace = from->nf_trace;
423 #endif
424 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
425 	to->ipvs_property = from->ipvs_property;
426 #endif
427 	skb_copy_secmark(to, from);
428 }
429 
430 /*
431  *	This IP datagram is too large to be sent in one piece.  Break it up into
432  *	smaller pieces (each of size equal to IP header plus
433  *	a block of the data of the original IP data part) that will yet fit in a
434  *	single device frame, and queue such a frame for sending.
435  */
436 
437 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
438 {
439 	struct iphdr *iph;
440 	int raw = 0;
441 	int ptr;
442 	struct net_device *dev;
443 	struct sk_buff *skb2;
444 	unsigned int mtu, hlen, left, len, ll_rs, pad;
445 	int offset;
446 	__be16 not_last_frag;
447 	struct rtable *rt = skb_rtable(skb);
448 	int err = 0;
449 
450 	dev = rt->u.dst.dev;
451 
452 	/*
453 	 *	Point into the IP datagram header.
454 	 */
455 
456 	iph = ip_hdr(skb);
457 
458 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
459 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
460 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
461 			  htonl(ip_skb_dst_mtu(skb)));
462 		kfree_skb(skb);
463 		return -EMSGSIZE;
464 	}
465 
466 	/*
467 	 *	Setup starting values.
468 	 */
469 
470 	hlen = iph->ihl * 4;
471 	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
472 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
473 
474 	/* When frag_list is given, use it. First, check its validity:
475 	 * some transformers could create wrong frag_list or break existing
476 	 * one, it is not prohibited. In this case fall back to copying.
477 	 *
478 	 * LATER: this step can be merged to real generation of fragments,
479 	 * we can switch to copy when see the first bad fragment.
480 	 */
481 	if (skb_has_frags(skb)) {
482 		struct sk_buff *frag;
483 		int first_len = skb_pagelen(skb);
484 		int truesizes = 0;
485 
486 		if (first_len - hlen > mtu ||
487 		    ((first_len - hlen) & 7) ||
488 		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
489 		    skb_cloned(skb))
490 			goto slow_path;
491 
492 		skb_walk_frags(skb, frag) {
493 			/* Correct geometry. */
494 			if (frag->len > mtu ||
495 			    ((frag->len & 7) && frag->next) ||
496 			    skb_headroom(frag) < hlen)
497 			    goto slow_path;
498 
499 			/* Partially cloned skb? */
500 			if (skb_shared(frag))
501 				goto slow_path;
502 
503 			BUG_ON(frag->sk);
504 			if (skb->sk) {
505 				frag->sk = skb->sk;
506 				frag->destructor = sock_wfree;
507 			}
508 			truesizes += frag->truesize;
509 		}
510 
511 		/* Everything is OK. Generate! */
512 
513 		err = 0;
514 		offset = 0;
515 		frag = skb_shinfo(skb)->frag_list;
516 		skb_frag_list_init(skb);
517 		skb->data_len = first_len - skb_headlen(skb);
518 		skb->truesize -= truesizes;
519 		skb->len = first_len;
520 		iph->tot_len = htons(first_len);
521 		iph->frag_off = htons(IP_MF);
522 		ip_send_check(iph);
523 
524 		for (;;) {
525 			/* Prepare header of the next frame,
526 			 * before previous one went down. */
527 			if (frag) {
528 				frag->ip_summed = CHECKSUM_NONE;
529 				skb_reset_transport_header(frag);
530 				__skb_push(frag, hlen);
531 				skb_reset_network_header(frag);
532 				memcpy(skb_network_header(frag), iph, hlen);
533 				iph = ip_hdr(frag);
534 				iph->tot_len = htons(frag->len);
535 				ip_copy_metadata(frag, skb);
536 				if (offset == 0)
537 					ip_options_fragment(frag);
538 				offset += skb->len - hlen;
539 				iph->frag_off = htons(offset>>3);
540 				if (frag->next != NULL)
541 					iph->frag_off |= htons(IP_MF);
542 				/* Ready, complete checksum */
543 				ip_send_check(iph);
544 			}
545 
546 			err = output(skb);
547 
548 			if (!err)
549 				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
550 			if (err || !frag)
551 				break;
552 
553 			skb = frag;
554 			frag = skb->next;
555 			skb->next = NULL;
556 		}
557 
558 		if (err == 0) {
559 			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
560 			return 0;
561 		}
562 
563 		while (frag) {
564 			skb = frag->next;
565 			kfree_skb(frag);
566 			frag = skb;
567 		}
568 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
569 		return err;
570 	}
571 
572 slow_path:
573 	left = skb->len - hlen;		/* Space per frame */
574 	ptr = raw + hlen;		/* Where to start from */
575 
576 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
577 	 * we need to make room for the encapsulating header
578 	 */
579 	pad = nf_bridge_pad(skb);
580 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
581 	mtu -= pad;
582 
583 	/*
584 	 *	Fragment the datagram.
585 	 */
586 
587 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
588 	not_last_frag = iph->frag_off & htons(IP_MF);
589 
590 	/*
591 	 *	Keep copying data until we run out.
592 	 */
593 
594 	while (left > 0) {
595 		len = left;
596 		/* IF: it doesn't fit, use 'mtu' - the data space left */
597 		if (len > mtu)
598 			len = mtu;
599 		/* IF: we are not sending upto and including the packet end
600 		   then align the next start on an eight byte boundary */
601 		if (len < left)	{
602 			len &= ~7;
603 		}
604 		/*
605 		 *	Allocate buffer.
606 		 */
607 
608 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
609 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
610 			err = -ENOMEM;
611 			goto fail;
612 		}
613 
614 		/*
615 		 *	Set up data on packet
616 		 */
617 
618 		ip_copy_metadata(skb2, skb);
619 		skb_reserve(skb2, ll_rs);
620 		skb_put(skb2, len + hlen);
621 		skb_reset_network_header(skb2);
622 		skb2->transport_header = skb2->network_header + hlen;
623 
624 		/*
625 		 *	Charge the memory for the fragment to any owner
626 		 *	it might possess
627 		 */
628 
629 		if (skb->sk)
630 			skb_set_owner_w(skb2, skb->sk);
631 
632 		/*
633 		 *	Copy the packet header into the new buffer.
634 		 */
635 
636 		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
637 
638 		/*
639 		 *	Copy a block of the IP datagram.
640 		 */
641 		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
642 			BUG();
643 		left -= len;
644 
645 		/*
646 		 *	Fill in the new header fields.
647 		 */
648 		iph = ip_hdr(skb2);
649 		iph->frag_off = htons((offset >> 3));
650 
651 		/* ANK: dirty, but effective trick. Upgrade options only if
652 		 * the segment to be fragmented was THE FIRST (otherwise,
653 		 * options are already fixed) and make it ONCE
654 		 * on the initial skb, so that all the following fragments
655 		 * will inherit fixed options.
656 		 */
657 		if (offset == 0)
658 			ip_options_fragment(skb);
659 
660 		/*
661 		 *	Added AC : If we are fragmenting a fragment that's not the
662 		 *		   last fragment then keep MF on each bit
663 		 */
664 		if (left > 0 || not_last_frag)
665 			iph->frag_off |= htons(IP_MF);
666 		ptr += len;
667 		offset += len;
668 
669 		/*
670 		 *	Put this fragment into the sending queue.
671 		 */
672 		iph->tot_len = htons(len + hlen);
673 
674 		ip_send_check(iph);
675 
676 		err = output(skb2);
677 		if (err)
678 			goto fail;
679 
680 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
681 	}
682 	kfree_skb(skb);
683 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
684 	return err;
685 
686 fail:
687 	kfree_skb(skb);
688 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
689 	return err;
690 }
691 
692 EXPORT_SYMBOL(ip_fragment);
693 
694 int
695 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
696 {
697 	struct iovec *iov = from;
698 
699 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
700 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
701 			return -EFAULT;
702 	} else {
703 		__wsum csum = 0;
704 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
705 			return -EFAULT;
706 		skb->csum = csum_block_add(skb->csum, csum, odd);
707 	}
708 	return 0;
709 }
710 
711 static inline __wsum
712 csum_page(struct page *page, int offset, int copy)
713 {
714 	char *kaddr;
715 	__wsum csum;
716 	kaddr = kmap(page);
717 	csum = csum_partial(kaddr + offset, copy, 0);
718 	kunmap(page);
719 	return csum;
720 }
721 
722 static inline int ip_ufo_append_data(struct sock *sk,
723 			int getfrag(void *from, char *to, int offset, int len,
724 			       int odd, struct sk_buff *skb),
725 			void *from, int length, int hh_len, int fragheaderlen,
726 			int transhdrlen, int mtu, unsigned int flags)
727 {
728 	struct sk_buff *skb;
729 	int err;
730 
731 	/* There is support for UDP fragmentation offload by network
732 	 * device, so create one single skb packet containing complete
733 	 * udp datagram
734 	 */
735 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
736 		skb = sock_alloc_send_skb(sk,
737 			hh_len + fragheaderlen + transhdrlen + 20,
738 			(flags & MSG_DONTWAIT), &err);
739 
740 		if (skb == NULL)
741 			return err;
742 
743 		/* reserve space for Hardware header */
744 		skb_reserve(skb, hh_len);
745 
746 		/* create space for UDP/IP header */
747 		skb_put(skb, fragheaderlen + transhdrlen);
748 
749 		/* initialize network header pointer */
750 		skb_reset_network_header(skb);
751 
752 		/* initialize protocol header pointer */
753 		skb->transport_header = skb->network_header + fragheaderlen;
754 
755 		skb->ip_summed = CHECKSUM_PARTIAL;
756 		skb->csum = 0;
757 		sk->sk_sndmsg_off = 0;
758 
759 		/* specify the length of each IP datagram fragment */
760 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
761 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
762 		__skb_queue_tail(&sk->sk_write_queue, skb);
763 	}
764 
765 	return skb_append_datato_frags(sk, skb, getfrag, from,
766 				       (length - transhdrlen));
767 }
768 
769 /*
770  *	ip_append_data() and ip_append_page() can make one large IP datagram
771  *	from many pieces of data. Each pieces will be holded on the socket
772  *	until ip_push_pending_frames() is called. Each piece can be a page
773  *	or non-page data.
774  *
775  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
776  *	this interface potentially.
777  *
778  *	LATER: length must be adjusted by pad at tail, when it is required.
779  */
780 int ip_append_data(struct sock *sk,
781 		   int getfrag(void *from, char *to, int offset, int len,
782 			       int odd, struct sk_buff *skb),
783 		   void *from, int length, int transhdrlen,
784 		   struct ipcm_cookie *ipc, struct rtable **rtp,
785 		   unsigned int flags)
786 {
787 	struct inet_sock *inet = inet_sk(sk);
788 	struct sk_buff *skb;
789 
790 	struct ip_options *opt = NULL;
791 	int hh_len;
792 	int exthdrlen;
793 	int mtu;
794 	int copy;
795 	int err;
796 	int offset = 0;
797 	unsigned int maxfraglen, fragheaderlen;
798 	int csummode = CHECKSUM_NONE;
799 	struct rtable *rt;
800 
801 	if (flags&MSG_PROBE)
802 		return 0;
803 
804 	if (skb_queue_empty(&sk->sk_write_queue)) {
805 		/*
806 		 * setup for corking.
807 		 */
808 		opt = ipc->opt;
809 		if (opt) {
810 			if (inet->cork.opt == NULL) {
811 				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
812 				if (unlikely(inet->cork.opt == NULL))
813 					return -ENOBUFS;
814 			}
815 			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
816 			inet->cork.flags |= IPCORK_OPT;
817 			inet->cork.addr = ipc->addr;
818 		}
819 		rt = *rtp;
820 		if (unlikely(!rt))
821 			return -EFAULT;
822 		/*
823 		 * We steal reference to this route, caller should not release it
824 		 */
825 		*rtp = NULL;
826 		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
827 					    rt->u.dst.dev->mtu :
828 					    dst_mtu(rt->u.dst.path);
829 		inet->cork.dst = &rt->u.dst;
830 		inet->cork.length = 0;
831 		sk->sk_sndmsg_page = NULL;
832 		sk->sk_sndmsg_off = 0;
833 		if ((exthdrlen = rt->u.dst.header_len) != 0) {
834 			length += exthdrlen;
835 			transhdrlen += exthdrlen;
836 		}
837 	} else {
838 		rt = (struct rtable *)inet->cork.dst;
839 		if (inet->cork.flags & IPCORK_OPT)
840 			opt = inet->cork.opt;
841 
842 		transhdrlen = 0;
843 		exthdrlen = 0;
844 		mtu = inet->cork.fragsize;
845 	}
846 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
847 
848 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
849 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
850 
851 	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
852 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
853 			       mtu-exthdrlen);
854 		return -EMSGSIZE;
855 	}
856 
857 	/*
858 	 * transhdrlen > 0 means that this is the first fragment and we wish
859 	 * it won't be fragmented in the future.
860 	 */
861 	if (transhdrlen &&
862 	    length + fragheaderlen <= mtu &&
863 	    rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
864 	    !exthdrlen)
865 		csummode = CHECKSUM_PARTIAL;
866 
867 	inet->cork.length += length;
868 	if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
869 	    (sk->sk_protocol == IPPROTO_UDP) &&
870 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
871 		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
872 					 fragheaderlen, transhdrlen, mtu,
873 					 flags);
874 		if (err)
875 			goto error;
876 		return 0;
877 	}
878 
879 	/* So, what's going on in the loop below?
880 	 *
881 	 * We use calculated fragment length to generate chained skb,
882 	 * each of segments is IP fragment ready for sending to network after
883 	 * adding appropriate IP header.
884 	 */
885 
886 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
887 		goto alloc_new_skb;
888 
889 	while (length > 0) {
890 		/* Check if the remaining data fits into current packet. */
891 		copy = mtu - skb->len;
892 		if (copy < length)
893 			copy = maxfraglen - skb->len;
894 		if (copy <= 0) {
895 			char *data;
896 			unsigned int datalen;
897 			unsigned int fraglen;
898 			unsigned int fraggap;
899 			unsigned int alloclen;
900 			struct sk_buff *skb_prev;
901 alloc_new_skb:
902 			skb_prev = skb;
903 			if (skb_prev)
904 				fraggap = skb_prev->len - maxfraglen;
905 			else
906 				fraggap = 0;
907 
908 			/*
909 			 * If remaining data exceeds the mtu,
910 			 * we know we need more fragment(s).
911 			 */
912 			datalen = length + fraggap;
913 			if (datalen > mtu - fragheaderlen)
914 				datalen = maxfraglen - fragheaderlen;
915 			fraglen = datalen + fragheaderlen;
916 
917 			if ((flags & MSG_MORE) &&
918 			    !(rt->u.dst.dev->features&NETIF_F_SG))
919 				alloclen = mtu;
920 			else
921 				alloclen = datalen + fragheaderlen;
922 
923 			/* The last fragment gets additional space at tail.
924 			 * Note, with MSG_MORE we overallocate on fragments,
925 			 * because we have no idea what fragment will be
926 			 * the last.
927 			 */
928 			if (datalen == length + fraggap)
929 				alloclen += rt->u.dst.trailer_len;
930 
931 			if (transhdrlen) {
932 				skb = sock_alloc_send_skb(sk,
933 						alloclen + hh_len + 15,
934 						(flags & MSG_DONTWAIT), &err);
935 			} else {
936 				skb = NULL;
937 				if (atomic_read(&sk->sk_wmem_alloc) <=
938 				    2 * sk->sk_sndbuf)
939 					skb = sock_wmalloc(sk,
940 							   alloclen + hh_len + 15, 1,
941 							   sk->sk_allocation);
942 				if (unlikely(skb == NULL))
943 					err = -ENOBUFS;
944 				else
945 					/* only the initial fragment is
946 					   time stamped */
947 					ipc->shtx.flags = 0;
948 			}
949 			if (skb == NULL)
950 				goto error;
951 
952 			/*
953 			 *	Fill in the control structures
954 			 */
955 			skb->ip_summed = csummode;
956 			skb->csum = 0;
957 			skb_reserve(skb, hh_len);
958 			*skb_tx(skb) = ipc->shtx;
959 
960 			/*
961 			 *	Find where to start putting bytes.
962 			 */
963 			data = skb_put(skb, fraglen);
964 			skb_set_network_header(skb, exthdrlen);
965 			skb->transport_header = (skb->network_header +
966 						 fragheaderlen);
967 			data += fragheaderlen;
968 
969 			if (fraggap) {
970 				skb->csum = skb_copy_and_csum_bits(
971 					skb_prev, maxfraglen,
972 					data + transhdrlen, fraggap, 0);
973 				skb_prev->csum = csum_sub(skb_prev->csum,
974 							  skb->csum);
975 				data += fraggap;
976 				pskb_trim_unique(skb_prev, maxfraglen);
977 			}
978 
979 			copy = datalen - transhdrlen - fraggap;
980 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
981 				err = -EFAULT;
982 				kfree_skb(skb);
983 				goto error;
984 			}
985 
986 			offset += copy;
987 			length -= datalen - fraggap;
988 			transhdrlen = 0;
989 			exthdrlen = 0;
990 			csummode = CHECKSUM_NONE;
991 
992 			/*
993 			 * Put the packet on the pending queue.
994 			 */
995 			__skb_queue_tail(&sk->sk_write_queue, skb);
996 			continue;
997 		}
998 
999 		if (copy > length)
1000 			copy = length;
1001 
1002 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1003 			unsigned int off;
1004 
1005 			off = skb->len;
1006 			if (getfrag(from, skb_put(skb, copy),
1007 					offset, copy, off, skb) < 0) {
1008 				__skb_trim(skb, off);
1009 				err = -EFAULT;
1010 				goto error;
1011 			}
1012 		} else {
1013 			int i = skb_shinfo(skb)->nr_frags;
1014 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1015 			struct page *page = sk->sk_sndmsg_page;
1016 			int off = sk->sk_sndmsg_off;
1017 			unsigned int left;
1018 
1019 			if (page && (left = PAGE_SIZE - off) > 0) {
1020 				if (copy >= left)
1021 					copy = left;
1022 				if (page != frag->page) {
1023 					if (i == MAX_SKB_FRAGS) {
1024 						err = -EMSGSIZE;
1025 						goto error;
1026 					}
1027 					get_page(page);
1028 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1029 					frag = &skb_shinfo(skb)->frags[i];
1030 				}
1031 			} else if (i < MAX_SKB_FRAGS) {
1032 				if (copy > PAGE_SIZE)
1033 					copy = PAGE_SIZE;
1034 				page = alloc_pages(sk->sk_allocation, 0);
1035 				if (page == NULL)  {
1036 					err = -ENOMEM;
1037 					goto error;
1038 				}
1039 				sk->sk_sndmsg_page = page;
1040 				sk->sk_sndmsg_off = 0;
1041 
1042 				skb_fill_page_desc(skb, i, page, 0, 0);
1043 				frag = &skb_shinfo(skb)->frags[i];
1044 			} else {
1045 				err = -EMSGSIZE;
1046 				goto error;
1047 			}
1048 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1049 				err = -EFAULT;
1050 				goto error;
1051 			}
1052 			sk->sk_sndmsg_off += copy;
1053 			frag->size += copy;
1054 			skb->len += copy;
1055 			skb->data_len += copy;
1056 			skb->truesize += copy;
1057 			atomic_add(copy, &sk->sk_wmem_alloc);
1058 		}
1059 		offset += copy;
1060 		length -= copy;
1061 	}
1062 
1063 	return 0;
1064 
1065 error:
1066 	inet->cork.length -= length;
1067 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1068 	return err;
1069 }
1070 
1071 ssize_t	ip_append_page(struct sock *sk, struct page *page,
1072 		       int offset, size_t size, int flags)
1073 {
1074 	struct inet_sock *inet = inet_sk(sk);
1075 	struct sk_buff *skb;
1076 	struct rtable *rt;
1077 	struct ip_options *opt = NULL;
1078 	int hh_len;
1079 	int mtu;
1080 	int len;
1081 	int err;
1082 	unsigned int maxfraglen, fragheaderlen, fraggap;
1083 
1084 	if (inet->hdrincl)
1085 		return -EPERM;
1086 
1087 	if (flags&MSG_PROBE)
1088 		return 0;
1089 
1090 	if (skb_queue_empty(&sk->sk_write_queue))
1091 		return -EINVAL;
1092 
1093 	rt = (struct rtable *)inet->cork.dst;
1094 	if (inet->cork.flags & IPCORK_OPT)
1095 		opt = inet->cork.opt;
1096 
1097 	if (!(rt->u.dst.dev->features&NETIF_F_SG))
1098 		return -EOPNOTSUPP;
1099 
1100 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1101 	mtu = inet->cork.fragsize;
1102 
1103 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1104 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1105 
1106 	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1107 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1108 		return -EMSGSIZE;
1109 	}
1110 
1111 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1112 		return -EINVAL;
1113 
1114 	inet->cork.length += size;
1115 	if ((sk->sk_protocol == IPPROTO_UDP) &&
1116 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1117 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1118 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1119 	}
1120 
1121 
1122 	while (size > 0) {
1123 		int i;
1124 
1125 		if (skb_is_gso(skb))
1126 			len = size;
1127 		else {
1128 
1129 			/* Check if the remaining data fits into current packet. */
1130 			len = mtu - skb->len;
1131 			if (len < size)
1132 				len = maxfraglen - skb->len;
1133 		}
1134 		if (len <= 0) {
1135 			struct sk_buff *skb_prev;
1136 			int alloclen;
1137 
1138 			skb_prev = skb;
1139 			fraggap = skb_prev->len - maxfraglen;
1140 
1141 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1142 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1143 			if (unlikely(!skb)) {
1144 				err = -ENOBUFS;
1145 				goto error;
1146 			}
1147 
1148 			/*
1149 			 *	Fill in the control structures
1150 			 */
1151 			skb->ip_summed = CHECKSUM_NONE;
1152 			skb->csum = 0;
1153 			skb_reserve(skb, hh_len);
1154 
1155 			/*
1156 			 *	Find where to start putting bytes.
1157 			 */
1158 			skb_put(skb, fragheaderlen + fraggap);
1159 			skb_reset_network_header(skb);
1160 			skb->transport_header = (skb->network_header +
1161 						 fragheaderlen);
1162 			if (fraggap) {
1163 				skb->csum = skb_copy_and_csum_bits(skb_prev,
1164 								   maxfraglen,
1165 						    skb_transport_header(skb),
1166 								   fraggap, 0);
1167 				skb_prev->csum = csum_sub(skb_prev->csum,
1168 							  skb->csum);
1169 				pskb_trim_unique(skb_prev, maxfraglen);
1170 			}
1171 
1172 			/*
1173 			 * Put the packet on the pending queue.
1174 			 */
1175 			__skb_queue_tail(&sk->sk_write_queue, skb);
1176 			continue;
1177 		}
1178 
1179 		i = skb_shinfo(skb)->nr_frags;
1180 		if (len > size)
1181 			len = size;
1182 		if (skb_can_coalesce(skb, i, page, offset)) {
1183 			skb_shinfo(skb)->frags[i-1].size += len;
1184 		} else if (i < MAX_SKB_FRAGS) {
1185 			get_page(page);
1186 			skb_fill_page_desc(skb, i, page, offset, len);
1187 		} else {
1188 			err = -EMSGSIZE;
1189 			goto error;
1190 		}
1191 
1192 		if (skb->ip_summed == CHECKSUM_NONE) {
1193 			__wsum csum;
1194 			csum = csum_page(page, offset, len);
1195 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1196 		}
1197 
1198 		skb->len += len;
1199 		skb->data_len += len;
1200 		skb->truesize += len;
1201 		atomic_add(len, &sk->sk_wmem_alloc);
1202 		offset += len;
1203 		size -= len;
1204 	}
1205 	return 0;
1206 
1207 error:
1208 	inet->cork.length -= size;
1209 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1210 	return err;
1211 }
1212 
1213 static void ip_cork_release(struct inet_sock *inet)
1214 {
1215 	inet->cork.flags &= ~IPCORK_OPT;
1216 	kfree(inet->cork.opt);
1217 	inet->cork.opt = NULL;
1218 	dst_release(inet->cork.dst);
1219 	inet->cork.dst = NULL;
1220 }
1221 
1222 /*
1223  *	Combined all pending IP fragments on the socket as one IP datagram
1224  *	and push them out.
1225  */
1226 int ip_push_pending_frames(struct sock *sk)
1227 {
1228 	struct sk_buff *skb, *tmp_skb;
1229 	struct sk_buff **tail_skb;
1230 	struct inet_sock *inet = inet_sk(sk);
1231 	struct net *net = sock_net(sk);
1232 	struct ip_options *opt = NULL;
1233 	struct rtable *rt = (struct rtable *)inet->cork.dst;
1234 	struct iphdr *iph;
1235 	__be16 df = 0;
1236 	__u8 ttl;
1237 	int err = 0;
1238 
1239 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1240 		goto out;
1241 	tail_skb = &(skb_shinfo(skb)->frag_list);
1242 
1243 	/* move skb->data to ip header from ext header */
1244 	if (skb->data < skb_network_header(skb))
1245 		__skb_pull(skb, skb_network_offset(skb));
1246 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1247 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1248 		*tail_skb = tmp_skb;
1249 		tail_skb = &(tmp_skb->next);
1250 		skb->len += tmp_skb->len;
1251 		skb->data_len += tmp_skb->len;
1252 		skb->truesize += tmp_skb->truesize;
1253 		tmp_skb->destructor = NULL;
1254 		tmp_skb->sk = NULL;
1255 	}
1256 
1257 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1258 	 * to fragment the frame generated here. No matter, what transforms
1259 	 * how transforms change size of the packet, it will come out.
1260 	 */
1261 	if (inet->pmtudisc < IP_PMTUDISC_DO)
1262 		skb->local_df = 1;
1263 
1264 	/* DF bit is set when we want to see DF on outgoing frames.
1265 	 * If local_df is set too, we still allow to fragment this frame
1266 	 * locally. */
1267 	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1268 	    (skb->len <= dst_mtu(&rt->u.dst) &&
1269 	     ip_dont_fragment(sk, &rt->u.dst)))
1270 		df = htons(IP_DF);
1271 
1272 	if (inet->cork.flags & IPCORK_OPT)
1273 		opt = inet->cork.opt;
1274 
1275 	if (rt->rt_type == RTN_MULTICAST)
1276 		ttl = inet->mc_ttl;
1277 	else
1278 		ttl = ip_select_ttl(inet, &rt->u.dst);
1279 
1280 	iph = (struct iphdr *)skb->data;
1281 	iph->version = 4;
1282 	iph->ihl = 5;
1283 	if (opt) {
1284 		iph->ihl += opt->optlen>>2;
1285 		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1286 	}
1287 	iph->tos = inet->tos;
1288 	iph->frag_off = df;
1289 	ip_select_ident(iph, &rt->u.dst, sk);
1290 	iph->ttl = ttl;
1291 	iph->protocol = sk->sk_protocol;
1292 	iph->saddr = rt->rt_src;
1293 	iph->daddr = rt->rt_dst;
1294 
1295 	skb->priority = sk->sk_priority;
1296 	skb->mark = sk->sk_mark;
1297 	/*
1298 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1299 	 * on dst refcount
1300 	 */
1301 	inet->cork.dst = NULL;
1302 	skb_dst_set(skb, &rt->u.dst);
1303 
1304 	if (iph->protocol == IPPROTO_ICMP)
1305 		icmp_out_count(net, ((struct icmphdr *)
1306 			skb_transport_header(skb))->type);
1307 
1308 	/* Netfilter gets whole the not fragmented skb. */
1309 	err = ip_local_out(skb);
1310 	if (err) {
1311 		if (err > 0)
1312 			err = net_xmit_errno(err);
1313 		if (err)
1314 			goto error;
1315 	}
1316 
1317 out:
1318 	ip_cork_release(inet);
1319 	return err;
1320 
1321 error:
1322 	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1323 	goto out;
1324 }
1325 
1326 /*
1327  *	Throw away all pending data on the socket.
1328  */
1329 void ip_flush_pending_frames(struct sock *sk)
1330 {
1331 	struct sk_buff *skb;
1332 
1333 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1334 		kfree_skb(skb);
1335 
1336 	ip_cork_release(inet_sk(sk));
1337 }
1338 
1339 
1340 /*
1341  *	Fetch data from kernel space and fill in checksum if needed.
1342  */
1343 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1344 			      int len, int odd, struct sk_buff *skb)
1345 {
1346 	__wsum csum;
1347 
1348 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1349 	skb->csum = csum_block_add(skb->csum, csum, odd);
1350 	return 0;
1351 }
1352 
1353 /*
1354  *	Generic function to send a packet as reply to another packet.
1355  *	Used to send TCP resets so far. ICMP should use this function too.
1356  *
1357  *	Should run single threaded per socket because it uses the sock
1358  *     	structure to pass arguments.
1359  */
1360 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1361 		   unsigned int len)
1362 {
1363 	struct inet_sock *inet = inet_sk(sk);
1364 	struct {
1365 		struct ip_options	opt;
1366 		char			data[40];
1367 	} replyopts;
1368 	struct ipcm_cookie ipc;
1369 	__be32 daddr;
1370 	struct rtable *rt = skb_rtable(skb);
1371 
1372 	if (ip_options_echo(&replyopts.opt, skb))
1373 		return;
1374 
1375 	daddr = ipc.addr = rt->rt_src;
1376 	ipc.opt = NULL;
1377 	ipc.shtx.flags = 0;
1378 
1379 	if (replyopts.opt.optlen) {
1380 		ipc.opt = &replyopts.opt;
1381 
1382 		if (ipc.opt->srr)
1383 			daddr = replyopts.opt.faddr;
1384 	}
1385 
1386 	{
1387 		struct flowi fl = { .oif = arg->bound_dev_if,
1388 				    .nl_u = { .ip4_u =
1389 					      { .daddr = daddr,
1390 						.saddr = rt->rt_spec_dst,
1391 						.tos = RT_TOS(ip_hdr(skb)->tos) } },
1392 				    /* Not quite clean, but right. */
1393 				    .uli_u = { .ports =
1394 					       { .sport = tcp_hdr(skb)->dest,
1395 						 .dport = tcp_hdr(skb)->source } },
1396 				    .proto = sk->sk_protocol,
1397 				    .flags = ip_reply_arg_flowi_flags(arg) };
1398 		security_skb_classify_flow(skb, &fl);
1399 		if (ip_route_output_key(sock_net(sk), &rt, &fl))
1400 			return;
1401 	}
1402 
1403 	/* And let IP do all the hard work.
1404 
1405 	   This chunk is not reenterable, hence spinlock.
1406 	   Note that it uses the fact, that this function is called
1407 	   with locally disabled BH and that sk cannot be already spinlocked.
1408 	 */
1409 	bh_lock_sock(sk);
1410 	inet->tos = ip_hdr(skb)->tos;
1411 	sk->sk_priority = skb->priority;
1412 	sk->sk_protocol = ip_hdr(skb)->protocol;
1413 	sk->sk_bound_dev_if = arg->bound_dev_if;
1414 	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1415 		       &ipc, &rt, MSG_DONTWAIT);
1416 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1417 		if (arg->csumoffset >= 0)
1418 			*((__sum16 *)skb_transport_header(skb) +
1419 			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1420 								arg->csum));
1421 		skb->ip_summed = CHECKSUM_NONE;
1422 		ip_push_pending_frames(sk);
1423 	}
1424 
1425 	bh_unlock_sock(sk);
1426 
1427 	ip_rt_put(rt);
1428 }
1429 
1430 void __init ip_init(void)
1431 {
1432 	ip_rt_init();
1433 	inet_initpeers();
1434 
1435 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1436 	igmp_mc_proc_init();
1437 #endif
1438 }
1439 
1440 EXPORT_SYMBOL(ip_generic_getfrag);
1441 EXPORT_SYMBOL(ip_queue_xmit);
1442 EXPORT_SYMBOL(ip_send_check);
1443