xref: /linux/net/ipv4/ip_output.c (revision b43ab901d671e3e3cad425ea5e9a3c74e266dcdd)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Donald Becker, <becker@super.org>
11  *		Alan Cox, <Alan.Cox@linux.org>
12  *		Richard Underwood
13  *		Stefan Becker, <stefanb@yello.ping.de>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16  *		Hirokazu Takahashi, <taka@valinux.co.jp>
17  *
18  *	See ip_input.c for original log
19  *
20  *	Fixes:
21  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23  *		Bradford Johnson:	Fix faulty handling of some frames when
24  *					no route is found.
25  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26  *					(in case if packet not accepted by
27  *					output firewall rules)
28  *		Mike McLagan	:	Routing by source
29  *		Alexey Kuznetsov:	use new route cache
30  *		Andi Kleen:		Fix broken PMTU recovery and remove
31  *					some redundant tests.
32  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35  *					for decreased register pressure on x86
36  *					and more readibility.
37  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38  *					silently drop skb instead of failing with -EPERM.
39  *		Detlev Wengorz	:	Copy protocol for fragments.
40  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41  *					datagrams.
42  *		Hirokazu Takahashi:	sendfile() on UDP works now.
43  */
44 
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
55 
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
65 
66 #include <net/snmp.h>
67 #include <net/ip.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
70 #include <net/xfrm.h>
71 #include <linux/skbuff.h>
72 #include <net/sock.h>
73 #include <net/arp.h>
74 #include <net/icmp.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
83 
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
86 
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
89 {
90 	iph->check = 0;
91 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92 }
93 EXPORT_SYMBOL(ip_send_check);
94 
95 int __ip_local_out(struct sk_buff *skb)
96 {
97 	struct iphdr *iph = ip_hdr(skb);
98 
99 	iph->tot_len = htons(skb->len);
100 	ip_send_check(iph);
101 	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 		       skb_dst(skb)->dev, dst_output);
103 }
104 
105 int ip_local_out(struct sk_buff *skb)
106 {
107 	int err;
108 
109 	err = __ip_local_out(skb);
110 	if (likely(err == 1))
111 		err = dst_output(skb);
112 
113 	return err;
114 }
115 EXPORT_SYMBOL_GPL(ip_local_out);
116 
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 {
120 	skb_reset_mac_header(newskb);
121 	__skb_pull(newskb, skb_network_offset(newskb));
122 	newskb->pkt_type = PACKET_LOOPBACK;
123 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 	WARN_ON(!skb_dst(newskb));
125 	skb_dst_force(newskb);
126 	netif_rx_ni(newskb);
127 	return 0;
128 }
129 
130 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
131 {
132 	int ttl = inet->uc_ttl;
133 
134 	if (ttl < 0)
135 		ttl = ip4_dst_hoplimit(dst);
136 	return ttl;
137 }
138 
139 /*
140  *		Add an ip header to a skbuff and send it out.
141  *
142  */
143 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
144 			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
145 {
146 	struct inet_sock *inet = inet_sk(sk);
147 	struct rtable *rt = skb_rtable(skb);
148 	struct iphdr *iph;
149 
150 	/* Build the IP header. */
151 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
152 	skb_reset_network_header(skb);
153 	iph = ip_hdr(skb);
154 	iph->version  = 4;
155 	iph->ihl      = 5;
156 	iph->tos      = inet->tos;
157 	if (ip_dont_fragment(sk, &rt->dst))
158 		iph->frag_off = htons(IP_DF);
159 	else
160 		iph->frag_off = 0;
161 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
162 	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
163 	iph->saddr    = saddr;
164 	iph->protocol = sk->sk_protocol;
165 	ip_select_ident(iph, &rt->dst, sk);
166 
167 	if (opt && opt->opt.optlen) {
168 		iph->ihl += opt->opt.optlen>>2;
169 		ip_options_build(skb, &opt->opt, daddr, rt, 0);
170 	}
171 
172 	skb->priority = sk->sk_priority;
173 	skb->mark = sk->sk_mark;
174 
175 	/* Send it out. */
176 	return ip_local_out(skb);
177 }
178 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
179 
180 static inline int ip_finish_output2(struct sk_buff *skb)
181 {
182 	struct dst_entry *dst = skb_dst(skb);
183 	struct rtable *rt = (struct rtable *)dst;
184 	struct net_device *dev = dst->dev;
185 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
186 	struct neighbour *neigh;
187 
188 	if (rt->rt_type == RTN_MULTICAST) {
189 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
190 	} else if (rt->rt_type == RTN_BROADCAST)
191 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
192 
193 	/* Be paranoid, rather than too clever. */
194 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
195 		struct sk_buff *skb2;
196 
197 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
198 		if (skb2 == NULL) {
199 			kfree_skb(skb);
200 			return -ENOMEM;
201 		}
202 		if (skb->sk)
203 			skb_set_owner_w(skb2, skb->sk);
204 		kfree_skb(skb);
205 		skb = skb2;
206 	}
207 
208 	rcu_read_lock();
209 	neigh = dst_get_neighbour_noref(dst);
210 	if (neigh) {
211 		int res = neigh_output(neigh, skb);
212 
213 		rcu_read_unlock();
214 		return res;
215 	}
216 	rcu_read_unlock();
217 
218 	if (net_ratelimit())
219 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
220 	kfree_skb(skb);
221 	return -EINVAL;
222 }
223 
224 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
225 {
226 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
227 
228 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
229 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
230 }
231 
232 static int ip_finish_output(struct sk_buff *skb)
233 {
234 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
235 	/* Policy lookup after SNAT yielded a new policy */
236 	if (skb_dst(skb)->xfrm != NULL) {
237 		IPCB(skb)->flags |= IPSKB_REROUTED;
238 		return dst_output(skb);
239 	}
240 #endif
241 	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
242 		return ip_fragment(skb, ip_finish_output2);
243 	else
244 		return ip_finish_output2(skb);
245 }
246 
247 int ip_mc_output(struct sk_buff *skb)
248 {
249 	struct sock *sk = skb->sk;
250 	struct rtable *rt = skb_rtable(skb);
251 	struct net_device *dev = rt->dst.dev;
252 
253 	/*
254 	 *	If the indicated interface is up and running, send the packet.
255 	 */
256 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
257 
258 	skb->dev = dev;
259 	skb->protocol = htons(ETH_P_IP);
260 
261 	/*
262 	 *	Multicasts are looped back for other local users
263 	 */
264 
265 	if (rt->rt_flags&RTCF_MULTICAST) {
266 		if (sk_mc_loop(sk)
267 #ifdef CONFIG_IP_MROUTE
268 		/* Small optimization: do not loopback not local frames,
269 		   which returned after forwarding; they will be  dropped
270 		   by ip_mr_input in any case.
271 		   Note, that local frames are looped back to be delivered
272 		   to local recipients.
273 
274 		   This check is duplicated in ip_mr_input at the moment.
275 		 */
276 		    &&
277 		    ((rt->rt_flags & RTCF_LOCAL) ||
278 		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
279 #endif
280 		   ) {
281 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
282 			if (newskb)
283 				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
284 					newskb, NULL, newskb->dev,
285 					ip_dev_loopback_xmit);
286 		}
287 
288 		/* Multicasts with ttl 0 must not go beyond the host */
289 
290 		if (ip_hdr(skb)->ttl == 0) {
291 			kfree_skb(skb);
292 			return 0;
293 		}
294 	}
295 
296 	if (rt->rt_flags&RTCF_BROADCAST) {
297 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
298 		if (newskb)
299 			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
300 				NULL, newskb->dev, ip_dev_loopback_xmit);
301 	}
302 
303 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
304 			    skb->dev, ip_finish_output,
305 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
306 }
307 
308 int ip_output(struct sk_buff *skb)
309 {
310 	struct net_device *dev = skb_dst(skb)->dev;
311 
312 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
313 
314 	skb->dev = dev;
315 	skb->protocol = htons(ETH_P_IP);
316 
317 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
318 			    ip_finish_output,
319 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
320 }
321 
322 /*
323  * copy saddr and daddr, possibly using 64bit load/stores
324  * Equivalent to :
325  *   iph->saddr = fl4->saddr;
326  *   iph->daddr = fl4->daddr;
327  */
328 static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
329 {
330 	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
331 		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
332 	memcpy(&iph->saddr, &fl4->saddr,
333 	       sizeof(fl4->saddr) + sizeof(fl4->daddr));
334 }
335 
336 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
337 {
338 	struct sock *sk = skb->sk;
339 	struct inet_sock *inet = inet_sk(sk);
340 	struct ip_options_rcu *inet_opt;
341 	struct flowi4 *fl4;
342 	struct rtable *rt;
343 	struct iphdr *iph;
344 	int res;
345 
346 	/* Skip all of this if the packet is already routed,
347 	 * f.e. by something like SCTP.
348 	 */
349 	rcu_read_lock();
350 	inet_opt = rcu_dereference(inet->inet_opt);
351 	fl4 = &fl->u.ip4;
352 	rt = skb_rtable(skb);
353 	if (rt != NULL)
354 		goto packet_routed;
355 
356 	/* Make sure we can route this packet. */
357 	rt = (struct rtable *)__sk_dst_check(sk, 0);
358 	if (rt == NULL) {
359 		__be32 daddr;
360 
361 		/* Use correct destination address if we have options. */
362 		daddr = inet->inet_daddr;
363 		if (inet_opt && inet_opt->opt.srr)
364 			daddr = inet_opt->opt.faddr;
365 
366 		/* If this fails, retransmit mechanism of transport layer will
367 		 * keep trying until route appears or the connection times
368 		 * itself out.
369 		 */
370 		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
371 					   daddr, inet->inet_saddr,
372 					   inet->inet_dport,
373 					   inet->inet_sport,
374 					   sk->sk_protocol,
375 					   RT_CONN_FLAGS(sk),
376 					   sk->sk_bound_dev_if);
377 		if (IS_ERR(rt))
378 			goto no_route;
379 		sk_setup_caps(sk, &rt->dst);
380 	}
381 	skb_dst_set_noref(skb, &rt->dst);
382 
383 packet_routed:
384 	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
385 		goto no_route;
386 
387 	/* OK, we know where to send it, allocate and build IP header. */
388 	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
389 	skb_reset_network_header(skb);
390 	iph = ip_hdr(skb);
391 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
392 	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
393 		iph->frag_off = htons(IP_DF);
394 	else
395 		iph->frag_off = 0;
396 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
397 	iph->protocol = sk->sk_protocol;
398 	ip_copy_addrs(iph, fl4);
399 
400 	/* Transport layer set skb->h.foo itself. */
401 
402 	if (inet_opt && inet_opt->opt.optlen) {
403 		iph->ihl += inet_opt->opt.optlen >> 2;
404 		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
405 	}
406 
407 	ip_select_ident_more(iph, &rt->dst, sk,
408 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
409 
410 	skb->priority = sk->sk_priority;
411 	skb->mark = sk->sk_mark;
412 
413 	res = ip_local_out(skb);
414 	rcu_read_unlock();
415 	return res;
416 
417 no_route:
418 	rcu_read_unlock();
419 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
420 	kfree_skb(skb);
421 	return -EHOSTUNREACH;
422 }
423 EXPORT_SYMBOL(ip_queue_xmit);
424 
425 
426 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
427 {
428 	to->pkt_type = from->pkt_type;
429 	to->priority = from->priority;
430 	to->protocol = from->protocol;
431 	skb_dst_drop(to);
432 	skb_dst_copy(to, from);
433 	to->dev = from->dev;
434 	to->mark = from->mark;
435 
436 	/* Copy the flags to each fragment. */
437 	IPCB(to)->flags = IPCB(from)->flags;
438 
439 #ifdef CONFIG_NET_SCHED
440 	to->tc_index = from->tc_index;
441 #endif
442 	nf_copy(to, from);
443 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
444     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
445 	to->nf_trace = from->nf_trace;
446 #endif
447 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
448 	to->ipvs_property = from->ipvs_property;
449 #endif
450 	skb_copy_secmark(to, from);
451 }
452 
453 /*
454  *	This IP datagram is too large to be sent in one piece.  Break it up into
455  *	smaller pieces (each of size equal to IP header plus
456  *	a block of the data of the original IP data part) that will yet fit in a
457  *	single device frame, and queue such a frame for sending.
458  */
459 
460 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
461 {
462 	struct iphdr *iph;
463 	int ptr;
464 	struct net_device *dev;
465 	struct sk_buff *skb2;
466 	unsigned int mtu, hlen, left, len, ll_rs;
467 	int offset;
468 	__be16 not_last_frag;
469 	struct rtable *rt = skb_rtable(skb);
470 	int err = 0;
471 
472 	dev = rt->dst.dev;
473 
474 	/*
475 	 *	Point into the IP datagram header.
476 	 */
477 
478 	iph = ip_hdr(skb);
479 
480 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
481 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
482 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
483 			  htonl(ip_skb_dst_mtu(skb)));
484 		kfree_skb(skb);
485 		return -EMSGSIZE;
486 	}
487 
488 	/*
489 	 *	Setup starting values.
490 	 */
491 
492 	hlen = iph->ihl * 4;
493 	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
494 #ifdef CONFIG_BRIDGE_NETFILTER
495 	if (skb->nf_bridge)
496 		mtu -= nf_bridge_mtu_reduction(skb);
497 #endif
498 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
499 
500 	/* When frag_list is given, use it. First, check its validity:
501 	 * some transformers could create wrong frag_list or break existing
502 	 * one, it is not prohibited. In this case fall back to copying.
503 	 *
504 	 * LATER: this step can be merged to real generation of fragments,
505 	 * we can switch to copy when see the first bad fragment.
506 	 */
507 	if (skb_has_frag_list(skb)) {
508 		struct sk_buff *frag, *frag2;
509 		int first_len = skb_pagelen(skb);
510 
511 		if (first_len - hlen > mtu ||
512 		    ((first_len - hlen) & 7) ||
513 		    ip_is_fragment(iph) ||
514 		    skb_cloned(skb))
515 			goto slow_path;
516 
517 		skb_walk_frags(skb, frag) {
518 			/* Correct geometry. */
519 			if (frag->len > mtu ||
520 			    ((frag->len & 7) && frag->next) ||
521 			    skb_headroom(frag) < hlen)
522 				goto slow_path_clean;
523 
524 			/* Partially cloned skb? */
525 			if (skb_shared(frag))
526 				goto slow_path_clean;
527 
528 			BUG_ON(frag->sk);
529 			if (skb->sk) {
530 				frag->sk = skb->sk;
531 				frag->destructor = sock_wfree;
532 			}
533 			skb->truesize -= frag->truesize;
534 		}
535 
536 		/* Everything is OK. Generate! */
537 
538 		err = 0;
539 		offset = 0;
540 		frag = skb_shinfo(skb)->frag_list;
541 		skb_frag_list_init(skb);
542 		skb->data_len = first_len - skb_headlen(skb);
543 		skb->len = first_len;
544 		iph->tot_len = htons(first_len);
545 		iph->frag_off = htons(IP_MF);
546 		ip_send_check(iph);
547 
548 		for (;;) {
549 			/* Prepare header of the next frame,
550 			 * before previous one went down. */
551 			if (frag) {
552 				frag->ip_summed = CHECKSUM_NONE;
553 				skb_reset_transport_header(frag);
554 				__skb_push(frag, hlen);
555 				skb_reset_network_header(frag);
556 				memcpy(skb_network_header(frag), iph, hlen);
557 				iph = ip_hdr(frag);
558 				iph->tot_len = htons(frag->len);
559 				ip_copy_metadata(frag, skb);
560 				if (offset == 0)
561 					ip_options_fragment(frag);
562 				offset += skb->len - hlen;
563 				iph->frag_off = htons(offset>>3);
564 				if (frag->next != NULL)
565 					iph->frag_off |= htons(IP_MF);
566 				/* Ready, complete checksum */
567 				ip_send_check(iph);
568 			}
569 
570 			err = output(skb);
571 
572 			if (!err)
573 				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
574 			if (err || !frag)
575 				break;
576 
577 			skb = frag;
578 			frag = skb->next;
579 			skb->next = NULL;
580 		}
581 
582 		if (err == 0) {
583 			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
584 			return 0;
585 		}
586 
587 		while (frag) {
588 			skb = frag->next;
589 			kfree_skb(frag);
590 			frag = skb;
591 		}
592 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
593 		return err;
594 
595 slow_path_clean:
596 		skb_walk_frags(skb, frag2) {
597 			if (frag2 == frag)
598 				break;
599 			frag2->sk = NULL;
600 			frag2->destructor = NULL;
601 			skb->truesize += frag2->truesize;
602 		}
603 	}
604 
605 slow_path:
606 	left = skb->len - hlen;		/* Space per frame */
607 	ptr = hlen;		/* Where to start from */
608 
609 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
610 	 * we need to make room for the encapsulating header
611 	 */
612 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
613 
614 	/*
615 	 *	Fragment the datagram.
616 	 */
617 
618 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
619 	not_last_frag = iph->frag_off & htons(IP_MF);
620 
621 	/*
622 	 *	Keep copying data until we run out.
623 	 */
624 
625 	while (left > 0) {
626 		len = left;
627 		/* IF: it doesn't fit, use 'mtu' - the data space left */
628 		if (len > mtu)
629 			len = mtu;
630 		/* IF: we are not sending up to and including the packet end
631 		   then align the next start on an eight byte boundary */
632 		if (len < left)	{
633 			len &= ~7;
634 		}
635 		/*
636 		 *	Allocate buffer.
637 		 */
638 
639 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
640 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
641 			err = -ENOMEM;
642 			goto fail;
643 		}
644 
645 		/*
646 		 *	Set up data on packet
647 		 */
648 
649 		ip_copy_metadata(skb2, skb);
650 		skb_reserve(skb2, ll_rs);
651 		skb_put(skb2, len + hlen);
652 		skb_reset_network_header(skb2);
653 		skb2->transport_header = skb2->network_header + hlen;
654 
655 		/*
656 		 *	Charge the memory for the fragment to any owner
657 		 *	it might possess
658 		 */
659 
660 		if (skb->sk)
661 			skb_set_owner_w(skb2, skb->sk);
662 
663 		/*
664 		 *	Copy the packet header into the new buffer.
665 		 */
666 
667 		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
668 
669 		/*
670 		 *	Copy a block of the IP datagram.
671 		 */
672 		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
673 			BUG();
674 		left -= len;
675 
676 		/*
677 		 *	Fill in the new header fields.
678 		 */
679 		iph = ip_hdr(skb2);
680 		iph->frag_off = htons((offset >> 3));
681 
682 		/* ANK: dirty, but effective trick. Upgrade options only if
683 		 * the segment to be fragmented was THE FIRST (otherwise,
684 		 * options are already fixed) and make it ONCE
685 		 * on the initial skb, so that all the following fragments
686 		 * will inherit fixed options.
687 		 */
688 		if (offset == 0)
689 			ip_options_fragment(skb);
690 
691 		/*
692 		 *	Added AC : If we are fragmenting a fragment that's not the
693 		 *		   last fragment then keep MF on each bit
694 		 */
695 		if (left > 0 || not_last_frag)
696 			iph->frag_off |= htons(IP_MF);
697 		ptr += len;
698 		offset += len;
699 
700 		/*
701 		 *	Put this fragment into the sending queue.
702 		 */
703 		iph->tot_len = htons(len + hlen);
704 
705 		ip_send_check(iph);
706 
707 		err = output(skb2);
708 		if (err)
709 			goto fail;
710 
711 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
712 	}
713 	kfree_skb(skb);
714 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
715 	return err;
716 
717 fail:
718 	kfree_skb(skb);
719 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
720 	return err;
721 }
722 EXPORT_SYMBOL(ip_fragment);
723 
724 int
725 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
726 {
727 	struct iovec *iov = from;
728 
729 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
730 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
731 			return -EFAULT;
732 	} else {
733 		__wsum csum = 0;
734 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
735 			return -EFAULT;
736 		skb->csum = csum_block_add(skb->csum, csum, odd);
737 	}
738 	return 0;
739 }
740 EXPORT_SYMBOL(ip_generic_getfrag);
741 
742 static inline __wsum
743 csum_page(struct page *page, int offset, int copy)
744 {
745 	char *kaddr;
746 	__wsum csum;
747 	kaddr = kmap(page);
748 	csum = csum_partial(kaddr + offset, copy, 0);
749 	kunmap(page);
750 	return csum;
751 }
752 
753 static inline int ip_ufo_append_data(struct sock *sk,
754 			struct sk_buff_head *queue,
755 			int getfrag(void *from, char *to, int offset, int len,
756 			       int odd, struct sk_buff *skb),
757 			void *from, int length, int hh_len, int fragheaderlen,
758 			int transhdrlen, int maxfraglen, unsigned int flags)
759 {
760 	struct sk_buff *skb;
761 	int err;
762 
763 	/* There is support for UDP fragmentation offload by network
764 	 * device, so create one single skb packet containing complete
765 	 * udp datagram
766 	 */
767 	if ((skb = skb_peek_tail(queue)) == NULL) {
768 		skb = sock_alloc_send_skb(sk,
769 			hh_len + fragheaderlen + transhdrlen + 20,
770 			(flags & MSG_DONTWAIT), &err);
771 
772 		if (skb == NULL)
773 			return err;
774 
775 		/* reserve space for Hardware header */
776 		skb_reserve(skb, hh_len);
777 
778 		/* create space for UDP/IP header */
779 		skb_put(skb, fragheaderlen + transhdrlen);
780 
781 		/* initialize network header pointer */
782 		skb_reset_network_header(skb);
783 
784 		/* initialize protocol header pointer */
785 		skb->transport_header = skb->network_header + fragheaderlen;
786 
787 		skb->ip_summed = CHECKSUM_PARTIAL;
788 		skb->csum = 0;
789 
790 		/* specify the length of each IP datagram fragment */
791 		skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
792 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
793 		__skb_queue_tail(queue, skb);
794 	}
795 
796 	return skb_append_datato_frags(sk, skb, getfrag, from,
797 				       (length - transhdrlen));
798 }
799 
800 static int __ip_append_data(struct sock *sk,
801 			    struct flowi4 *fl4,
802 			    struct sk_buff_head *queue,
803 			    struct inet_cork *cork,
804 			    int getfrag(void *from, char *to, int offset,
805 					int len, int odd, struct sk_buff *skb),
806 			    void *from, int length, int transhdrlen,
807 			    unsigned int flags)
808 {
809 	struct inet_sock *inet = inet_sk(sk);
810 	struct sk_buff *skb;
811 
812 	struct ip_options *opt = cork->opt;
813 	int hh_len;
814 	int exthdrlen;
815 	int mtu;
816 	int copy;
817 	int err;
818 	int offset = 0;
819 	unsigned int maxfraglen, fragheaderlen;
820 	int csummode = CHECKSUM_NONE;
821 	struct rtable *rt = (struct rtable *)cork->dst;
822 
823 	skb = skb_peek_tail(queue);
824 
825 	exthdrlen = !skb ? rt->dst.header_len : 0;
826 	mtu = cork->fragsize;
827 
828 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
829 
830 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
831 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
832 
833 	if (cork->length + length > 0xFFFF - fragheaderlen) {
834 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
835 			       mtu-exthdrlen);
836 		return -EMSGSIZE;
837 	}
838 
839 	/*
840 	 * transhdrlen > 0 means that this is the first fragment and we wish
841 	 * it won't be fragmented in the future.
842 	 */
843 	if (transhdrlen &&
844 	    length + fragheaderlen <= mtu &&
845 	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
846 	    !exthdrlen)
847 		csummode = CHECKSUM_PARTIAL;
848 
849 	cork->length += length;
850 	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
851 	    (sk->sk_protocol == IPPROTO_UDP) &&
852 	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
853 		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
854 					 hh_len, fragheaderlen, transhdrlen,
855 					 maxfraglen, flags);
856 		if (err)
857 			goto error;
858 		return 0;
859 	}
860 
861 	/* So, what's going on in the loop below?
862 	 *
863 	 * We use calculated fragment length to generate chained skb,
864 	 * each of segments is IP fragment ready for sending to network after
865 	 * adding appropriate IP header.
866 	 */
867 
868 	if (!skb)
869 		goto alloc_new_skb;
870 
871 	while (length > 0) {
872 		/* Check if the remaining data fits into current packet. */
873 		copy = mtu - skb->len;
874 		if (copy < length)
875 			copy = maxfraglen - skb->len;
876 		if (copy <= 0) {
877 			char *data;
878 			unsigned int datalen;
879 			unsigned int fraglen;
880 			unsigned int fraggap;
881 			unsigned int alloclen;
882 			struct sk_buff *skb_prev;
883 alloc_new_skb:
884 			skb_prev = skb;
885 			if (skb_prev)
886 				fraggap = skb_prev->len - maxfraglen;
887 			else
888 				fraggap = 0;
889 
890 			/*
891 			 * If remaining data exceeds the mtu,
892 			 * we know we need more fragment(s).
893 			 */
894 			datalen = length + fraggap;
895 			if (datalen > mtu - fragheaderlen)
896 				datalen = maxfraglen - fragheaderlen;
897 			fraglen = datalen + fragheaderlen;
898 
899 			if ((flags & MSG_MORE) &&
900 			    !(rt->dst.dev->features&NETIF_F_SG))
901 				alloclen = mtu;
902 			else
903 				alloclen = fraglen;
904 
905 			alloclen += exthdrlen;
906 
907 			/* The last fragment gets additional space at tail.
908 			 * Note, with MSG_MORE we overallocate on fragments,
909 			 * because we have no idea what fragment will be
910 			 * the last.
911 			 */
912 			if (datalen == length + fraggap)
913 				alloclen += rt->dst.trailer_len;
914 
915 			if (transhdrlen) {
916 				skb = sock_alloc_send_skb(sk,
917 						alloclen + hh_len + 15,
918 						(flags & MSG_DONTWAIT), &err);
919 			} else {
920 				skb = NULL;
921 				if (atomic_read(&sk->sk_wmem_alloc) <=
922 				    2 * sk->sk_sndbuf)
923 					skb = sock_wmalloc(sk,
924 							   alloclen + hh_len + 15, 1,
925 							   sk->sk_allocation);
926 				if (unlikely(skb == NULL))
927 					err = -ENOBUFS;
928 				else
929 					/* only the initial fragment is
930 					   time stamped */
931 					cork->tx_flags = 0;
932 			}
933 			if (skb == NULL)
934 				goto error;
935 
936 			/*
937 			 *	Fill in the control structures
938 			 */
939 			skb->ip_summed = csummode;
940 			skb->csum = 0;
941 			skb_reserve(skb, hh_len);
942 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
943 
944 			/*
945 			 *	Find where to start putting bytes.
946 			 */
947 			data = skb_put(skb, fraglen + exthdrlen);
948 			skb_set_network_header(skb, exthdrlen);
949 			skb->transport_header = (skb->network_header +
950 						 fragheaderlen);
951 			data += fragheaderlen + exthdrlen;
952 
953 			if (fraggap) {
954 				skb->csum = skb_copy_and_csum_bits(
955 					skb_prev, maxfraglen,
956 					data + transhdrlen, fraggap, 0);
957 				skb_prev->csum = csum_sub(skb_prev->csum,
958 							  skb->csum);
959 				data += fraggap;
960 				pskb_trim_unique(skb_prev, maxfraglen);
961 			}
962 
963 			copy = datalen - transhdrlen - fraggap;
964 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
965 				err = -EFAULT;
966 				kfree_skb(skb);
967 				goto error;
968 			}
969 
970 			offset += copy;
971 			length -= datalen - fraggap;
972 			transhdrlen = 0;
973 			exthdrlen = 0;
974 			csummode = CHECKSUM_NONE;
975 
976 			/*
977 			 * Put the packet on the pending queue.
978 			 */
979 			__skb_queue_tail(queue, skb);
980 			continue;
981 		}
982 
983 		if (copy > length)
984 			copy = length;
985 
986 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
987 			unsigned int off;
988 
989 			off = skb->len;
990 			if (getfrag(from, skb_put(skb, copy),
991 					offset, copy, off, skb) < 0) {
992 				__skb_trim(skb, off);
993 				err = -EFAULT;
994 				goto error;
995 			}
996 		} else {
997 			int i = skb_shinfo(skb)->nr_frags;
998 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
999 			struct page *page = cork->page;
1000 			int off = cork->off;
1001 			unsigned int left;
1002 
1003 			if (page && (left = PAGE_SIZE - off) > 0) {
1004 				if (copy >= left)
1005 					copy = left;
1006 				if (page != skb_frag_page(frag)) {
1007 					if (i == MAX_SKB_FRAGS) {
1008 						err = -EMSGSIZE;
1009 						goto error;
1010 					}
1011 					skb_fill_page_desc(skb, i, page, off, 0);
1012 					skb_frag_ref(skb, i);
1013 					frag = &skb_shinfo(skb)->frags[i];
1014 				}
1015 			} else if (i < MAX_SKB_FRAGS) {
1016 				if (copy > PAGE_SIZE)
1017 					copy = PAGE_SIZE;
1018 				page = alloc_pages(sk->sk_allocation, 0);
1019 				if (page == NULL)  {
1020 					err = -ENOMEM;
1021 					goto error;
1022 				}
1023 				cork->page = page;
1024 				cork->off = 0;
1025 
1026 				skb_fill_page_desc(skb, i, page, 0, 0);
1027 				frag = &skb_shinfo(skb)->frags[i];
1028 			} else {
1029 				err = -EMSGSIZE;
1030 				goto error;
1031 			}
1032 			if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1033 				    offset, copy, skb->len, skb) < 0) {
1034 				err = -EFAULT;
1035 				goto error;
1036 			}
1037 			cork->off += copy;
1038 			skb_frag_size_add(frag, copy);
1039 			skb->len += copy;
1040 			skb->data_len += copy;
1041 			skb->truesize += copy;
1042 			atomic_add(copy, &sk->sk_wmem_alloc);
1043 		}
1044 		offset += copy;
1045 		length -= copy;
1046 	}
1047 
1048 	return 0;
1049 
1050 error:
1051 	cork->length -= length;
1052 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1053 	return err;
1054 }
1055 
1056 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1057 			 struct ipcm_cookie *ipc, struct rtable **rtp)
1058 {
1059 	struct inet_sock *inet = inet_sk(sk);
1060 	struct ip_options_rcu *opt;
1061 	struct rtable *rt;
1062 
1063 	/*
1064 	 * setup for corking.
1065 	 */
1066 	opt = ipc->opt;
1067 	if (opt) {
1068 		if (cork->opt == NULL) {
1069 			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1070 					    sk->sk_allocation);
1071 			if (unlikely(cork->opt == NULL))
1072 				return -ENOBUFS;
1073 		}
1074 		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1075 		cork->flags |= IPCORK_OPT;
1076 		cork->addr = ipc->addr;
1077 	}
1078 	rt = *rtp;
1079 	if (unlikely(!rt))
1080 		return -EFAULT;
1081 	/*
1082 	 * We steal reference to this route, caller should not release it
1083 	 */
1084 	*rtp = NULL;
1085 	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1086 			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1087 	cork->dst = &rt->dst;
1088 	cork->length = 0;
1089 	cork->tx_flags = ipc->tx_flags;
1090 	cork->page = NULL;
1091 	cork->off = 0;
1092 
1093 	return 0;
1094 }
1095 
1096 /*
1097  *	ip_append_data() and ip_append_page() can make one large IP datagram
1098  *	from many pieces of data. Each pieces will be holded on the socket
1099  *	until ip_push_pending_frames() is called. Each piece can be a page
1100  *	or non-page data.
1101  *
1102  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1103  *	this interface potentially.
1104  *
1105  *	LATER: length must be adjusted by pad at tail, when it is required.
1106  */
1107 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1108 		   int getfrag(void *from, char *to, int offset, int len,
1109 			       int odd, struct sk_buff *skb),
1110 		   void *from, int length, int transhdrlen,
1111 		   struct ipcm_cookie *ipc, struct rtable **rtp,
1112 		   unsigned int flags)
1113 {
1114 	struct inet_sock *inet = inet_sk(sk);
1115 	int err;
1116 
1117 	if (flags&MSG_PROBE)
1118 		return 0;
1119 
1120 	if (skb_queue_empty(&sk->sk_write_queue)) {
1121 		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1122 		if (err)
1123 			return err;
1124 	} else {
1125 		transhdrlen = 0;
1126 	}
1127 
1128 	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1129 				from, length, transhdrlen, flags);
1130 }
1131 
1132 ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1133 		       int offset, size_t size, int flags)
1134 {
1135 	struct inet_sock *inet = inet_sk(sk);
1136 	struct sk_buff *skb;
1137 	struct rtable *rt;
1138 	struct ip_options *opt = NULL;
1139 	struct inet_cork *cork;
1140 	int hh_len;
1141 	int mtu;
1142 	int len;
1143 	int err;
1144 	unsigned int maxfraglen, fragheaderlen, fraggap;
1145 
1146 	if (inet->hdrincl)
1147 		return -EPERM;
1148 
1149 	if (flags&MSG_PROBE)
1150 		return 0;
1151 
1152 	if (skb_queue_empty(&sk->sk_write_queue))
1153 		return -EINVAL;
1154 
1155 	cork = &inet->cork.base;
1156 	rt = (struct rtable *)cork->dst;
1157 	if (cork->flags & IPCORK_OPT)
1158 		opt = cork->opt;
1159 
1160 	if (!(rt->dst.dev->features&NETIF_F_SG))
1161 		return -EOPNOTSUPP;
1162 
1163 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1164 	mtu = cork->fragsize;
1165 
1166 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1167 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1168 
1169 	if (cork->length + size > 0xFFFF - fragheaderlen) {
1170 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1171 		return -EMSGSIZE;
1172 	}
1173 
1174 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1175 		return -EINVAL;
1176 
1177 	cork->length += size;
1178 	if ((size + skb->len > mtu) &&
1179 	    (sk->sk_protocol == IPPROTO_UDP) &&
1180 	    (rt->dst.dev->features & NETIF_F_UFO)) {
1181 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1182 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1183 	}
1184 
1185 
1186 	while (size > 0) {
1187 		int i;
1188 
1189 		if (skb_is_gso(skb))
1190 			len = size;
1191 		else {
1192 
1193 			/* Check if the remaining data fits into current packet. */
1194 			len = mtu - skb->len;
1195 			if (len < size)
1196 				len = maxfraglen - skb->len;
1197 		}
1198 		if (len <= 0) {
1199 			struct sk_buff *skb_prev;
1200 			int alloclen;
1201 
1202 			skb_prev = skb;
1203 			fraggap = skb_prev->len - maxfraglen;
1204 
1205 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1206 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1207 			if (unlikely(!skb)) {
1208 				err = -ENOBUFS;
1209 				goto error;
1210 			}
1211 
1212 			/*
1213 			 *	Fill in the control structures
1214 			 */
1215 			skb->ip_summed = CHECKSUM_NONE;
1216 			skb->csum = 0;
1217 			skb_reserve(skb, hh_len);
1218 
1219 			/*
1220 			 *	Find where to start putting bytes.
1221 			 */
1222 			skb_put(skb, fragheaderlen + fraggap);
1223 			skb_reset_network_header(skb);
1224 			skb->transport_header = (skb->network_header +
1225 						 fragheaderlen);
1226 			if (fraggap) {
1227 				skb->csum = skb_copy_and_csum_bits(skb_prev,
1228 								   maxfraglen,
1229 						    skb_transport_header(skb),
1230 								   fraggap, 0);
1231 				skb_prev->csum = csum_sub(skb_prev->csum,
1232 							  skb->csum);
1233 				pskb_trim_unique(skb_prev, maxfraglen);
1234 			}
1235 
1236 			/*
1237 			 * Put the packet on the pending queue.
1238 			 */
1239 			__skb_queue_tail(&sk->sk_write_queue, skb);
1240 			continue;
1241 		}
1242 
1243 		i = skb_shinfo(skb)->nr_frags;
1244 		if (len > size)
1245 			len = size;
1246 		if (skb_can_coalesce(skb, i, page, offset)) {
1247 			skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1248 		} else if (i < MAX_SKB_FRAGS) {
1249 			get_page(page);
1250 			skb_fill_page_desc(skb, i, page, offset, len);
1251 		} else {
1252 			err = -EMSGSIZE;
1253 			goto error;
1254 		}
1255 
1256 		if (skb->ip_summed == CHECKSUM_NONE) {
1257 			__wsum csum;
1258 			csum = csum_page(page, offset, len);
1259 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1260 		}
1261 
1262 		skb->len += len;
1263 		skb->data_len += len;
1264 		skb->truesize += len;
1265 		atomic_add(len, &sk->sk_wmem_alloc);
1266 		offset += len;
1267 		size -= len;
1268 	}
1269 	return 0;
1270 
1271 error:
1272 	cork->length -= size;
1273 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1274 	return err;
1275 }
1276 
1277 static void ip_cork_release(struct inet_cork *cork)
1278 {
1279 	cork->flags &= ~IPCORK_OPT;
1280 	kfree(cork->opt);
1281 	cork->opt = NULL;
1282 	dst_release(cork->dst);
1283 	cork->dst = NULL;
1284 }
1285 
1286 /*
1287  *	Combined all pending IP fragments on the socket as one IP datagram
1288  *	and push them out.
1289  */
1290 struct sk_buff *__ip_make_skb(struct sock *sk,
1291 			      struct flowi4 *fl4,
1292 			      struct sk_buff_head *queue,
1293 			      struct inet_cork *cork)
1294 {
1295 	struct sk_buff *skb, *tmp_skb;
1296 	struct sk_buff **tail_skb;
1297 	struct inet_sock *inet = inet_sk(sk);
1298 	struct net *net = sock_net(sk);
1299 	struct ip_options *opt = NULL;
1300 	struct rtable *rt = (struct rtable *)cork->dst;
1301 	struct iphdr *iph;
1302 	__be16 df = 0;
1303 	__u8 ttl;
1304 
1305 	if ((skb = __skb_dequeue(queue)) == NULL)
1306 		goto out;
1307 	tail_skb = &(skb_shinfo(skb)->frag_list);
1308 
1309 	/* move skb->data to ip header from ext header */
1310 	if (skb->data < skb_network_header(skb))
1311 		__skb_pull(skb, skb_network_offset(skb));
1312 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1313 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1314 		*tail_skb = tmp_skb;
1315 		tail_skb = &(tmp_skb->next);
1316 		skb->len += tmp_skb->len;
1317 		skb->data_len += tmp_skb->len;
1318 		skb->truesize += tmp_skb->truesize;
1319 		tmp_skb->destructor = NULL;
1320 		tmp_skb->sk = NULL;
1321 	}
1322 
1323 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1324 	 * to fragment the frame generated here. No matter, what transforms
1325 	 * how transforms change size of the packet, it will come out.
1326 	 */
1327 	if (inet->pmtudisc < IP_PMTUDISC_DO)
1328 		skb->local_df = 1;
1329 
1330 	/* DF bit is set when we want to see DF on outgoing frames.
1331 	 * If local_df is set too, we still allow to fragment this frame
1332 	 * locally. */
1333 	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1334 	    (skb->len <= dst_mtu(&rt->dst) &&
1335 	     ip_dont_fragment(sk, &rt->dst)))
1336 		df = htons(IP_DF);
1337 
1338 	if (cork->flags & IPCORK_OPT)
1339 		opt = cork->opt;
1340 
1341 	if (rt->rt_type == RTN_MULTICAST)
1342 		ttl = inet->mc_ttl;
1343 	else
1344 		ttl = ip_select_ttl(inet, &rt->dst);
1345 
1346 	iph = (struct iphdr *)skb->data;
1347 	iph->version = 4;
1348 	iph->ihl = 5;
1349 	iph->tos = inet->tos;
1350 	iph->frag_off = df;
1351 	ip_select_ident(iph, &rt->dst, sk);
1352 	iph->ttl = ttl;
1353 	iph->protocol = sk->sk_protocol;
1354 	ip_copy_addrs(iph, fl4);
1355 
1356 	if (opt) {
1357 		iph->ihl += opt->optlen>>2;
1358 		ip_options_build(skb, opt, cork->addr, rt, 0);
1359 	}
1360 
1361 	skb->priority = sk->sk_priority;
1362 	skb->mark = sk->sk_mark;
1363 	/*
1364 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1365 	 * on dst refcount
1366 	 */
1367 	cork->dst = NULL;
1368 	skb_dst_set(skb, &rt->dst);
1369 
1370 	if (iph->protocol == IPPROTO_ICMP)
1371 		icmp_out_count(net, ((struct icmphdr *)
1372 			skb_transport_header(skb))->type);
1373 
1374 	ip_cork_release(cork);
1375 out:
1376 	return skb;
1377 }
1378 
1379 int ip_send_skb(struct sk_buff *skb)
1380 {
1381 	struct net *net = sock_net(skb->sk);
1382 	int err;
1383 
1384 	err = ip_local_out(skb);
1385 	if (err) {
1386 		if (err > 0)
1387 			err = net_xmit_errno(err);
1388 		if (err)
1389 			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1390 	}
1391 
1392 	return err;
1393 }
1394 
1395 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1396 {
1397 	struct sk_buff *skb;
1398 
1399 	skb = ip_finish_skb(sk, fl4);
1400 	if (!skb)
1401 		return 0;
1402 
1403 	/* Netfilter gets whole the not fragmented skb. */
1404 	return ip_send_skb(skb);
1405 }
1406 
1407 /*
1408  *	Throw away all pending data on the socket.
1409  */
1410 static void __ip_flush_pending_frames(struct sock *sk,
1411 				      struct sk_buff_head *queue,
1412 				      struct inet_cork *cork)
1413 {
1414 	struct sk_buff *skb;
1415 
1416 	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1417 		kfree_skb(skb);
1418 
1419 	ip_cork_release(cork);
1420 }
1421 
1422 void ip_flush_pending_frames(struct sock *sk)
1423 {
1424 	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1425 }
1426 
1427 struct sk_buff *ip_make_skb(struct sock *sk,
1428 			    struct flowi4 *fl4,
1429 			    int getfrag(void *from, char *to, int offset,
1430 					int len, int odd, struct sk_buff *skb),
1431 			    void *from, int length, int transhdrlen,
1432 			    struct ipcm_cookie *ipc, struct rtable **rtp,
1433 			    unsigned int flags)
1434 {
1435 	struct inet_cork cork;
1436 	struct sk_buff_head queue;
1437 	int err;
1438 
1439 	if (flags & MSG_PROBE)
1440 		return NULL;
1441 
1442 	__skb_queue_head_init(&queue);
1443 
1444 	cork.flags = 0;
1445 	cork.addr = 0;
1446 	cork.opt = NULL;
1447 	err = ip_setup_cork(sk, &cork, ipc, rtp);
1448 	if (err)
1449 		return ERR_PTR(err);
1450 
1451 	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1452 			       from, length, transhdrlen, flags);
1453 	if (err) {
1454 		__ip_flush_pending_frames(sk, &queue, &cork);
1455 		return ERR_PTR(err);
1456 	}
1457 
1458 	return __ip_make_skb(sk, fl4, &queue, &cork);
1459 }
1460 
1461 /*
1462  *	Fetch data from kernel space and fill in checksum if needed.
1463  */
1464 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1465 			      int len, int odd, struct sk_buff *skb)
1466 {
1467 	__wsum csum;
1468 
1469 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1470 	skb->csum = csum_block_add(skb->csum, csum, odd);
1471 	return 0;
1472 }
1473 
1474 /*
1475  *	Generic function to send a packet as reply to another packet.
1476  *	Used to send TCP resets so far. ICMP should use this function too.
1477  *
1478  *	Should run single threaded per socket because it uses the sock
1479  *     	structure to pass arguments.
1480  */
1481 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1482 		   const struct ip_reply_arg *arg, unsigned int len)
1483 {
1484 	struct inet_sock *inet = inet_sk(sk);
1485 	struct ip_options_data replyopts;
1486 	struct ipcm_cookie ipc;
1487 	struct flowi4 fl4;
1488 	struct rtable *rt = skb_rtable(skb);
1489 
1490 	if (ip_options_echo(&replyopts.opt.opt, skb))
1491 		return;
1492 
1493 	ipc.addr = daddr;
1494 	ipc.opt = NULL;
1495 	ipc.tx_flags = 0;
1496 
1497 	if (replyopts.opt.opt.optlen) {
1498 		ipc.opt = &replyopts.opt;
1499 
1500 		if (replyopts.opt.opt.srr)
1501 			daddr = replyopts.opt.opt.faddr;
1502 	}
1503 
1504 	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1505 			   RT_TOS(arg->tos),
1506 			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1507 			   ip_reply_arg_flowi_flags(arg),
1508 			   daddr, rt->rt_spec_dst,
1509 			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1510 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1511 	rt = ip_route_output_key(sock_net(sk), &fl4);
1512 	if (IS_ERR(rt))
1513 		return;
1514 
1515 	/* And let IP do all the hard work.
1516 
1517 	   This chunk is not reenterable, hence spinlock.
1518 	   Note that it uses the fact, that this function is called
1519 	   with locally disabled BH and that sk cannot be already spinlocked.
1520 	 */
1521 	bh_lock_sock(sk);
1522 	inet->tos = arg->tos;
1523 	sk->sk_priority = skb->priority;
1524 	sk->sk_protocol = ip_hdr(skb)->protocol;
1525 	sk->sk_bound_dev_if = arg->bound_dev_if;
1526 	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1527 		       &ipc, &rt, MSG_DONTWAIT);
1528 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1529 		if (arg->csumoffset >= 0)
1530 			*((__sum16 *)skb_transport_header(skb) +
1531 			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1532 								arg->csum));
1533 		skb->ip_summed = CHECKSUM_NONE;
1534 		ip_push_pending_frames(sk, &fl4);
1535 	}
1536 
1537 	bh_unlock_sock(sk);
1538 
1539 	ip_rt_put(rt);
1540 }
1541 
1542 void __init ip_init(void)
1543 {
1544 	ip_rt_init();
1545 	inet_initpeers();
1546 
1547 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1548 	igmp_mc_proc_init();
1549 #endif
1550 }
1551