xref: /linux/net/ipv4/ip_output.c (revision 7c43185138cf523b0810ffd2c9e18e2ecb356730)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Donald Becker, <becker@super.org>
11  *		Alan Cox, <Alan.Cox@linux.org>
12  *		Richard Underwood
13  *		Stefan Becker, <stefanb@yello.ping.de>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16  *		Hirokazu Takahashi, <taka@valinux.co.jp>
17  *
18  *	See ip_input.c for original log
19  *
20  *	Fixes:
21  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23  *		Bradford Johnson:	Fix faulty handling of some frames when
24  *					no route is found.
25  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26  *					(in case if packet not accepted by
27  *					output firewall rules)
28  *		Mike McLagan	:	Routing by source
29  *		Alexey Kuznetsov:	use new route cache
30  *		Andi Kleen:		Fix broken PMTU recovery and remove
31  *					some redundant tests.
32  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35  *					for decreased register pressure on x86
36  *					and more readibility.
37  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38  *					silently drop skb instead of failing with -EPERM.
39  *		Detlev Wengorz	:	Copy protocol for fragments.
40  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41  *					datagrams.
42  *		Hirokazu Takahashi:	sendfile() on UDP works now.
43  */
44 
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
55 
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
65 
66 #include <net/snmp.h>
67 #include <net/ip.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
70 #include <net/xfrm.h>
71 #include <linux/skbuff.h>
72 #include <net/sock.h>
73 #include <net/arp.h>
74 #include <net/icmp.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
83 
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
86 
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
89 {
90 	iph->check = 0;
91 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92 }
93 EXPORT_SYMBOL(ip_send_check);
94 
95 int __ip_local_out(struct sk_buff *skb)
96 {
97 	struct iphdr *iph = ip_hdr(skb);
98 
99 	iph->tot_len = htons(skb->len);
100 	ip_send_check(iph);
101 	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 		       skb_dst(skb)->dev, dst_output);
103 }
104 
105 int ip_local_out(struct sk_buff *skb)
106 {
107 	int err;
108 
109 	err = __ip_local_out(skb);
110 	if (likely(err == 1))
111 		err = dst_output(skb);
112 
113 	return err;
114 }
115 EXPORT_SYMBOL_GPL(ip_local_out);
116 
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 {
120 	skb_reset_mac_header(newskb);
121 	__skb_pull(newskb, skb_network_offset(newskb));
122 	newskb->pkt_type = PACKET_LOOPBACK;
123 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 	WARN_ON(!skb_dst(newskb));
125 	netif_rx_ni(newskb);
126 	return 0;
127 }
128 
129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 {
131 	int ttl = inet->uc_ttl;
132 
133 	if (ttl < 0)
134 		ttl = ip4_dst_hoplimit(dst);
135 	return ttl;
136 }
137 
138 /*
139  *		Add an ip header to a skbuff and send it out.
140  *
141  */
142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144 {
145 	struct inet_sock *inet = inet_sk(sk);
146 	struct rtable *rt = skb_rtable(skb);
147 	struct iphdr *iph;
148 
149 	/* Build the IP header. */
150 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151 	skb_reset_network_header(skb);
152 	iph = ip_hdr(skb);
153 	iph->version  = 4;
154 	iph->ihl      = 5;
155 	iph->tos      = inet->tos;
156 	if (ip_dont_fragment(sk, &rt->dst))
157 		iph->frag_off = htons(IP_DF);
158 	else
159 		iph->frag_off = 0;
160 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161 	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 	iph->saddr    = saddr;
163 	iph->protocol = sk->sk_protocol;
164 	ip_select_ident(iph, &rt->dst, sk);
165 
166 	if (opt && opt->opt.optlen) {
167 		iph->ihl += opt->opt.optlen>>2;
168 		ip_options_build(skb, &opt->opt, daddr, rt, 0);
169 	}
170 
171 	skb->priority = sk->sk_priority;
172 	skb->mark = sk->sk_mark;
173 
174 	/* Send it out. */
175 	return ip_local_out(skb);
176 }
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178 
179 static inline int ip_finish_output2(struct sk_buff *skb)
180 {
181 	struct dst_entry *dst = skb_dst(skb);
182 	struct rtable *rt = (struct rtable *)dst;
183 	struct net_device *dev = dst->dev;
184 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 	struct neighbour *neigh;
186 
187 	if (rt->rt_type == RTN_MULTICAST) {
188 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 	} else if (rt->rt_type == RTN_BROADCAST)
190 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
191 
192 	/* Be paranoid, rather than too clever. */
193 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
194 		struct sk_buff *skb2;
195 
196 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 		if (skb2 == NULL) {
198 			kfree_skb(skb);
199 			return -ENOMEM;
200 		}
201 		if (skb->sk)
202 			skb_set_owner_w(skb2, skb->sk);
203 		kfree_skb(skb);
204 		skb = skb2;
205 	}
206 
207 	rcu_read_lock();
208 	neigh = dst_get_neighbour(dst);
209 	if (neigh) {
210 		int res = neigh_output(neigh, skb);
211 
212 		rcu_read_unlock();
213 		return res;
214 	}
215 	rcu_read_unlock();
216 
217 	if (net_ratelimit())
218 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
219 	kfree_skb(skb);
220 	return -EINVAL;
221 }
222 
223 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
224 {
225 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
226 
227 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
228 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
229 }
230 
231 static int ip_finish_output(struct sk_buff *skb)
232 {
233 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
234 	/* Policy lookup after SNAT yielded a new policy */
235 	if (skb_dst(skb)->xfrm != NULL) {
236 		IPCB(skb)->flags |= IPSKB_REROUTED;
237 		return dst_output(skb);
238 	}
239 #endif
240 	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
241 		return ip_fragment(skb, ip_finish_output2);
242 	else
243 		return ip_finish_output2(skb);
244 }
245 
246 int ip_mc_output(struct sk_buff *skb)
247 {
248 	struct sock *sk = skb->sk;
249 	struct rtable *rt = skb_rtable(skb);
250 	struct net_device *dev = rt->dst.dev;
251 
252 	/*
253 	 *	If the indicated interface is up and running, send the packet.
254 	 */
255 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
256 
257 	skb->dev = dev;
258 	skb->protocol = htons(ETH_P_IP);
259 
260 	/*
261 	 *	Multicasts are looped back for other local users
262 	 */
263 
264 	if (rt->rt_flags&RTCF_MULTICAST) {
265 		if (sk_mc_loop(sk)
266 #ifdef CONFIG_IP_MROUTE
267 		/* Small optimization: do not loopback not local frames,
268 		   which returned after forwarding; they will be  dropped
269 		   by ip_mr_input in any case.
270 		   Note, that local frames are looped back to be delivered
271 		   to local recipients.
272 
273 		   This check is duplicated in ip_mr_input at the moment.
274 		 */
275 		    &&
276 		    ((rt->rt_flags & RTCF_LOCAL) ||
277 		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
278 #endif
279 		   ) {
280 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
281 			if (newskb)
282 				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283 					newskb, NULL, newskb->dev,
284 					ip_dev_loopback_xmit);
285 		}
286 
287 		/* Multicasts with ttl 0 must not go beyond the host */
288 
289 		if (ip_hdr(skb)->ttl == 0) {
290 			kfree_skb(skb);
291 			return 0;
292 		}
293 	}
294 
295 	if (rt->rt_flags&RTCF_BROADCAST) {
296 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297 		if (newskb)
298 			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299 				NULL, newskb->dev, ip_dev_loopback_xmit);
300 	}
301 
302 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
303 			    skb->dev, ip_finish_output,
304 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
305 }
306 
307 int ip_output(struct sk_buff *skb)
308 {
309 	struct net_device *dev = skb_dst(skb)->dev;
310 
311 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
312 
313 	skb->dev = dev;
314 	skb->protocol = htons(ETH_P_IP);
315 
316 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
317 			    ip_finish_output,
318 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
319 }
320 
321 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
322 {
323 	struct sock *sk = skb->sk;
324 	struct inet_sock *inet = inet_sk(sk);
325 	struct ip_options_rcu *inet_opt;
326 	struct flowi4 *fl4;
327 	struct rtable *rt;
328 	struct iphdr *iph;
329 	int res;
330 
331 	/* Skip all of this if the packet is already routed,
332 	 * f.e. by something like SCTP.
333 	 */
334 	rcu_read_lock();
335 	inet_opt = rcu_dereference(inet->inet_opt);
336 	fl4 = &fl->u.ip4;
337 	rt = skb_rtable(skb);
338 	if (rt != NULL)
339 		goto packet_routed;
340 
341 	/* Make sure we can route this packet. */
342 	rt = (struct rtable *)__sk_dst_check(sk, 0);
343 	if (rt == NULL) {
344 		__be32 daddr;
345 
346 		/* Use correct destination address if we have options. */
347 		daddr = inet->inet_daddr;
348 		if (inet_opt && inet_opt->opt.srr)
349 			daddr = inet_opt->opt.faddr;
350 
351 		/* If this fails, retransmit mechanism of transport layer will
352 		 * keep trying until route appears or the connection times
353 		 * itself out.
354 		 */
355 		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
356 					   daddr, inet->inet_saddr,
357 					   inet->inet_dport,
358 					   inet->inet_sport,
359 					   sk->sk_protocol,
360 					   RT_CONN_FLAGS(sk),
361 					   sk->sk_bound_dev_if);
362 		if (IS_ERR(rt))
363 			goto no_route;
364 		sk_setup_caps(sk, &rt->dst);
365 	}
366 	skb_dst_set_noref(skb, &rt->dst);
367 
368 packet_routed:
369 	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
370 		goto no_route;
371 
372 	/* OK, we know where to send it, allocate and build IP header. */
373 	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
374 	skb_reset_network_header(skb);
375 	iph = ip_hdr(skb);
376 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
377 	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
378 		iph->frag_off = htons(IP_DF);
379 	else
380 		iph->frag_off = 0;
381 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
382 	iph->protocol = sk->sk_protocol;
383 	iph->saddr    = fl4->saddr;
384 	iph->daddr    = fl4->daddr;
385 	/* Transport layer set skb->h.foo itself. */
386 
387 	if (inet_opt && inet_opt->opt.optlen) {
388 		iph->ihl += inet_opt->opt.optlen >> 2;
389 		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
390 	}
391 
392 	ip_select_ident_more(iph, &rt->dst, sk,
393 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
394 
395 	skb->priority = sk->sk_priority;
396 	skb->mark = sk->sk_mark;
397 
398 	res = ip_local_out(skb);
399 	rcu_read_unlock();
400 	return res;
401 
402 no_route:
403 	rcu_read_unlock();
404 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
405 	kfree_skb(skb);
406 	return -EHOSTUNREACH;
407 }
408 EXPORT_SYMBOL(ip_queue_xmit);
409 
410 
411 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
412 {
413 	to->pkt_type = from->pkt_type;
414 	to->priority = from->priority;
415 	to->protocol = from->protocol;
416 	skb_dst_drop(to);
417 	skb_dst_copy(to, from);
418 	to->dev = from->dev;
419 	to->mark = from->mark;
420 
421 	/* Copy the flags to each fragment. */
422 	IPCB(to)->flags = IPCB(from)->flags;
423 
424 #ifdef CONFIG_NET_SCHED
425 	to->tc_index = from->tc_index;
426 #endif
427 	nf_copy(to, from);
428 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
429     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
430 	to->nf_trace = from->nf_trace;
431 #endif
432 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
433 	to->ipvs_property = from->ipvs_property;
434 #endif
435 	skb_copy_secmark(to, from);
436 }
437 
438 /*
439  *	This IP datagram is too large to be sent in one piece.  Break it up into
440  *	smaller pieces (each of size equal to IP header plus
441  *	a block of the data of the original IP data part) that will yet fit in a
442  *	single device frame, and queue such a frame for sending.
443  */
444 
445 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
446 {
447 	struct iphdr *iph;
448 	int ptr;
449 	struct net_device *dev;
450 	struct sk_buff *skb2;
451 	unsigned int mtu, hlen, left, len, ll_rs;
452 	int offset;
453 	__be16 not_last_frag;
454 	struct rtable *rt = skb_rtable(skb);
455 	int err = 0;
456 
457 	dev = rt->dst.dev;
458 
459 	/*
460 	 *	Point into the IP datagram header.
461 	 */
462 
463 	iph = ip_hdr(skb);
464 
465 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
466 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
467 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
468 			  htonl(ip_skb_dst_mtu(skb)));
469 		kfree_skb(skb);
470 		return -EMSGSIZE;
471 	}
472 
473 	/*
474 	 *	Setup starting values.
475 	 */
476 
477 	hlen = iph->ihl * 4;
478 	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
479 #ifdef CONFIG_BRIDGE_NETFILTER
480 	if (skb->nf_bridge)
481 		mtu -= nf_bridge_mtu_reduction(skb);
482 #endif
483 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
484 
485 	/* When frag_list is given, use it. First, check its validity:
486 	 * some transformers could create wrong frag_list or break existing
487 	 * one, it is not prohibited. In this case fall back to copying.
488 	 *
489 	 * LATER: this step can be merged to real generation of fragments,
490 	 * we can switch to copy when see the first bad fragment.
491 	 */
492 	if (skb_has_frag_list(skb)) {
493 		struct sk_buff *frag, *frag2;
494 		int first_len = skb_pagelen(skb);
495 
496 		if (first_len - hlen > mtu ||
497 		    ((first_len - hlen) & 7) ||
498 		    ip_is_fragment(iph) ||
499 		    skb_cloned(skb))
500 			goto slow_path;
501 
502 		skb_walk_frags(skb, frag) {
503 			/* Correct geometry. */
504 			if (frag->len > mtu ||
505 			    ((frag->len & 7) && frag->next) ||
506 			    skb_headroom(frag) < hlen)
507 				goto slow_path_clean;
508 
509 			/* Partially cloned skb? */
510 			if (skb_shared(frag))
511 				goto slow_path_clean;
512 
513 			BUG_ON(frag->sk);
514 			if (skb->sk) {
515 				frag->sk = skb->sk;
516 				frag->destructor = sock_wfree;
517 			}
518 			skb->truesize -= frag->truesize;
519 		}
520 
521 		/* Everything is OK. Generate! */
522 
523 		err = 0;
524 		offset = 0;
525 		frag = skb_shinfo(skb)->frag_list;
526 		skb_frag_list_init(skb);
527 		skb->data_len = first_len - skb_headlen(skb);
528 		skb->len = first_len;
529 		iph->tot_len = htons(first_len);
530 		iph->frag_off = htons(IP_MF);
531 		ip_send_check(iph);
532 
533 		for (;;) {
534 			/* Prepare header of the next frame,
535 			 * before previous one went down. */
536 			if (frag) {
537 				frag->ip_summed = CHECKSUM_NONE;
538 				skb_reset_transport_header(frag);
539 				__skb_push(frag, hlen);
540 				skb_reset_network_header(frag);
541 				memcpy(skb_network_header(frag), iph, hlen);
542 				iph = ip_hdr(frag);
543 				iph->tot_len = htons(frag->len);
544 				ip_copy_metadata(frag, skb);
545 				if (offset == 0)
546 					ip_options_fragment(frag);
547 				offset += skb->len - hlen;
548 				iph->frag_off = htons(offset>>3);
549 				if (frag->next != NULL)
550 					iph->frag_off |= htons(IP_MF);
551 				/* Ready, complete checksum */
552 				ip_send_check(iph);
553 			}
554 
555 			err = output(skb);
556 
557 			if (!err)
558 				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
559 			if (err || !frag)
560 				break;
561 
562 			skb = frag;
563 			frag = skb->next;
564 			skb->next = NULL;
565 		}
566 
567 		if (err == 0) {
568 			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
569 			return 0;
570 		}
571 
572 		while (frag) {
573 			skb = frag->next;
574 			kfree_skb(frag);
575 			frag = skb;
576 		}
577 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
578 		return err;
579 
580 slow_path_clean:
581 		skb_walk_frags(skb, frag2) {
582 			if (frag2 == frag)
583 				break;
584 			frag2->sk = NULL;
585 			frag2->destructor = NULL;
586 			skb->truesize += frag2->truesize;
587 		}
588 	}
589 
590 slow_path:
591 	left = skb->len - hlen;		/* Space per frame */
592 	ptr = hlen;		/* Where to start from */
593 
594 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
595 	 * we need to make room for the encapsulating header
596 	 */
597 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
598 
599 	/*
600 	 *	Fragment the datagram.
601 	 */
602 
603 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
604 	not_last_frag = iph->frag_off & htons(IP_MF);
605 
606 	/*
607 	 *	Keep copying data until we run out.
608 	 */
609 
610 	while (left > 0) {
611 		len = left;
612 		/* IF: it doesn't fit, use 'mtu' - the data space left */
613 		if (len > mtu)
614 			len = mtu;
615 		/* IF: we are not sending up to and including the packet end
616 		   then align the next start on an eight byte boundary */
617 		if (len < left)	{
618 			len &= ~7;
619 		}
620 		/*
621 		 *	Allocate buffer.
622 		 */
623 
624 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
625 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
626 			err = -ENOMEM;
627 			goto fail;
628 		}
629 
630 		/*
631 		 *	Set up data on packet
632 		 */
633 
634 		ip_copy_metadata(skb2, skb);
635 		skb_reserve(skb2, ll_rs);
636 		skb_put(skb2, len + hlen);
637 		skb_reset_network_header(skb2);
638 		skb2->transport_header = skb2->network_header + hlen;
639 
640 		/*
641 		 *	Charge the memory for the fragment to any owner
642 		 *	it might possess
643 		 */
644 
645 		if (skb->sk)
646 			skb_set_owner_w(skb2, skb->sk);
647 
648 		/*
649 		 *	Copy the packet header into the new buffer.
650 		 */
651 
652 		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
653 
654 		/*
655 		 *	Copy a block of the IP datagram.
656 		 */
657 		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
658 			BUG();
659 		left -= len;
660 
661 		/*
662 		 *	Fill in the new header fields.
663 		 */
664 		iph = ip_hdr(skb2);
665 		iph->frag_off = htons((offset >> 3));
666 
667 		/* ANK: dirty, but effective trick. Upgrade options only if
668 		 * the segment to be fragmented was THE FIRST (otherwise,
669 		 * options are already fixed) and make it ONCE
670 		 * on the initial skb, so that all the following fragments
671 		 * will inherit fixed options.
672 		 */
673 		if (offset == 0)
674 			ip_options_fragment(skb);
675 
676 		/*
677 		 *	Added AC : If we are fragmenting a fragment that's not the
678 		 *		   last fragment then keep MF on each bit
679 		 */
680 		if (left > 0 || not_last_frag)
681 			iph->frag_off |= htons(IP_MF);
682 		ptr += len;
683 		offset += len;
684 
685 		/*
686 		 *	Put this fragment into the sending queue.
687 		 */
688 		iph->tot_len = htons(len + hlen);
689 
690 		ip_send_check(iph);
691 
692 		err = output(skb2);
693 		if (err)
694 			goto fail;
695 
696 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
697 	}
698 	kfree_skb(skb);
699 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
700 	return err;
701 
702 fail:
703 	kfree_skb(skb);
704 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
705 	return err;
706 }
707 EXPORT_SYMBOL(ip_fragment);
708 
709 int
710 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
711 {
712 	struct iovec *iov = from;
713 
714 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
715 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
716 			return -EFAULT;
717 	} else {
718 		__wsum csum = 0;
719 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
720 			return -EFAULT;
721 		skb->csum = csum_block_add(skb->csum, csum, odd);
722 	}
723 	return 0;
724 }
725 EXPORT_SYMBOL(ip_generic_getfrag);
726 
727 static inline __wsum
728 csum_page(struct page *page, int offset, int copy)
729 {
730 	char *kaddr;
731 	__wsum csum;
732 	kaddr = kmap(page);
733 	csum = csum_partial(kaddr + offset, copy, 0);
734 	kunmap(page);
735 	return csum;
736 }
737 
738 static inline int ip_ufo_append_data(struct sock *sk,
739 			struct sk_buff_head *queue,
740 			int getfrag(void *from, char *to, int offset, int len,
741 			       int odd, struct sk_buff *skb),
742 			void *from, int length, int hh_len, int fragheaderlen,
743 			int transhdrlen, int maxfraglen, unsigned int flags)
744 {
745 	struct sk_buff *skb;
746 	int err;
747 
748 	/* There is support for UDP fragmentation offload by network
749 	 * device, so create one single skb packet containing complete
750 	 * udp datagram
751 	 */
752 	if ((skb = skb_peek_tail(queue)) == NULL) {
753 		skb = sock_alloc_send_skb(sk,
754 			hh_len + fragheaderlen + transhdrlen + 20,
755 			(flags & MSG_DONTWAIT), &err);
756 
757 		if (skb == NULL)
758 			return err;
759 
760 		/* reserve space for Hardware header */
761 		skb_reserve(skb, hh_len);
762 
763 		/* create space for UDP/IP header */
764 		skb_put(skb, fragheaderlen + transhdrlen);
765 
766 		/* initialize network header pointer */
767 		skb_reset_network_header(skb);
768 
769 		/* initialize protocol header pointer */
770 		skb->transport_header = skb->network_header + fragheaderlen;
771 
772 		skb->ip_summed = CHECKSUM_PARTIAL;
773 		skb->csum = 0;
774 
775 		/* specify the length of each IP datagram fragment */
776 		skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
777 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
778 		__skb_queue_tail(queue, skb);
779 	}
780 
781 	return skb_append_datato_frags(sk, skb, getfrag, from,
782 				       (length - transhdrlen));
783 }
784 
785 static int __ip_append_data(struct sock *sk,
786 			    struct flowi4 *fl4,
787 			    struct sk_buff_head *queue,
788 			    struct inet_cork *cork,
789 			    int getfrag(void *from, char *to, int offset,
790 					int len, int odd, struct sk_buff *skb),
791 			    void *from, int length, int transhdrlen,
792 			    unsigned int flags)
793 {
794 	struct inet_sock *inet = inet_sk(sk);
795 	struct sk_buff *skb;
796 
797 	struct ip_options *opt = cork->opt;
798 	int hh_len;
799 	int exthdrlen;
800 	int mtu;
801 	int copy;
802 	int err;
803 	int offset = 0;
804 	unsigned int maxfraglen, fragheaderlen;
805 	int csummode = CHECKSUM_NONE;
806 	struct rtable *rt = (struct rtable *)cork->dst;
807 
808 	skb = skb_peek_tail(queue);
809 
810 	exthdrlen = !skb ? rt->dst.header_len : 0;
811 	mtu = cork->fragsize;
812 
813 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
814 
815 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
816 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
817 
818 	if (cork->length + length > 0xFFFF - fragheaderlen) {
819 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
820 			       mtu-exthdrlen);
821 		return -EMSGSIZE;
822 	}
823 
824 	/*
825 	 * transhdrlen > 0 means that this is the first fragment and we wish
826 	 * it won't be fragmented in the future.
827 	 */
828 	if (transhdrlen &&
829 	    length + fragheaderlen <= mtu &&
830 	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
831 	    !exthdrlen)
832 		csummode = CHECKSUM_PARTIAL;
833 
834 	cork->length += length;
835 	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
836 	    (sk->sk_protocol == IPPROTO_UDP) &&
837 	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
838 		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
839 					 hh_len, fragheaderlen, transhdrlen,
840 					 maxfraglen, flags);
841 		if (err)
842 			goto error;
843 		return 0;
844 	}
845 
846 	/* So, what's going on in the loop below?
847 	 *
848 	 * We use calculated fragment length to generate chained skb,
849 	 * each of segments is IP fragment ready for sending to network after
850 	 * adding appropriate IP header.
851 	 */
852 
853 	if (!skb)
854 		goto alloc_new_skb;
855 
856 	while (length > 0) {
857 		/* Check if the remaining data fits into current packet. */
858 		copy = mtu - skb->len;
859 		if (copy < length)
860 			copy = maxfraglen - skb->len;
861 		if (copy <= 0) {
862 			char *data;
863 			unsigned int datalen;
864 			unsigned int fraglen;
865 			unsigned int fraggap;
866 			unsigned int alloclen;
867 			struct sk_buff *skb_prev;
868 alloc_new_skb:
869 			skb_prev = skb;
870 			if (skb_prev)
871 				fraggap = skb_prev->len - maxfraglen;
872 			else
873 				fraggap = 0;
874 
875 			/*
876 			 * If remaining data exceeds the mtu,
877 			 * we know we need more fragment(s).
878 			 */
879 			datalen = length + fraggap;
880 			if (datalen > mtu - fragheaderlen)
881 				datalen = maxfraglen - fragheaderlen;
882 			fraglen = datalen + fragheaderlen;
883 
884 			if ((flags & MSG_MORE) &&
885 			    !(rt->dst.dev->features&NETIF_F_SG))
886 				alloclen = mtu;
887 			else
888 				alloclen = fraglen;
889 
890 			alloclen += exthdrlen;
891 
892 			/* The last fragment gets additional space at tail.
893 			 * Note, with MSG_MORE we overallocate on fragments,
894 			 * because we have no idea what fragment will be
895 			 * the last.
896 			 */
897 			if (datalen == length + fraggap)
898 				alloclen += rt->dst.trailer_len;
899 
900 			if (transhdrlen) {
901 				skb = sock_alloc_send_skb(sk,
902 						alloclen + hh_len + 15,
903 						(flags & MSG_DONTWAIT), &err);
904 			} else {
905 				skb = NULL;
906 				if (atomic_read(&sk->sk_wmem_alloc) <=
907 				    2 * sk->sk_sndbuf)
908 					skb = sock_wmalloc(sk,
909 							   alloclen + hh_len + 15, 1,
910 							   sk->sk_allocation);
911 				if (unlikely(skb == NULL))
912 					err = -ENOBUFS;
913 				else
914 					/* only the initial fragment is
915 					   time stamped */
916 					cork->tx_flags = 0;
917 			}
918 			if (skb == NULL)
919 				goto error;
920 
921 			/*
922 			 *	Fill in the control structures
923 			 */
924 			skb->ip_summed = csummode;
925 			skb->csum = 0;
926 			skb_reserve(skb, hh_len);
927 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
928 
929 			/*
930 			 *	Find where to start putting bytes.
931 			 */
932 			data = skb_put(skb, fraglen + exthdrlen);
933 			skb_set_network_header(skb, exthdrlen);
934 			skb->transport_header = (skb->network_header +
935 						 fragheaderlen);
936 			data += fragheaderlen + exthdrlen;
937 
938 			if (fraggap) {
939 				skb->csum = skb_copy_and_csum_bits(
940 					skb_prev, maxfraglen,
941 					data + transhdrlen, fraggap, 0);
942 				skb_prev->csum = csum_sub(skb_prev->csum,
943 							  skb->csum);
944 				data += fraggap;
945 				pskb_trim_unique(skb_prev, maxfraglen);
946 			}
947 
948 			copy = datalen - transhdrlen - fraggap;
949 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
950 				err = -EFAULT;
951 				kfree_skb(skb);
952 				goto error;
953 			}
954 
955 			offset += copy;
956 			length -= datalen - fraggap;
957 			transhdrlen = 0;
958 			exthdrlen = 0;
959 			csummode = CHECKSUM_NONE;
960 
961 			/*
962 			 * Put the packet on the pending queue.
963 			 */
964 			__skb_queue_tail(queue, skb);
965 			continue;
966 		}
967 
968 		if (copy > length)
969 			copy = length;
970 
971 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
972 			unsigned int off;
973 
974 			off = skb->len;
975 			if (getfrag(from, skb_put(skb, copy),
976 					offset, copy, off, skb) < 0) {
977 				__skb_trim(skb, off);
978 				err = -EFAULT;
979 				goto error;
980 			}
981 		} else {
982 			int i = skb_shinfo(skb)->nr_frags;
983 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
984 			struct page *page = cork->page;
985 			int off = cork->off;
986 			unsigned int left;
987 
988 			if (page && (left = PAGE_SIZE - off) > 0) {
989 				if (copy >= left)
990 					copy = left;
991 				if (page != frag->page) {
992 					if (i == MAX_SKB_FRAGS) {
993 						err = -EMSGSIZE;
994 						goto error;
995 					}
996 					get_page(page);
997 					skb_fill_page_desc(skb, i, page, off, 0);
998 					frag = &skb_shinfo(skb)->frags[i];
999 				}
1000 			} else if (i < MAX_SKB_FRAGS) {
1001 				if (copy > PAGE_SIZE)
1002 					copy = PAGE_SIZE;
1003 				page = alloc_pages(sk->sk_allocation, 0);
1004 				if (page == NULL)  {
1005 					err = -ENOMEM;
1006 					goto error;
1007 				}
1008 				cork->page = page;
1009 				cork->off = 0;
1010 
1011 				skb_fill_page_desc(skb, i, page, 0, 0);
1012 				frag = &skb_shinfo(skb)->frags[i];
1013 			} else {
1014 				err = -EMSGSIZE;
1015 				goto error;
1016 			}
1017 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1018 				err = -EFAULT;
1019 				goto error;
1020 			}
1021 			cork->off += copy;
1022 			frag->size += copy;
1023 			skb->len += copy;
1024 			skb->data_len += copy;
1025 			skb->truesize += copy;
1026 			atomic_add(copy, &sk->sk_wmem_alloc);
1027 		}
1028 		offset += copy;
1029 		length -= copy;
1030 	}
1031 
1032 	return 0;
1033 
1034 error:
1035 	cork->length -= length;
1036 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1037 	return err;
1038 }
1039 
1040 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1041 			 struct ipcm_cookie *ipc, struct rtable **rtp)
1042 {
1043 	struct inet_sock *inet = inet_sk(sk);
1044 	struct ip_options_rcu *opt;
1045 	struct rtable *rt;
1046 
1047 	/*
1048 	 * setup for corking.
1049 	 */
1050 	opt = ipc->opt;
1051 	if (opt) {
1052 		if (cork->opt == NULL) {
1053 			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1054 					    sk->sk_allocation);
1055 			if (unlikely(cork->opt == NULL))
1056 				return -ENOBUFS;
1057 		}
1058 		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1059 		cork->flags |= IPCORK_OPT;
1060 		cork->addr = ipc->addr;
1061 	}
1062 	rt = *rtp;
1063 	if (unlikely(!rt))
1064 		return -EFAULT;
1065 	/*
1066 	 * We steal reference to this route, caller should not release it
1067 	 */
1068 	*rtp = NULL;
1069 	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1070 			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1071 	cork->dst = &rt->dst;
1072 	cork->length = 0;
1073 	cork->tx_flags = ipc->tx_flags;
1074 	cork->page = NULL;
1075 	cork->off = 0;
1076 
1077 	return 0;
1078 }
1079 
1080 /*
1081  *	ip_append_data() and ip_append_page() can make one large IP datagram
1082  *	from many pieces of data. Each pieces will be holded on the socket
1083  *	until ip_push_pending_frames() is called. Each piece can be a page
1084  *	or non-page data.
1085  *
1086  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1087  *	this interface potentially.
1088  *
1089  *	LATER: length must be adjusted by pad at tail, when it is required.
1090  */
1091 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1092 		   int getfrag(void *from, char *to, int offset, int len,
1093 			       int odd, struct sk_buff *skb),
1094 		   void *from, int length, int transhdrlen,
1095 		   struct ipcm_cookie *ipc, struct rtable **rtp,
1096 		   unsigned int flags)
1097 {
1098 	struct inet_sock *inet = inet_sk(sk);
1099 	int err;
1100 
1101 	if (flags&MSG_PROBE)
1102 		return 0;
1103 
1104 	if (skb_queue_empty(&sk->sk_write_queue)) {
1105 		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1106 		if (err)
1107 			return err;
1108 	} else {
1109 		transhdrlen = 0;
1110 	}
1111 
1112 	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1113 				from, length, transhdrlen, flags);
1114 }
1115 
1116 ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1117 		       int offset, size_t size, int flags)
1118 {
1119 	struct inet_sock *inet = inet_sk(sk);
1120 	struct sk_buff *skb;
1121 	struct rtable *rt;
1122 	struct ip_options *opt = NULL;
1123 	struct inet_cork *cork;
1124 	int hh_len;
1125 	int mtu;
1126 	int len;
1127 	int err;
1128 	unsigned int maxfraglen, fragheaderlen, fraggap;
1129 
1130 	if (inet->hdrincl)
1131 		return -EPERM;
1132 
1133 	if (flags&MSG_PROBE)
1134 		return 0;
1135 
1136 	if (skb_queue_empty(&sk->sk_write_queue))
1137 		return -EINVAL;
1138 
1139 	cork = &inet->cork.base;
1140 	rt = (struct rtable *)cork->dst;
1141 	if (cork->flags & IPCORK_OPT)
1142 		opt = cork->opt;
1143 
1144 	if (!(rt->dst.dev->features&NETIF_F_SG))
1145 		return -EOPNOTSUPP;
1146 
1147 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1148 	mtu = cork->fragsize;
1149 
1150 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1151 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1152 
1153 	if (cork->length + size > 0xFFFF - fragheaderlen) {
1154 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1155 		return -EMSGSIZE;
1156 	}
1157 
1158 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1159 		return -EINVAL;
1160 
1161 	cork->length += size;
1162 	if ((size + skb->len > mtu) &&
1163 	    (sk->sk_protocol == IPPROTO_UDP) &&
1164 	    (rt->dst.dev->features & NETIF_F_UFO)) {
1165 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1166 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1167 	}
1168 
1169 
1170 	while (size > 0) {
1171 		int i;
1172 
1173 		if (skb_is_gso(skb))
1174 			len = size;
1175 		else {
1176 
1177 			/* Check if the remaining data fits into current packet. */
1178 			len = mtu - skb->len;
1179 			if (len < size)
1180 				len = maxfraglen - skb->len;
1181 		}
1182 		if (len <= 0) {
1183 			struct sk_buff *skb_prev;
1184 			int alloclen;
1185 
1186 			skb_prev = skb;
1187 			fraggap = skb_prev->len - maxfraglen;
1188 
1189 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1190 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1191 			if (unlikely(!skb)) {
1192 				err = -ENOBUFS;
1193 				goto error;
1194 			}
1195 
1196 			/*
1197 			 *	Fill in the control structures
1198 			 */
1199 			skb->ip_summed = CHECKSUM_NONE;
1200 			skb->csum = 0;
1201 			skb_reserve(skb, hh_len);
1202 
1203 			/*
1204 			 *	Find where to start putting bytes.
1205 			 */
1206 			skb_put(skb, fragheaderlen + fraggap);
1207 			skb_reset_network_header(skb);
1208 			skb->transport_header = (skb->network_header +
1209 						 fragheaderlen);
1210 			if (fraggap) {
1211 				skb->csum = skb_copy_and_csum_bits(skb_prev,
1212 								   maxfraglen,
1213 						    skb_transport_header(skb),
1214 								   fraggap, 0);
1215 				skb_prev->csum = csum_sub(skb_prev->csum,
1216 							  skb->csum);
1217 				pskb_trim_unique(skb_prev, maxfraglen);
1218 			}
1219 
1220 			/*
1221 			 * Put the packet on the pending queue.
1222 			 */
1223 			__skb_queue_tail(&sk->sk_write_queue, skb);
1224 			continue;
1225 		}
1226 
1227 		i = skb_shinfo(skb)->nr_frags;
1228 		if (len > size)
1229 			len = size;
1230 		if (skb_can_coalesce(skb, i, page, offset)) {
1231 			skb_shinfo(skb)->frags[i-1].size += len;
1232 		} else if (i < MAX_SKB_FRAGS) {
1233 			get_page(page);
1234 			skb_fill_page_desc(skb, i, page, offset, len);
1235 		} else {
1236 			err = -EMSGSIZE;
1237 			goto error;
1238 		}
1239 
1240 		if (skb->ip_summed == CHECKSUM_NONE) {
1241 			__wsum csum;
1242 			csum = csum_page(page, offset, len);
1243 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1244 		}
1245 
1246 		skb->len += len;
1247 		skb->data_len += len;
1248 		skb->truesize += len;
1249 		atomic_add(len, &sk->sk_wmem_alloc);
1250 		offset += len;
1251 		size -= len;
1252 	}
1253 	return 0;
1254 
1255 error:
1256 	cork->length -= size;
1257 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1258 	return err;
1259 }
1260 
1261 static void ip_cork_release(struct inet_cork *cork)
1262 {
1263 	cork->flags &= ~IPCORK_OPT;
1264 	kfree(cork->opt);
1265 	cork->opt = NULL;
1266 	dst_release(cork->dst);
1267 	cork->dst = NULL;
1268 }
1269 
1270 /*
1271  *	Combined all pending IP fragments on the socket as one IP datagram
1272  *	and push them out.
1273  */
1274 struct sk_buff *__ip_make_skb(struct sock *sk,
1275 			      struct flowi4 *fl4,
1276 			      struct sk_buff_head *queue,
1277 			      struct inet_cork *cork)
1278 {
1279 	struct sk_buff *skb, *tmp_skb;
1280 	struct sk_buff **tail_skb;
1281 	struct inet_sock *inet = inet_sk(sk);
1282 	struct net *net = sock_net(sk);
1283 	struct ip_options *opt = NULL;
1284 	struct rtable *rt = (struct rtable *)cork->dst;
1285 	struct iphdr *iph;
1286 	__be16 df = 0;
1287 	__u8 ttl;
1288 
1289 	if ((skb = __skb_dequeue(queue)) == NULL)
1290 		goto out;
1291 	tail_skb = &(skb_shinfo(skb)->frag_list);
1292 
1293 	/* move skb->data to ip header from ext header */
1294 	if (skb->data < skb_network_header(skb))
1295 		__skb_pull(skb, skb_network_offset(skb));
1296 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1297 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1298 		*tail_skb = tmp_skb;
1299 		tail_skb = &(tmp_skb->next);
1300 		skb->len += tmp_skb->len;
1301 		skb->data_len += tmp_skb->len;
1302 		skb->truesize += tmp_skb->truesize;
1303 		tmp_skb->destructor = NULL;
1304 		tmp_skb->sk = NULL;
1305 	}
1306 
1307 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1308 	 * to fragment the frame generated here. No matter, what transforms
1309 	 * how transforms change size of the packet, it will come out.
1310 	 */
1311 	if (inet->pmtudisc < IP_PMTUDISC_DO)
1312 		skb->local_df = 1;
1313 
1314 	/* DF bit is set when we want to see DF on outgoing frames.
1315 	 * If local_df is set too, we still allow to fragment this frame
1316 	 * locally. */
1317 	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1318 	    (skb->len <= dst_mtu(&rt->dst) &&
1319 	     ip_dont_fragment(sk, &rt->dst)))
1320 		df = htons(IP_DF);
1321 
1322 	if (cork->flags & IPCORK_OPT)
1323 		opt = cork->opt;
1324 
1325 	if (rt->rt_type == RTN_MULTICAST)
1326 		ttl = inet->mc_ttl;
1327 	else
1328 		ttl = ip_select_ttl(inet, &rt->dst);
1329 
1330 	iph = (struct iphdr *)skb->data;
1331 	iph->version = 4;
1332 	iph->ihl = 5;
1333 	iph->tos = inet->tos;
1334 	iph->frag_off = df;
1335 	ip_select_ident(iph, &rt->dst, sk);
1336 	iph->ttl = ttl;
1337 	iph->protocol = sk->sk_protocol;
1338 	iph->saddr = fl4->saddr;
1339 	iph->daddr = fl4->daddr;
1340 
1341 	if (opt) {
1342 		iph->ihl += opt->optlen>>2;
1343 		ip_options_build(skb, opt, cork->addr, rt, 0);
1344 	}
1345 
1346 	skb->priority = sk->sk_priority;
1347 	skb->mark = sk->sk_mark;
1348 	/*
1349 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1350 	 * on dst refcount
1351 	 */
1352 	cork->dst = NULL;
1353 	skb_dst_set(skb, &rt->dst);
1354 
1355 	if (iph->protocol == IPPROTO_ICMP)
1356 		icmp_out_count(net, ((struct icmphdr *)
1357 			skb_transport_header(skb))->type);
1358 
1359 	ip_cork_release(cork);
1360 out:
1361 	return skb;
1362 }
1363 
1364 int ip_send_skb(struct sk_buff *skb)
1365 {
1366 	struct net *net = sock_net(skb->sk);
1367 	int err;
1368 
1369 	err = ip_local_out(skb);
1370 	if (err) {
1371 		if (err > 0)
1372 			err = net_xmit_errno(err);
1373 		if (err)
1374 			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1375 	}
1376 
1377 	return err;
1378 }
1379 
1380 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1381 {
1382 	struct sk_buff *skb;
1383 
1384 	skb = ip_finish_skb(sk, fl4);
1385 	if (!skb)
1386 		return 0;
1387 
1388 	/* Netfilter gets whole the not fragmented skb. */
1389 	return ip_send_skb(skb);
1390 }
1391 
1392 /*
1393  *	Throw away all pending data on the socket.
1394  */
1395 static void __ip_flush_pending_frames(struct sock *sk,
1396 				      struct sk_buff_head *queue,
1397 				      struct inet_cork *cork)
1398 {
1399 	struct sk_buff *skb;
1400 
1401 	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1402 		kfree_skb(skb);
1403 
1404 	ip_cork_release(cork);
1405 }
1406 
1407 void ip_flush_pending_frames(struct sock *sk)
1408 {
1409 	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1410 }
1411 
1412 struct sk_buff *ip_make_skb(struct sock *sk,
1413 			    struct flowi4 *fl4,
1414 			    int getfrag(void *from, char *to, int offset,
1415 					int len, int odd, struct sk_buff *skb),
1416 			    void *from, int length, int transhdrlen,
1417 			    struct ipcm_cookie *ipc, struct rtable **rtp,
1418 			    unsigned int flags)
1419 {
1420 	struct inet_cork cork;
1421 	struct sk_buff_head queue;
1422 	int err;
1423 
1424 	if (flags & MSG_PROBE)
1425 		return NULL;
1426 
1427 	__skb_queue_head_init(&queue);
1428 
1429 	cork.flags = 0;
1430 	cork.addr = 0;
1431 	cork.opt = NULL;
1432 	err = ip_setup_cork(sk, &cork, ipc, rtp);
1433 	if (err)
1434 		return ERR_PTR(err);
1435 
1436 	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1437 			       from, length, transhdrlen, flags);
1438 	if (err) {
1439 		__ip_flush_pending_frames(sk, &queue, &cork);
1440 		return ERR_PTR(err);
1441 	}
1442 
1443 	return __ip_make_skb(sk, fl4, &queue, &cork);
1444 }
1445 
1446 /*
1447  *	Fetch data from kernel space and fill in checksum if needed.
1448  */
1449 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1450 			      int len, int odd, struct sk_buff *skb)
1451 {
1452 	__wsum csum;
1453 
1454 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1455 	skb->csum = csum_block_add(skb->csum, csum, odd);
1456 	return 0;
1457 }
1458 
1459 /*
1460  *	Generic function to send a packet as reply to another packet.
1461  *	Used to send TCP resets so far. ICMP should use this function too.
1462  *
1463  *	Should run single threaded per socket because it uses the sock
1464  *     	structure to pass arguments.
1465  */
1466 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1467 		   struct ip_reply_arg *arg, unsigned int len)
1468 {
1469 	struct inet_sock *inet = inet_sk(sk);
1470 	struct ip_options_data replyopts;
1471 	struct ipcm_cookie ipc;
1472 	struct flowi4 fl4;
1473 	struct rtable *rt = skb_rtable(skb);
1474 
1475 	if (ip_options_echo(&replyopts.opt.opt, skb))
1476 		return;
1477 
1478 	ipc.addr = daddr;
1479 	ipc.opt = NULL;
1480 	ipc.tx_flags = 0;
1481 
1482 	if (replyopts.opt.opt.optlen) {
1483 		ipc.opt = &replyopts.opt;
1484 
1485 		if (replyopts.opt.opt.srr)
1486 			daddr = replyopts.opt.opt.faddr;
1487 	}
1488 
1489 	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1490 			   RT_TOS(ip_hdr(skb)->tos),
1491 			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1492 			   ip_reply_arg_flowi_flags(arg),
1493 			   daddr, rt->rt_spec_dst,
1494 			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1495 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1496 	rt = ip_route_output_key(sock_net(sk), &fl4);
1497 	if (IS_ERR(rt))
1498 		return;
1499 
1500 	/* And let IP do all the hard work.
1501 
1502 	   This chunk is not reenterable, hence spinlock.
1503 	   Note that it uses the fact, that this function is called
1504 	   with locally disabled BH and that sk cannot be already spinlocked.
1505 	 */
1506 	bh_lock_sock(sk);
1507 	inet->tos = ip_hdr(skb)->tos;
1508 	sk->sk_priority = skb->priority;
1509 	sk->sk_protocol = ip_hdr(skb)->protocol;
1510 	sk->sk_bound_dev_if = arg->bound_dev_if;
1511 	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1512 		       &ipc, &rt, MSG_DONTWAIT);
1513 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1514 		if (arg->csumoffset >= 0)
1515 			*((__sum16 *)skb_transport_header(skb) +
1516 			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1517 								arg->csum));
1518 		skb->ip_summed = CHECKSUM_NONE;
1519 		ip_push_pending_frames(sk, &fl4);
1520 	}
1521 
1522 	bh_unlock_sock(sk);
1523 
1524 	ip_rt_put(rt);
1525 }
1526 
1527 void __init ip_init(void)
1528 {
1529 	ip_rt_init();
1530 	inet_initpeers();
1531 
1532 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1533 	igmp_mc_proc_init();
1534 #endif
1535 }
1536