xref: /linux/net/ipv4/tcp.c (revision 5e8d780d745c1619aba81fe7166c5a4b5cad2b84)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *		Alan Cox	:	Numerous verify_area() calls
24  *		Alan Cox	:	Set the ACK bit on a reset
25  *		Alan Cox	:	Stopped it crashing if it closed while
26  *					sk->inuse=1 and was trying to connect
27  *					(tcp_err()).
28  *		Alan Cox	:	All icmp error handling was broken
29  *					pointers passed where wrong and the
30  *					socket was looked up backwards. Nobody
31  *					tested any icmp error code obviously.
32  *		Alan Cox	:	tcp_err() now handled properly. It
33  *					wakes people on errors. poll
34  *					behaves and the icmp error race
35  *					has gone by moving it into sock.c
36  *		Alan Cox	:	tcp_send_reset() fixed to work for
37  *					everything not just packets for
38  *					unknown sockets.
39  *		Alan Cox	:	tcp option processing.
40  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41  *					syn rule wrong]
42  *		Herp Rosmanith  :	More reset fixes
43  *		Alan Cox	:	No longer acks invalid rst frames.
44  *					Acking any kind of RST is right out.
45  *		Alan Cox	:	Sets an ignore me flag on an rst
46  *					receive otherwise odd bits of prattle
47  *					escape still
48  *		Alan Cox	:	Fixed another acking RST frame bug.
49  *					Should stop LAN workplace lockups.
50  *		Alan Cox	: 	Some tidyups using the new skb list
51  *					facilities
52  *		Alan Cox	:	sk->keepopen now seems to work
53  *		Alan Cox	:	Pulls options out correctly on accepts
54  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56  *					bit to skb ops.
57  *		Alan Cox	:	Tidied tcp_data to avoid a potential
58  *					nasty.
59  *		Alan Cox	:	Added some better commenting, as the
60  *					tcp is hard to follow
61  *		Alan Cox	:	Removed incorrect check for 20 * psh
62  *	Michael O'Reilly	:	ack < copied bug fix.
63  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64  *		Alan Cox	:	FIN with no memory -> CRASH
65  *		Alan Cox	:	Added socket option proto entries.
66  *					Also added awareness of them to accept.
67  *		Alan Cox	:	Added TCP options (SOL_TCP)
68  *		Alan Cox	:	Switched wakeup calls to callbacks,
69  *					so the kernel can layer network
70  *					sockets.
71  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72  *		Alan Cox	:	Handle FIN (more) properly (we hope).
73  *		Alan Cox	:	RST frames sent on unsynchronised
74  *					state ack error.
75  *		Alan Cox	:	Put in missing check for SYN bit.
76  *		Alan Cox	:	Added tcp_select_window() aka NET2E
77  *					window non shrink trick.
78  *		Alan Cox	:	Added a couple of small NET2E timer
79  *					fixes
80  *		Charles Hedrick :	TCP fixes
81  *		Toomas Tamm	:	TCP window fixes
82  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83  *		Charles Hedrick	:	Rewrote most of it to actually work
84  *		Linus		:	Rewrote tcp_read() and URG handling
85  *					completely
86  *		Gerhard Koerting:	Fixed some missing timer handling
87  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88  *		Gerhard Koerting:	PC/TCP workarounds
89  *		Adam Caldwell	:	Assorted timer/timing errors
90  *		Matthew Dillon	:	Fixed another RST bug
91  *		Alan Cox	:	Move to kernel side addressing changes.
92  *		Alan Cox	:	Beginning work on TCP fastpathing
93  *					(not yet usable)
94  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95  *		Alan Cox	:	TCP fast path debugging
96  *		Alan Cox	:	Window clamping
97  *		Michael Riepe	:	Bug in tcp_check()
98  *		Matt Dillon	:	More TCP improvements and RST bug fixes
99  *		Matt Dillon	:	Yet more small nasties remove from the
100  *					TCP code (Be very nice to this man if
101  *					tcp finally works 100%) 8)
102  *		Alan Cox	:	BSD accept semantics.
103  *		Alan Cox	:	Reset on closedown bug.
104  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105  *		Michael Pall	:	Handle poll() after URG properly in
106  *					all cases.
107  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108  *					(multi URG PUSH broke rlogin).
109  *		Michael Pall	:	Fix the multi URG PUSH problem in
110  *					tcp_readable(), poll() after URG
111  *					works now.
112  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113  *					BSD api.
114  *		Alan Cox	:	Changed the semantics of sk->socket to
115  *					fix a race and a signal problem with
116  *					accept() and async I/O.
117  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120  *					clients/servers which listen in on
121  *					fixed ports.
122  *		Alan Cox	:	Cleaned the above up and shrank it to
123  *					a sensible code size.
124  *		Alan Cox	:	Self connect lockup fix.
125  *		Alan Cox	:	No connect to multicast.
126  *		Ross Biro	:	Close unaccepted children on master
127  *					socket close.
128  *		Alan Cox	:	Reset tracing code.
129  *		Alan Cox	:	Spurious resets on shutdown.
130  *		Alan Cox	:	Giant 15 minute/60 second timer error
131  *		Alan Cox	:	Small whoops in polling before an
132  *					accept.
133  *		Alan Cox	:	Kept the state trace facility since
134  *					it's handy for debugging.
135  *		Alan Cox	:	More reset handler fixes.
136  *		Alan Cox	:	Started rewriting the code based on
137  *					the RFC's for other useful protocol
138  *					references see: Comer, KA9Q NOS, and
139  *					for a reference on the difference
140  *					between specifications and how BSD
141  *					works see the 4.4lite source.
142  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143  *					close.
144  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146  *		Alan Cox	:	Reimplemented timers as per the RFC
147  *					and using multiple timers for sanity.
148  *		Alan Cox	:	Small bug fixes, and a lot of new
149  *					comments.
150  *		Alan Cox	:	Fixed dual reader crash by locking
151  *					the buffers (much like datagram.c)
152  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153  *					now gets fed up of retrying without
154  *					(even a no space) answer.
155  *		Alan Cox	:	Extracted closing code better
156  *		Alan Cox	:	Fixed the closing state machine to
157  *					resemble the RFC.
158  *		Alan Cox	:	More 'per spec' fixes.
159  *		Jorge Cwik	:	Even faster checksumming.
160  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161  *					only frames. At least one pc tcp stack
162  *					generates them.
163  *		Alan Cox	:	Cache last socket.
164  *		Alan Cox	:	Per route irtt.
165  *		Matt Day	:	poll()->select() match BSD precisely on error
166  *		Alan Cox	:	New buffers
167  *		Marc Tamsky	:	Various sk->prot->retransmits and
168  *					sk->retransmits misupdating fixed.
169  *					Fixed tcp_write_timeout: stuck close,
170  *					and TCP syn retries gets used now.
171  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172  *					ack if state is TCP_CLOSED.
173  *		Alan Cox	:	Look up device on a retransmit - routes may
174  *					change. Doesn't yet cope with MSS shrink right
175  *					but it's a start!
176  *		Marc Tamsky	:	Closing in closing fixes.
177  *		Mike Shaver	:	RFC1122 verifications.
178  *		Alan Cox	:	rcv_saddr errors.
179  *		Alan Cox	:	Block double connect().
180  *		Alan Cox	:	Small hooks for enSKIP.
181  *		Alexey Kuznetsov:	Path MTU discovery.
182  *		Alan Cox	:	Support soft errors.
183  *		Alan Cox	:	Fix MTU discovery pathological case
184  *					when the remote claims no mtu!
185  *		Marc Tamsky	:	TCP_CLOSE fix.
186  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187  *					window but wrong (fixes NT lpd problems)
188  *		Pedro Roque	:	Better TCP window handling, delayed ack.
189  *		Joerg Reuter	:	No modification of locked buffers in
190  *					tcp_do_retransmit()
191  *		Eric Schenk	:	Changed receiver side silly window
192  *					avoidance algorithm to BSD style
193  *					algorithm. This doubles throughput
194  *					against machines running Solaris,
195  *					and seems to result in general
196  *					improvement.
197  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198  *	Willy Konynenberg	:	Transparent proxying support.
199  *	Mike McLagan		:	Routing by source
200  *		Keith Owens	:	Do proper merging with partial SKB's in
201  *					tcp_do_sendmsg to avoid burstiness.
202  *		Eric Schenk	:	Fix fast close down bug with
203  *					shutdown() followed by close().
204  *		Andi Kleen 	:	Make poll agree with SIGIO
205  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206  *					lingertime == 0 (RFC 793 ABORT Call)
207  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208  *					csum_and_copy_from_user() if possible.
209  *
210  *		This program is free software; you can redistribute it and/or
211  *		modify it under the terms of the GNU General Public License
212  *		as published by the Free Software Foundation; either version
213  *		2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *	TCP_SYN_SENT		sent a connection request, waiting for ack
218  *
219  *	TCP_SYN_RECV		received a connection request, sent ack,
220  *				waiting for final ack in three-way handshake.
221  *
222  *	TCP_ESTABLISHED		connection established
223  *
224  *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225  *				transmission of remaining buffered data
226  *
227  *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228  *				to shutdown
229  *
230  *	TCP_CLOSING		both sides have shutdown but we still have
231  *				data we have to finish sending
232  *
233  *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234  *				closed, can only be entered from FIN_WAIT2
235  *				or CLOSING.  Required because the other end
236  *				may not have gotten our last ACK causing it
237  *				to retransmit the data packet (which we ignore)
238  *
239  *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240  *				us to finish writing our data and to shutdown
241  *				(we have to close() to move on to LAST_ACK)
242  *
243  *	TCP_LAST_ACK		out side has shutdown after remote has
244  *				shutdown.  There may still be data in our
245  *				buffer that we have to finish sending
246  *
247  *	TCP_CLOSE		socket is finished
248  */
249 
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260 #include <linux/cache.h>
261 #include <linux/err.h>
262 
263 #include <net/icmp.h>
264 #include <net/tcp.h>
265 #include <net/xfrm.h>
266 #include <net/ip.h>
267 #include <net/netdma.h>
268 
269 #include <asm/uaccess.h>
270 #include <asm/ioctls.h>
271 
272 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
273 
274 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
275 
276 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
277 
278 EXPORT_SYMBOL_GPL(tcp_orphan_count);
279 
280 int sysctl_tcp_mem[3] __read_mostly;
281 int sysctl_tcp_wmem[3] __read_mostly;
282 int sysctl_tcp_rmem[3] __read_mostly;
283 
284 EXPORT_SYMBOL(sysctl_tcp_mem);
285 EXPORT_SYMBOL(sysctl_tcp_rmem);
286 EXPORT_SYMBOL(sysctl_tcp_wmem);
287 
288 atomic_t tcp_memory_allocated;	/* Current allocated memory. */
289 atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
290 
291 EXPORT_SYMBOL(tcp_memory_allocated);
292 EXPORT_SYMBOL(tcp_sockets_allocated);
293 
294 /*
295  * Pressure flag: try to collapse.
296  * Technical note: it is used by multiple contexts non atomically.
297  * All the sk_stream_mem_schedule() is of this nature: accounting
298  * is strict, actions are advisory and have some latency.
299  */
300 int tcp_memory_pressure;
301 
302 EXPORT_SYMBOL(tcp_memory_pressure);
303 
304 void tcp_enter_memory_pressure(void)
305 {
306 	if (!tcp_memory_pressure) {
307 		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
308 		tcp_memory_pressure = 1;
309 	}
310 }
311 
312 EXPORT_SYMBOL(tcp_enter_memory_pressure);
313 
314 /*
315  *	Wait for a TCP event.
316  *
317  *	Note that we don't need to lock the socket, as the upper poll layers
318  *	take care of normal races (between the test and the event) and we don't
319  *	go look at any of the socket buffers directly.
320  */
321 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
322 {
323 	unsigned int mask;
324 	struct sock *sk = sock->sk;
325 	struct tcp_sock *tp = tcp_sk(sk);
326 
327 	poll_wait(file, sk->sk_sleep, wait);
328 	if (sk->sk_state == TCP_LISTEN)
329 		return inet_csk_listen_poll(sk);
330 
331 	/* Socket is not locked. We are protected from async events
332 	   by poll logic and correct handling of state changes
333 	   made by another threads is impossible in any case.
334 	 */
335 
336 	mask = 0;
337 	if (sk->sk_err)
338 		mask = POLLERR;
339 
340 	/*
341 	 * POLLHUP is certainly not done right. But poll() doesn't
342 	 * have a notion of HUP in just one direction, and for a
343 	 * socket the read side is more interesting.
344 	 *
345 	 * Some poll() documentation says that POLLHUP is incompatible
346 	 * with the POLLOUT/POLLWR flags, so somebody should check this
347 	 * all. But careful, it tends to be safer to return too many
348 	 * bits than too few, and you can easily break real applications
349 	 * if you don't tell them that something has hung up!
350 	 *
351 	 * Check-me.
352 	 *
353 	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
354 	 * our fs/select.c). It means that after we received EOF,
355 	 * poll always returns immediately, making impossible poll() on write()
356 	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
357 	 * if and only if shutdown has been made in both directions.
358 	 * Actually, it is interesting to look how Solaris and DUX
359 	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
360 	 * then we could set it on SND_SHUTDOWN. BTW examples given
361 	 * in Stevens' books assume exactly this behaviour, it explains
362 	 * why PULLHUP is incompatible with POLLOUT.	--ANK
363 	 *
364 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
365 	 * blocking on fresh not-connected or disconnected socket. --ANK
366 	 */
367 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
368 		mask |= POLLHUP;
369 	if (sk->sk_shutdown & RCV_SHUTDOWN)
370 		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
371 
372 	/* Connected? */
373 	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
374 		/* Potential race condition. If read of tp below will
375 		 * escape above sk->sk_state, we can be illegally awaken
376 		 * in SYN_* states. */
377 		if ((tp->rcv_nxt != tp->copied_seq) &&
378 		    (tp->urg_seq != tp->copied_seq ||
379 		     tp->rcv_nxt != tp->copied_seq + 1 ||
380 		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
381 			mask |= POLLIN | POLLRDNORM;
382 
383 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
384 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
385 				mask |= POLLOUT | POLLWRNORM;
386 			} else {  /* send SIGIO later */
387 				set_bit(SOCK_ASYNC_NOSPACE,
388 					&sk->sk_socket->flags);
389 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
390 
391 				/* Race breaker. If space is freed after
392 				 * wspace test but before the flags are set,
393 				 * IO signal will be lost.
394 				 */
395 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
396 					mask |= POLLOUT | POLLWRNORM;
397 			}
398 		}
399 
400 		if (tp->urg_data & TCP_URG_VALID)
401 			mask |= POLLPRI;
402 	}
403 	return mask;
404 }
405 
406 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
407 {
408 	struct tcp_sock *tp = tcp_sk(sk);
409 	int answ;
410 
411 	switch (cmd) {
412 	case SIOCINQ:
413 		if (sk->sk_state == TCP_LISTEN)
414 			return -EINVAL;
415 
416 		lock_sock(sk);
417 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
418 			answ = 0;
419 		else if (sock_flag(sk, SOCK_URGINLINE) ||
420 			 !tp->urg_data ||
421 			 before(tp->urg_seq, tp->copied_seq) ||
422 			 !before(tp->urg_seq, tp->rcv_nxt)) {
423 			answ = tp->rcv_nxt - tp->copied_seq;
424 
425 			/* Subtract 1, if FIN is in queue. */
426 			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
427 				answ -=
428 		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
429 		} else
430 			answ = tp->urg_seq - tp->copied_seq;
431 		release_sock(sk);
432 		break;
433 	case SIOCATMARK:
434 		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
435 		break;
436 	case SIOCOUTQ:
437 		if (sk->sk_state == TCP_LISTEN)
438 			return -EINVAL;
439 
440 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
441 			answ = 0;
442 		else
443 			answ = tp->write_seq - tp->snd_una;
444 		break;
445 	default:
446 		return -ENOIOCTLCMD;
447 	};
448 
449 	return put_user(answ, (int __user *)arg);
450 }
451 
452 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
453 {
454 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
455 	tp->pushed_seq = tp->write_seq;
456 }
457 
458 static inline int forced_push(struct tcp_sock *tp)
459 {
460 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
461 }
462 
463 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
464 			      struct sk_buff *skb)
465 {
466 	skb->csum = 0;
467 	TCP_SKB_CB(skb)->seq = tp->write_seq;
468 	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
469 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
470 	TCP_SKB_CB(skb)->sacked = 0;
471 	skb_header_release(skb);
472 	__skb_queue_tail(&sk->sk_write_queue, skb);
473 	sk_charge_skb(sk, skb);
474 	if (!sk->sk_send_head)
475 		sk->sk_send_head = skb;
476 	if (tp->nonagle & TCP_NAGLE_PUSH)
477 		tp->nonagle &= ~TCP_NAGLE_PUSH;
478 }
479 
480 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
481 				struct sk_buff *skb)
482 {
483 	if (flags & MSG_OOB) {
484 		tp->urg_mode = 1;
485 		tp->snd_up = tp->write_seq;
486 		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
487 	}
488 }
489 
490 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
491 			    int mss_now, int nonagle)
492 {
493 	if (sk->sk_send_head) {
494 		struct sk_buff *skb = sk->sk_write_queue.prev;
495 		if (!(flags & MSG_MORE) || forced_push(tp))
496 			tcp_mark_push(tp, skb);
497 		tcp_mark_urg(tp, flags, skb);
498 		__tcp_push_pending_frames(sk, tp, mss_now,
499 					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
500 	}
501 }
502 
503 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
504 			 size_t psize, int flags)
505 {
506 	struct tcp_sock *tp = tcp_sk(sk);
507 	int mss_now, size_goal;
508 	int err;
509 	ssize_t copied;
510 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
511 
512 	/* Wait for a connection to finish. */
513 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
514 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
515 			goto out_err;
516 
517 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
518 
519 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
520 	size_goal = tp->xmit_size_goal;
521 	copied = 0;
522 
523 	err = -EPIPE;
524 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
525 		goto do_error;
526 
527 	while (psize > 0) {
528 		struct sk_buff *skb = sk->sk_write_queue.prev;
529 		struct page *page = pages[poffset / PAGE_SIZE];
530 		int copy, i, can_coalesce;
531 		int offset = poffset % PAGE_SIZE;
532 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
533 
534 		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
535 new_segment:
536 			if (!sk_stream_memory_free(sk))
537 				goto wait_for_sndbuf;
538 
539 			skb = sk_stream_alloc_pskb(sk, 0, 0,
540 						   sk->sk_allocation);
541 			if (!skb)
542 				goto wait_for_memory;
543 
544 			skb_entail(sk, tp, skb);
545 			copy = size_goal;
546 		}
547 
548 		if (copy > size)
549 			copy = size;
550 
551 		i = skb_shinfo(skb)->nr_frags;
552 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
553 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
554 			tcp_mark_push(tp, skb);
555 			goto new_segment;
556 		}
557 		if (!sk_stream_wmem_schedule(sk, copy))
558 			goto wait_for_memory;
559 
560 		if (can_coalesce) {
561 			skb_shinfo(skb)->frags[i - 1].size += copy;
562 		} else {
563 			get_page(page);
564 			skb_fill_page_desc(skb, i, page, offset, copy);
565 		}
566 
567 		skb->len += copy;
568 		skb->data_len += copy;
569 		skb->truesize += copy;
570 		sk->sk_wmem_queued += copy;
571 		sk->sk_forward_alloc -= copy;
572 		skb->ip_summed = CHECKSUM_HW;
573 		tp->write_seq += copy;
574 		TCP_SKB_CB(skb)->end_seq += copy;
575 		skb_shinfo(skb)->gso_segs = 0;
576 
577 		if (!copied)
578 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
579 
580 		copied += copy;
581 		poffset += copy;
582 		if (!(psize -= copy))
583 			goto out;
584 
585 		if (skb->len < mss_now || (flags & MSG_OOB))
586 			continue;
587 
588 		if (forced_push(tp)) {
589 			tcp_mark_push(tp, skb);
590 			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
591 		} else if (skb == sk->sk_send_head)
592 			tcp_push_one(sk, mss_now);
593 		continue;
594 
595 wait_for_sndbuf:
596 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
597 wait_for_memory:
598 		if (copied)
599 			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
600 
601 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
602 			goto do_error;
603 
604 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
605 		size_goal = tp->xmit_size_goal;
606 	}
607 
608 out:
609 	if (copied)
610 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
611 	return copied;
612 
613 do_error:
614 	if (copied)
615 		goto out;
616 out_err:
617 	return sk_stream_error(sk, flags, err);
618 }
619 
620 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
621 		     size_t size, int flags)
622 {
623 	ssize_t res;
624 	struct sock *sk = sock->sk;
625 
626 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
627 	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
628 		return sock_no_sendpage(sock, page, offset, size, flags);
629 
630 	lock_sock(sk);
631 	TCP_CHECK_TIMER(sk);
632 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
633 	TCP_CHECK_TIMER(sk);
634 	release_sock(sk);
635 	return res;
636 }
637 
638 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
639 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
640 
641 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
642 {
643 	int tmp = tp->mss_cache;
644 
645 	if (sk->sk_route_caps & NETIF_F_SG) {
646 		if (sk->sk_route_caps & NETIF_F_TSO)
647 			tmp = 0;
648 		else {
649 			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
650 
651 			if (tmp >= pgbreak &&
652 			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
653 				tmp = pgbreak;
654 		}
655 	}
656 
657 	return tmp;
658 }
659 
660 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
661 		size_t size)
662 {
663 	struct iovec *iov;
664 	struct tcp_sock *tp = tcp_sk(sk);
665 	struct sk_buff *skb;
666 	int iovlen, flags;
667 	int mss_now, size_goal;
668 	int err, copied;
669 	long timeo;
670 
671 	lock_sock(sk);
672 	TCP_CHECK_TIMER(sk);
673 
674 	flags = msg->msg_flags;
675 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
676 
677 	/* Wait for a connection to finish. */
678 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
679 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
680 			goto out_err;
681 
682 	/* This should be in poll */
683 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
684 
685 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
686 	size_goal = tp->xmit_size_goal;
687 
688 	/* Ok commence sending. */
689 	iovlen = msg->msg_iovlen;
690 	iov = msg->msg_iov;
691 	copied = 0;
692 
693 	err = -EPIPE;
694 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
695 		goto do_error;
696 
697 	while (--iovlen >= 0) {
698 		int seglen = iov->iov_len;
699 		unsigned char __user *from = iov->iov_base;
700 
701 		iov++;
702 
703 		while (seglen > 0) {
704 			int copy;
705 
706 			skb = sk->sk_write_queue.prev;
707 
708 			if (!sk->sk_send_head ||
709 			    (copy = size_goal - skb->len) <= 0) {
710 
711 new_segment:
712 				/* Allocate new segment. If the interface is SG,
713 				 * allocate skb fitting to single page.
714 				 */
715 				if (!sk_stream_memory_free(sk))
716 					goto wait_for_sndbuf;
717 
718 				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
719 							   0, sk->sk_allocation);
720 				if (!skb)
721 					goto wait_for_memory;
722 
723 				/*
724 				 * Check whether we can use HW checksum.
725 				 */
726 				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
727 					skb->ip_summed = CHECKSUM_HW;
728 
729 				skb_entail(sk, tp, skb);
730 				copy = size_goal;
731 			}
732 
733 			/* Try to append data to the end of skb. */
734 			if (copy > seglen)
735 				copy = seglen;
736 
737 			/* Where to copy to? */
738 			if (skb_tailroom(skb) > 0) {
739 				/* We have some space in skb head. Superb! */
740 				if (copy > skb_tailroom(skb))
741 					copy = skb_tailroom(skb);
742 				if ((err = skb_add_data(skb, from, copy)) != 0)
743 					goto do_fault;
744 			} else {
745 				int merge = 0;
746 				int i = skb_shinfo(skb)->nr_frags;
747 				struct page *page = TCP_PAGE(sk);
748 				int off = TCP_OFF(sk);
749 
750 				if (skb_can_coalesce(skb, i, page, off) &&
751 				    off != PAGE_SIZE) {
752 					/* We can extend the last page
753 					 * fragment. */
754 					merge = 1;
755 				} else if (i == MAX_SKB_FRAGS ||
756 					   (!i &&
757 					   !(sk->sk_route_caps & NETIF_F_SG))) {
758 					/* Need to add new fragment and cannot
759 					 * do this because interface is non-SG,
760 					 * or because all the page slots are
761 					 * busy. */
762 					tcp_mark_push(tp, skb);
763 					goto new_segment;
764 				} else if (page) {
765 					if (off == PAGE_SIZE) {
766 						put_page(page);
767 						TCP_PAGE(sk) = page = NULL;
768 						off = 0;
769 					}
770 				} else
771 					off = 0;
772 
773 				if (copy > PAGE_SIZE - off)
774 					copy = PAGE_SIZE - off;
775 
776 				if (!sk_stream_wmem_schedule(sk, copy))
777 					goto wait_for_memory;
778 
779 				if (!page) {
780 					/* Allocate new cache page. */
781 					if (!(page = sk_stream_alloc_page(sk)))
782 						goto wait_for_memory;
783 				}
784 
785 				/* Time to copy data. We are close to
786 				 * the end! */
787 				err = skb_copy_to_page(sk, from, skb, page,
788 						       off, copy);
789 				if (err) {
790 					/* If this page was new, give it to the
791 					 * socket so it does not get leaked.
792 					 */
793 					if (!TCP_PAGE(sk)) {
794 						TCP_PAGE(sk) = page;
795 						TCP_OFF(sk) = 0;
796 					}
797 					goto do_error;
798 				}
799 
800 				/* Update the skb. */
801 				if (merge) {
802 					skb_shinfo(skb)->frags[i - 1].size +=
803 									copy;
804 				} else {
805 					skb_fill_page_desc(skb, i, page, off, copy);
806 					if (TCP_PAGE(sk)) {
807 						get_page(page);
808 					} else if (off + copy < PAGE_SIZE) {
809 						get_page(page);
810 						TCP_PAGE(sk) = page;
811 					}
812 				}
813 
814 				TCP_OFF(sk) = off + copy;
815 			}
816 
817 			if (!copied)
818 				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
819 
820 			tp->write_seq += copy;
821 			TCP_SKB_CB(skb)->end_seq += copy;
822 			skb_shinfo(skb)->gso_segs = 0;
823 
824 			from += copy;
825 			copied += copy;
826 			if ((seglen -= copy) == 0 && iovlen == 0)
827 				goto out;
828 
829 			if (skb->len < mss_now || (flags & MSG_OOB))
830 				continue;
831 
832 			if (forced_push(tp)) {
833 				tcp_mark_push(tp, skb);
834 				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
835 			} else if (skb == sk->sk_send_head)
836 				tcp_push_one(sk, mss_now);
837 			continue;
838 
839 wait_for_sndbuf:
840 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
841 wait_for_memory:
842 			if (copied)
843 				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
844 
845 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
846 				goto do_error;
847 
848 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
849 			size_goal = tp->xmit_size_goal;
850 		}
851 	}
852 
853 out:
854 	if (copied)
855 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
856 	TCP_CHECK_TIMER(sk);
857 	release_sock(sk);
858 	return copied;
859 
860 do_fault:
861 	if (!skb->len) {
862 		if (sk->sk_send_head == skb)
863 			sk->sk_send_head = NULL;
864 		__skb_unlink(skb, &sk->sk_write_queue);
865 		sk_stream_free_skb(sk, skb);
866 	}
867 
868 do_error:
869 	if (copied)
870 		goto out;
871 out_err:
872 	err = sk_stream_error(sk, flags, err);
873 	TCP_CHECK_TIMER(sk);
874 	release_sock(sk);
875 	return err;
876 }
877 
878 /*
879  *	Handle reading urgent data. BSD has very simple semantics for
880  *	this, no blocking and very strange errors 8)
881  */
882 
883 static int tcp_recv_urg(struct sock *sk, long timeo,
884 			struct msghdr *msg, int len, int flags,
885 			int *addr_len)
886 {
887 	struct tcp_sock *tp = tcp_sk(sk);
888 
889 	/* No URG data to read. */
890 	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
891 	    tp->urg_data == TCP_URG_READ)
892 		return -EINVAL;	/* Yes this is right ! */
893 
894 	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
895 		return -ENOTCONN;
896 
897 	if (tp->urg_data & TCP_URG_VALID) {
898 		int err = 0;
899 		char c = tp->urg_data;
900 
901 		if (!(flags & MSG_PEEK))
902 			tp->urg_data = TCP_URG_READ;
903 
904 		/* Read urgent data. */
905 		msg->msg_flags |= MSG_OOB;
906 
907 		if (len > 0) {
908 			if (!(flags & MSG_TRUNC))
909 				err = memcpy_toiovec(msg->msg_iov, &c, 1);
910 			len = 1;
911 		} else
912 			msg->msg_flags |= MSG_TRUNC;
913 
914 		return err ? -EFAULT : len;
915 	}
916 
917 	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
918 		return 0;
919 
920 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
921 	 * the available implementations agree in this case:
922 	 * this call should never block, independent of the
923 	 * blocking state of the socket.
924 	 * Mike <pall@rz.uni-karlsruhe.de>
925 	 */
926 	return -EAGAIN;
927 }
928 
929 /* Clean up the receive buffer for full frames taken by the user,
930  * then send an ACK if necessary.  COPIED is the number of bytes
931  * tcp_recvmsg has given to the user so far, it speeds up the
932  * calculation of whether or not we must ACK for the sake of
933  * a window update.
934  */
935 void tcp_cleanup_rbuf(struct sock *sk, int copied)
936 {
937 	struct tcp_sock *tp = tcp_sk(sk);
938 	int time_to_ack = 0;
939 
940 #if TCP_DEBUG
941 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
942 
943 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
944 #endif
945 
946 	if (inet_csk_ack_scheduled(sk)) {
947 		const struct inet_connection_sock *icsk = inet_csk(sk);
948 		   /* Delayed ACKs frequently hit locked sockets during bulk
949 		    * receive. */
950 		if (icsk->icsk_ack.blocked ||
951 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
952 		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
953 		    /*
954 		     * If this read emptied read buffer, we send ACK, if
955 		     * connection is not bidirectional, user drained
956 		     * receive buffer and there was a small segment
957 		     * in queue.
958 		     */
959 		    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
960 		     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
961 			time_to_ack = 1;
962 	}
963 
964 	/* We send an ACK if we can now advertise a non-zero window
965 	 * which has been raised "significantly".
966 	 *
967 	 * Even if window raised up to infinity, do not send window open ACK
968 	 * in states, where we will not receive more. It is useless.
969 	 */
970 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
971 		__u32 rcv_window_now = tcp_receive_window(tp);
972 
973 		/* Optimize, __tcp_select_window() is not cheap. */
974 		if (2*rcv_window_now <= tp->window_clamp) {
975 			__u32 new_window = __tcp_select_window(sk);
976 
977 			/* Send ACK now, if this read freed lots of space
978 			 * in our buffer. Certainly, new_window is new window.
979 			 * We can advertise it now, if it is not less than current one.
980 			 * "Lots" means "at least twice" here.
981 			 */
982 			if (new_window && new_window >= 2 * rcv_window_now)
983 				time_to_ack = 1;
984 		}
985 	}
986 	if (time_to_ack)
987 		tcp_send_ack(sk);
988 }
989 
990 static void tcp_prequeue_process(struct sock *sk)
991 {
992 	struct sk_buff *skb;
993 	struct tcp_sock *tp = tcp_sk(sk);
994 
995 	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
996 
997 	/* RX process wants to run with disabled BHs, though it is not
998 	 * necessary */
999 	local_bh_disable();
1000 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1001 		sk->sk_backlog_rcv(sk, skb);
1002 	local_bh_enable();
1003 
1004 	/* Clear memory counter. */
1005 	tp->ucopy.memory = 0;
1006 }
1007 
1008 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1009 {
1010 	struct sk_buff *skb;
1011 	u32 offset;
1012 
1013 	skb_queue_walk(&sk->sk_receive_queue, skb) {
1014 		offset = seq - TCP_SKB_CB(skb)->seq;
1015 		if (skb->h.th->syn)
1016 			offset--;
1017 		if (offset < skb->len || skb->h.th->fin) {
1018 			*off = offset;
1019 			return skb;
1020 		}
1021 	}
1022 	return NULL;
1023 }
1024 
1025 /*
1026  * This routine provides an alternative to tcp_recvmsg() for routines
1027  * that would like to handle copying from skbuffs directly in 'sendfile'
1028  * fashion.
1029  * Note:
1030  *	- It is assumed that the socket was locked by the caller.
1031  *	- The routine does not block.
1032  *	- At present, there is no support for reading OOB data
1033  *	  or for 'peeking' the socket using this routine
1034  *	  (although both would be easy to implement).
1035  */
1036 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1037 		  sk_read_actor_t recv_actor)
1038 {
1039 	struct sk_buff *skb;
1040 	struct tcp_sock *tp = tcp_sk(sk);
1041 	u32 seq = tp->copied_seq;
1042 	u32 offset;
1043 	int copied = 0;
1044 
1045 	if (sk->sk_state == TCP_LISTEN)
1046 		return -ENOTCONN;
1047 	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1048 		if (offset < skb->len) {
1049 			size_t used, len;
1050 
1051 			len = skb->len - offset;
1052 			/* Stop reading if we hit a patch of urgent data */
1053 			if (tp->urg_data) {
1054 				u32 urg_offset = tp->urg_seq - seq;
1055 				if (urg_offset < len)
1056 					len = urg_offset;
1057 				if (!len)
1058 					break;
1059 			}
1060 			used = recv_actor(desc, skb, offset, len);
1061 			if (used <= len) {
1062 				seq += used;
1063 				copied += used;
1064 				offset += used;
1065 			}
1066 			if (offset != skb->len)
1067 				break;
1068 		}
1069 		if (skb->h.th->fin) {
1070 			sk_eat_skb(sk, skb, 0);
1071 			++seq;
1072 			break;
1073 		}
1074 		sk_eat_skb(sk, skb, 0);
1075 		if (!desc->count)
1076 			break;
1077 	}
1078 	tp->copied_seq = seq;
1079 
1080 	tcp_rcv_space_adjust(sk);
1081 
1082 	/* Clean up data we have read: This will do ACK frames. */
1083 	if (copied)
1084 		tcp_cleanup_rbuf(sk, copied);
1085 	return copied;
1086 }
1087 
1088 /*
1089  *	This routine copies from a sock struct into the user buffer.
1090  *
1091  *	Technical note: in 2.3 we work on _locked_ socket, so that
1092  *	tricks with *seq access order and skb->users are not required.
1093  *	Probably, code can be easily improved even more.
1094  */
1095 
1096 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1097 		size_t len, int nonblock, int flags, int *addr_len)
1098 {
1099 	struct tcp_sock *tp = tcp_sk(sk);
1100 	int copied = 0;
1101 	u32 peek_seq;
1102 	u32 *seq;
1103 	unsigned long used;
1104 	int err;
1105 	int target;		/* Read at least this many bytes */
1106 	long timeo;
1107 	struct task_struct *user_recv = NULL;
1108 	int copied_early = 0;
1109 
1110 	lock_sock(sk);
1111 
1112 	TCP_CHECK_TIMER(sk);
1113 
1114 	err = -ENOTCONN;
1115 	if (sk->sk_state == TCP_LISTEN)
1116 		goto out;
1117 
1118 	timeo = sock_rcvtimeo(sk, nonblock);
1119 
1120 	/* Urgent data needs to be handled specially. */
1121 	if (flags & MSG_OOB)
1122 		goto recv_urg;
1123 
1124 	seq = &tp->copied_seq;
1125 	if (flags & MSG_PEEK) {
1126 		peek_seq = tp->copied_seq;
1127 		seq = &peek_seq;
1128 	}
1129 
1130 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1131 
1132 #ifdef CONFIG_NET_DMA
1133 	tp->ucopy.dma_chan = NULL;
1134 	preempt_disable();
1135 	if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1136 	    !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) {
1137 		preempt_enable_no_resched();
1138 		tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
1139 	} else
1140 		preempt_enable_no_resched();
1141 #endif
1142 
1143 	do {
1144 		struct sk_buff *skb;
1145 		u32 offset;
1146 
1147 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1148 		if (tp->urg_data && tp->urg_seq == *seq) {
1149 			if (copied)
1150 				break;
1151 			if (signal_pending(current)) {
1152 				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1153 				break;
1154 			}
1155 		}
1156 
1157 		/* Next get a buffer. */
1158 
1159 		skb = skb_peek(&sk->sk_receive_queue);
1160 		do {
1161 			if (!skb)
1162 				break;
1163 
1164 			/* Now that we have two receive queues this
1165 			 * shouldn't happen.
1166 			 */
1167 			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1168 				printk(KERN_INFO "recvmsg bug: copied %X "
1169 				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1170 				break;
1171 			}
1172 			offset = *seq - TCP_SKB_CB(skb)->seq;
1173 			if (skb->h.th->syn)
1174 				offset--;
1175 			if (offset < skb->len)
1176 				goto found_ok_skb;
1177 			if (skb->h.th->fin)
1178 				goto found_fin_ok;
1179 			BUG_TRAP(flags & MSG_PEEK);
1180 			skb = skb->next;
1181 		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1182 
1183 		/* Well, if we have backlog, try to process it now yet. */
1184 
1185 		if (copied >= target && !sk->sk_backlog.tail)
1186 			break;
1187 
1188 		if (copied) {
1189 			if (sk->sk_err ||
1190 			    sk->sk_state == TCP_CLOSE ||
1191 			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1192 			    !timeo ||
1193 			    signal_pending(current) ||
1194 			    (flags & MSG_PEEK))
1195 				break;
1196 		} else {
1197 			if (sock_flag(sk, SOCK_DONE))
1198 				break;
1199 
1200 			if (sk->sk_err) {
1201 				copied = sock_error(sk);
1202 				break;
1203 			}
1204 
1205 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1206 				break;
1207 
1208 			if (sk->sk_state == TCP_CLOSE) {
1209 				if (!sock_flag(sk, SOCK_DONE)) {
1210 					/* This occurs when user tries to read
1211 					 * from never connected socket.
1212 					 */
1213 					copied = -ENOTCONN;
1214 					break;
1215 				}
1216 				break;
1217 			}
1218 
1219 			if (!timeo) {
1220 				copied = -EAGAIN;
1221 				break;
1222 			}
1223 
1224 			if (signal_pending(current)) {
1225 				copied = sock_intr_errno(timeo);
1226 				break;
1227 			}
1228 		}
1229 
1230 		tcp_cleanup_rbuf(sk, copied);
1231 
1232 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1233 			/* Install new reader */
1234 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1235 				user_recv = current;
1236 				tp->ucopy.task = user_recv;
1237 				tp->ucopy.iov = msg->msg_iov;
1238 			}
1239 
1240 			tp->ucopy.len = len;
1241 
1242 			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1243 				 (flags & (MSG_PEEK | MSG_TRUNC)));
1244 
1245 			/* Ugly... If prequeue is not empty, we have to
1246 			 * process it before releasing socket, otherwise
1247 			 * order will be broken at second iteration.
1248 			 * More elegant solution is required!!!
1249 			 *
1250 			 * Look: we have the following (pseudo)queues:
1251 			 *
1252 			 * 1. packets in flight
1253 			 * 2. backlog
1254 			 * 3. prequeue
1255 			 * 4. receive_queue
1256 			 *
1257 			 * Each queue can be processed only if the next ones
1258 			 * are empty. At this point we have empty receive_queue.
1259 			 * But prequeue _can_ be not empty after 2nd iteration,
1260 			 * when we jumped to start of loop because backlog
1261 			 * processing added something to receive_queue.
1262 			 * We cannot release_sock(), because backlog contains
1263 			 * packets arrived _after_ prequeued ones.
1264 			 *
1265 			 * Shortly, algorithm is clear --- to process all
1266 			 * the queues in order. We could make it more directly,
1267 			 * requeueing packets from backlog to prequeue, if
1268 			 * is not empty. It is more elegant, but eats cycles,
1269 			 * unfortunately.
1270 			 */
1271 			if (!skb_queue_empty(&tp->ucopy.prequeue))
1272 				goto do_prequeue;
1273 
1274 			/* __ Set realtime policy in scheduler __ */
1275 		}
1276 
1277 		if (copied >= target) {
1278 			/* Do not sleep, just process backlog. */
1279 			release_sock(sk);
1280 			lock_sock(sk);
1281 		} else
1282 			sk_wait_data(sk, &timeo);
1283 
1284 #ifdef CONFIG_NET_DMA
1285 		tp->ucopy.wakeup = 0;
1286 #endif
1287 
1288 		if (user_recv) {
1289 			int chunk;
1290 
1291 			/* __ Restore normal policy in scheduler __ */
1292 
1293 			if ((chunk = len - tp->ucopy.len) != 0) {
1294 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1295 				len -= chunk;
1296 				copied += chunk;
1297 			}
1298 
1299 			if (tp->rcv_nxt == tp->copied_seq &&
1300 			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1301 do_prequeue:
1302 				tcp_prequeue_process(sk);
1303 
1304 				if ((chunk = len - tp->ucopy.len) != 0) {
1305 					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1306 					len -= chunk;
1307 					copied += chunk;
1308 				}
1309 			}
1310 		}
1311 		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1312 			if (net_ratelimit())
1313 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1314 				       current->comm, current->pid);
1315 			peek_seq = tp->copied_seq;
1316 		}
1317 		continue;
1318 
1319 	found_ok_skb:
1320 		/* Ok so how much can we use? */
1321 		used = skb->len - offset;
1322 		if (len < used)
1323 			used = len;
1324 
1325 		/* Do we have urgent data here? */
1326 		if (tp->urg_data) {
1327 			u32 urg_offset = tp->urg_seq - *seq;
1328 			if (urg_offset < used) {
1329 				if (!urg_offset) {
1330 					if (!sock_flag(sk, SOCK_URGINLINE)) {
1331 						++*seq;
1332 						offset++;
1333 						used--;
1334 						if (!used)
1335 							goto skip_copy;
1336 					}
1337 				} else
1338 					used = urg_offset;
1339 			}
1340 		}
1341 
1342 		if (!(flags & MSG_TRUNC)) {
1343 #ifdef CONFIG_NET_DMA
1344 			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1345 				tp->ucopy.dma_chan = get_softnet_dma();
1346 
1347 			if (tp->ucopy.dma_chan) {
1348 				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1349 					tp->ucopy.dma_chan, skb, offset,
1350 					msg->msg_iov, used,
1351 					tp->ucopy.pinned_list);
1352 
1353 				if (tp->ucopy.dma_cookie < 0) {
1354 
1355 					printk(KERN_ALERT "dma_cookie < 0\n");
1356 
1357 					/* Exception. Bailout! */
1358 					if (!copied)
1359 						copied = -EFAULT;
1360 					break;
1361 				}
1362 				if ((offset + used) == skb->len)
1363 					copied_early = 1;
1364 
1365 			} else
1366 #endif
1367 			{
1368 				err = skb_copy_datagram_iovec(skb, offset,
1369 						msg->msg_iov, used);
1370 				if (err) {
1371 					/* Exception. Bailout! */
1372 					if (!copied)
1373 						copied = -EFAULT;
1374 					break;
1375 				}
1376 			}
1377 		}
1378 
1379 		*seq += used;
1380 		copied += used;
1381 		len -= used;
1382 
1383 		tcp_rcv_space_adjust(sk);
1384 
1385 skip_copy:
1386 		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1387 			tp->urg_data = 0;
1388 			tcp_fast_path_check(sk, tp);
1389 		}
1390 		if (used + offset < skb->len)
1391 			continue;
1392 
1393 		if (skb->h.th->fin)
1394 			goto found_fin_ok;
1395 		if (!(flags & MSG_PEEK)) {
1396 			sk_eat_skb(sk, skb, copied_early);
1397 			copied_early = 0;
1398 		}
1399 		continue;
1400 
1401 	found_fin_ok:
1402 		/* Process the FIN. */
1403 		++*seq;
1404 		if (!(flags & MSG_PEEK)) {
1405 			sk_eat_skb(sk, skb, copied_early);
1406 			copied_early = 0;
1407 		}
1408 		break;
1409 	} while (len > 0);
1410 
1411 	if (user_recv) {
1412 		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1413 			int chunk;
1414 
1415 			tp->ucopy.len = copied > 0 ? len : 0;
1416 
1417 			tcp_prequeue_process(sk);
1418 
1419 			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1420 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1421 				len -= chunk;
1422 				copied += chunk;
1423 			}
1424 		}
1425 
1426 		tp->ucopy.task = NULL;
1427 		tp->ucopy.len = 0;
1428 	}
1429 
1430 #ifdef CONFIG_NET_DMA
1431 	if (tp->ucopy.dma_chan) {
1432 		struct sk_buff *skb;
1433 		dma_cookie_t done, used;
1434 
1435 		dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1436 
1437 		while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1438 		                                 tp->ucopy.dma_cookie, &done,
1439 		                                 &used) == DMA_IN_PROGRESS) {
1440 			/* do partial cleanup of sk_async_wait_queue */
1441 			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1442 			       (dma_async_is_complete(skb->dma_cookie, done,
1443 			                              used) == DMA_SUCCESS)) {
1444 				__skb_dequeue(&sk->sk_async_wait_queue);
1445 				kfree_skb(skb);
1446 			}
1447 		}
1448 
1449 		/* Safe to free early-copied skbs now */
1450 		__skb_queue_purge(&sk->sk_async_wait_queue);
1451 		dma_chan_put(tp->ucopy.dma_chan);
1452 		tp->ucopy.dma_chan = NULL;
1453 	}
1454 	if (tp->ucopy.pinned_list) {
1455 		dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1456 		tp->ucopy.pinned_list = NULL;
1457 	}
1458 #endif
1459 
1460 	/* According to UNIX98, msg_name/msg_namelen are ignored
1461 	 * on connected socket. I was just happy when found this 8) --ANK
1462 	 */
1463 
1464 	/* Clean up data we have read: This will do ACK frames. */
1465 	tcp_cleanup_rbuf(sk, copied);
1466 
1467 	TCP_CHECK_TIMER(sk);
1468 	release_sock(sk);
1469 	return copied;
1470 
1471 out:
1472 	TCP_CHECK_TIMER(sk);
1473 	release_sock(sk);
1474 	return err;
1475 
1476 recv_urg:
1477 	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1478 	goto out;
1479 }
1480 
1481 /*
1482  *	State processing on a close. This implements the state shift for
1483  *	sending our FIN frame. Note that we only send a FIN for some
1484  *	states. A shutdown() may have already sent the FIN, or we may be
1485  *	closed.
1486  */
1487 
1488 static const unsigned char new_state[16] = {
1489   /* current state:        new state:      action:	*/
1490   /* (Invalid)		*/ TCP_CLOSE,
1491   /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1492   /* TCP_SYN_SENT	*/ TCP_CLOSE,
1493   /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1494   /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1495   /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1496   /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1497   /* TCP_CLOSE		*/ TCP_CLOSE,
1498   /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1499   /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1500   /* TCP_LISTEN		*/ TCP_CLOSE,
1501   /* TCP_CLOSING	*/ TCP_CLOSING,
1502 };
1503 
1504 static int tcp_close_state(struct sock *sk)
1505 {
1506 	int next = (int)new_state[sk->sk_state];
1507 	int ns = next & TCP_STATE_MASK;
1508 
1509 	tcp_set_state(sk, ns);
1510 
1511 	return next & TCP_ACTION_FIN;
1512 }
1513 
1514 /*
1515  *	Shutdown the sending side of a connection. Much like close except
1516  *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1517  */
1518 
1519 void tcp_shutdown(struct sock *sk, int how)
1520 {
1521 	/*	We need to grab some memory, and put together a FIN,
1522 	 *	and then put it into the queue to be sent.
1523 	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1524 	 */
1525 	if (!(how & SEND_SHUTDOWN))
1526 		return;
1527 
1528 	/* If we've already sent a FIN, or it's a closed state, skip this. */
1529 	if ((1 << sk->sk_state) &
1530 	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1531 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1532 		/* Clear out any half completed packets.  FIN if needed. */
1533 		if (tcp_close_state(sk))
1534 			tcp_send_fin(sk);
1535 	}
1536 }
1537 
1538 void tcp_close(struct sock *sk, long timeout)
1539 {
1540 	struct sk_buff *skb;
1541 	int data_was_unread = 0;
1542 	int state;
1543 
1544 	lock_sock(sk);
1545 	sk->sk_shutdown = SHUTDOWN_MASK;
1546 
1547 	if (sk->sk_state == TCP_LISTEN) {
1548 		tcp_set_state(sk, TCP_CLOSE);
1549 
1550 		/* Special case. */
1551 		inet_csk_listen_stop(sk);
1552 
1553 		goto adjudge_to_death;
1554 	}
1555 
1556 	/*  We need to flush the recv. buffs.  We do this only on the
1557 	 *  descriptor close, not protocol-sourced closes, because the
1558 	 *  reader process may not have drained the data yet!
1559 	 */
1560 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1561 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1562 			  skb->h.th->fin;
1563 		data_was_unread += len;
1564 		__kfree_skb(skb);
1565 	}
1566 
1567 	sk_stream_mem_reclaim(sk);
1568 
1569 	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1570 	 * 3.10, we send a RST here because data was lost.  To
1571 	 * witness the awful effects of the old behavior of always
1572 	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1573 	 * a bulk GET in an FTP client, suspend the process, wait
1574 	 * for the client to advertise a zero window, then kill -9
1575 	 * the FTP client, wheee...  Note: timeout is always zero
1576 	 * in such a case.
1577 	 */
1578 	if (data_was_unread) {
1579 		/* Unread data was tossed, zap the connection. */
1580 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1581 		tcp_set_state(sk, TCP_CLOSE);
1582 		tcp_send_active_reset(sk, GFP_KERNEL);
1583 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1584 		/* Check zero linger _after_ checking for unread data. */
1585 		sk->sk_prot->disconnect(sk, 0);
1586 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1587 	} else if (tcp_close_state(sk)) {
1588 		/* We FIN if the application ate all the data before
1589 		 * zapping the connection.
1590 		 */
1591 
1592 		/* RED-PEN. Formally speaking, we have broken TCP state
1593 		 * machine. State transitions:
1594 		 *
1595 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1596 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1597 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1598 		 *
1599 		 * are legal only when FIN has been sent (i.e. in window),
1600 		 * rather than queued out of window. Purists blame.
1601 		 *
1602 		 * F.e. "RFC state" is ESTABLISHED,
1603 		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1604 		 *
1605 		 * The visible declinations are that sometimes
1606 		 * we enter time-wait state, when it is not required really
1607 		 * (harmless), do not send active resets, when they are
1608 		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1609 		 * they look as CLOSING or LAST_ACK for Linux)
1610 		 * Probably, I missed some more holelets.
1611 		 * 						--ANK
1612 		 */
1613 		tcp_send_fin(sk);
1614 	}
1615 
1616 	sk_stream_wait_close(sk, timeout);
1617 
1618 adjudge_to_death:
1619 	state = sk->sk_state;
1620 	sock_hold(sk);
1621 	sock_orphan(sk);
1622 	atomic_inc(sk->sk_prot->orphan_count);
1623 
1624 	/* It is the last release_sock in its life. It will remove backlog. */
1625 	release_sock(sk);
1626 
1627 
1628 	/* Now socket is owned by kernel and we acquire BH lock
1629 	   to finish close. No need to check for user refs.
1630 	 */
1631 	local_bh_disable();
1632 	bh_lock_sock(sk);
1633 	BUG_TRAP(!sock_owned_by_user(sk));
1634 
1635 	/* Have we already been destroyed by a softirq or backlog? */
1636 	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1637 		goto out;
1638 
1639 	/*	This is a (useful) BSD violating of the RFC. There is a
1640 	 *	problem with TCP as specified in that the other end could
1641 	 *	keep a socket open forever with no application left this end.
1642 	 *	We use a 3 minute timeout (about the same as BSD) then kill
1643 	 *	our end. If they send after that then tough - BUT: long enough
1644 	 *	that we won't make the old 4*rto = almost no time - whoops
1645 	 *	reset mistake.
1646 	 *
1647 	 *	Nope, it was not mistake. It is really desired behaviour
1648 	 *	f.e. on http servers, when such sockets are useless, but
1649 	 *	consume significant resources. Let's do it with special
1650 	 *	linger2	option.					--ANK
1651 	 */
1652 
1653 	if (sk->sk_state == TCP_FIN_WAIT2) {
1654 		struct tcp_sock *tp = tcp_sk(sk);
1655 		if (tp->linger2 < 0) {
1656 			tcp_set_state(sk, TCP_CLOSE);
1657 			tcp_send_active_reset(sk, GFP_ATOMIC);
1658 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1659 		} else {
1660 			const int tmo = tcp_fin_time(sk);
1661 
1662 			if (tmo > TCP_TIMEWAIT_LEN) {
1663 				inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1664 			} else {
1665 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1666 				goto out;
1667 			}
1668 		}
1669 	}
1670 	if (sk->sk_state != TCP_CLOSE) {
1671 		sk_stream_mem_reclaim(sk);
1672 		if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1673 		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1674 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1675 			if (net_ratelimit())
1676 				printk(KERN_INFO "TCP: too many of orphaned "
1677 				       "sockets\n");
1678 			tcp_set_state(sk, TCP_CLOSE);
1679 			tcp_send_active_reset(sk, GFP_ATOMIC);
1680 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1681 		}
1682 	}
1683 
1684 	if (sk->sk_state == TCP_CLOSE)
1685 		inet_csk_destroy_sock(sk);
1686 	/* Otherwise, socket is reprieved until protocol close. */
1687 
1688 out:
1689 	bh_unlock_sock(sk);
1690 	local_bh_enable();
1691 	sock_put(sk);
1692 }
1693 
1694 /* These states need RST on ABORT according to RFC793 */
1695 
1696 static inline int tcp_need_reset(int state)
1697 {
1698 	return (1 << state) &
1699 	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1700 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1701 }
1702 
1703 int tcp_disconnect(struct sock *sk, int flags)
1704 {
1705 	struct inet_sock *inet = inet_sk(sk);
1706 	struct inet_connection_sock *icsk = inet_csk(sk);
1707 	struct tcp_sock *tp = tcp_sk(sk);
1708 	int err = 0;
1709 	int old_state = sk->sk_state;
1710 
1711 	if (old_state != TCP_CLOSE)
1712 		tcp_set_state(sk, TCP_CLOSE);
1713 
1714 	/* ABORT function of RFC793 */
1715 	if (old_state == TCP_LISTEN) {
1716 		inet_csk_listen_stop(sk);
1717 	} else if (tcp_need_reset(old_state) ||
1718 		   (tp->snd_nxt != tp->write_seq &&
1719 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1720 		/* The last check adjusts for discrepancy of Linux wrt. RFC
1721 		 * states
1722 		 */
1723 		tcp_send_active_reset(sk, gfp_any());
1724 		sk->sk_err = ECONNRESET;
1725 	} else if (old_state == TCP_SYN_SENT)
1726 		sk->sk_err = ECONNRESET;
1727 
1728 	tcp_clear_xmit_timers(sk);
1729 	__skb_queue_purge(&sk->sk_receive_queue);
1730 	sk_stream_writequeue_purge(sk);
1731 	__skb_queue_purge(&tp->out_of_order_queue);
1732 #ifdef CONFIG_NET_DMA
1733 	__skb_queue_purge(&sk->sk_async_wait_queue);
1734 #endif
1735 
1736 	inet->dport = 0;
1737 
1738 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1739 		inet_reset_saddr(sk);
1740 
1741 	sk->sk_shutdown = 0;
1742 	sock_reset_flag(sk, SOCK_DONE);
1743 	tp->srtt = 0;
1744 	if ((tp->write_seq += tp->max_window + 2) == 0)
1745 		tp->write_seq = 1;
1746 	icsk->icsk_backoff = 0;
1747 	tp->snd_cwnd = 2;
1748 	icsk->icsk_probes_out = 0;
1749 	tp->packets_out = 0;
1750 	tp->snd_ssthresh = 0x7fffffff;
1751 	tp->snd_cwnd_cnt = 0;
1752 	tp->bytes_acked = 0;
1753 	tcp_set_ca_state(sk, TCP_CA_Open);
1754 	tcp_clear_retrans(tp);
1755 	inet_csk_delack_init(sk);
1756 	sk->sk_send_head = NULL;
1757 	tp->rx_opt.saw_tstamp = 0;
1758 	tcp_sack_reset(&tp->rx_opt);
1759 	__sk_dst_reset(sk);
1760 
1761 	BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1762 
1763 	sk->sk_error_report(sk);
1764 	return err;
1765 }
1766 
1767 /*
1768  *	Socket option code for TCP.
1769  */
1770 static int do_tcp_setsockopt(struct sock *sk, int level,
1771 		int optname, char __user *optval, int optlen)
1772 {
1773 	struct tcp_sock *tp = tcp_sk(sk);
1774 	struct inet_connection_sock *icsk = inet_csk(sk);
1775 	int val;
1776 	int err = 0;
1777 
1778 	/* This is a string value all the others are int's */
1779 	if (optname == TCP_CONGESTION) {
1780 		char name[TCP_CA_NAME_MAX];
1781 
1782 		if (optlen < 1)
1783 			return -EINVAL;
1784 
1785 		val = strncpy_from_user(name, optval,
1786 					min(TCP_CA_NAME_MAX-1, optlen));
1787 		if (val < 0)
1788 			return -EFAULT;
1789 		name[val] = 0;
1790 
1791 		lock_sock(sk);
1792 		err = tcp_set_congestion_control(sk, name);
1793 		release_sock(sk);
1794 		return err;
1795 	}
1796 
1797 	if (optlen < sizeof(int))
1798 		return -EINVAL;
1799 
1800 	if (get_user(val, (int __user *)optval))
1801 		return -EFAULT;
1802 
1803 	lock_sock(sk);
1804 
1805 	switch (optname) {
1806 	case TCP_MAXSEG:
1807 		/* Values greater than interface MTU won't take effect. However
1808 		 * at the point when this call is done we typically don't yet
1809 		 * know which interface is going to be used */
1810 		if (val < 8 || val > MAX_TCP_WINDOW) {
1811 			err = -EINVAL;
1812 			break;
1813 		}
1814 		tp->rx_opt.user_mss = val;
1815 		break;
1816 
1817 	case TCP_NODELAY:
1818 		if (val) {
1819 			/* TCP_NODELAY is weaker than TCP_CORK, so that
1820 			 * this option on corked socket is remembered, but
1821 			 * it is not activated until cork is cleared.
1822 			 *
1823 			 * However, when TCP_NODELAY is set we make
1824 			 * an explicit push, which overrides even TCP_CORK
1825 			 * for currently queued segments.
1826 			 */
1827 			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1828 			tcp_push_pending_frames(sk, tp);
1829 		} else {
1830 			tp->nonagle &= ~TCP_NAGLE_OFF;
1831 		}
1832 		break;
1833 
1834 	case TCP_CORK:
1835 		/* When set indicates to always queue non-full frames.
1836 		 * Later the user clears this option and we transmit
1837 		 * any pending partial frames in the queue.  This is
1838 		 * meant to be used alongside sendfile() to get properly
1839 		 * filled frames when the user (for example) must write
1840 		 * out headers with a write() call first and then use
1841 		 * sendfile to send out the data parts.
1842 		 *
1843 		 * TCP_CORK can be set together with TCP_NODELAY and it is
1844 		 * stronger than TCP_NODELAY.
1845 		 */
1846 		if (val) {
1847 			tp->nonagle |= TCP_NAGLE_CORK;
1848 		} else {
1849 			tp->nonagle &= ~TCP_NAGLE_CORK;
1850 			if (tp->nonagle&TCP_NAGLE_OFF)
1851 				tp->nonagle |= TCP_NAGLE_PUSH;
1852 			tcp_push_pending_frames(sk, tp);
1853 		}
1854 		break;
1855 
1856 	case TCP_KEEPIDLE:
1857 		if (val < 1 || val > MAX_TCP_KEEPIDLE)
1858 			err = -EINVAL;
1859 		else {
1860 			tp->keepalive_time = val * HZ;
1861 			if (sock_flag(sk, SOCK_KEEPOPEN) &&
1862 			    !((1 << sk->sk_state) &
1863 			      (TCPF_CLOSE | TCPF_LISTEN))) {
1864 				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1865 				if (tp->keepalive_time > elapsed)
1866 					elapsed = tp->keepalive_time - elapsed;
1867 				else
1868 					elapsed = 0;
1869 				inet_csk_reset_keepalive_timer(sk, elapsed);
1870 			}
1871 		}
1872 		break;
1873 	case TCP_KEEPINTVL:
1874 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
1875 			err = -EINVAL;
1876 		else
1877 			tp->keepalive_intvl = val * HZ;
1878 		break;
1879 	case TCP_KEEPCNT:
1880 		if (val < 1 || val > MAX_TCP_KEEPCNT)
1881 			err = -EINVAL;
1882 		else
1883 			tp->keepalive_probes = val;
1884 		break;
1885 	case TCP_SYNCNT:
1886 		if (val < 1 || val > MAX_TCP_SYNCNT)
1887 			err = -EINVAL;
1888 		else
1889 			icsk->icsk_syn_retries = val;
1890 		break;
1891 
1892 	case TCP_LINGER2:
1893 		if (val < 0)
1894 			tp->linger2 = -1;
1895 		else if (val > sysctl_tcp_fin_timeout / HZ)
1896 			tp->linger2 = 0;
1897 		else
1898 			tp->linger2 = val * HZ;
1899 		break;
1900 
1901 	case TCP_DEFER_ACCEPT:
1902 		icsk->icsk_accept_queue.rskq_defer_accept = 0;
1903 		if (val > 0) {
1904 			/* Translate value in seconds to number of
1905 			 * retransmits */
1906 			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1907 			       val > ((TCP_TIMEOUT_INIT / HZ) <<
1908 				       icsk->icsk_accept_queue.rskq_defer_accept))
1909 				icsk->icsk_accept_queue.rskq_defer_accept++;
1910 			icsk->icsk_accept_queue.rskq_defer_accept++;
1911 		}
1912 		break;
1913 
1914 	case TCP_WINDOW_CLAMP:
1915 		if (!val) {
1916 			if (sk->sk_state != TCP_CLOSE) {
1917 				err = -EINVAL;
1918 				break;
1919 			}
1920 			tp->window_clamp = 0;
1921 		} else
1922 			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1923 						SOCK_MIN_RCVBUF / 2 : val;
1924 		break;
1925 
1926 	case TCP_QUICKACK:
1927 		if (!val) {
1928 			icsk->icsk_ack.pingpong = 1;
1929 		} else {
1930 			icsk->icsk_ack.pingpong = 0;
1931 			if ((1 << sk->sk_state) &
1932 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1933 			    inet_csk_ack_scheduled(sk)) {
1934 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1935 				tcp_cleanup_rbuf(sk, 1);
1936 				if (!(val & 1))
1937 					icsk->icsk_ack.pingpong = 1;
1938 			}
1939 		}
1940 		break;
1941 
1942 	default:
1943 		err = -ENOPROTOOPT;
1944 		break;
1945 	};
1946 	release_sock(sk);
1947 	return err;
1948 }
1949 
1950 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1951 		   int optlen)
1952 {
1953 	struct inet_connection_sock *icsk = inet_csk(sk);
1954 
1955 	if (level != SOL_TCP)
1956 		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
1957 						     optval, optlen);
1958 	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
1959 }
1960 
1961 #ifdef CONFIG_COMPAT
1962 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
1963 			  char __user *optval, int optlen)
1964 {
1965 	if (level != SOL_TCP)
1966 		return inet_csk_compat_setsockopt(sk, level, optname,
1967 						  optval, optlen);
1968 	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
1969 }
1970 
1971 EXPORT_SYMBOL(compat_tcp_setsockopt);
1972 #endif
1973 
1974 /* Return information about state of tcp endpoint in API format. */
1975 void tcp_get_info(struct sock *sk, struct tcp_info *info)
1976 {
1977 	struct tcp_sock *tp = tcp_sk(sk);
1978 	const struct inet_connection_sock *icsk = inet_csk(sk);
1979 	u32 now = tcp_time_stamp;
1980 
1981 	memset(info, 0, sizeof(*info));
1982 
1983 	info->tcpi_state = sk->sk_state;
1984 	info->tcpi_ca_state = icsk->icsk_ca_state;
1985 	info->tcpi_retransmits = icsk->icsk_retransmits;
1986 	info->tcpi_probes = icsk->icsk_probes_out;
1987 	info->tcpi_backoff = icsk->icsk_backoff;
1988 
1989 	if (tp->rx_opt.tstamp_ok)
1990 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1991 	if (tp->rx_opt.sack_ok)
1992 		info->tcpi_options |= TCPI_OPT_SACK;
1993 	if (tp->rx_opt.wscale_ok) {
1994 		info->tcpi_options |= TCPI_OPT_WSCALE;
1995 		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1996 		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1997 	}
1998 
1999 	if (tp->ecn_flags&TCP_ECN_OK)
2000 		info->tcpi_options |= TCPI_OPT_ECN;
2001 
2002 	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2003 	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2004 	info->tcpi_snd_mss = tp->mss_cache;
2005 	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2006 
2007 	info->tcpi_unacked = tp->packets_out;
2008 	info->tcpi_sacked = tp->sacked_out;
2009 	info->tcpi_lost = tp->lost_out;
2010 	info->tcpi_retrans = tp->retrans_out;
2011 	info->tcpi_fackets = tp->fackets_out;
2012 
2013 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2014 	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2015 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2016 
2017 	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2018 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2019 	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2020 	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2021 	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2022 	info->tcpi_snd_cwnd = tp->snd_cwnd;
2023 	info->tcpi_advmss = tp->advmss;
2024 	info->tcpi_reordering = tp->reordering;
2025 
2026 	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2027 	info->tcpi_rcv_space = tp->rcvq_space.space;
2028 
2029 	info->tcpi_total_retrans = tp->total_retrans;
2030 }
2031 
2032 EXPORT_SYMBOL_GPL(tcp_get_info);
2033 
2034 static int do_tcp_getsockopt(struct sock *sk, int level,
2035 		int optname, char __user *optval, int __user *optlen)
2036 {
2037 	struct inet_connection_sock *icsk = inet_csk(sk);
2038 	struct tcp_sock *tp = tcp_sk(sk);
2039 	int val, len;
2040 
2041 	if (get_user(len, optlen))
2042 		return -EFAULT;
2043 
2044 	len = min_t(unsigned int, len, sizeof(int));
2045 
2046 	if (len < 0)
2047 		return -EINVAL;
2048 
2049 	switch (optname) {
2050 	case TCP_MAXSEG:
2051 		val = tp->mss_cache;
2052 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2053 			val = tp->rx_opt.user_mss;
2054 		break;
2055 	case TCP_NODELAY:
2056 		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2057 		break;
2058 	case TCP_CORK:
2059 		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2060 		break;
2061 	case TCP_KEEPIDLE:
2062 		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2063 		break;
2064 	case TCP_KEEPINTVL:
2065 		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2066 		break;
2067 	case TCP_KEEPCNT:
2068 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2069 		break;
2070 	case TCP_SYNCNT:
2071 		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2072 		break;
2073 	case TCP_LINGER2:
2074 		val = tp->linger2;
2075 		if (val >= 0)
2076 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2077 		break;
2078 	case TCP_DEFER_ACCEPT:
2079 		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2080 			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2081 		break;
2082 	case TCP_WINDOW_CLAMP:
2083 		val = tp->window_clamp;
2084 		break;
2085 	case TCP_INFO: {
2086 		struct tcp_info info;
2087 
2088 		if (get_user(len, optlen))
2089 			return -EFAULT;
2090 
2091 		tcp_get_info(sk, &info);
2092 
2093 		len = min_t(unsigned int, len, sizeof(info));
2094 		if (put_user(len, optlen))
2095 			return -EFAULT;
2096 		if (copy_to_user(optval, &info, len))
2097 			return -EFAULT;
2098 		return 0;
2099 	}
2100 	case TCP_QUICKACK:
2101 		val = !icsk->icsk_ack.pingpong;
2102 		break;
2103 
2104 	case TCP_CONGESTION:
2105 		if (get_user(len, optlen))
2106 			return -EFAULT;
2107 		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2108 		if (put_user(len, optlen))
2109 			return -EFAULT;
2110 		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2111 			return -EFAULT;
2112 		return 0;
2113 	default:
2114 		return -ENOPROTOOPT;
2115 	};
2116 
2117 	if (put_user(len, optlen))
2118 		return -EFAULT;
2119 	if (copy_to_user(optval, &val, len))
2120 		return -EFAULT;
2121 	return 0;
2122 }
2123 
2124 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2125 		   int __user *optlen)
2126 {
2127 	struct inet_connection_sock *icsk = inet_csk(sk);
2128 
2129 	if (level != SOL_TCP)
2130 		return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2131 						     optval, optlen);
2132 	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2133 }
2134 
2135 #ifdef CONFIG_COMPAT
2136 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2137 			  char __user *optval, int __user *optlen)
2138 {
2139 	if (level != SOL_TCP)
2140 		return inet_csk_compat_getsockopt(sk, level, optname,
2141 						  optval, optlen);
2142 	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2143 }
2144 
2145 EXPORT_SYMBOL(compat_tcp_getsockopt);
2146 #endif
2147 
2148 struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2149 {
2150 	struct sk_buff *segs = ERR_PTR(-EINVAL);
2151 	struct tcphdr *th;
2152 	unsigned thlen;
2153 	unsigned int seq;
2154 	unsigned int delta;
2155 	unsigned int oldlen;
2156 	unsigned int len;
2157 
2158 	if (!pskb_may_pull(skb, sizeof(*th)))
2159 		goto out;
2160 
2161 	th = skb->h.th;
2162 	thlen = th->doff * 4;
2163 	if (thlen < sizeof(*th))
2164 		goto out;
2165 
2166 	if (!pskb_may_pull(skb, thlen))
2167 		goto out;
2168 
2169 	segs = NULL;
2170 	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST))
2171 		goto out;
2172 
2173 	oldlen = (u16)~skb->len;
2174 	__skb_pull(skb, thlen);
2175 
2176 	segs = skb_segment(skb, features);
2177 	if (IS_ERR(segs))
2178 		goto out;
2179 
2180 	len = skb_shinfo(skb)->gso_size;
2181 	delta = htonl(oldlen + (thlen + len));
2182 
2183 	skb = segs;
2184 	th = skb->h.th;
2185 	seq = ntohl(th->seq);
2186 
2187 	do {
2188 		th->fin = th->psh = 0;
2189 
2190 		th->check = ~csum_fold(th->check + delta);
2191 		if (skb->ip_summed != CHECKSUM_HW)
2192 			th->check = csum_fold(csum_partial(skb->h.raw, thlen,
2193 							   skb->csum));
2194 
2195 		seq += len;
2196 		skb = skb->next;
2197 		th = skb->h.th;
2198 
2199 		th->seq = htonl(seq);
2200 		th->cwr = 0;
2201 	} while (skb->next);
2202 
2203 	delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
2204 	th->check = ~csum_fold(th->check + delta);
2205 	if (skb->ip_summed != CHECKSUM_HW)
2206 		th->check = csum_fold(csum_partial(skb->h.raw, thlen,
2207 						   skb->csum));
2208 
2209 out:
2210 	return segs;
2211 }
2212 
2213 extern void __skb_cb_too_small_for_tcp(int, int);
2214 extern struct tcp_congestion_ops tcp_reno;
2215 
2216 static __initdata unsigned long thash_entries;
2217 static int __init set_thash_entries(char *str)
2218 {
2219 	if (!str)
2220 		return 0;
2221 	thash_entries = simple_strtoul(str, &str, 0);
2222 	return 1;
2223 }
2224 __setup("thash_entries=", set_thash_entries);
2225 
2226 void __init tcp_init(void)
2227 {
2228 	struct sk_buff *skb = NULL;
2229 	unsigned long limit;
2230 	int order, i, max_share;
2231 
2232 	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2233 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2234 					   sizeof(skb->cb));
2235 
2236 	tcp_hashinfo.bind_bucket_cachep =
2237 		kmem_cache_create("tcp_bind_bucket",
2238 				  sizeof(struct inet_bind_bucket), 0,
2239 				  SLAB_HWCACHE_ALIGN, NULL, NULL);
2240 	if (!tcp_hashinfo.bind_bucket_cachep)
2241 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2242 
2243 	/* Size and allocate the main established and bind bucket
2244 	 * hash tables.
2245 	 *
2246 	 * The methodology is similar to that of the buffer cache.
2247 	 */
2248 	tcp_hashinfo.ehash =
2249 		alloc_large_system_hash("TCP established",
2250 					sizeof(struct inet_ehash_bucket),
2251 					thash_entries,
2252 					(num_physpages >= 128 * 1024) ?
2253 					13 : 15,
2254 					HASH_HIGHMEM,
2255 					&tcp_hashinfo.ehash_size,
2256 					NULL,
2257 					0);
2258 	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2259 	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2260 		rwlock_init(&tcp_hashinfo.ehash[i].lock);
2261 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2262 	}
2263 
2264 	tcp_hashinfo.bhash =
2265 		alloc_large_system_hash("TCP bind",
2266 					sizeof(struct inet_bind_hashbucket),
2267 					tcp_hashinfo.ehash_size,
2268 					(num_physpages >= 128 * 1024) ?
2269 					13 : 15,
2270 					HASH_HIGHMEM,
2271 					&tcp_hashinfo.bhash_size,
2272 					NULL,
2273 					64 * 1024);
2274 	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2275 	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2276 		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2277 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2278 	}
2279 
2280 	/* Try to be a bit smarter and adjust defaults depending
2281 	 * on available memory.
2282 	 */
2283 	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2284 			(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2285 			order++)
2286 		;
2287 	if (order >= 4) {
2288 		sysctl_local_port_range[0] = 32768;
2289 		sysctl_local_port_range[1] = 61000;
2290 		tcp_death_row.sysctl_max_tw_buckets = 180000;
2291 		sysctl_tcp_max_orphans = 4096 << (order - 4);
2292 		sysctl_max_syn_backlog = 1024;
2293 	} else if (order < 3) {
2294 		sysctl_local_port_range[0] = 1024 * (3 - order);
2295 		tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2296 		sysctl_tcp_max_orphans >>= (3 - order);
2297 		sysctl_max_syn_backlog = 128;
2298 	}
2299 
2300 	sysctl_tcp_mem[0] =  768 << order;
2301 	sysctl_tcp_mem[1] = 1024 << order;
2302 	sysctl_tcp_mem[2] = 1536 << order;
2303 
2304 	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2305 	max_share = min(4UL*1024*1024, limit);
2306 
2307 	sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
2308 	sysctl_tcp_wmem[1] = 16*1024;
2309 	sysctl_tcp_wmem[2] = max(64*1024, max_share);
2310 
2311 	sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
2312 	sysctl_tcp_rmem[1] = 87380;
2313 	sysctl_tcp_rmem[2] = max(87380, max_share);
2314 
2315 	printk(KERN_INFO "TCP: Hash tables configured "
2316 	       "(established %d bind %d)\n",
2317 	       tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2318 
2319 	tcp_register_congestion_control(&tcp_reno);
2320 }
2321 
2322 EXPORT_SYMBOL(tcp_close);
2323 EXPORT_SYMBOL(tcp_disconnect);
2324 EXPORT_SYMBOL(tcp_getsockopt);
2325 EXPORT_SYMBOL(tcp_ioctl);
2326 EXPORT_SYMBOL(tcp_poll);
2327 EXPORT_SYMBOL(tcp_read_sock);
2328 EXPORT_SYMBOL(tcp_recvmsg);
2329 EXPORT_SYMBOL(tcp_sendmsg);
2330 EXPORT_SYMBOL(tcp_sendpage);
2331 EXPORT_SYMBOL(tcp_setsockopt);
2332 EXPORT_SYMBOL(tcp_shutdown);
2333 EXPORT_SYMBOL(tcp_statistics);
2334