xref: /linux/net/ipv4/tcp.c (revision 60b2737de1b1ddfdb90f3ba622634eb49d6f3603)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *		Alan Cox	:	Numerous verify_area() calls
24  *		Alan Cox	:	Set the ACK bit on a reset
25  *		Alan Cox	:	Stopped it crashing if it closed while
26  *					sk->inuse=1 and was trying to connect
27  *					(tcp_err()).
28  *		Alan Cox	:	All icmp error handling was broken
29  *					pointers passed where wrong and the
30  *					socket was looked up backwards. Nobody
31  *					tested any icmp error code obviously.
32  *		Alan Cox	:	tcp_err() now handled properly. It
33  *					wakes people on errors. poll
34  *					behaves and the icmp error race
35  *					has gone by moving it into sock.c
36  *		Alan Cox	:	tcp_send_reset() fixed to work for
37  *					everything not just packets for
38  *					unknown sockets.
39  *		Alan Cox	:	tcp option processing.
40  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41  *					syn rule wrong]
42  *		Herp Rosmanith  :	More reset fixes
43  *		Alan Cox	:	No longer acks invalid rst frames.
44  *					Acking any kind of RST is right out.
45  *		Alan Cox	:	Sets an ignore me flag on an rst
46  *					receive otherwise odd bits of prattle
47  *					escape still
48  *		Alan Cox	:	Fixed another acking RST frame bug.
49  *					Should stop LAN workplace lockups.
50  *		Alan Cox	: 	Some tidyups using the new skb list
51  *					facilities
52  *		Alan Cox	:	sk->keepopen now seems to work
53  *		Alan Cox	:	Pulls options out correctly on accepts
54  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56  *					bit to skb ops.
57  *		Alan Cox	:	Tidied tcp_data to avoid a potential
58  *					nasty.
59  *		Alan Cox	:	Added some better commenting, as the
60  *					tcp is hard to follow
61  *		Alan Cox	:	Removed incorrect check for 20 * psh
62  *	Michael O'Reilly	:	ack < copied bug fix.
63  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64  *		Alan Cox	:	FIN with no memory -> CRASH
65  *		Alan Cox	:	Added socket option proto entries.
66  *					Also added awareness of them to accept.
67  *		Alan Cox	:	Added TCP options (SOL_TCP)
68  *		Alan Cox	:	Switched wakeup calls to callbacks,
69  *					so the kernel can layer network
70  *					sockets.
71  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72  *		Alan Cox	:	Handle FIN (more) properly (we hope).
73  *		Alan Cox	:	RST frames sent on unsynchronised
74  *					state ack error.
75  *		Alan Cox	:	Put in missing check for SYN bit.
76  *		Alan Cox	:	Added tcp_select_window() aka NET2E
77  *					window non shrink trick.
78  *		Alan Cox	:	Added a couple of small NET2E timer
79  *					fixes
80  *		Charles Hedrick :	TCP fixes
81  *		Toomas Tamm	:	TCP window fixes
82  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83  *		Charles Hedrick	:	Rewrote most of it to actually work
84  *		Linus		:	Rewrote tcp_read() and URG handling
85  *					completely
86  *		Gerhard Koerting:	Fixed some missing timer handling
87  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88  *		Gerhard Koerting:	PC/TCP workarounds
89  *		Adam Caldwell	:	Assorted timer/timing errors
90  *		Matthew Dillon	:	Fixed another RST bug
91  *		Alan Cox	:	Move to kernel side addressing changes.
92  *		Alan Cox	:	Beginning work on TCP fastpathing
93  *					(not yet usable)
94  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95  *		Alan Cox	:	TCP fast path debugging
96  *		Alan Cox	:	Window clamping
97  *		Michael Riepe	:	Bug in tcp_check()
98  *		Matt Dillon	:	More TCP improvements and RST bug fixes
99  *		Matt Dillon	:	Yet more small nasties remove from the
100  *					TCP code (Be very nice to this man if
101  *					tcp finally works 100%) 8)
102  *		Alan Cox	:	BSD accept semantics.
103  *		Alan Cox	:	Reset on closedown bug.
104  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105  *		Michael Pall	:	Handle poll() after URG properly in
106  *					all cases.
107  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108  *					(multi URG PUSH broke rlogin).
109  *		Michael Pall	:	Fix the multi URG PUSH problem in
110  *					tcp_readable(), poll() after URG
111  *					works now.
112  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113  *					BSD api.
114  *		Alan Cox	:	Changed the semantics of sk->socket to
115  *					fix a race and a signal problem with
116  *					accept() and async I/O.
117  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120  *					clients/servers which listen in on
121  *					fixed ports.
122  *		Alan Cox	:	Cleaned the above up and shrank it to
123  *					a sensible code size.
124  *		Alan Cox	:	Self connect lockup fix.
125  *		Alan Cox	:	No connect to multicast.
126  *		Ross Biro	:	Close unaccepted children on master
127  *					socket close.
128  *		Alan Cox	:	Reset tracing code.
129  *		Alan Cox	:	Spurious resets on shutdown.
130  *		Alan Cox	:	Giant 15 minute/60 second timer error
131  *		Alan Cox	:	Small whoops in polling before an
132  *					accept.
133  *		Alan Cox	:	Kept the state trace facility since
134  *					it's handy for debugging.
135  *		Alan Cox	:	More reset handler fixes.
136  *		Alan Cox	:	Started rewriting the code based on
137  *					the RFC's for other useful protocol
138  *					references see: Comer, KA9Q NOS, and
139  *					for a reference on the difference
140  *					between specifications and how BSD
141  *					works see the 4.4lite source.
142  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143  *					close.
144  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146  *		Alan Cox	:	Reimplemented timers as per the RFC
147  *					and using multiple timers for sanity.
148  *		Alan Cox	:	Small bug fixes, and a lot of new
149  *					comments.
150  *		Alan Cox	:	Fixed dual reader crash by locking
151  *					the buffers (much like datagram.c)
152  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153  *					now gets fed up of retrying without
154  *					(even a no space) answer.
155  *		Alan Cox	:	Extracted closing code better
156  *		Alan Cox	:	Fixed the closing state machine to
157  *					resemble the RFC.
158  *		Alan Cox	:	More 'per spec' fixes.
159  *		Jorge Cwik	:	Even faster checksumming.
160  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161  *					only frames. At least one pc tcp stack
162  *					generates them.
163  *		Alan Cox	:	Cache last socket.
164  *		Alan Cox	:	Per route irtt.
165  *		Matt Day	:	poll()->select() match BSD precisely on error
166  *		Alan Cox	:	New buffers
167  *		Marc Tamsky	:	Various sk->prot->retransmits and
168  *					sk->retransmits misupdating fixed.
169  *					Fixed tcp_write_timeout: stuck close,
170  *					and TCP syn retries gets used now.
171  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172  *					ack if state is TCP_CLOSED.
173  *		Alan Cox	:	Look up device on a retransmit - routes may
174  *					change. Doesn't yet cope with MSS shrink right
175  *					but it's a start!
176  *		Marc Tamsky	:	Closing in closing fixes.
177  *		Mike Shaver	:	RFC1122 verifications.
178  *		Alan Cox	:	rcv_saddr errors.
179  *		Alan Cox	:	Block double connect().
180  *		Alan Cox	:	Small hooks for enSKIP.
181  *		Alexey Kuznetsov:	Path MTU discovery.
182  *		Alan Cox	:	Support soft errors.
183  *		Alan Cox	:	Fix MTU discovery pathological case
184  *					when the remote claims no mtu!
185  *		Marc Tamsky	:	TCP_CLOSE fix.
186  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187  *					window but wrong (fixes NT lpd problems)
188  *		Pedro Roque	:	Better TCP window handling, delayed ack.
189  *		Joerg Reuter	:	No modification of locked buffers in
190  *					tcp_do_retransmit()
191  *		Eric Schenk	:	Changed receiver side silly window
192  *					avoidance algorithm to BSD style
193  *					algorithm. This doubles throughput
194  *					against machines running Solaris,
195  *					and seems to result in general
196  *					improvement.
197  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198  *	Willy Konynenberg	:	Transparent proxying support.
199  *	Mike McLagan		:	Routing by source
200  *		Keith Owens	:	Do proper merging with partial SKB's in
201  *					tcp_do_sendmsg to avoid burstiness.
202  *		Eric Schenk	:	Fix fast close down bug with
203  *					shutdown() followed by close().
204  *		Andi Kleen 	:	Make poll agree with SIGIO
205  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206  *					lingertime == 0 (RFC 793 ABORT Call)
207  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208  *					csum_and_copy_from_user() if possible.
209  *
210  *		This program is free software; you can redistribute it and/or
211  *		modify it under the terms of the GNU General Public License
212  *		as published by the Free Software Foundation; either version
213  *		2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *	TCP_SYN_SENT		sent a connection request, waiting for ack
218  *
219  *	TCP_SYN_RECV		received a connection request, sent ack,
220  *				waiting for final ack in three-way handshake.
221  *
222  *	TCP_ESTABLISHED		connection established
223  *
224  *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225  *				transmission of remaining buffered data
226  *
227  *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228  *				to shutdown
229  *
230  *	TCP_CLOSING		both sides have shutdown but we still have
231  *				data we have to finish sending
232  *
233  *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234  *				closed, can only be entered from FIN_WAIT2
235  *				or CLOSING.  Required because the other end
236  *				may not have gotten our last ACK causing it
237  *				to retransmit the data packet (which we ignore)
238  *
239  *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240  *				us to finish writing our data and to shutdown
241  *				(we have to close() to move on to LAST_ACK)
242  *
243  *	TCP_LAST_ACK		out side has shutdown after remote has
244  *				shutdown.  There may still be data in our
245  *				buffer that we have to finish sending
246  *
247  *	TCP_CLOSE		socket is finished
248  */
249 
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260 
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265 
266 
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269 
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271 
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273 
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276 
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278 
279 int sysctl_tcp_mem[3];
280 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282 
283 EXPORT_SYMBOL(sysctl_tcp_mem);
284 EXPORT_SYMBOL(sysctl_tcp_rmem);
285 EXPORT_SYMBOL(sysctl_tcp_wmem);
286 
287 atomic_t tcp_memory_allocated;	/* Current allocated memory. */
288 atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
289 
290 EXPORT_SYMBOL(tcp_memory_allocated);
291 EXPORT_SYMBOL(tcp_sockets_allocated);
292 
293 /*
294  * Pressure flag: try to collapse.
295  * Technical note: it is used by multiple contexts non atomically.
296  * All the sk_stream_mem_schedule() is of this nature: accounting
297  * is strict, actions are advisory and have some latency.
298  */
299 int tcp_memory_pressure;
300 
301 EXPORT_SYMBOL(tcp_memory_pressure);
302 
303 void tcp_enter_memory_pressure(void)
304 {
305 	if (!tcp_memory_pressure) {
306 		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307 		tcp_memory_pressure = 1;
308 	}
309 }
310 
311 EXPORT_SYMBOL(tcp_enter_memory_pressure);
312 
313 /*
314  * LISTEN is a special case for poll..
315  */
316 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317 					       poll_table *wait)
318 {
319 	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320 }
321 
322 /*
323  *	Wait for a TCP event.
324  *
325  *	Note that we don't need to lock the socket, as the upper poll layers
326  *	take care of normal races (between the test and the event) and we don't
327  *	go look at any of the socket buffers directly.
328  */
329 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330 {
331 	unsigned int mask;
332 	struct sock *sk = sock->sk;
333 	struct tcp_sock *tp = tcp_sk(sk);
334 
335 	poll_wait(file, sk->sk_sleep, wait);
336 	if (sk->sk_state == TCP_LISTEN)
337 		return tcp_listen_poll(sk, wait);
338 
339 	/* Socket is not locked. We are protected from async events
340 	   by poll logic and correct handling of state changes
341 	   made by another threads is impossible in any case.
342 	 */
343 
344 	mask = 0;
345 	if (sk->sk_err)
346 		mask = POLLERR;
347 
348 	/*
349 	 * POLLHUP is certainly not done right. But poll() doesn't
350 	 * have a notion of HUP in just one direction, and for a
351 	 * socket the read side is more interesting.
352 	 *
353 	 * Some poll() documentation says that POLLHUP is incompatible
354 	 * with the POLLOUT/POLLWR flags, so somebody should check this
355 	 * all. But careful, it tends to be safer to return too many
356 	 * bits than too few, and you can easily break real applications
357 	 * if you don't tell them that something has hung up!
358 	 *
359 	 * Check-me.
360 	 *
361 	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362 	 * our fs/select.c). It means that after we received EOF,
363 	 * poll always returns immediately, making impossible poll() on write()
364 	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365 	 * if and only if shutdown has been made in both directions.
366 	 * Actually, it is interesting to look how Solaris and DUX
367 	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
368 	 * then we could set it on SND_SHUTDOWN. BTW examples given
369 	 * in Stevens' books assume exactly this behaviour, it explains
370 	 * why PULLHUP is incompatible with POLLOUT.	--ANK
371 	 *
372 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373 	 * blocking on fresh not-connected or disconnected socket. --ANK
374 	 */
375 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376 		mask |= POLLHUP;
377 	if (sk->sk_shutdown & RCV_SHUTDOWN)
378 		mask |= POLLIN | POLLRDNORM;
379 
380 	/* Connected? */
381 	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382 		/* Potential race condition. If read of tp below will
383 		 * escape above sk->sk_state, we can be illegally awaken
384 		 * in SYN_* states. */
385 		if ((tp->rcv_nxt != tp->copied_seq) &&
386 		    (tp->urg_seq != tp->copied_seq ||
387 		     tp->rcv_nxt != tp->copied_seq + 1 ||
388 		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389 			mask |= POLLIN | POLLRDNORM;
390 
391 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393 				mask |= POLLOUT | POLLWRNORM;
394 			} else {  /* send SIGIO later */
395 				set_bit(SOCK_ASYNC_NOSPACE,
396 					&sk->sk_socket->flags);
397 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398 
399 				/* Race breaker. If space is freed after
400 				 * wspace test but before the flags are set,
401 				 * IO signal will be lost.
402 				 */
403 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404 					mask |= POLLOUT | POLLWRNORM;
405 			}
406 		}
407 
408 		if (tp->urg_data & TCP_URG_VALID)
409 			mask |= POLLPRI;
410 	}
411 	return mask;
412 }
413 
414 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415 {
416 	struct tcp_sock *tp = tcp_sk(sk);
417 	int answ;
418 
419 	switch (cmd) {
420 	case SIOCINQ:
421 		if (sk->sk_state == TCP_LISTEN)
422 			return -EINVAL;
423 
424 		lock_sock(sk);
425 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426 			answ = 0;
427 		else if (sock_flag(sk, SOCK_URGINLINE) ||
428 			 !tp->urg_data ||
429 			 before(tp->urg_seq, tp->copied_seq) ||
430 			 !before(tp->urg_seq, tp->rcv_nxt)) {
431 			answ = tp->rcv_nxt - tp->copied_seq;
432 
433 			/* Subtract 1, if FIN is in queue. */
434 			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435 				answ -=
436 		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437 		} else
438 			answ = tp->urg_seq - tp->copied_seq;
439 		release_sock(sk);
440 		break;
441 	case SIOCATMARK:
442 		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443 		break;
444 	case SIOCOUTQ:
445 		if (sk->sk_state == TCP_LISTEN)
446 			return -EINVAL;
447 
448 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449 			answ = 0;
450 		else
451 			answ = tp->write_seq - tp->snd_una;
452 		break;
453 	default:
454 		return -ENOIOCTLCMD;
455 	};
456 
457 	return put_user(answ, (int __user *)arg);
458 }
459 
460 
461 int tcp_listen_start(struct sock *sk)
462 {
463 	struct inet_sock *inet = inet_sk(sk);
464 	struct tcp_sock *tp = tcp_sk(sk);
465 	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466 
467 	if (rc != 0)
468 		return rc;
469 
470 	sk->sk_max_ack_backlog = 0;
471 	sk->sk_ack_backlog = 0;
472 	tcp_delack_init(tp);
473 
474 	/* There is race window here: we announce ourselves listening,
475 	 * but this transition is still not validated by get_port().
476 	 * It is OK, because this socket enters to hash table only
477 	 * after validation is complete.
478 	 */
479 	sk->sk_state = TCP_LISTEN;
480 	if (!sk->sk_prot->get_port(sk, inet->num)) {
481 		inet->sport = htons(inet->num);
482 
483 		sk_dst_reset(sk);
484 		sk->sk_prot->hash(sk);
485 
486 		return 0;
487 	}
488 
489 	sk->sk_state = TCP_CLOSE;
490 	reqsk_queue_destroy(&tp->accept_queue);
491 	return -EADDRINUSE;
492 }
493 
494 /*
495  *	This routine closes sockets which have been at least partially
496  *	opened, but not yet accepted.
497  */
498 
499 static void tcp_listen_stop (struct sock *sk)
500 {
501 	struct tcp_sock *tp = tcp_sk(sk);
502 	struct listen_sock *lopt;
503 	struct request_sock *acc_req;
504 	struct request_sock *req;
505 	int i;
506 
507 	tcp_delete_keepalive_timer(sk);
508 
509 	/* make all the listen_opt local to us */
510 	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511 	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512 
513 	if (lopt->qlen) {
514 		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515 			while ((req = lopt->syn_table[i]) != NULL) {
516 				lopt->syn_table[i] = req->dl_next;
517 				lopt->qlen--;
518 				reqsk_free(req);
519 
520 		/* Following specs, it would be better either to send FIN
521 		 * (and enter FIN-WAIT-1, it is normal close)
522 		 * or to send active reset (abort).
523 		 * Certainly, it is pretty dangerous while synflood, but it is
524 		 * bad justification for our negligence 8)
525 		 * To be honest, we are not able to make either
526 		 * of the variants now.			--ANK
527 		 */
528 			}
529 		}
530 	}
531 	BUG_TRAP(!lopt->qlen);
532 
533 	kfree(lopt);
534 
535 	while ((req = acc_req) != NULL) {
536 		struct sock *child = req->sk;
537 
538 		acc_req = req->dl_next;
539 
540 		local_bh_disable();
541 		bh_lock_sock(child);
542 		BUG_TRAP(!sock_owned_by_user(child));
543 		sock_hold(child);
544 
545 		tcp_disconnect(child, O_NONBLOCK);
546 
547 		sock_orphan(child);
548 
549 		atomic_inc(&tcp_orphan_count);
550 
551 		tcp_destroy_sock(child);
552 
553 		bh_unlock_sock(child);
554 		local_bh_enable();
555 		sock_put(child);
556 
557 		sk_acceptq_removed(sk);
558 		__reqsk_free(req);
559 	}
560 	BUG_TRAP(!sk->sk_ack_backlog);
561 }
562 
563 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564 {
565 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
566 	tp->pushed_seq = tp->write_seq;
567 }
568 
569 static inline int forced_push(struct tcp_sock *tp)
570 {
571 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
572 }
573 
574 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
575 			      struct sk_buff *skb)
576 {
577 	skb->csum = 0;
578 	TCP_SKB_CB(skb)->seq = tp->write_seq;
579 	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
580 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
581 	TCP_SKB_CB(skb)->sacked = 0;
582 	skb_header_release(skb);
583 	__skb_queue_tail(&sk->sk_write_queue, skb);
584 	sk_charge_skb(sk, skb);
585 	if (!sk->sk_send_head)
586 		sk->sk_send_head = skb;
587 	else if (tp->nonagle&TCP_NAGLE_PUSH)
588 		tp->nonagle &= ~TCP_NAGLE_PUSH;
589 }
590 
591 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
592 				struct sk_buff *skb)
593 {
594 	if (flags & MSG_OOB) {
595 		tp->urg_mode = 1;
596 		tp->snd_up = tp->write_seq;
597 		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
598 	}
599 }
600 
601 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
602 			    int mss_now, int nonagle)
603 {
604 	if (sk->sk_send_head) {
605 		struct sk_buff *skb = sk->sk_write_queue.prev;
606 		if (!(flags & MSG_MORE) || forced_push(tp))
607 			tcp_mark_push(tp, skb);
608 		tcp_mark_urg(tp, flags, skb);
609 		__tcp_push_pending_frames(sk, tp, mss_now,
610 					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
611 	}
612 }
613 
614 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
615 			 size_t psize, int flags)
616 {
617 	struct tcp_sock *tp = tcp_sk(sk);
618 	int mss_now;
619 	int err;
620 	ssize_t copied;
621 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
622 
623 	/* Wait for a connection to finish. */
624 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
625 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
626 			goto out_err;
627 
628 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629 
630 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 	copied = 0;
632 
633 	err = -EPIPE;
634 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
635 		goto do_error;
636 
637 	while (psize > 0) {
638 		struct sk_buff *skb = sk->sk_write_queue.prev;
639 		struct page *page = pages[poffset / PAGE_SIZE];
640 		int copy, i, can_coalesce;
641 		int offset = poffset % PAGE_SIZE;
642 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
643 
644 		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
645 new_segment:
646 			if (!sk_stream_memory_free(sk))
647 				goto wait_for_sndbuf;
648 
649 			skb = sk_stream_alloc_pskb(sk, 0, 0,
650 						   sk->sk_allocation);
651 			if (!skb)
652 				goto wait_for_memory;
653 
654 			skb_entail(sk, tp, skb);
655 			copy = mss_now;
656 		}
657 
658 		if (copy > size)
659 			copy = size;
660 
661 		i = skb_shinfo(skb)->nr_frags;
662 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
663 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
664 			tcp_mark_push(tp, skb);
665 			goto new_segment;
666 		}
667 		if (sk->sk_forward_alloc < copy &&
668 		    !sk_stream_mem_schedule(sk, copy, 0))
669 			goto wait_for_memory;
670 
671 		if (can_coalesce) {
672 			skb_shinfo(skb)->frags[i - 1].size += copy;
673 		} else {
674 			get_page(page);
675 			skb_fill_page_desc(skb, i, page, offset, copy);
676 		}
677 
678 		skb->len += copy;
679 		skb->data_len += copy;
680 		skb->truesize += copy;
681 		sk->sk_wmem_queued += copy;
682 		sk->sk_forward_alloc -= copy;
683 		skb->ip_summed = CHECKSUM_HW;
684 		tp->write_seq += copy;
685 		TCP_SKB_CB(skb)->end_seq += copy;
686 		skb_shinfo(skb)->tso_segs = 0;
687 
688 		if (!copied)
689 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
690 
691 		copied += copy;
692 		poffset += copy;
693 		if (!(psize -= copy))
694 			goto out;
695 
696 		if (skb->len != mss_now || (flags & MSG_OOB))
697 			continue;
698 
699 		if (forced_push(tp)) {
700 			tcp_mark_push(tp, skb);
701 			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
702 		} else if (skb == sk->sk_send_head)
703 			tcp_push_one(sk, mss_now);
704 		continue;
705 
706 wait_for_sndbuf:
707 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
708 wait_for_memory:
709 		if (copied)
710 			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
711 
712 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
713 			goto do_error;
714 
715 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
716 	}
717 
718 out:
719 	if (copied)
720 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
721 	return copied;
722 
723 do_error:
724 	if (copied)
725 		goto out;
726 out_err:
727 	return sk_stream_error(sk, flags, err);
728 }
729 
730 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
731 		     size_t size, int flags)
732 {
733 	ssize_t res;
734 	struct sock *sk = sock->sk;
735 
736 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
737 
738 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
739 	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
740 		return sock_no_sendpage(sock, page, offset, size, flags);
741 
742 #undef TCP_ZC_CSUM_FLAGS
743 
744 	lock_sock(sk);
745 	TCP_CHECK_TIMER(sk);
746 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
747 	TCP_CHECK_TIMER(sk);
748 	release_sock(sk);
749 	return res;
750 }
751 
752 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
753 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
754 
755 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756 {
757 	int tmp = tp->mss_cache_std;
758 
759 	if (sk->sk_route_caps & NETIF_F_SG) {
760 		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
761 
762 		if (tmp >= pgbreak &&
763 		    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
764 			tmp = pgbreak;
765 	}
766 	return tmp;
767 }
768 
769 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
770 		size_t size)
771 {
772 	struct iovec *iov;
773 	struct tcp_sock *tp = tcp_sk(sk);
774 	struct sk_buff *skb;
775 	int iovlen, flags;
776 	int mss_now;
777 	int err, copied;
778 	long timeo;
779 
780 	lock_sock(sk);
781 	TCP_CHECK_TIMER(sk);
782 
783 	flags = msg->msg_flags;
784 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
785 
786 	/* Wait for a connection to finish. */
787 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
788 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
789 			goto out_err;
790 
791 	/* This should be in poll */
792 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
793 
794 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
795 
796 	/* Ok commence sending. */
797 	iovlen = msg->msg_iovlen;
798 	iov = msg->msg_iov;
799 	copied = 0;
800 
801 	err = -EPIPE;
802 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
803 		goto do_error;
804 
805 	while (--iovlen >= 0) {
806 		int seglen = iov->iov_len;
807 		unsigned char __user *from = iov->iov_base;
808 
809 		iov++;
810 
811 		while (seglen > 0) {
812 			int copy;
813 
814 			skb = sk->sk_write_queue.prev;
815 
816 			if (!sk->sk_send_head ||
817 			    (copy = mss_now - skb->len) <= 0) {
818 
819 new_segment:
820 				/* Allocate new segment. If the interface is SG,
821 				 * allocate skb fitting to single page.
822 				 */
823 				if (!sk_stream_memory_free(sk))
824 					goto wait_for_sndbuf;
825 
826 				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
827 							   0, sk->sk_allocation);
828 				if (!skb)
829 					goto wait_for_memory;
830 
831 				/*
832 				 * Check whether we can use HW checksum.
833 				 */
834 				if (sk->sk_route_caps &
835 				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
836 				     NETIF_F_HW_CSUM))
837 					skb->ip_summed = CHECKSUM_HW;
838 
839 				skb_entail(sk, tp, skb);
840 				copy = mss_now;
841 			}
842 
843 			/* Try to append data to the end of skb. */
844 			if (copy > seglen)
845 				copy = seglen;
846 
847 			/* Where to copy to? */
848 			if (skb_tailroom(skb) > 0) {
849 				/* We have some space in skb head. Superb! */
850 				if (copy > skb_tailroom(skb))
851 					copy = skb_tailroom(skb);
852 				if ((err = skb_add_data(skb, from, copy)) != 0)
853 					goto do_fault;
854 			} else {
855 				int merge = 0;
856 				int i = skb_shinfo(skb)->nr_frags;
857 				struct page *page = TCP_PAGE(sk);
858 				int off = TCP_OFF(sk);
859 
860 				if (skb_can_coalesce(skb, i, page, off) &&
861 				    off != PAGE_SIZE) {
862 					/* We can extend the last page
863 					 * fragment. */
864 					merge = 1;
865 				} else if (i == MAX_SKB_FRAGS ||
866 					   (!i &&
867 					   !(sk->sk_route_caps & NETIF_F_SG))) {
868 					/* Need to add new fragment and cannot
869 					 * do this because interface is non-SG,
870 					 * or because all the page slots are
871 					 * busy. */
872 					tcp_mark_push(tp, skb);
873 					goto new_segment;
874 				} else if (page) {
875 					/* If page is cached, align
876 					 * offset to L1 cache boundary
877 					 */
878 					off = (off + L1_CACHE_BYTES - 1) &
879 					      ~(L1_CACHE_BYTES - 1);
880 					if (off == PAGE_SIZE) {
881 						put_page(page);
882 						TCP_PAGE(sk) = page = NULL;
883 					}
884 				}
885 
886 				if (!page) {
887 					/* Allocate new cache page. */
888 					if (!(page = sk_stream_alloc_page(sk)))
889 						goto wait_for_memory;
890 					off = 0;
891 				}
892 
893 				if (copy > PAGE_SIZE - off)
894 					copy = PAGE_SIZE - off;
895 
896 				/* Time to copy data. We are close to
897 				 * the end! */
898 				err = skb_copy_to_page(sk, from, skb, page,
899 						       off, copy);
900 				if (err) {
901 					/* If this page was new, give it to the
902 					 * socket so it does not get leaked.
903 					 */
904 					if (!TCP_PAGE(sk)) {
905 						TCP_PAGE(sk) = page;
906 						TCP_OFF(sk) = 0;
907 					}
908 					goto do_error;
909 				}
910 
911 				/* Update the skb. */
912 				if (merge) {
913 					skb_shinfo(skb)->frags[i - 1].size +=
914 									copy;
915 				} else {
916 					skb_fill_page_desc(skb, i, page, off, copy);
917 					if (TCP_PAGE(sk)) {
918 						get_page(page);
919 					} else if (off + copy < PAGE_SIZE) {
920 						get_page(page);
921 						TCP_PAGE(sk) = page;
922 					}
923 				}
924 
925 				TCP_OFF(sk) = off + copy;
926 			}
927 
928 			if (!copied)
929 				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
930 
931 			tp->write_seq += copy;
932 			TCP_SKB_CB(skb)->end_seq += copy;
933 			skb_shinfo(skb)->tso_segs = 0;
934 
935 			from += copy;
936 			copied += copy;
937 			if ((seglen -= copy) == 0 && iovlen == 0)
938 				goto out;
939 
940 			if (skb->len != mss_now || (flags & MSG_OOB))
941 				continue;
942 
943 			if (forced_push(tp)) {
944 				tcp_mark_push(tp, skb);
945 				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
946 			} else if (skb == sk->sk_send_head)
947 				tcp_push_one(sk, mss_now);
948 			continue;
949 
950 wait_for_sndbuf:
951 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
952 wait_for_memory:
953 			if (copied)
954 				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
955 
956 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
957 				goto do_error;
958 
959 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
960 		}
961 	}
962 
963 out:
964 	if (copied)
965 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
966 	TCP_CHECK_TIMER(sk);
967 	release_sock(sk);
968 	return copied;
969 
970 do_fault:
971 	if (!skb->len) {
972 		if (sk->sk_send_head == skb)
973 			sk->sk_send_head = NULL;
974 		__skb_unlink(skb, skb->list);
975 		sk_stream_free_skb(sk, skb);
976 	}
977 
978 do_error:
979 	if (copied)
980 		goto out;
981 out_err:
982 	err = sk_stream_error(sk, flags, err);
983 	TCP_CHECK_TIMER(sk);
984 	release_sock(sk);
985 	return err;
986 }
987 
988 /*
989  *	Handle reading urgent data. BSD has very simple semantics for
990  *	this, no blocking and very strange errors 8)
991  */
992 
993 static int tcp_recv_urg(struct sock *sk, long timeo,
994 			struct msghdr *msg, int len, int flags,
995 			int *addr_len)
996 {
997 	struct tcp_sock *tp = tcp_sk(sk);
998 
999 	/* No URG data to read. */
1000 	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1001 	    tp->urg_data == TCP_URG_READ)
1002 		return -EINVAL;	/* Yes this is right ! */
1003 
1004 	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1005 		return -ENOTCONN;
1006 
1007 	if (tp->urg_data & TCP_URG_VALID) {
1008 		int err = 0;
1009 		char c = tp->urg_data;
1010 
1011 		if (!(flags & MSG_PEEK))
1012 			tp->urg_data = TCP_URG_READ;
1013 
1014 		/* Read urgent data. */
1015 		msg->msg_flags |= MSG_OOB;
1016 
1017 		if (len > 0) {
1018 			if (!(flags & MSG_TRUNC))
1019 				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1020 			len = 1;
1021 		} else
1022 			msg->msg_flags |= MSG_TRUNC;
1023 
1024 		return err ? -EFAULT : len;
1025 	}
1026 
1027 	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1028 		return 0;
1029 
1030 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1031 	 * the available implementations agree in this case:
1032 	 * this call should never block, independent of the
1033 	 * blocking state of the socket.
1034 	 * Mike <pall@rz.uni-karlsruhe.de>
1035 	 */
1036 	return -EAGAIN;
1037 }
1038 
1039 /* Clean up the receive buffer for full frames taken by the user,
1040  * then send an ACK if necessary.  COPIED is the number of bytes
1041  * tcp_recvmsg has given to the user so far, it speeds up the
1042  * calculation of whether or not we must ACK for the sake of
1043  * a window update.
1044  */
1045 static void cleanup_rbuf(struct sock *sk, int copied)
1046 {
1047 	struct tcp_sock *tp = tcp_sk(sk);
1048 	int time_to_ack = 0;
1049 
1050 #if TCP_DEBUG
1051 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1052 
1053 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1054 #endif
1055 
1056 	if (tcp_ack_scheduled(tp)) {
1057 		   /* Delayed ACKs frequently hit locked sockets during bulk
1058 		    * receive. */
1059 		if (tp->ack.blocked ||
1060 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1061 		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1062 		    /*
1063 		     * If this read emptied read buffer, we send ACK, if
1064 		     * connection is not bidirectional, user drained
1065 		     * receive buffer and there was a small segment
1066 		     * in queue.
1067 		     */
1068 		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1069 		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1070 			time_to_ack = 1;
1071 	}
1072 
1073 	/* We send an ACK if we can now advertise a non-zero window
1074 	 * which has been raised "significantly".
1075 	 *
1076 	 * Even if window raised up to infinity, do not send window open ACK
1077 	 * in states, where we will not receive more. It is useless.
1078 	 */
1079 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1080 		__u32 rcv_window_now = tcp_receive_window(tp);
1081 
1082 		/* Optimize, __tcp_select_window() is not cheap. */
1083 		if (2*rcv_window_now <= tp->window_clamp) {
1084 			__u32 new_window = __tcp_select_window(sk);
1085 
1086 			/* Send ACK now, if this read freed lots of space
1087 			 * in our buffer. Certainly, new_window is new window.
1088 			 * We can advertise it now, if it is not less than current one.
1089 			 * "Lots" means "at least twice" here.
1090 			 */
1091 			if (new_window && new_window >= 2 * rcv_window_now)
1092 				time_to_ack = 1;
1093 		}
1094 	}
1095 	if (time_to_ack)
1096 		tcp_send_ack(sk);
1097 }
1098 
1099 static void tcp_prequeue_process(struct sock *sk)
1100 {
1101 	struct sk_buff *skb;
1102 	struct tcp_sock *tp = tcp_sk(sk);
1103 
1104 	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1105 
1106 	/* RX process wants to run with disabled BHs, though it is not
1107 	 * necessary */
1108 	local_bh_disable();
1109 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1110 		sk->sk_backlog_rcv(sk, skb);
1111 	local_bh_enable();
1112 
1113 	/* Clear memory counter. */
1114 	tp->ucopy.memory = 0;
1115 }
1116 
1117 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1118 {
1119 	struct sk_buff *skb;
1120 	u32 offset;
1121 
1122 	skb_queue_walk(&sk->sk_receive_queue, skb) {
1123 		offset = seq - TCP_SKB_CB(skb)->seq;
1124 		if (skb->h.th->syn)
1125 			offset--;
1126 		if (offset < skb->len || skb->h.th->fin) {
1127 			*off = offset;
1128 			return skb;
1129 		}
1130 	}
1131 	return NULL;
1132 }
1133 
1134 /*
1135  * This routine provides an alternative to tcp_recvmsg() for routines
1136  * that would like to handle copying from skbuffs directly in 'sendfile'
1137  * fashion.
1138  * Note:
1139  *	- It is assumed that the socket was locked by the caller.
1140  *	- The routine does not block.
1141  *	- At present, there is no support for reading OOB data
1142  *	  or for 'peeking' the socket using this routine
1143  *	  (although both would be easy to implement).
1144  */
1145 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1146 		  sk_read_actor_t recv_actor)
1147 {
1148 	struct sk_buff *skb;
1149 	struct tcp_sock *tp = tcp_sk(sk);
1150 	u32 seq = tp->copied_seq;
1151 	u32 offset;
1152 	int copied = 0;
1153 
1154 	if (sk->sk_state == TCP_LISTEN)
1155 		return -ENOTCONN;
1156 	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1157 		if (offset < skb->len) {
1158 			size_t used, len;
1159 
1160 			len = skb->len - offset;
1161 			/* Stop reading if we hit a patch of urgent data */
1162 			if (tp->urg_data) {
1163 				u32 urg_offset = tp->urg_seq - seq;
1164 				if (urg_offset < len)
1165 					len = urg_offset;
1166 				if (!len)
1167 					break;
1168 			}
1169 			used = recv_actor(desc, skb, offset, len);
1170 			if (used <= len) {
1171 				seq += used;
1172 				copied += used;
1173 				offset += used;
1174 			}
1175 			if (offset != skb->len)
1176 				break;
1177 		}
1178 		if (skb->h.th->fin) {
1179 			sk_eat_skb(sk, skb);
1180 			++seq;
1181 			break;
1182 		}
1183 		sk_eat_skb(sk, skb);
1184 		if (!desc->count)
1185 			break;
1186 	}
1187 	tp->copied_seq = seq;
1188 
1189 	tcp_rcv_space_adjust(sk);
1190 
1191 	/* Clean up data we have read: This will do ACK frames. */
1192 	if (copied)
1193 		cleanup_rbuf(sk, copied);
1194 	return copied;
1195 }
1196 
1197 /*
1198  *	This routine copies from a sock struct into the user buffer.
1199  *
1200  *	Technical note: in 2.3 we work on _locked_ socket, so that
1201  *	tricks with *seq access order and skb->users are not required.
1202  *	Probably, code can be easily improved even more.
1203  */
1204 
1205 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1206 		size_t len, int nonblock, int flags, int *addr_len)
1207 {
1208 	struct tcp_sock *tp = tcp_sk(sk);
1209 	int copied = 0;
1210 	u32 peek_seq;
1211 	u32 *seq;
1212 	unsigned long used;
1213 	int err;
1214 	int target;		/* Read at least this many bytes */
1215 	long timeo;
1216 	struct task_struct *user_recv = NULL;
1217 
1218 	lock_sock(sk);
1219 
1220 	TCP_CHECK_TIMER(sk);
1221 
1222 	err = -ENOTCONN;
1223 	if (sk->sk_state == TCP_LISTEN)
1224 		goto out;
1225 
1226 	timeo = sock_rcvtimeo(sk, nonblock);
1227 
1228 	/* Urgent data needs to be handled specially. */
1229 	if (flags & MSG_OOB)
1230 		goto recv_urg;
1231 
1232 	seq = &tp->copied_seq;
1233 	if (flags & MSG_PEEK) {
1234 		peek_seq = tp->copied_seq;
1235 		seq = &peek_seq;
1236 	}
1237 
1238 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1239 
1240 	do {
1241 		struct sk_buff *skb;
1242 		u32 offset;
1243 
1244 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1245 		if (tp->urg_data && tp->urg_seq == *seq) {
1246 			if (copied)
1247 				break;
1248 			if (signal_pending(current)) {
1249 				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1250 				break;
1251 			}
1252 		}
1253 
1254 		/* Next get a buffer. */
1255 
1256 		skb = skb_peek(&sk->sk_receive_queue);
1257 		do {
1258 			if (!skb)
1259 				break;
1260 
1261 			/* Now that we have two receive queues this
1262 			 * shouldn't happen.
1263 			 */
1264 			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1265 				printk(KERN_INFO "recvmsg bug: copied %X "
1266 				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1267 				break;
1268 			}
1269 			offset = *seq - TCP_SKB_CB(skb)->seq;
1270 			if (skb->h.th->syn)
1271 				offset--;
1272 			if (offset < skb->len)
1273 				goto found_ok_skb;
1274 			if (skb->h.th->fin)
1275 				goto found_fin_ok;
1276 			BUG_TRAP(flags & MSG_PEEK);
1277 			skb = skb->next;
1278 		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1279 
1280 		/* Well, if we have backlog, try to process it now yet. */
1281 
1282 		if (copied >= target && !sk->sk_backlog.tail)
1283 			break;
1284 
1285 		if (copied) {
1286 			if (sk->sk_err ||
1287 			    sk->sk_state == TCP_CLOSE ||
1288 			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1289 			    !timeo ||
1290 			    signal_pending(current) ||
1291 			    (flags & MSG_PEEK))
1292 				break;
1293 		} else {
1294 			if (sock_flag(sk, SOCK_DONE))
1295 				break;
1296 
1297 			if (sk->sk_err) {
1298 				copied = sock_error(sk);
1299 				break;
1300 			}
1301 
1302 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1303 				break;
1304 
1305 			if (sk->sk_state == TCP_CLOSE) {
1306 				if (!sock_flag(sk, SOCK_DONE)) {
1307 					/* This occurs when user tries to read
1308 					 * from never connected socket.
1309 					 */
1310 					copied = -ENOTCONN;
1311 					break;
1312 				}
1313 				break;
1314 			}
1315 
1316 			if (!timeo) {
1317 				copied = -EAGAIN;
1318 				break;
1319 			}
1320 
1321 			if (signal_pending(current)) {
1322 				copied = sock_intr_errno(timeo);
1323 				break;
1324 			}
1325 		}
1326 
1327 		cleanup_rbuf(sk, copied);
1328 
1329 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1330 			/* Install new reader */
1331 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1332 				user_recv = current;
1333 				tp->ucopy.task = user_recv;
1334 				tp->ucopy.iov = msg->msg_iov;
1335 			}
1336 
1337 			tp->ucopy.len = len;
1338 
1339 			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1340 				 (flags & (MSG_PEEK | MSG_TRUNC)));
1341 
1342 			/* Ugly... If prequeue is not empty, we have to
1343 			 * process it before releasing socket, otherwise
1344 			 * order will be broken at second iteration.
1345 			 * More elegant solution is required!!!
1346 			 *
1347 			 * Look: we have the following (pseudo)queues:
1348 			 *
1349 			 * 1. packets in flight
1350 			 * 2. backlog
1351 			 * 3. prequeue
1352 			 * 4. receive_queue
1353 			 *
1354 			 * Each queue can be processed only if the next ones
1355 			 * are empty. At this point we have empty receive_queue.
1356 			 * But prequeue _can_ be not empty after 2nd iteration,
1357 			 * when we jumped to start of loop because backlog
1358 			 * processing added something to receive_queue.
1359 			 * We cannot release_sock(), because backlog contains
1360 			 * packets arrived _after_ prequeued ones.
1361 			 *
1362 			 * Shortly, algorithm is clear --- to process all
1363 			 * the queues in order. We could make it more directly,
1364 			 * requeueing packets from backlog to prequeue, if
1365 			 * is not empty. It is more elegant, but eats cycles,
1366 			 * unfortunately.
1367 			 */
1368 			if (skb_queue_len(&tp->ucopy.prequeue))
1369 				goto do_prequeue;
1370 
1371 			/* __ Set realtime policy in scheduler __ */
1372 		}
1373 
1374 		if (copied >= target) {
1375 			/* Do not sleep, just process backlog. */
1376 			release_sock(sk);
1377 			lock_sock(sk);
1378 		} else
1379 			sk_wait_data(sk, &timeo);
1380 
1381 		if (user_recv) {
1382 			int chunk;
1383 
1384 			/* __ Restore normal policy in scheduler __ */
1385 
1386 			if ((chunk = len - tp->ucopy.len) != 0) {
1387 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1388 				len -= chunk;
1389 				copied += chunk;
1390 			}
1391 
1392 			if (tp->rcv_nxt == tp->copied_seq &&
1393 			    skb_queue_len(&tp->ucopy.prequeue)) {
1394 do_prequeue:
1395 				tcp_prequeue_process(sk);
1396 
1397 				if ((chunk = len - tp->ucopy.len) != 0) {
1398 					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1399 					len -= chunk;
1400 					copied += chunk;
1401 				}
1402 			}
1403 		}
1404 		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1405 			if (net_ratelimit())
1406 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1407 				       current->comm, current->pid);
1408 			peek_seq = tp->copied_seq;
1409 		}
1410 		continue;
1411 
1412 	found_ok_skb:
1413 		/* Ok so how much can we use? */
1414 		used = skb->len - offset;
1415 		if (len < used)
1416 			used = len;
1417 
1418 		/* Do we have urgent data here? */
1419 		if (tp->urg_data) {
1420 			u32 urg_offset = tp->urg_seq - *seq;
1421 			if (urg_offset < used) {
1422 				if (!urg_offset) {
1423 					if (!sock_flag(sk, SOCK_URGINLINE)) {
1424 						++*seq;
1425 						offset++;
1426 						used--;
1427 						if (!used)
1428 							goto skip_copy;
1429 					}
1430 				} else
1431 					used = urg_offset;
1432 			}
1433 		}
1434 
1435 		if (!(flags & MSG_TRUNC)) {
1436 			err = skb_copy_datagram_iovec(skb, offset,
1437 						      msg->msg_iov, used);
1438 			if (err) {
1439 				/* Exception. Bailout! */
1440 				if (!copied)
1441 					copied = -EFAULT;
1442 				break;
1443 			}
1444 		}
1445 
1446 		*seq += used;
1447 		copied += used;
1448 		len -= used;
1449 
1450 		tcp_rcv_space_adjust(sk);
1451 
1452 skip_copy:
1453 		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1454 			tp->urg_data = 0;
1455 			tcp_fast_path_check(sk, tp);
1456 		}
1457 		if (used + offset < skb->len)
1458 			continue;
1459 
1460 		if (skb->h.th->fin)
1461 			goto found_fin_ok;
1462 		if (!(flags & MSG_PEEK))
1463 			sk_eat_skb(sk, skb);
1464 		continue;
1465 
1466 	found_fin_ok:
1467 		/* Process the FIN. */
1468 		++*seq;
1469 		if (!(flags & MSG_PEEK))
1470 			sk_eat_skb(sk, skb);
1471 		break;
1472 	} while (len > 0);
1473 
1474 	if (user_recv) {
1475 		if (skb_queue_len(&tp->ucopy.prequeue)) {
1476 			int chunk;
1477 
1478 			tp->ucopy.len = copied > 0 ? len : 0;
1479 
1480 			tcp_prequeue_process(sk);
1481 
1482 			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1483 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1484 				len -= chunk;
1485 				copied += chunk;
1486 			}
1487 		}
1488 
1489 		tp->ucopy.task = NULL;
1490 		tp->ucopy.len = 0;
1491 	}
1492 
1493 	/* According to UNIX98, msg_name/msg_namelen are ignored
1494 	 * on connected socket. I was just happy when found this 8) --ANK
1495 	 */
1496 
1497 	/* Clean up data we have read: This will do ACK frames. */
1498 	cleanup_rbuf(sk, copied);
1499 
1500 	TCP_CHECK_TIMER(sk);
1501 	release_sock(sk);
1502 	return copied;
1503 
1504 out:
1505 	TCP_CHECK_TIMER(sk);
1506 	release_sock(sk);
1507 	return err;
1508 
1509 recv_urg:
1510 	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1511 	goto out;
1512 }
1513 
1514 /*
1515  *	State processing on a close. This implements the state shift for
1516  *	sending our FIN frame. Note that we only send a FIN for some
1517  *	states. A shutdown() may have already sent the FIN, or we may be
1518  *	closed.
1519  */
1520 
1521 static unsigned char new_state[16] = {
1522   /* current state:        new state:      action:	*/
1523   /* (Invalid)		*/ TCP_CLOSE,
1524   /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1525   /* TCP_SYN_SENT	*/ TCP_CLOSE,
1526   /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1527   /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1528   /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1529   /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1530   /* TCP_CLOSE		*/ TCP_CLOSE,
1531   /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1532   /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1533   /* TCP_LISTEN		*/ TCP_CLOSE,
1534   /* TCP_CLOSING	*/ TCP_CLOSING,
1535 };
1536 
1537 static int tcp_close_state(struct sock *sk)
1538 {
1539 	int next = (int)new_state[sk->sk_state];
1540 	int ns = next & TCP_STATE_MASK;
1541 
1542 	tcp_set_state(sk, ns);
1543 
1544 	return next & TCP_ACTION_FIN;
1545 }
1546 
1547 /*
1548  *	Shutdown the sending side of a connection. Much like close except
1549  *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1550  */
1551 
1552 void tcp_shutdown(struct sock *sk, int how)
1553 {
1554 	/*	We need to grab some memory, and put together a FIN,
1555 	 *	and then put it into the queue to be sent.
1556 	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1557 	 */
1558 	if (!(how & SEND_SHUTDOWN))
1559 		return;
1560 
1561 	/* If we've already sent a FIN, or it's a closed state, skip this. */
1562 	if ((1 << sk->sk_state) &
1563 	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1564 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1565 		/* Clear out any half completed packets.  FIN if needed. */
1566 		if (tcp_close_state(sk))
1567 			tcp_send_fin(sk);
1568 	}
1569 }
1570 
1571 /*
1572  * At this point, there should be no process reference to this
1573  * socket, and thus no user references at all.  Therefore we
1574  * can assume the socket waitqueue is inactive and nobody will
1575  * try to jump onto it.
1576  */
1577 void tcp_destroy_sock(struct sock *sk)
1578 {
1579 	BUG_TRAP(sk->sk_state == TCP_CLOSE);
1580 	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1581 
1582 	/* It cannot be in hash table! */
1583 	BUG_TRAP(sk_unhashed(sk));
1584 
1585 	/* If it has not 0 inet_sk(sk)->num, it must be bound */
1586 	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1587 
1588 	sk->sk_prot->destroy(sk);
1589 
1590 	sk_stream_kill_queues(sk);
1591 
1592 	xfrm_sk_free_policy(sk);
1593 
1594 #ifdef INET_REFCNT_DEBUG
1595 	if (atomic_read(&sk->sk_refcnt) != 1) {
1596 		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1597 		       sk, atomic_read(&sk->sk_refcnt));
1598 	}
1599 #endif
1600 
1601 	atomic_dec(&tcp_orphan_count);
1602 	sock_put(sk);
1603 }
1604 
1605 void tcp_close(struct sock *sk, long timeout)
1606 {
1607 	struct sk_buff *skb;
1608 	int data_was_unread = 0;
1609 
1610 	lock_sock(sk);
1611 	sk->sk_shutdown = SHUTDOWN_MASK;
1612 
1613 	if (sk->sk_state == TCP_LISTEN) {
1614 		tcp_set_state(sk, TCP_CLOSE);
1615 
1616 		/* Special case. */
1617 		tcp_listen_stop(sk);
1618 
1619 		goto adjudge_to_death;
1620 	}
1621 
1622 	/*  We need to flush the recv. buffs.  We do this only on the
1623 	 *  descriptor close, not protocol-sourced closes, because the
1624 	 *  reader process may not have drained the data yet!
1625 	 */
1626 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1627 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1628 			  skb->h.th->fin;
1629 		data_was_unread += len;
1630 		__kfree_skb(skb);
1631 	}
1632 
1633 	sk_stream_mem_reclaim(sk);
1634 
1635 	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1636 	 * 3.10, we send a RST here because data was lost.  To
1637 	 * witness the awful effects of the old behavior of always
1638 	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1639 	 * a bulk GET in an FTP client, suspend the process, wait
1640 	 * for the client to advertise a zero window, then kill -9
1641 	 * the FTP client, wheee...  Note: timeout is always zero
1642 	 * in such a case.
1643 	 */
1644 	if (data_was_unread) {
1645 		/* Unread data was tossed, zap the connection. */
1646 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1647 		tcp_set_state(sk, TCP_CLOSE);
1648 		tcp_send_active_reset(sk, GFP_KERNEL);
1649 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1650 		/* Check zero linger _after_ checking for unread data. */
1651 		sk->sk_prot->disconnect(sk, 0);
1652 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1653 	} else if (tcp_close_state(sk)) {
1654 		/* We FIN if the application ate all the data before
1655 		 * zapping the connection.
1656 		 */
1657 
1658 		/* RED-PEN. Formally speaking, we have broken TCP state
1659 		 * machine. State transitions:
1660 		 *
1661 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1662 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1663 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1664 		 *
1665 		 * are legal only when FIN has been sent (i.e. in window),
1666 		 * rather than queued out of window. Purists blame.
1667 		 *
1668 		 * F.e. "RFC state" is ESTABLISHED,
1669 		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1670 		 *
1671 		 * The visible declinations are that sometimes
1672 		 * we enter time-wait state, when it is not required really
1673 		 * (harmless), do not send active resets, when they are
1674 		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1675 		 * they look as CLOSING or LAST_ACK for Linux)
1676 		 * Probably, I missed some more holelets.
1677 		 * 						--ANK
1678 		 */
1679 		tcp_send_fin(sk);
1680 	}
1681 
1682 	sk_stream_wait_close(sk, timeout);
1683 
1684 adjudge_to_death:
1685 	/* It is the last release_sock in its life. It will remove backlog. */
1686 	release_sock(sk);
1687 
1688 
1689 	/* Now socket is owned by kernel and we acquire BH lock
1690 	   to finish close. No need to check for user refs.
1691 	 */
1692 	local_bh_disable();
1693 	bh_lock_sock(sk);
1694 	BUG_TRAP(!sock_owned_by_user(sk));
1695 
1696 	sock_hold(sk);
1697 	sock_orphan(sk);
1698 
1699 	/*	This is a (useful) BSD violating of the RFC. There is a
1700 	 *	problem with TCP as specified in that the other end could
1701 	 *	keep a socket open forever with no application left this end.
1702 	 *	We use a 3 minute timeout (about the same as BSD) then kill
1703 	 *	our end. If they send after that then tough - BUT: long enough
1704 	 *	that we won't make the old 4*rto = almost no time - whoops
1705 	 *	reset mistake.
1706 	 *
1707 	 *	Nope, it was not mistake. It is really desired behaviour
1708 	 *	f.e. on http servers, when such sockets are useless, but
1709 	 *	consume significant resources. Let's do it with special
1710 	 *	linger2	option.					--ANK
1711 	 */
1712 
1713 	if (sk->sk_state == TCP_FIN_WAIT2) {
1714 		struct tcp_sock *tp = tcp_sk(sk);
1715 		if (tp->linger2 < 0) {
1716 			tcp_set_state(sk, TCP_CLOSE);
1717 			tcp_send_active_reset(sk, GFP_ATOMIC);
1718 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1719 		} else {
1720 			int tmo = tcp_fin_time(tp);
1721 
1722 			if (tmo > TCP_TIMEWAIT_LEN) {
1723 				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1724 			} else {
1725 				atomic_inc(&tcp_orphan_count);
1726 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1727 				goto out;
1728 			}
1729 		}
1730 	}
1731 	if (sk->sk_state != TCP_CLOSE) {
1732 		sk_stream_mem_reclaim(sk);
1733 		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1734 		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1735 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1736 			if (net_ratelimit())
1737 				printk(KERN_INFO "TCP: too many of orphaned "
1738 				       "sockets\n");
1739 			tcp_set_state(sk, TCP_CLOSE);
1740 			tcp_send_active_reset(sk, GFP_ATOMIC);
1741 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1742 		}
1743 	}
1744 	atomic_inc(&tcp_orphan_count);
1745 
1746 	if (sk->sk_state == TCP_CLOSE)
1747 		tcp_destroy_sock(sk);
1748 	/* Otherwise, socket is reprieved until protocol close. */
1749 
1750 out:
1751 	bh_unlock_sock(sk);
1752 	local_bh_enable();
1753 	sock_put(sk);
1754 }
1755 
1756 /* These states need RST on ABORT according to RFC793 */
1757 
1758 static inline int tcp_need_reset(int state)
1759 {
1760 	return (1 << state) &
1761 	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1762 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1763 }
1764 
1765 int tcp_disconnect(struct sock *sk, int flags)
1766 {
1767 	struct inet_sock *inet = inet_sk(sk);
1768 	struct tcp_sock *tp = tcp_sk(sk);
1769 	int err = 0;
1770 	int old_state = sk->sk_state;
1771 
1772 	if (old_state != TCP_CLOSE)
1773 		tcp_set_state(sk, TCP_CLOSE);
1774 
1775 	/* ABORT function of RFC793 */
1776 	if (old_state == TCP_LISTEN) {
1777 		tcp_listen_stop(sk);
1778 	} else if (tcp_need_reset(old_state) ||
1779 		   (tp->snd_nxt != tp->write_seq &&
1780 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1781 		/* The last check adjusts for discrepance of Linux wrt. RFC
1782 		 * states
1783 		 */
1784 		tcp_send_active_reset(sk, gfp_any());
1785 		sk->sk_err = ECONNRESET;
1786 	} else if (old_state == TCP_SYN_SENT)
1787 		sk->sk_err = ECONNRESET;
1788 
1789 	tcp_clear_xmit_timers(sk);
1790 	__skb_queue_purge(&sk->sk_receive_queue);
1791 	sk_stream_writequeue_purge(sk);
1792 	__skb_queue_purge(&tp->out_of_order_queue);
1793 
1794 	inet->dport = 0;
1795 
1796 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1797 		inet_reset_saddr(sk);
1798 
1799 	sk->sk_shutdown = 0;
1800 	sock_reset_flag(sk, SOCK_DONE);
1801 	tp->srtt = 0;
1802 	if ((tp->write_seq += tp->max_window + 2) == 0)
1803 		tp->write_seq = 1;
1804 	tp->backoff = 0;
1805 	tp->snd_cwnd = 2;
1806 	tp->probes_out = 0;
1807 	tp->packets_out = 0;
1808 	tp->snd_ssthresh = 0x7fffffff;
1809 	tp->snd_cwnd_cnt = 0;
1810 	tcp_set_ca_state(tp, TCP_CA_Open);
1811 	tcp_clear_retrans(tp);
1812 	tcp_delack_init(tp);
1813 	sk->sk_send_head = NULL;
1814 	tp->rx_opt.saw_tstamp = 0;
1815 	tcp_sack_reset(&tp->rx_opt);
1816 	__sk_dst_reset(sk);
1817 
1818 	BUG_TRAP(!inet->num || tp->bind_hash);
1819 
1820 	sk->sk_error_report(sk);
1821 	return err;
1822 }
1823 
1824 /*
1825  *	Wait for an incoming connection, avoid race
1826  *	conditions. This must be called with the socket locked.
1827  */
1828 static int wait_for_connect(struct sock *sk, long timeo)
1829 {
1830 	struct tcp_sock *tp = tcp_sk(sk);
1831 	DEFINE_WAIT(wait);
1832 	int err;
1833 
1834 	/*
1835 	 * True wake-one mechanism for incoming connections: only
1836 	 * one process gets woken up, not the 'whole herd'.
1837 	 * Since we do not 'race & poll' for established sockets
1838 	 * anymore, the common case will execute the loop only once.
1839 	 *
1840 	 * Subtle issue: "add_wait_queue_exclusive()" will be added
1841 	 * after any current non-exclusive waiters, and we know that
1842 	 * it will always _stay_ after any new non-exclusive waiters
1843 	 * because all non-exclusive waiters are added at the
1844 	 * beginning of the wait-queue. As such, it's ok to "drop"
1845 	 * our exclusiveness temporarily when we get woken up without
1846 	 * having to remove and re-insert us on the wait queue.
1847 	 */
1848 	for (;;) {
1849 		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1850 					  TASK_INTERRUPTIBLE);
1851 		release_sock(sk);
1852 		if (reqsk_queue_empty(&tp->accept_queue))
1853 			timeo = schedule_timeout(timeo);
1854 		lock_sock(sk);
1855 		err = 0;
1856 		if (!reqsk_queue_empty(&tp->accept_queue))
1857 			break;
1858 		err = -EINVAL;
1859 		if (sk->sk_state != TCP_LISTEN)
1860 			break;
1861 		err = sock_intr_errno(timeo);
1862 		if (signal_pending(current))
1863 			break;
1864 		err = -EAGAIN;
1865 		if (!timeo)
1866 			break;
1867 	}
1868 	finish_wait(sk->sk_sleep, &wait);
1869 	return err;
1870 }
1871 
1872 /*
1873  *	This will accept the next outstanding connection.
1874  */
1875 
1876 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1877 {
1878 	struct tcp_sock *tp = tcp_sk(sk);
1879 	struct sock *newsk;
1880 	int error;
1881 
1882 	lock_sock(sk);
1883 
1884 	/* We need to make sure that this socket is listening,
1885 	 * and that it has something pending.
1886 	 */
1887 	error = -EINVAL;
1888 	if (sk->sk_state != TCP_LISTEN)
1889 		goto out_err;
1890 
1891 	/* Find already established connection */
1892 	if (reqsk_queue_empty(&tp->accept_queue)) {
1893 		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1894 
1895 		/* If this is a non blocking socket don't sleep */
1896 		error = -EAGAIN;
1897 		if (!timeo)
1898 			goto out_err;
1899 
1900 		error = wait_for_connect(sk, timeo);
1901 		if (error)
1902 			goto out_err;
1903 	}
1904 
1905 	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1906 	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1907 out:
1908 	release_sock(sk);
1909 	return newsk;
1910 out_err:
1911 	newsk = NULL;
1912 	*err = error;
1913 	goto out;
1914 }
1915 
1916 /*
1917  *	Socket option code for TCP.
1918  */
1919 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1920 		   int optlen)
1921 {
1922 	struct tcp_sock *tp = tcp_sk(sk);
1923 	int val;
1924 	int err = 0;
1925 
1926 	if (level != SOL_TCP)
1927 		return tp->af_specific->setsockopt(sk, level, optname,
1928 						   optval, optlen);
1929 
1930 	if (optlen < sizeof(int))
1931 		return -EINVAL;
1932 
1933 	if (get_user(val, (int __user *)optval))
1934 		return -EFAULT;
1935 
1936 	lock_sock(sk);
1937 
1938 	switch (optname) {
1939 	case TCP_MAXSEG:
1940 		/* Values greater than interface MTU won't take effect. However
1941 		 * at the point when this call is done we typically don't yet
1942 		 * know which interface is going to be used */
1943 		if (val < 8 || val > MAX_TCP_WINDOW) {
1944 			err = -EINVAL;
1945 			break;
1946 		}
1947 		tp->rx_opt.user_mss = val;
1948 		break;
1949 
1950 	case TCP_NODELAY:
1951 		if (val) {
1952 			/* TCP_NODELAY is weaker than TCP_CORK, so that
1953 			 * this option on corked socket is remembered, but
1954 			 * it is not activated until cork is cleared.
1955 			 *
1956 			 * However, when TCP_NODELAY is set we make
1957 			 * an explicit push, which overrides even TCP_CORK
1958 			 * for currently queued segments.
1959 			 */
1960 			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1961 			tcp_push_pending_frames(sk, tp);
1962 		} else {
1963 			tp->nonagle &= ~TCP_NAGLE_OFF;
1964 		}
1965 		break;
1966 
1967 	case TCP_CORK:
1968 		/* When set indicates to always queue non-full frames.
1969 		 * Later the user clears this option and we transmit
1970 		 * any pending partial frames in the queue.  This is
1971 		 * meant to be used alongside sendfile() to get properly
1972 		 * filled frames when the user (for example) must write
1973 		 * out headers with a write() call first and then use
1974 		 * sendfile to send out the data parts.
1975 		 *
1976 		 * TCP_CORK can be set together with TCP_NODELAY and it is
1977 		 * stronger than TCP_NODELAY.
1978 		 */
1979 		if (val) {
1980 			tp->nonagle |= TCP_NAGLE_CORK;
1981 		} else {
1982 			tp->nonagle &= ~TCP_NAGLE_CORK;
1983 			if (tp->nonagle&TCP_NAGLE_OFF)
1984 				tp->nonagle |= TCP_NAGLE_PUSH;
1985 			tcp_push_pending_frames(sk, tp);
1986 		}
1987 		break;
1988 
1989 	case TCP_KEEPIDLE:
1990 		if (val < 1 || val > MAX_TCP_KEEPIDLE)
1991 			err = -EINVAL;
1992 		else {
1993 			tp->keepalive_time = val * HZ;
1994 			if (sock_flag(sk, SOCK_KEEPOPEN) &&
1995 			    !((1 << sk->sk_state) &
1996 			      (TCPF_CLOSE | TCPF_LISTEN))) {
1997 				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1998 				if (tp->keepalive_time > elapsed)
1999 					elapsed = tp->keepalive_time - elapsed;
2000 				else
2001 					elapsed = 0;
2002 				tcp_reset_keepalive_timer(sk, elapsed);
2003 			}
2004 		}
2005 		break;
2006 	case TCP_KEEPINTVL:
2007 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2008 			err = -EINVAL;
2009 		else
2010 			tp->keepalive_intvl = val * HZ;
2011 		break;
2012 	case TCP_KEEPCNT:
2013 		if (val < 1 || val > MAX_TCP_KEEPCNT)
2014 			err = -EINVAL;
2015 		else
2016 			tp->keepalive_probes = val;
2017 		break;
2018 	case TCP_SYNCNT:
2019 		if (val < 1 || val > MAX_TCP_SYNCNT)
2020 			err = -EINVAL;
2021 		else
2022 			tp->syn_retries = val;
2023 		break;
2024 
2025 	case TCP_LINGER2:
2026 		if (val < 0)
2027 			tp->linger2 = -1;
2028 		else if (val > sysctl_tcp_fin_timeout / HZ)
2029 			tp->linger2 = 0;
2030 		else
2031 			tp->linger2 = val * HZ;
2032 		break;
2033 
2034 	case TCP_DEFER_ACCEPT:
2035 		tp->defer_accept = 0;
2036 		if (val > 0) {
2037 			/* Translate value in seconds to number of
2038 			 * retransmits */
2039 			while (tp->defer_accept < 32 &&
2040 			       val > ((TCP_TIMEOUT_INIT / HZ) <<
2041 				       tp->defer_accept))
2042 				tp->defer_accept++;
2043 			tp->defer_accept++;
2044 		}
2045 		break;
2046 
2047 	case TCP_WINDOW_CLAMP:
2048 		if (!val) {
2049 			if (sk->sk_state != TCP_CLOSE) {
2050 				err = -EINVAL;
2051 				break;
2052 			}
2053 			tp->window_clamp = 0;
2054 		} else
2055 			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2056 						SOCK_MIN_RCVBUF / 2 : val;
2057 		break;
2058 
2059 	case TCP_QUICKACK:
2060 		if (!val) {
2061 			tp->ack.pingpong = 1;
2062 		} else {
2063 			tp->ack.pingpong = 0;
2064 			if ((1 << sk->sk_state) &
2065 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2066 			    tcp_ack_scheduled(tp)) {
2067 				tp->ack.pending |= TCP_ACK_PUSHED;
2068 				cleanup_rbuf(sk, 1);
2069 				if (!(val & 1))
2070 					tp->ack.pingpong = 1;
2071 			}
2072 		}
2073 		break;
2074 
2075 	default:
2076 		err = -ENOPROTOOPT;
2077 		break;
2078 	};
2079 	release_sock(sk);
2080 	return err;
2081 }
2082 
2083 /* Return information about state of tcp endpoint in API format. */
2084 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2085 {
2086 	struct tcp_sock *tp = tcp_sk(sk);
2087 	u32 now = tcp_time_stamp;
2088 
2089 	memset(info, 0, sizeof(*info));
2090 
2091 	info->tcpi_state = sk->sk_state;
2092 	info->tcpi_ca_state = tp->ca_state;
2093 	info->tcpi_retransmits = tp->retransmits;
2094 	info->tcpi_probes = tp->probes_out;
2095 	info->tcpi_backoff = tp->backoff;
2096 
2097 	if (tp->rx_opt.tstamp_ok)
2098 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2099 	if (tp->rx_opt.sack_ok)
2100 		info->tcpi_options |= TCPI_OPT_SACK;
2101 	if (tp->rx_opt.wscale_ok) {
2102 		info->tcpi_options |= TCPI_OPT_WSCALE;
2103 		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2104 		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2105 	}
2106 
2107 	if (tp->ecn_flags&TCP_ECN_OK)
2108 		info->tcpi_options |= TCPI_OPT_ECN;
2109 
2110 	info->tcpi_rto = jiffies_to_usecs(tp->rto);
2111 	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2112 	info->tcpi_snd_mss = tp->mss_cache_std;
2113 	info->tcpi_rcv_mss = tp->ack.rcv_mss;
2114 
2115 	info->tcpi_unacked = tp->packets_out;
2116 	info->tcpi_sacked = tp->sacked_out;
2117 	info->tcpi_lost = tp->lost_out;
2118 	info->tcpi_retrans = tp->retrans_out;
2119 	info->tcpi_fackets = tp->fackets_out;
2120 
2121 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2122 	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2123 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2124 
2125 	info->tcpi_pmtu = tp->pmtu_cookie;
2126 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2127 	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2128 	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2129 	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2130 	info->tcpi_snd_cwnd = tp->snd_cwnd;
2131 	info->tcpi_advmss = tp->advmss;
2132 	info->tcpi_reordering = tp->reordering;
2133 
2134 	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2135 	info->tcpi_rcv_space = tp->rcvq_space.space;
2136 
2137 	info->tcpi_total_retrans = tp->total_retrans;
2138 }
2139 
2140 EXPORT_SYMBOL_GPL(tcp_get_info);
2141 
2142 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2143 		   int __user *optlen)
2144 {
2145 	struct tcp_sock *tp = tcp_sk(sk);
2146 	int val, len;
2147 
2148 	if (level != SOL_TCP)
2149 		return tp->af_specific->getsockopt(sk, level, optname,
2150 						   optval, optlen);
2151 
2152 	if (get_user(len, optlen))
2153 		return -EFAULT;
2154 
2155 	len = min_t(unsigned int, len, sizeof(int));
2156 
2157 	if (len < 0)
2158 		return -EINVAL;
2159 
2160 	switch (optname) {
2161 	case TCP_MAXSEG:
2162 		val = tp->mss_cache_std;
2163 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2164 			val = tp->rx_opt.user_mss;
2165 		break;
2166 	case TCP_NODELAY:
2167 		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2168 		break;
2169 	case TCP_CORK:
2170 		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2171 		break;
2172 	case TCP_KEEPIDLE:
2173 		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2174 		break;
2175 	case TCP_KEEPINTVL:
2176 		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2177 		break;
2178 	case TCP_KEEPCNT:
2179 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2180 		break;
2181 	case TCP_SYNCNT:
2182 		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2183 		break;
2184 	case TCP_LINGER2:
2185 		val = tp->linger2;
2186 		if (val >= 0)
2187 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2188 		break;
2189 	case TCP_DEFER_ACCEPT:
2190 		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2191 					       (tp->defer_accept - 1));
2192 		break;
2193 	case TCP_WINDOW_CLAMP:
2194 		val = tp->window_clamp;
2195 		break;
2196 	case TCP_INFO: {
2197 		struct tcp_info info;
2198 
2199 		if (get_user(len, optlen))
2200 			return -EFAULT;
2201 
2202 		tcp_get_info(sk, &info);
2203 
2204 		len = min_t(unsigned int, len, sizeof(info));
2205 		if (put_user(len, optlen))
2206 			return -EFAULT;
2207 		if (copy_to_user(optval, &info, len))
2208 			return -EFAULT;
2209 		return 0;
2210 	}
2211 	case TCP_QUICKACK:
2212 		val = !tp->ack.pingpong;
2213 		break;
2214 	default:
2215 		return -ENOPROTOOPT;
2216 	};
2217 
2218 	if (put_user(len, optlen))
2219 		return -EFAULT;
2220 	if (copy_to_user(optval, &val, len))
2221 		return -EFAULT;
2222 	return 0;
2223 }
2224 
2225 
2226 extern void __skb_cb_too_small_for_tcp(int, int);
2227 extern void tcpdiag_init(void);
2228 
2229 static __initdata unsigned long thash_entries;
2230 static int __init set_thash_entries(char *str)
2231 {
2232 	if (!str)
2233 		return 0;
2234 	thash_entries = simple_strtoul(str, &str, 0);
2235 	return 1;
2236 }
2237 __setup("thash_entries=", set_thash_entries);
2238 
2239 void __init tcp_init(void)
2240 {
2241 	struct sk_buff *skb = NULL;
2242 	int order, i;
2243 
2244 	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2245 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2246 					   sizeof(skb->cb));
2247 
2248 	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2249 					      sizeof(struct tcp_bind_bucket),
2250 					      0, SLAB_HWCACHE_ALIGN,
2251 					      NULL, NULL);
2252 	if (!tcp_bucket_cachep)
2253 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2254 
2255 	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2256 						sizeof(struct tcp_tw_bucket),
2257 						0, SLAB_HWCACHE_ALIGN,
2258 						NULL, NULL);
2259 	if (!tcp_timewait_cachep)
2260 		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2261 
2262 	/* Size and allocate the main established and bind bucket
2263 	 * hash tables.
2264 	 *
2265 	 * The methodology is similar to that of the buffer cache.
2266 	 */
2267 	tcp_ehash = (struct tcp_ehash_bucket *)
2268 		alloc_large_system_hash("TCP established",
2269 					sizeof(struct tcp_ehash_bucket),
2270 					thash_entries,
2271 					(num_physpages >= 128 * 1024) ?
2272 						(25 - PAGE_SHIFT) :
2273 						(27 - PAGE_SHIFT),
2274 					HASH_HIGHMEM,
2275 					&tcp_ehash_size,
2276 					NULL,
2277 					0);
2278 	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2279 	for (i = 0; i < (tcp_ehash_size << 1); i++) {
2280 		rwlock_init(&tcp_ehash[i].lock);
2281 		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2282 	}
2283 
2284 	tcp_bhash = (struct tcp_bind_hashbucket *)
2285 		alloc_large_system_hash("TCP bind",
2286 					sizeof(struct tcp_bind_hashbucket),
2287 					tcp_ehash_size,
2288 					(num_physpages >= 128 * 1024) ?
2289 						(25 - PAGE_SHIFT) :
2290 						(27 - PAGE_SHIFT),
2291 					HASH_HIGHMEM,
2292 					&tcp_bhash_size,
2293 					NULL,
2294 					64 * 1024);
2295 	tcp_bhash_size = 1 << tcp_bhash_size;
2296 	for (i = 0; i < tcp_bhash_size; i++) {
2297 		spin_lock_init(&tcp_bhash[i].lock);
2298 		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2299 	}
2300 
2301 	/* Try to be a bit smarter and adjust defaults depending
2302 	 * on available memory.
2303 	 */
2304 	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2305 			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2306 			order++)
2307 		;
2308 	if (order >= 4) {
2309 		sysctl_local_port_range[0] = 32768;
2310 		sysctl_local_port_range[1] = 61000;
2311 		sysctl_tcp_max_tw_buckets = 180000;
2312 		sysctl_tcp_max_orphans = 4096 << (order - 4);
2313 		sysctl_max_syn_backlog = 1024;
2314 	} else if (order < 3) {
2315 		sysctl_local_port_range[0] = 1024 * (3 - order);
2316 		sysctl_tcp_max_tw_buckets >>= (3 - order);
2317 		sysctl_tcp_max_orphans >>= (3 - order);
2318 		sysctl_max_syn_backlog = 128;
2319 	}
2320 	tcp_port_rover = sysctl_local_port_range[0] - 1;
2321 
2322 	sysctl_tcp_mem[0] =  768 << order;
2323 	sysctl_tcp_mem[1] = 1024 << order;
2324 	sysctl_tcp_mem[2] = 1536 << order;
2325 
2326 	if (order < 3) {
2327 		sysctl_tcp_wmem[2] = 64 * 1024;
2328 		sysctl_tcp_rmem[0] = PAGE_SIZE;
2329 		sysctl_tcp_rmem[1] = 43689;
2330 		sysctl_tcp_rmem[2] = 2 * 43689;
2331 	}
2332 
2333 	printk(KERN_INFO "TCP: Hash tables configured "
2334 	       "(established %d bind %d)\n",
2335 	       tcp_ehash_size << 1, tcp_bhash_size);
2336 }
2337 
2338 EXPORT_SYMBOL(tcp_accept);
2339 EXPORT_SYMBOL(tcp_close);
2340 EXPORT_SYMBOL(tcp_destroy_sock);
2341 EXPORT_SYMBOL(tcp_disconnect);
2342 EXPORT_SYMBOL(tcp_getsockopt);
2343 EXPORT_SYMBOL(tcp_ioctl);
2344 EXPORT_SYMBOL(tcp_poll);
2345 EXPORT_SYMBOL(tcp_read_sock);
2346 EXPORT_SYMBOL(tcp_recvmsg);
2347 EXPORT_SYMBOL(tcp_sendmsg);
2348 EXPORT_SYMBOL(tcp_sendpage);
2349 EXPORT_SYMBOL(tcp_setsockopt);
2350 EXPORT_SYMBOL(tcp_shutdown);
2351 EXPORT_SYMBOL(tcp_statistics);
2352 EXPORT_SYMBOL(tcp_timewait_cachep);
2353