xref: /linux/net/ipv4/tcp.c (revision 20d0021394c1b070bf04b22c5bc8fdb437edd4c5)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *		Alan Cox	:	Numerous verify_area() calls
24  *		Alan Cox	:	Set the ACK bit on a reset
25  *		Alan Cox	:	Stopped it crashing if it closed while
26  *					sk->inuse=1 and was trying to connect
27  *					(tcp_err()).
28  *		Alan Cox	:	All icmp error handling was broken
29  *					pointers passed where wrong and the
30  *					socket was looked up backwards. Nobody
31  *					tested any icmp error code obviously.
32  *		Alan Cox	:	tcp_err() now handled properly. It
33  *					wakes people on errors. poll
34  *					behaves and the icmp error race
35  *					has gone by moving it into sock.c
36  *		Alan Cox	:	tcp_send_reset() fixed to work for
37  *					everything not just packets for
38  *					unknown sockets.
39  *		Alan Cox	:	tcp option processing.
40  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41  *					syn rule wrong]
42  *		Herp Rosmanith  :	More reset fixes
43  *		Alan Cox	:	No longer acks invalid rst frames.
44  *					Acking any kind of RST is right out.
45  *		Alan Cox	:	Sets an ignore me flag on an rst
46  *					receive otherwise odd bits of prattle
47  *					escape still
48  *		Alan Cox	:	Fixed another acking RST frame bug.
49  *					Should stop LAN workplace lockups.
50  *		Alan Cox	: 	Some tidyups using the new skb list
51  *					facilities
52  *		Alan Cox	:	sk->keepopen now seems to work
53  *		Alan Cox	:	Pulls options out correctly on accepts
54  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56  *					bit to skb ops.
57  *		Alan Cox	:	Tidied tcp_data to avoid a potential
58  *					nasty.
59  *		Alan Cox	:	Added some better commenting, as the
60  *					tcp is hard to follow
61  *		Alan Cox	:	Removed incorrect check for 20 * psh
62  *	Michael O'Reilly	:	ack < copied bug fix.
63  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64  *		Alan Cox	:	FIN with no memory -> CRASH
65  *		Alan Cox	:	Added socket option proto entries.
66  *					Also added awareness of them to accept.
67  *		Alan Cox	:	Added TCP options (SOL_TCP)
68  *		Alan Cox	:	Switched wakeup calls to callbacks,
69  *					so the kernel can layer network
70  *					sockets.
71  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72  *		Alan Cox	:	Handle FIN (more) properly (we hope).
73  *		Alan Cox	:	RST frames sent on unsynchronised
74  *					state ack error.
75  *		Alan Cox	:	Put in missing check for SYN bit.
76  *		Alan Cox	:	Added tcp_select_window() aka NET2E
77  *					window non shrink trick.
78  *		Alan Cox	:	Added a couple of small NET2E timer
79  *					fixes
80  *		Charles Hedrick :	TCP fixes
81  *		Toomas Tamm	:	TCP window fixes
82  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83  *		Charles Hedrick	:	Rewrote most of it to actually work
84  *		Linus		:	Rewrote tcp_read() and URG handling
85  *					completely
86  *		Gerhard Koerting:	Fixed some missing timer handling
87  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88  *		Gerhard Koerting:	PC/TCP workarounds
89  *		Adam Caldwell	:	Assorted timer/timing errors
90  *		Matthew Dillon	:	Fixed another RST bug
91  *		Alan Cox	:	Move to kernel side addressing changes.
92  *		Alan Cox	:	Beginning work on TCP fastpathing
93  *					(not yet usable)
94  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95  *		Alan Cox	:	TCP fast path debugging
96  *		Alan Cox	:	Window clamping
97  *		Michael Riepe	:	Bug in tcp_check()
98  *		Matt Dillon	:	More TCP improvements and RST bug fixes
99  *		Matt Dillon	:	Yet more small nasties remove from the
100  *					TCP code (Be very nice to this man if
101  *					tcp finally works 100%) 8)
102  *		Alan Cox	:	BSD accept semantics.
103  *		Alan Cox	:	Reset on closedown bug.
104  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105  *		Michael Pall	:	Handle poll() after URG properly in
106  *					all cases.
107  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108  *					(multi URG PUSH broke rlogin).
109  *		Michael Pall	:	Fix the multi URG PUSH problem in
110  *					tcp_readable(), poll() after URG
111  *					works now.
112  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113  *					BSD api.
114  *		Alan Cox	:	Changed the semantics of sk->socket to
115  *					fix a race and a signal problem with
116  *					accept() and async I/O.
117  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120  *					clients/servers which listen in on
121  *					fixed ports.
122  *		Alan Cox	:	Cleaned the above up and shrank it to
123  *					a sensible code size.
124  *		Alan Cox	:	Self connect lockup fix.
125  *		Alan Cox	:	No connect to multicast.
126  *		Ross Biro	:	Close unaccepted children on master
127  *					socket close.
128  *		Alan Cox	:	Reset tracing code.
129  *		Alan Cox	:	Spurious resets on shutdown.
130  *		Alan Cox	:	Giant 15 minute/60 second timer error
131  *		Alan Cox	:	Small whoops in polling before an
132  *					accept.
133  *		Alan Cox	:	Kept the state trace facility since
134  *					it's handy for debugging.
135  *		Alan Cox	:	More reset handler fixes.
136  *		Alan Cox	:	Started rewriting the code based on
137  *					the RFC's for other useful protocol
138  *					references see: Comer, KA9Q NOS, and
139  *					for a reference on the difference
140  *					between specifications and how BSD
141  *					works see the 4.4lite source.
142  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143  *					close.
144  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146  *		Alan Cox	:	Reimplemented timers as per the RFC
147  *					and using multiple timers for sanity.
148  *		Alan Cox	:	Small bug fixes, and a lot of new
149  *					comments.
150  *		Alan Cox	:	Fixed dual reader crash by locking
151  *					the buffers (much like datagram.c)
152  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153  *					now gets fed up of retrying without
154  *					(even a no space) answer.
155  *		Alan Cox	:	Extracted closing code better
156  *		Alan Cox	:	Fixed the closing state machine to
157  *					resemble the RFC.
158  *		Alan Cox	:	More 'per spec' fixes.
159  *		Jorge Cwik	:	Even faster checksumming.
160  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161  *					only frames. At least one pc tcp stack
162  *					generates them.
163  *		Alan Cox	:	Cache last socket.
164  *		Alan Cox	:	Per route irtt.
165  *		Matt Day	:	poll()->select() match BSD precisely on error
166  *		Alan Cox	:	New buffers
167  *		Marc Tamsky	:	Various sk->prot->retransmits and
168  *					sk->retransmits misupdating fixed.
169  *					Fixed tcp_write_timeout: stuck close,
170  *					and TCP syn retries gets used now.
171  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172  *					ack if state is TCP_CLOSED.
173  *		Alan Cox	:	Look up device on a retransmit - routes may
174  *					change. Doesn't yet cope with MSS shrink right
175  *					but it's a start!
176  *		Marc Tamsky	:	Closing in closing fixes.
177  *		Mike Shaver	:	RFC1122 verifications.
178  *		Alan Cox	:	rcv_saddr errors.
179  *		Alan Cox	:	Block double connect().
180  *		Alan Cox	:	Small hooks for enSKIP.
181  *		Alexey Kuznetsov:	Path MTU discovery.
182  *		Alan Cox	:	Support soft errors.
183  *		Alan Cox	:	Fix MTU discovery pathological case
184  *					when the remote claims no mtu!
185  *		Marc Tamsky	:	TCP_CLOSE fix.
186  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187  *					window but wrong (fixes NT lpd problems)
188  *		Pedro Roque	:	Better TCP window handling, delayed ack.
189  *		Joerg Reuter	:	No modification of locked buffers in
190  *					tcp_do_retransmit()
191  *		Eric Schenk	:	Changed receiver side silly window
192  *					avoidance algorithm to BSD style
193  *					algorithm. This doubles throughput
194  *					against machines running Solaris,
195  *					and seems to result in general
196  *					improvement.
197  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198  *	Willy Konynenberg	:	Transparent proxying support.
199  *	Mike McLagan		:	Routing by source
200  *		Keith Owens	:	Do proper merging with partial SKB's in
201  *					tcp_do_sendmsg to avoid burstiness.
202  *		Eric Schenk	:	Fix fast close down bug with
203  *					shutdown() followed by close().
204  *		Andi Kleen 	:	Make poll agree with SIGIO
205  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206  *					lingertime == 0 (RFC 793 ABORT Call)
207  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208  *					csum_and_copy_from_user() if possible.
209  *
210  *		This program is free software; you can redistribute it and/or
211  *		modify it under the terms of the GNU General Public License
212  *		as published by the Free Software Foundation; either version
213  *		2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *	TCP_SYN_SENT		sent a connection request, waiting for ack
218  *
219  *	TCP_SYN_RECV		received a connection request, sent ack,
220  *				waiting for final ack in three-way handshake.
221  *
222  *	TCP_ESTABLISHED		connection established
223  *
224  *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225  *				transmission of remaining buffered data
226  *
227  *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228  *				to shutdown
229  *
230  *	TCP_CLOSING		both sides have shutdown but we still have
231  *				data we have to finish sending
232  *
233  *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234  *				closed, can only be entered from FIN_WAIT2
235  *				or CLOSING.  Required because the other end
236  *				may not have gotten our last ACK causing it
237  *				to retransmit the data packet (which we ignore)
238  *
239  *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240  *				us to finish writing our data and to shutdown
241  *				(we have to close() to move on to LAST_ACK)
242  *
243  *	TCP_LAST_ACK		out side has shutdown after remote has
244  *				shutdown.  There may still be data in our
245  *				buffer that we have to finish sending
246  *
247  *	TCP_CLOSE		socket is finished
248  */
249 
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260 
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265 
266 
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269 
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271 
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273 
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276 
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278 
279 int sysctl_tcp_mem[3];
280 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282 
283 EXPORT_SYMBOL(sysctl_tcp_mem);
284 EXPORT_SYMBOL(sysctl_tcp_rmem);
285 EXPORT_SYMBOL(sysctl_tcp_wmem);
286 
287 atomic_t tcp_memory_allocated;	/* Current allocated memory. */
288 atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
289 
290 EXPORT_SYMBOL(tcp_memory_allocated);
291 EXPORT_SYMBOL(tcp_sockets_allocated);
292 
293 /*
294  * Pressure flag: try to collapse.
295  * Technical note: it is used by multiple contexts non atomically.
296  * All the sk_stream_mem_schedule() is of this nature: accounting
297  * is strict, actions are advisory and have some latency.
298  */
299 int tcp_memory_pressure;
300 
301 EXPORT_SYMBOL(tcp_memory_pressure);
302 
303 void tcp_enter_memory_pressure(void)
304 {
305 	if (!tcp_memory_pressure) {
306 		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307 		tcp_memory_pressure = 1;
308 	}
309 }
310 
311 EXPORT_SYMBOL(tcp_enter_memory_pressure);
312 
313 /*
314  * LISTEN is a special case for poll..
315  */
316 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317 					       poll_table *wait)
318 {
319 	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320 }
321 
322 /*
323  *	Wait for a TCP event.
324  *
325  *	Note that we don't need to lock the socket, as the upper poll layers
326  *	take care of normal races (between the test and the event) and we don't
327  *	go look at any of the socket buffers directly.
328  */
329 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330 {
331 	unsigned int mask;
332 	struct sock *sk = sock->sk;
333 	struct tcp_sock *tp = tcp_sk(sk);
334 
335 	poll_wait(file, sk->sk_sleep, wait);
336 	if (sk->sk_state == TCP_LISTEN)
337 		return tcp_listen_poll(sk, wait);
338 
339 	/* Socket is not locked. We are protected from async events
340 	   by poll logic and correct handling of state changes
341 	   made by another threads is impossible in any case.
342 	 */
343 
344 	mask = 0;
345 	if (sk->sk_err)
346 		mask = POLLERR;
347 
348 	/*
349 	 * POLLHUP is certainly not done right. But poll() doesn't
350 	 * have a notion of HUP in just one direction, and for a
351 	 * socket the read side is more interesting.
352 	 *
353 	 * Some poll() documentation says that POLLHUP is incompatible
354 	 * with the POLLOUT/POLLWR flags, so somebody should check this
355 	 * all. But careful, it tends to be safer to return too many
356 	 * bits than too few, and you can easily break real applications
357 	 * if you don't tell them that something has hung up!
358 	 *
359 	 * Check-me.
360 	 *
361 	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362 	 * our fs/select.c). It means that after we received EOF,
363 	 * poll always returns immediately, making impossible poll() on write()
364 	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365 	 * if and only if shutdown has been made in both directions.
366 	 * Actually, it is interesting to look how Solaris and DUX
367 	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
368 	 * then we could set it on SND_SHUTDOWN. BTW examples given
369 	 * in Stevens' books assume exactly this behaviour, it explains
370 	 * why PULLHUP is incompatible with POLLOUT.	--ANK
371 	 *
372 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373 	 * blocking on fresh not-connected or disconnected socket. --ANK
374 	 */
375 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376 		mask |= POLLHUP;
377 	if (sk->sk_shutdown & RCV_SHUTDOWN)
378 		mask |= POLLIN | POLLRDNORM;
379 
380 	/* Connected? */
381 	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382 		/* Potential race condition. If read of tp below will
383 		 * escape above sk->sk_state, we can be illegally awaken
384 		 * in SYN_* states. */
385 		if ((tp->rcv_nxt != tp->copied_seq) &&
386 		    (tp->urg_seq != tp->copied_seq ||
387 		     tp->rcv_nxt != tp->copied_seq + 1 ||
388 		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389 			mask |= POLLIN | POLLRDNORM;
390 
391 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393 				mask |= POLLOUT | POLLWRNORM;
394 			} else {  /* send SIGIO later */
395 				set_bit(SOCK_ASYNC_NOSPACE,
396 					&sk->sk_socket->flags);
397 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398 
399 				/* Race breaker. If space is freed after
400 				 * wspace test but before the flags are set,
401 				 * IO signal will be lost.
402 				 */
403 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404 					mask |= POLLOUT | POLLWRNORM;
405 			}
406 		}
407 
408 		if (tp->urg_data & TCP_URG_VALID)
409 			mask |= POLLPRI;
410 	}
411 	return mask;
412 }
413 
414 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415 {
416 	struct tcp_sock *tp = tcp_sk(sk);
417 	int answ;
418 
419 	switch (cmd) {
420 	case SIOCINQ:
421 		if (sk->sk_state == TCP_LISTEN)
422 			return -EINVAL;
423 
424 		lock_sock(sk);
425 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426 			answ = 0;
427 		else if (sock_flag(sk, SOCK_URGINLINE) ||
428 			 !tp->urg_data ||
429 			 before(tp->urg_seq, tp->copied_seq) ||
430 			 !before(tp->urg_seq, tp->rcv_nxt)) {
431 			answ = tp->rcv_nxt - tp->copied_seq;
432 
433 			/* Subtract 1, if FIN is in queue. */
434 			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435 				answ -=
436 		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437 		} else
438 			answ = tp->urg_seq - tp->copied_seq;
439 		release_sock(sk);
440 		break;
441 	case SIOCATMARK:
442 		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443 		break;
444 	case SIOCOUTQ:
445 		if (sk->sk_state == TCP_LISTEN)
446 			return -EINVAL;
447 
448 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449 			answ = 0;
450 		else
451 			answ = tp->write_seq - tp->snd_una;
452 		break;
453 	default:
454 		return -ENOIOCTLCMD;
455 	};
456 
457 	return put_user(answ, (int __user *)arg);
458 }
459 
460 
461 int tcp_listen_start(struct sock *sk)
462 {
463 	struct inet_sock *inet = inet_sk(sk);
464 	struct tcp_sock *tp = tcp_sk(sk);
465 	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466 
467 	if (rc != 0)
468 		return rc;
469 
470 	sk->sk_max_ack_backlog = 0;
471 	sk->sk_ack_backlog = 0;
472 	tcp_delack_init(tp);
473 
474 	/* There is race window here: we announce ourselves listening,
475 	 * but this transition is still not validated by get_port().
476 	 * It is OK, because this socket enters to hash table only
477 	 * after validation is complete.
478 	 */
479 	sk->sk_state = TCP_LISTEN;
480 	if (!sk->sk_prot->get_port(sk, inet->num)) {
481 		inet->sport = htons(inet->num);
482 
483 		sk_dst_reset(sk);
484 		sk->sk_prot->hash(sk);
485 
486 		return 0;
487 	}
488 
489 	sk->sk_state = TCP_CLOSE;
490 	reqsk_queue_destroy(&tp->accept_queue);
491 	return -EADDRINUSE;
492 }
493 
494 /*
495  *	This routine closes sockets which have been at least partially
496  *	opened, but not yet accepted.
497  */
498 
499 static void tcp_listen_stop (struct sock *sk)
500 {
501 	struct tcp_sock *tp = tcp_sk(sk);
502 	struct listen_sock *lopt;
503 	struct request_sock *acc_req;
504 	struct request_sock *req;
505 	int i;
506 
507 	tcp_delete_keepalive_timer(sk);
508 
509 	/* make all the listen_opt local to us */
510 	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511 	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512 
513 	if (lopt->qlen) {
514 		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515 			while ((req = lopt->syn_table[i]) != NULL) {
516 				lopt->syn_table[i] = req->dl_next;
517 				lopt->qlen--;
518 				reqsk_free(req);
519 
520 		/* Following specs, it would be better either to send FIN
521 		 * (and enter FIN-WAIT-1, it is normal close)
522 		 * or to send active reset (abort).
523 		 * Certainly, it is pretty dangerous while synflood, but it is
524 		 * bad justification for our negligence 8)
525 		 * To be honest, we are not able to make either
526 		 * of the variants now.			--ANK
527 		 */
528 			}
529 		}
530 	}
531 	BUG_TRAP(!lopt->qlen);
532 
533 	kfree(lopt);
534 
535 	while ((req = acc_req) != NULL) {
536 		struct sock *child = req->sk;
537 
538 		acc_req = req->dl_next;
539 
540 		local_bh_disable();
541 		bh_lock_sock(child);
542 		BUG_TRAP(!sock_owned_by_user(child));
543 		sock_hold(child);
544 
545 		tcp_disconnect(child, O_NONBLOCK);
546 
547 		sock_orphan(child);
548 
549 		atomic_inc(&tcp_orphan_count);
550 
551 		tcp_destroy_sock(child);
552 
553 		bh_unlock_sock(child);
554 		local_bh_enable();
555 		sock_put(child);
556 
557 		sk_acceptq_removed(sk);
558 		__reqsk_free(req);
559 	}
560 	BUG_TRAP(!sk->sk_ack_backlog);
561 }
562 
563 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564 {
565 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
566 	tp->pushed_seq = tp->write_seq;
567 }
568 
569 static inline int forced_push(struct tcp_sock *tp)
570 {
571 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
572 }
573 
574 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
575 			      struct sk_buff *skb)
576 {
577 	skb->csum = 0;
578 	TCP_SKB_CB(skb)->seq = tp->write_seq;
579 	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
580 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
581 	TCP_SKB_CB(skb)->sacked = 0;
582 	skb_header_release(skb);
583 	__skb_queue_tail(&sk->sk_write_queue, skb);
584 	sk_charge_skb(sk, skb);
585 	if (!sk->sk_send_head)
586 		sk->sk_send_head = skb;
587 	else if (tp->nonagle&TCP_NAGLE_PUSH)
588 		tp->nonagle &= ~TCP_NAGLE_PUSH;
589 }
590 
591 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
592 				struct sk_buff *skb)
593 {
594 	if (flags & MSG_OOB) {
595 		tp->urg_mode = 1;
596 		tp->snd_up = tp->write_seq;
597 		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
598 	}
599 }
600 
601 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
602 			    int mss_now, int nonagle)
603 {
604 	if (sk->sk_send_head) {
605 		struct sk_buff *skb = sk->sk_write_queue.prev;
606 		if (!(flags & MSG_MORE) || forced_push(tp))
607 			tcp_mark_push(tp, skb);
608 		tcp_mark_urg(tp, flags, skb);
609 		__tcp_push_pending_frames(sk, tp, mss_now,
610 					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
611 	}
612 }
613 
614 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
615 			 size_t psize, int flags)
616 {
617 	struct tcp_sock *tp = tcp_sk(sk);
618 	int mss_now, size_goal;
619 	int err;
620 	ssize_t copied;
621 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
622 
623 	/* Wait for a connection to finish. */
624 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
625 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
626 			goto out_err;
627 
628 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629 
630 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 	size_goal = tp->xmit_size_goal;
632 	copied = 0;
633 
634 	err = -EPIPE;
635 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
636 		goto do_error;
637 
638 	while (psize > 0) {
639 		struct sk_buff *skb = sk->sk_write_queue.prev;
640 		struct page *page = pages[poffset / PAGE_SIZE];
641 		int copy, i, can_coalesce;
642 		int offset = poffset % PAGE_SIZE;
643 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
644 
645 		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
646 new_segment:
647 			if (!sk_stream_memory_free(sk))
648 				goto wait_for_sndbuf;
649 
650 			skb = sk_stream_alloc_pskb(sk, 0, 0,
651 						   sk->sk_allocation);
652 			if (!skb)
653 				goto wait_for_memory;
654 
655 			skb_entail(sk, tp, skb);
656 			copy = size_goal;
657 		}
658 
659 		if (copy > size)
660 			copy = size;
661 
662 		i = skb_shinfo(skb)->nr_frags;
663 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
664 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
665 			tcp_mark_push(tp, skb);
666 			goto new_segment;
667 		}
668 		if (sk->sk_forward_alloc < copy &&
669 		    !sk_stream_mem_schedule(sk, copy, 0))
670 			goto wait_for_memory;
671 
672 		if (can_coalesce) {
673 			skb_shinfo(skb)->frags[i - 1].size += copy;
674 		} else {
675 			get_page(page);
676 			skb_fill_page_desc(skb, i, page, offset, copy);
677 		}
678 
679 		skb->len += copy;
680 		skb->data_len += copy;
681 		skb->truesize += copy;
682 		sk->sk_wmem_queued += copy;
683 		sk->sk_forward_alloc -= copy;
684 		skb->ip_summed = CHECKSUM_HW;
685 		tp->write_seq += copy;
686 		TCP_SKB_CB(skb)->end_seq += copy;
687 		skb_shinfo(skb)->tso_segs = 0;
688 
689 		if (!copied)
690 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
691 
692 		copied += copy;
693 		poffset += copy;
694 		if (!(psize -= copy))
695 			goto out;
696 
697 		if (skb->len < mss_now || (flags & MSG_OOB))
698 			continue;
699 
700 		if (forced_push(tp)) {
701 			tcp_mark_push(tp, skb);
702 			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
703 		} else if (skb == sk->sk_send_head)
704 			tcp_push_one(sk, mss_now);
705 		continue;
706 
707 wait_for_sndbuf:
708 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
709 wait_for_memory:
710 		if (copied)
711 			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
712 
713 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
714 			goto do_error;
715 
716 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717 		size_goal = tp->xmit_size_goal;
718 	}
719 
720 out:
721 	if (copied)
722 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
723 	return copied;
724 
725 do_error:
726 	if (copied)
727 		goto out;
728 out_err:
729 	return sk_stream_error(sk, flags, err);
730 }
731 
732 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
733 		     size_t size, int flags)
734 {
735 	ssize_t res;
736 	struct sock *sk = sock->sk;
737 
738 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
739 
740 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
741 	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
742 		return sock_no_sendpage(sock, page, offset, size, flags);
743 
744 #undef TCP_ZC_CSUM_FLAGS
745 
746 	lock_sock(sk);
747 	TCP_CHECK_TIMER(sk);
748 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
749 	TCP_CHECK_TIMER(sk);
750 	release_sock(sk);
751 	return res;
752 }
753 
754 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
755 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
756 
757 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
758 {
759 	int tmp = tp->mss_cache;
760 
761 	if (sk->sk_route_caps & NETIF_F_SG) {
762 		if (sk->sk_route_caps & NETIF_F_TSO)
763 			tmp = 0;
764 		else {
765 			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
766 
767 			if (tmp >= pgbreak &&
768 			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
769 				tmp = pgbreak;
770 		}
771 	}
772 
773 	return tmp;
774 }
775 
776 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
777 		size_t size)
778 {
779 	struct iovec *iov;
780 	struct tcp_sock *tp = tcp_sk(sk);
781 	struct sk_buff *skb;
782 	int iovlen, flags;
783 	int mss_now, size_goal;
784 	int err, copied;
785 	long timeo;
786 
787 	lock_sock(sk);
788 	TCP_CHECK_TIMER(sk);
789 
790 	flags = msg->msg_flags;
791 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
792 
793 	/* Wait for a connection to finish. */
794 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
795 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
796 			goto out_err;
797 
798 	/* This should be in poll */
799 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
800 
801 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802 	size_goal = tp->xmit_size_goal;
803 
804 	/* Ok commence sending. */
805 	iovlen = msg->msg_iovlen;
806 	iov = msg->msg_iov;
807 	copied = 0;
808 
809 	err = -EPIPE;
810 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
811 		goto do_error;
812 
813 	while (--iovlen >= 0) {
814 		int seglen = iov->iov_len;
815 		unsigned char __user *from = iov->iov_base;
816 
817 		iov++;
818 
819 		while (seglen > 0) {
820 			int copy;
821 
822 			skb = sk->sk_write_queue.prev;
823 
824 			if (!sk->sk_send_head ||
825 			    (copy = size_goal - skb->len) <= 0) {
826 
827 new_segment:
828 				/* Allocate new segment. If the interface is SG,
829 				 * allocate skb fitting to single page.
830 				 */
831 				if (!sk_stream_memory_free(sk))
832 					goto wait_for_sndbuf;
833 
834 				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
835 							   0, sk->sk_allocation);
836 				if (!skb)
837 					goto wait_for_memory;
838 
839 				/*
840 				 * Check whether we can use HW checksum.
841 				 */
842 				if (sk->sk_route_caps &
843 				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
844 				     NETIF_F_HW_CSUM))
845 					skb->ip_summed = CHECKSUM_HW;
846 
847 				skb_entail(sk, tp, skb);
848 				copy = size_goal;
849 			}
850 
851 			/* Try to append data to the end of skb. */
852 			if (copy > seglen)
853 				copy = seglen;
854 
855 			/* Where to copy to? */
856 			if (skb_tailroom(skb) > 0) {
857 				/* We have some space in skb head. Superb! */
858 				if (copy > skb_tailroom(skb))
859 					copy = skb_tailroom(skb);
860 				if ((err = skb_add_data(skb, from, copy)) != 0)
861 					goto do_fault;
862 			} else {
863 				int merge = 0;
864 				int i = skb_shinfo(skb)->nr_frags;
865 				struct page *page = TCP_PAGE(sk);
866 				int off = TCP_OFF(sk);
867 
868 				if (skb_can_coalesce(skb, i, page, off) &&
869 				    off != PAGE_SIZE) {
870 					/* We can extend the last page
871 					 * fragment. */
872 					merge = 1;
873 				} else if (i == MAX_SKB_FRAGS ||
874 					   (!i &&
875 					   !(sk->sk_route_caps & NETIF_F_SG))) {
876 					/* Need to add new fragment and cannot
877 					 * do this because interface is non-SG,
878 					 * or because all the page slots are
879 					 * busy. */
880 					tcp_mark_push(tp, skb);
881 					goto new_segment;
882 				} else if (page) {
883 					if (off == PAGE_SIZE) {
884 						put_page(page);
885 						TCP_PAGE(sk) = page = NULL;
886 					}
887 				}
888 
889 				if (!page) {
890 					/* Allocate new cache page. */
891 					if (!(page = sk_stream_alloc_page(sk)))
892 						goto wait_for_memory;
893 					off = 0;
894 				}
895 
896 				if (copy > PAGE_SIZE - off)
897 					copy = PAGE_SIZE - off;
898 
899 				/* Time to copy data. We are close to
900 				 * the end! */
901 				err = skb_copy_to_page(sk, from, skb, page,
902 						       off, copy);
903 				if (err) {
904 					/* If this page was new, give it to the
905 					 * socket so it does not get leaked.
906 					 */
907 					if (!TCP_PAGE(sk)) {
908 						TCP_PAGE(sk) = page;
909 						TCP_OFF(sk) = 0;
910 					}
911 					goto do_error;
912 				}
913 
914 				/* Update the skb. */
915 				if (merge) {
916 					skb_shinfo(skb)->frags[i - 1].size +=
917 									copy;
918 				} else {
919 					skb_fill_page_desc(skb, i, page, off, copy);
920 					if (TCP_PAGE(sk)) {
921 						get_page(page);
922 					} else if (off + copy < PAGE_SIZE) {
923 						get_page(page);
924 						TCP_PAGE(sk) = page;
925 					}
926 				}
927 
928 				TCP_OFF(sk) = off + copy;
929 			}
930 
931 			if (!copied)
932 				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
933 
934 			tp->write_seq += copy;
935 			TCP_SKB_CB(skb)->end_seq += copy;
936 			skb_shinfo(skb)->tso_segs = 0;
937 
938 			from += copy;
939 			copied += copy;
940 			if ((seglen -= copy) == 0 && iovlen == 0)
941 				goto out;
942 
943 			if (skb->len < mss_now || (flags & MSG_OOB))
944 				continue;
945 
946 			if (forced_push(tp)) {
947 				tcp_mark_push(tp, skb);
948 				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
949 			} else if (skb == sk->sk_send_head)
950 				tcp_push_one(sk, mss_now);
951 			continue;
952 
953 wait_for_sndbuf:
954 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
955 wait_for_memory:
956 			if (copied)
957 				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
958 
959 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
960 				goto do_error;
961 
962 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963 			size_goal = tp->xmit_size_goal;
964 		}
965 	}
966 
967 out:
968 	if (copied)
969 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
970 	TCP_CHECK_TIMER(sk);
971 	release_sock(sk);
972 	return copied;
973 
974 do_fault:
975 	if (!skb->len) {
976 		if (sk->sk_send_head == skb)
977 			sk->sk_send_head = NULL;
978 		__skb_unlink(skb, skb->list);
979 		sk_stream_free_skb(sk, skb);
980 	}
981 
982 do_error:
983 	if (copied)
984 		goto out;
985 out_err:
986 	err = sk_stream_error(sk, flags, err);
987 	TCP_CHECK_TIMER(sk);
988 	release_sock(sk);
989 	return err;
990 }
991 
992 /*
993  *	Handle reading urgent data. BSD has very simple semantics for
994  *	this, no blocking and very strange errors 8)
995  */
996 
997 static int tcp_recv_urg(struct sock *sk, long timeo,
998 			struct msghdr *msg, int len, int flags,
999 			int *addr_len)
1000 {
1001 	struct tcp_sock *tp = tcp_sk(sk);
1002 
1003 	/* No URG data to read. */
1004 	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1005 	    tp->urg_data == TCP_URG_READ)
1006 		return -EINVAL;	/* Yes this is right ! */
1007 
1008 	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1009 		return -ENOTCONN;
1010 
1011 	if (tp->urg_data & TCP_URG_VALID) {
1012 		int err = 0;
1013 		char c = tp->urg_data;
1014 
1015 		if (!(flags & MSG_PEEK))
1016 			tp->urg_data = TCP_URG_READ;
1017 
1018 		/* Read urgent data. */
1019 		msg->msg_flags |= MSG_OOB;
1020 
1021 		if (len > 0) {
1022 			if (!(flags & MSG_TRUNC))
1023 				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1024 			len = 1;
1025 		} else
1026 			msg->msg_flags |= MSG_TRUNC;
1027 
1028 		return err ? -EFAULT : len;
1029 	}
1030 
1031 	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1032 		return 0;
1033 
1034 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1035 	 * the available implementations agree in this case:
1036 	 * this call should never block, independent of the
1037 	 * blocking state of the socket.
1038 	 * Mike <pall@rz.uni-karlsruhe.de>
1039 	 */
1040 	return -EAGAIN;
1041 }
1042 
1043 /* Clean up the receive buffer for full frames taken by the user,
1044  * then send an ACK if necessary.  COPIED is the number of bytes
1045  * tcp_recvmsg has given to the user so far, it speeds up the
1046  * calculation of whether or not we must ACK for the sake of
1047  * a window update.
1048  */
1049 static void cleanup_rbuf(struct sock *sk, int copied)
1050 {
1051 	struct tcp_sock *tp = tcp_sk(sk);
1052 	int time_to_ack = 0;
1053 
1054 #if TCP_DEBUG
1055 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1056 
1057 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1058 #endif
1059 
1060 	if (tcp_ack_scheduled(tp)) {
1061 		   /* Delayed ACKs frequently hit locked sockets during bulk
1062 		    * receive. */
1063 		if (tp->ack.blocked ||
1064 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1065 		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1066 		    /*
1067 		     * If this read emptied read buffer, we send ACK, if
1068 		     * connection is not bidirectional, user drained
1069 		     * receive buffer and there was a small segment
1070 		     * in queue.
1071 		     */
1072 		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1073 		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1074 			time_to_ack = 1;
1075 	}
1076 
1077 	/* We send an ACK if we can now advertise a non-zero window
1078 	 * which has been raised "significantly".
1079 	 *
1080 	 * Even if window raised up to infinity, do not send window open ACK
1081 	 * in states, where we will not receive more. It is useless.
1082 	 */
1083 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1084 		__u32 rcv_window_now = tcp_receive_window(tp);
1085 
1086 		/* Optimize, __tcp_select_window() is not cheap. */
1087 		if (2*rcv_window_now <= tp->window_clamp) {
1088 			__u32 new_window = __tcp_select_window(sk);
1089 
1090 			/* Send ACK now, if this read freed lots of space
1091 			 * in our buffer. Certainly, new_window is new window.
1092 			 * We can advertise it now, if it is not less than current one.
1093 			 * "Lots" means "at least twice" here.
1094 			 */
1095 			if (new_window && new_window >= 2 * rcv_window_now)
1096 				time_to_ack = 1;
1097 		}
1098 	}
1099 	if (time_to_ack)
1100 		tcp_send_ack(sk);
1101 }
1102 
1103 static void tcp_prequeue_process(struct sock *sk)
1104 {
1105 	struct sk_buff *skb;
1106 	struct tcp_sock *tp = tcp_sk(sk);
1107 
1108 	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1109 
1110 	/* RX process wants to run with disabled BHs, though it is not
1111 	 * necessary */
1112 	local_bh_disable();
1113 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1114 		sk->sk_backlog_rcv(sk, skb);
1115 	local_bh_enable();
1116 
1117 	/* Clear memory counter. */
1118 	tp->ucopy.memory = 0;
1119 }
1120 
1121 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1122 {
1123 	struct sk_buff *skb;
1124 	u32 offset;
1125 
1126 	skb_queue_walk(&sk->sk_receive_queue, skb) {
1127 		offset = seq - TCP_SKB_CB(skb)->seq;
1128 		if (skb->h.th->syn)
1129 			offset--;
1130 		if (offset < skb->len || skb->h.th->fin) {
1131 			*off = offset;
1132 			return skb;
1133 		}
1134 	}
1135 	return NULL;
1136 }
1137 
1138 /*
1139  * This routine provides an alternative to tcp_recvmsg() for routines
1140  * that would like to handle copying from skbuffs directly in 'sendfile'
1141  * fashion.
1142  * Note:
1143  *	- It is assumed that the socket was locked by the caller.
1144  *	- The routine does not block.
1145  *	- At present, there is no support for reading OOB data
1146  *	  or for 'peeking' the socket using this routine
1147  *	  (although both would be easy to implement).
1148  */
1149 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1150 		  sk_read_actor_t recv_actor)
1151 {
1152 	struct sk_buff *skb;
1153 	struct tcp_sock *tp = tcp_sk(sk);
1154 	u32 seq = tp->copied_seq;
1155 	u32 offset;
1156 	int copied = 0;
1157 
1158 	if (sk->sk_state == TCP_LISTEN)
1159 		return -ENOTCONN;
1160 	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1161 		if (offset < skb->len) {
1162 			size_t used, len;
1163 
1164 			len = skb->len - offset;
1165 			/* Stop reading if we hit a patch of urgent data */
1166 			if (tp->urg_data) {
1167 				u32 urg_offset = tp->urg_seq - seq;
1168 				if (urg_offset < len)
1169 					len = urg_offset;
1170 				if (!len)
1171 					break;
1172 			}
1173 			used = recv_actor(desc, skb, offset, len);
1174 			if (used <= len) {
1175 				seq += used;
1176 				copied += used;
1177 				offset += used;
1178 			}
1179 			if (offset != skb->len)
1180 				break;
1181 		}
1182 		if (skb->h.th->fin) {
1183 			sk_eat_skb(sk, skb);
1184 			++seq;
1185 			break;
1186 		}
1187 		sk_eat_skb(sk, skb);
1188 		if (!desc->count)
1189 			break;
1190 	}
1191 	tp->copied_seq = seq;
1192 
1193 	tcp_rcv_space_adjust(sk);
1194 
1195 	/* Clean up data we have read: This will do ACK frames. */
1196 	if (copied)
1197 		cleanup_rbuf(sk, copied);
1198 	return copied;
1199 }
1200 
1201 /*
1202  *	This routine copies from a sock struct into the user buffer.
1203  *
1204  *	Technical note: in 2.3 we work on _locked_ socket, so that
1205  *	tricks with *seq access order and skb->users are not required.
1206  *	Probably, code can be easily improved even more.
1207  */
1208 
1209 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1210 		size_t len, int nonblock, int flags, int *addr_len)
1211 {
1212 	struct tcp_sock *tp = tcp_sk(sk);
1213 	int copied = 0;
1214 	u32 peek_seq;
1215 	u32 *seq;
1216 	unsigned long used;
1217 	int err;
1218 	int target;		/* Read at least this many bytes */
1219 	long timeo;
1220 	struct task_struct *user_recv = NULL;
1221 
1222 	lock_sock(sk);
1223 
1224 	TCP_CHECK_TIMER(sk);
1225 
1226 	err = -ENOTCONN;
1227 	if (sk->sk_state == TCP_LISTEN)
1228 		goto out;
1229 
1230 	timeo = sock_rcvtimeo(sk, nonblock);
1231 
1232 	/* Urgent data needs to be handled specially. */
1233 	if (flags & MSG_OOB)
1234 		goto recv_urg;
1235 
1236 	seq = &tp->copied_seq;
1237 	if (flags & MSG_PEEK) {
1238 		peek_seq = tp->copied_seq;
1239 		seq = &peek_seq;
1240 	}
1241 
1242 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1243 
1244 	do {
1245 		struct sk_buff *skb;
1246 		u32 offset;
1247 
1248 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1249 		if (tp->urg_data && tp->urg_seq == *seq) {
1250 			if (copied)
1251 				break;
1252 			if (signal_pending(current)) {
1253 				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1254 				break;
1255 			}
1256 		}
1257 
1258 		/* Next get a buffer. */
1259 
1260 		skb = skb_peek(&sk->sk_receive_queue);
1261 		do {
1262 			if (!skb)
1263 				break;
1264 
1265 			/* Now that we have two receive queues this
1266 			 * shouldn't happen.
1267 			 */
1268 			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1269 				printk(KERN_INFO "recvmsg bug: copied %X "
1270 				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1271 				break;
1272 			}
1273 			offset = *seq - TCP_SKB_CB(skb)->seq;
1274 			if (skb->h.th->syn)
1275 				offset--;
1276 			if (offset < skb->len)
1277 				goto found_ok_skb;
1278 			if (skb->h.th->fin)
1279 				goto found_fin_ok;
1280 			BUG_TRAP(flags & MSG_PEEK);
1281 			skb = skb->next;
1282 		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1283 
1284 		/* Well, if we have backlog, try to process it now yet. */
1285 
1286 		if (copied >= target && !sk->sk_backlog.tail)
1287 			break;
1288 
1289 		if (copied) {
1290 			if (sk->sk_err ||
1291 			    sk->sk_state == TCP_CLOSE ||
1292 			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1293 			    !timeo ||
1294 			    signal_pending(current) ||
1295 			    (flags & MSG_PEEK))
1296 				break;
1297 		} else {
1298 			if (sock_flag(sk, SOCK_DONE))
1299 				break;
1300 
1301 			if (sk->sk_err) {
1302 				copied = sock_error(sk);
1303 				break;
1304 			}
1305 
1306 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1307 				break;
1308 
1309 			if (sk->sk_state == TCP_CLOSE) {
1310 				if (!sock_flag(sk, SOCK_DONE)) {
1311 					/* This occurs when user tries to read
1312 					 * from never connected socket.
1313 					 */
1314 					copied = -ENOTCONN;
1315 					break;
1316 				}
1317 				break;
1318 			}
1319 
1320 			if (!timeo) {
1321 				copied = -EAGAIN;
1322 				break;
1323 			}
1324 
1325 			if (signal_pending(current)) {
1326 				copied = sock_intr_errno(timeo);
1327 				break;
1328 			}
1329 		}
1330 
1331 		cleanup_rbuf(sk, copied);
1332 
1333 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1334 			/* Install new reader */
1335 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1336 				user_recv = current;
1337 				tp->ucopy.task = user_recv;
1338 				tp->ucopy.iov = msg->msg_iov;
1339 			}
1340 
1341 			tp->ucopy.len = len;
1342 
1343 			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1344 				 (flags & (MSG_PEEK | MSG_TRUNC)));
1345 
1346 			/* Ugly... If prequeue is not empty, we have to
1347 			 * process it before releasing socket, otherwise
1348 			 * order will be broken at second iteration.
1349 			 * More elegant solution is required!!!
1350 			 *
1351 			 * Look: we have the following (pseudo)queues:
1352 			 *
1353 			 * 1. packets in flight
1354 			 * 2. backlog
1355 			 * 3. prequeue
1356 			 * 4. receive_queue
1357 			 *
1358 			 * Each queue can be processed only if the next ones
1359 			 * are empty. At this point we have empty receive_queue.
1360 			 * But prequeue _can_ be not empty after 2nd iteration,
1361 			 * when we jumped to start of loop because backlog
1362 			 * processing added something to receive_queue.
1363 			 * We cannot release_sock(), because backlog contains
1364 			 * packets arrived _after_ prequeued ones.
1365 			 *
1366 			 * Shortly, algorithm is clear --- to process all
1367 			 * the queues in order. We could make it more directly,
1368 			 * requeueing packets from backlog to prequeue, if
1369 			 * is not empty. It is more elegant, but eats cycles,
1370 			 * unfortunately.
1371 			 */
1372 			if (!skb_queue_empty(&tp->ucopy.prequeue))
1373 				goto do_prequeue;
1374 
1375 			/* __ Set realtime policy in scheduler __ */
1376 		}
1377 
1378 		if (copied >= target) {
1379 			/* Do not sleep, just process backlog. */
1380 			release_sock(sk);
1381 			lock_sock(sk);
1382 		} else
1383 			sk_wait_data(sk, &timeo);
1384 
1385 		if (user_recv) {
1386 			int chunk;
1387 
1388 			/* __ Restore normal policy in scheduler __ */
1389 
1390 			if ((chunk = len - tp->ucopy.len) != 0) {
1391 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1392 				len -= chunk;
1393 				copied += chunk;
1394 			}
1395 
1396 			if (tp->rcv_nxt == tp->copied_seq &&
1397 			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1398 do_prequeue:
1399 				tcp_prequeue_process(sk);
1400 
1401 				if ((chunk = len - tp->ucopy.len) != 0) {
1402 					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1403 					len -= chunk;
1404 					copied += chunk;
1405 				}
1406 			}
1407 		}
1408 		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1409 			if (net_ratelimit())
1410 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1411 				       current->comm, current->pid);
1412 			peek_seq = tp->copied_seq;
1413 		}
1414 		continue;
1415 
1416 	found_ok_skb:
1417 		/* Ok so how much can we use? */
1418 		used = skb->len - offset;
1419 		if (len < used)
1420 			used = len;
1421 
1422 		/* Do we have urgent data here? */
1423 		if (tp->urg_data) {
1424 			u32 urg_offset = tp->urg_seq - *seq;
1425 			if (urg_offset < used) {
1426 				if (!urg_offset) {
1427 					if (!sock_flag(sk, SOCK_URGINLINE)) {
1428 						++*seq;
1429 						offset++;
1430 						used--;
1431 						if (!used)
1432 							goto skip_copy;
1433 					}
1434 				} else
1435 					used = urg_offset;
1436 			}
1437 		}
1438 
1439 		if (!(flags & MSG_TRUNC)) {
1440 			err = skb_copy_datagram_iovec(skb, offset,
1441 						      msg->msg_iov, used);
1442 			if (err) {
1443 				/* Exception. Bailout! */
1444 				if (!copied)
1445 					copied = -EFAULT;
1446 				break;
1447 			}
1448 		}
1449 
1450 		*seq += used;
1451 		copied += used;
1452 		len -= used;
1453 
1454 		tcp_rcv_space_adjust(sk);
1455 
1456 skip_copy:
1457 		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1458 			tp->urg_data = 0;
1459 			tcp_fast_path_check(sk, tp);
1460 		}
1461 		if (used + offset < skb->len)
1462 			continue;
1463 
1464 		if (skb->h.th->fin)
1465 			goto found_fin_ok;
1466 		if (!(flags & MSG_PEEK))
1467 			sk_eat_skb(sk, skb);
1468 		continue;
1469 
1470 	found_fin_ok:
1471 		/* Process the FIN. */
1472 		++*seq;
1473 		if (!(flags & MSG_PEEK))
1474 			sk_eat_skb(sk, skb);
1475 		break;
1476 	} while (len > 0);
1477 
1478 	if (user_recv) {
1479 		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1480 			int chunk;
1481 
1482 			tp->ucopy.len = copied > 0 ? len : 0;
1483 
1484 			tcp_prequeue_process(sk);
1485 
1486 			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1487 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1488 				len -= chunk;
1489 				copied += chunk;
1490 			}
1491 		}
1492 
1493 		tp->ucopy.task = NULL;
1494 		tp->ucopy.len = 0;
1495 	}
1496 
1497 	/* According to UNIX98, msg_name/msg_namelen are ignored
1498 	 * on connected socket. I was just happy when found this 8) --ANK
1499 	 */
1500 
1501 	/* Clean up data we have read: This will do ACK frames. */
1502 	cleanup_rbuf(sk, copied);
1503 
1504 	TCP_CHECK_TIMER(sk);
1505 	release_sock(sk);
1506 	return copied;
1507 
1508 out:
1509 	TCP_CHECK_TIMER(sk);
1510 	release_sock(sk);
1511 	return err;
1512 
1513 recv_urg:
1514 	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1515 	goto out;
1516 }
1517 
1518 /*
1519  *	State processing on a close. This implements the state shift for
1520  *	sending our FIN frame. Note that we only send a FIN for some
1521  *	states. A shutdown() may have already sent the FIN, or we may be
1522  *	closed.
1523  */
1524 
1525 static unsigned char new_state[16] = {
1526   /* current state:        new state:      action:	*/
1527   /* (Invalid)		*/ TCP_CLOSE,
1528   /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1529   /* TCP_SYN_SENT	*/ TCP_CLOSE,
1530   /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1531   /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1532   /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1533   /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1534   /* TCP_CLOSE		*/ TCP_CLOSE,
1535   /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1536   /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1537   /* TCP_LISTEN		*/ TCP_CLOSE,
1538   /* TCP_CLOSING	*/ TCP_CLOSING,
1539 };
1540 
1541 static int tcp_close_state(struct sock *sk)
1542 {
1543 	int next = (int)new_state[sk->sk_state];
1544 	int ns = next & TCP_STATE_MASK;
1545 
1546 	tcp_set_state(sk, ns);
1547 
1548 	return next & TCP_ACTION_FIN;
1549 }
1550 
1551 /*
1552  *	Shutdown the sending side of a connection. Much like close except
1553  *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1554  */
1555 
1556 void tcp_shutdown(struct sock *sk, int how)
1557 {
1558 	/*	We need to grab some memory, and put together a FIN,
1559 	 *	and then put it into the queue to be sent.
1560 	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1561 	 */
1562 	if (!(how & SEND_SHUTDOWN))
1563 		return;
1564 
1565 	/* If we've already sent a FIN, or it's a closed state, skip this. */
1566 	if ((1 << sk->sk_state) &
1567 	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1568 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1569 		/* Clear out any half completed packets.  FIN if needed. */
1570 		if (tcp_close_state(sk))
1571 			tcp_send_fin(sk);
1572 	}
1573 }
1574 
1575 /*
1576  * At this point, there should be no process reference to this
1577  * socket, and thus no user references at all.  Therefore we
1578  * can assume the socket waitqueue is inactive and nobody will
1579  * try to jump onto it.
1580  */
1581 void tcp_destroy_sock(struct sock *sk)
1582 {
1583 	BUG_TRAP(sk->sk_state == TCP_CLOSE);
1584 	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1585 
1586 	/* It cannot be in hash table! */
1587 	BUG_TRAP(sk_unhashed(sk));
1588 
1589 	/* If it has not 0 inet_sk(sk)->num, it must be bound */
1590 	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1591 
1592 	sk->sk_prot->destroy(sk);
1593 
1594 	sk_stream_kill_queues(sk);
1595 
1596 	xfrm_sk_free_policy(sk);
1597 
1598 #ifdef INET_REFCNT_DEBUG
1599 	if (atomic_read(&sk->sk_refcnt) != 1) {
1600 		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1601 		       sk, atomic_read(&sk->sk_refcnt));
1602 	}
1603 #endif
1604 
1605 	atomic_dec(&tcp_orphan_count);
1606 	sock_put(sk);
1607 }
1608 
1609 void tcp_close(struct sock *sk, long timeout)
1610 {
1611 	struct sk_buff *skb;
1612 	int data_was_unread = 0;
1613 
1614 	lock_sock(sk);
1615 	sk->sk_shutdown = SHUTDOWN_MASK;
1616 
1617 	if (sk->sk_state == TCP_LISTEN) {
1618 		tcp_set_state(sk, TCP_CLOSE);
1619 
1620 		/* Special case. */
1621 		tcp_listen_stop(sk);
1622 
1623 		goto adjudge_to_death;
1624 	}
1625 
1626 	/*  We need to flush the recv. buffs.  We do this only on the
1627 	 *  descriptor close, not protocol-sourced closes, because the
1628 	 *  reader process may not have drained the data yet!
1629 	 */
1630 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1631 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1632 			  skb->h.th->fin;
1633 		data_was_unread += len;
1634 		__kfree_skb(skb);
1635 	}
1636 
1637 	sk_stream_mem_reclaim(sk);
1638 
1639 	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1640 	 * 3.10, we send a RST here because data was lost.  To
1641 	 * witness the awful effects of the old behavior of always
1642 	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1643 	 * a bulk GET in an FTP client, suspend the process, wait
1644 	 * for the client to advertise a zero window, then kill -9
1645 	 * the FTP client, wheee...  Note: timeout is always zero
1646 	 * in such a case.
1647 	 */
1648 	if (data_was_unread) {
1649 		/* Unread data was tossed, zap the connection. */
1650 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1651 		tcp_set_state(sk, TCP_CLOSE);
1652 		tcp_send_active_reset(sk, GFP_KERNEL);
1653 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1654 		/* Check zero linger _after_ checking for unread data. */
1655 		sk->sk_prot->disconnect(sk, 0);
1656 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1657 	} else if (tcp_close_state(sk)) {
1658 		/* We FIN if the application ate all the data before
1659 		 * zapping the connection.
1660 		 */
1661 
1662 		/* RED-PEN. Formally speaking, we have broken TCP state
1663 		 * machine. State transitions:
1664 		 *
1665 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1666 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1667 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1668 		 *
1669 		 * are legal only when FIN has been sent (i.e. in window),
1670 		 * rather than queued out of window. Purists blame.
1671 		 *
1672 		 * F.e. "RFC state" is ESTABLISHED,
1673 		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1674 		 *
1675 		 * The visible declinations are that sometimes
1676 		 * we enter time-wait state, when it is not required really
1677 		 * (harmless), do not send active resets, when they are
1678 		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1679 		 * they look as CLOSING or LAST_ACK for Linux)
1680 		 * Probably, I missed some more holelets.
1681 		 * 						--ANK
1682 		 */
1683 		tcp_send_fin(sk);
1684 	}
1685 
1686 	sk_stream_wait_close(sk, timeout);
1687 
1688 adjudge_to_death:
1689 	/* It is the last release_sock in its life. It will remove backlog. */
1690 	release_sock(sk);
1691 
1692 
1693 	/* Now socket is owned by kernel and we acquire BH lock
1694 	   to finish close. No need to check for user refs.
1695 	 */
1696 	local_bh_disable();
1697 	bh_lock_sock(sk);
1698 	BUG_TRAP(!sock_owned_by_user(sk));
1699 
1700 	sock_hold(sk);
1701 	sock_orphan(sk);
1702 
1703 	/*	This is a (useful) BSD violating of the RFC. There is a
1704 	 *	problem with TCP as specified in that the other end could
1705 	 *	keep a socket open forever with no application left this end.
1706 	 *	We use a 3 minute timeout (about the same as BSD) then kill
1707 	 *	our end. If they send after that then tough - BUT: long enough
1708 	 *	that we won't make the old 4*rto = almost no time - whoops
1709 	 *	reset mistake.
1710 	 *
1711 	 *	Nope, it was not mistake. It is really desired behaviour
1712 	 *	f.e. on http servers, when such sockets are useless, but
1713 	 *	consume significant resources. Let's do it with special
1714 	 *	linger2	option.					--ANK
1715 	 */
1716 
1717 	if (sk->sk_state == TCP_FIN_WAIT2) {
1718 		struct tcp_sock *tp = tcp_sk(sk);
1719 		if (tp->linger2 < 0) {
1720 			tcp_set_state(sk, TCP_CLOSE);
1721 			tcp_send_active_reset(sk, GFP_ATOMIC);
1722 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1723 		} else {
1724 			int tmo = tcp_fin_time(tp);
1725 
1726 			if (tmo > TCP_TIMEWAIT_LEN) {
1727 				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1728 			} else {
1729 				atomic_inc(&tcp_orphan_count);
1730 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1731 				goto out;
1732 			}
1733 		}
1734 	}
1735 	if (sk->sk_state != TCP_CLOSE) {
1736 		sk_stream_mem_reclaim(sk);
1737 		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1738 		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1739 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1740 			if (net_ratelimit())
1741 				printk(KERN_INFO "TCP: too many of orphaned "
1742 				       "sockets\n");
1743 			tcp_set_state(sk, TCP_CLOSE);
1744 			tcp_send_active_reset(sk, GFP_ATOMIC);
1745 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1746 		}
1747 	}
1748 	atomic_inc(&tcp_orphan_count);
1749 
1750 	if (sk->sk_state == TCP_CLOSE)
1751 		tcp_destroy_sock(sk);
1752 	/* Otherwise, socket is reprieved until protocol close. */
1753 
1754 out:
1755 	bh_unlock_sock(sk);
1756 	local_bh_enable();
1757 	sock_put(sk);
1758 }
1759 
1760 /* These states need RST on ABORT according to RFC793 */
1761 
1762 static inline int tcp_need_reset(int state)
1763 {
1764 	return (1 << state) &
1765 	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1766 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1767 }
1768 
1769 int tcp_disconnect(struct sock *sk, int flags)
1770 {
1771 	struct inet_sock *inet = inet_sk(sk);
1772 	struct tcp_sock *tp = tcp_sk(sk);
1773 	int err = 0;
1774 	int old_state = sk->sk_state;
1775 
1776 	if (old_state != TCP_CLOSE)
1777 		tcp_set_state(sk, TCP_CLOSE);
1778 
1779 	/* ABORT function of RFC793 */
1780 	if (old_state == TCP_LISTEN) {
1781 		tcp_listen_stop(sk);
1782 	} else if (tcp_need_reset(old_state) ||
1783 		   (tp->snd_nxt != tp->write_seq &&
1784 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1785 		/* The last check adjusts for discrepance of Linux wrt. RFC
1786 		 * states
1787 		 */
1788 		tcp_send_active_reset(sk, gfp_any());
1789 		sk->sk_err = ECONNRESET;
1790 	} else if (old_state == TCP_SYN_SENT)
1791 		sk->sk_err = ECONNRESET;
1792 
1793 	tcp_clear_xmit_timers(sk);
1794 	__skb_queue_purge(&sk->sk_receive_queue);
1795 	sk_stream_writequeue_purge(sk);
1796 	__skb_queue_purge(&tp->out_of_order_queue);
1797 
1798 	inet->dport = 0;
1799 
1800 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1801 		inet_reset_saddr(sk);
1802 
1803 	sk->sk_shutdown = 0;
1804 	sock_reset_flag(sk, SOCK_DONE);
1805 	tp->srtt = 0;
1806 	if ((tp->write_seq += tp->max_window + 2) == 0)
1807 		tp->write_seq = 1;
1808 	tp->backoff = 0;
1809 	tp->snd_cwnd = 2;
1810 	tp->probes_out = 0;
1811 	tp->packets_out = 0;
1812 	tp->snd_ssthresh = 0x7fffffff;
1813 	tp->snd_cwnd_cnt = 0;
1814 	tcp_set_ca_state(tp, TCP_CA_Open);
1815 	tcp_clear_retrans(tp);
1816 	tcp_delack_init(tp);
1817 	sk->sk_send_head = NULL;
1818 	tp->rx_opt.saw_tstamp = 0;
1819 	tcp_sack_reset(&tp->rx_opt);
1820 	__sk_dst_reset(sk);
1821 
1822 	BUG_TRAP(!inet->num || tp->bind_hash);
1823 
1824 	sk->sk_error_report(sk);
1825 	return err;
1826 }
1827 
1828 /*
1829  *	Wait for an incoming connection, avoid race
1830  *	conditions. This must be called with the socket locked.
1831  */
1832 static int wait_for_connect(struct sock *sk, long timeo)
1833 {
1834 	struct tcp_sock *tp = tcp_sk(sk);
1835 	DEFINE_WAIT(wait);
1836 	int err;
1837 
1838 	/*
1839 	 * True wake-one mechanism for incoming connections: only
1840 	 * one process gets woken up, not the 'whole herd'.
1841 	 * Since we do not 'race & poll' for established sockets
1842 	 * anymore, the common case will execute the loop only once.
1843 	 *
1844 	 * Subtle issue: "add_wait_queue_exclusive()" will be added
1845 	 * after any current non-exclusive waiters, and we know that
1846 	 * it will always _stay_ after any new non-exclusive waiters
1847 	 * because all non-exclusive waiters are added at the
1848 	 * beginning of the wait-queue. As such, it's ok to "drop"
1849 	 * our exclusiveness temporarily when we get woken up without
1850 	 * having to remove and re-insert us on the wait queue.
1851 	 */
1852 	for (;;) {
1853 		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1854 					  TASK_INTERRUPTIBLE);
1855 		release_sock(sk);
1856 		if (reqsk_queue_empty(&tp->accept_queue))
1857 			timeo = schedule_timeout(timeo);
1858 		lock_sock(sk);
1859 		err = 0;
1860 		if (!reqsk_queue_empty(&tp->accept_queue))
1861 			break;
1862 		err = -EINVAL;
1863 		if (sk->sk_state != TCP_LISTEN)
1864 			break;
1865 		err = sock_intr_errno(timeo);
1866 		if (signal_pending(current))
1867 			break;
1868 		err = -EAGAIN;
1869 		if (!timeo)
1870 			break;
1871 	}
1872 	finish_wait(sk->sk_sleep, &wait);
1873 	return err;
1874 }
1875 
1876 /*
1877  *	This will accept the next outstanding connection.
1878  */
1879 
1880 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1881 {
1882 	struct tcp_sock *tp = tcp_sk(sk);
1883 	struct sock *newsk;
1884 	int error;
1885 
1886 	lock_sock(sk);
1887 
1888 	/* We need to make sure that this socket is listening,
1889 	 * and that it has something pending.
1890 	 */
1891 	error = -EINVAL;
1892 	if (sk->sk_state != TCP_LISTEN)
1893 		goto out_err;
1894 
1895 	/* Find already established connection */
1896 	if (reqsk_queue_empty(&tp->accept_queue)) {
1897 		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1898 
1899 		/* If this is a non blocking socket don't sleep */
1900 		error = -EAGAIN;
1901 		if (!timeo)
1902 			goto out_err;
1903 
1904 		error = wait_for_connect(sk, timeo);
1905 		if (error)
1906 			goto out_err;
1907 	}
1908 
1909 	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1910 	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1911 out:
1912 	release_sock(sk);
1913 	return newsk;
1914 out_err:
1915 	newsk = NULL;
1916 	*err = error;
1917 	goto out;
1918 }
1919 
1920 /*
1921  *	Socket option code for TCP.
1922  */
1923 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1924 		   int optlen)
1925 {
1926 	struct tcp_sock *tp = tcp_sk(sk);
1927 	int val;
1928 	int err = 0;
1929 
1930 	if (level != SOL_TCP)
1931 		return tp->af_specific->setsockopt(sk, level, optname,
1932 						   optval, optlen);
1933 
1934 	/* This is a string value all the others are int's */
1935 	if (optname == TCP_CONGESTION) {
1936 		char name[TCP_CA_NAME_MAX];
1937 
1938 		if (optlen < 1)
1939 			return -EINVAL;
1940 
1941 		val = strncpy_from_user(name, optval,
1942 					min(TCP_CA_NAME_MAX-1, optlen));
1943 		if (val < 0)
1944 			return -EFAULT;
1945 		name[val] = 0;
1946 
1947 		lock_sock(sk);
1948 		err = tcp_set_congestion_control(tp, name);
1949 		release_sock(sk);
1950 		return err;
1951 	}
1952 
1953 	if (optlen < sizeof(int))
1954 		return -EINVAL;
1955 
1956 	if (get_user(val, (int __user *)optval))
1957 		return -EFAULT;
1958 
1959 	lock_sock(sk);
1960 
1961 	switch (optname) {
1962 	case TCP_MAXSEG:
1963 		/* Values greater than interface MTU won't take effect. However
1964 		 * at the point when this call is done we typically don't yet
1965 		 * know which interface is going to be used */
1966 		if (val < 8 || val > MAX_TCP_WINDOW) {
1967 			err = -EINVAL;
1968 			break;
1969 		}
1970 		tp->rx_opt.user_mss = val;
1971 		break;
1972 
1973 	case TCP_NODELAY:
1974 		if (val) {
1975 			/* TCP_NODELAY is weaker than TCP_CORK, so that
1976 			 * this option on corked socket is remembered, but
1977 			 * it is not activated until cork is cleared.
1978 			 *
1979 			 * However, when TCP_NODELAY is set we make
1980 			 * an explicit push, which overrides even TCP_CORK
1981 			 * for currently queued segments.
1982 			 */
1983 			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1984 			tcp_push_pending_frames(sk, tp);
1985 		} else {
1986 			tp->nonagle &= ~TCP_NAGLE_OFF;
1987 		}
1988 		break;
1989 
1990 	case TCP_CORK:
1991 		/* When set indicates to always queue non-full frames.
1992 		 * Later the user clears this option and we transmit
1993 		 * any pending partial frames in the queue.  This is
1994 		 * meant to be used alongside sendfile() to get properly
1995 		 * filled frames when the user (for example) must write
1996 		 * out headers with a write() call first and then use
1997 		 * sendfile to send out the data parts.
1998 		 *
1999 		 * TCP_CORK can be set together with TCP_NODELAY and it is
2000 		 * stronger than TCP_NODELAY.
2001 		 */
2002 		if (val) {
2003 			tp->nonagle |= TCP_NAGLE_CORK;
2004 		} else {
2005 			tp->nonagle &= ~TCP_NAGLE_CORK;
2006 			if (tp->nonagle&TCP_NAGLE_OFF)
2007 				tp->nonagle |= TCP_NAGLE_PUSH;
2008 			tcp_push_pending_frames(sk, tp);
2009 		}
2010 		break;
2011 
2012 	case TCP_KEEPIDLE:
2013 		if (val < 1 || val > MAX_TCP_KEEPIDLE)
2014 			err = -EINVAL;
2015 		else {
2016 			tp->keepalive_time = val * HZ;
2017 			if (sock_flag(sk, SOCK_KEEPOPEN) &&
2018 			    !((1 << sk->sk_state) &
2019 			      (TCPF_CLOSE | TCPF_LISTEN))) {
2020 				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2021 				if (tp->keepalive_time > elapsed)
2022 					elapsed = tp->keepalive_time - elapsed;
2023 				else
2024 					elapsed = 0;
2025 				tcp_reset_keepalive_timer(sk, elapsed);
2026 			}
2027 		}
2028 		break;
2029 	case TCP_KEEPINTVL:
2030 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2031 			err = -EINVAL;
2032 		else
2033 			tp->keepalive_intvl = val * HZ;
2034 		break;
2035 	case TCP_KEEPCNT:
2036 		if (val < 1 || val > MAX_TCP_KEEPCNT)
2037 			err = -EINVAL;
2038 		else
2039 			tp->keepalive_probes = val;
2040 		break;
2041 	case TCP_SYNCNT:
2042 		if (val < 1 || val > MAX_TCP_SYNCNT)
2043 			err = -EINVAL;
2044 		else
2045 			tp->syn_retries = val;
2046 		break;
2047 
2048 	case TCP_LINGER2:
2049 		if (val < 0)
2050 			tp->linger2 = -1;
2051 		else if (val > sysctl_tcp_fin_timeout / HZ)
2052 			tp->linger2 = 0;
2053 		else
2054 			tp->linger2 = val * HZ;
2055 		break;
2056 
2057 	case TCP_DEFER_ACCEPT:
2058 		tp->defer_accept = 0;
2059 		if (val > 0) {
2060 			/* Translate value in seconds to number of
2061 			 * retransmits */
2062 			while (tp->defer_accept < 32 &&
2063 			       val > ((TCP_TIMEOUT_INIT / HZ) <<
2064 				       tp->defer_accept))
2065 				tp->defer_accept++;
2066 			tp->defer_accept++;
2067 		}
2068 		break;
2069 
2070 	case TCP_WINDOW_CLAMP:
2071 		if (!val) {
2072 			if (sk->sk_state != TCP_CLOSE) {
2073 				err = -EINVAL;
2074 				break;
2075 			}
2076 			tp->window_clamp = 0;
2077 		} else
2078 			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2079 						SOCK_MIN_RCVBUF / 2 : val;
2080 		break;
2081 
2082 	case TCP_QUICKACK:
2083 		if (!val) {
2084 			tp->ack.pingpong = 1;
2085 		} else {
2086 			tp->ack.pingpong = 0;
2087 			if ((1 << sk->sk_state) &
2088 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2089 			    tcp_ack_scheduled(tp)) {
2090 				tp->ack.pending |= TCP_ACK_PUSHED;
2091 				cleanup_rbuf(sk, 1);
2092 				if (!(val & 1))
2093 					tp->ack.pingpong = 1;
2094 			}
2095 		}
2096 		break;
2097 
2098 	default:
2099 		err = -ENOPROTOOPT;
2100 		break;
2101 	};
2102 	release_sock(sk);
2103 	return err;
2104 }
2105 
2106 /* Return information about state of tcp endpoint in API format. */
2107 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2108 {
2109 	struct tcp_sock *tp = tcp_sk(sk);
2110 	u32 now = tcp_time_stamp;
2111 
2112 	memset(info, 0, sizeof(*info));
2113 
2114 	info->tcpi_state = sk->sk_state;
2115 	info->tcpi_ca_state = tp->ca_state;
2116 	info->tcpi_retransmits = tp->retransmits;
2117 	info->tcpi_probes = tp->probes_out;
2118 	info->tcpi_backoff = tp->backoff;
2119 
2120 	if (tp->rx_opt.tstamp_ok)
2121 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2122 	if (tp->rx_opt.sack_ok)
2123 		info->tcpi_options |= TCPI_OPT_SACK;
2124 	if (tp->rx_opt.wscale_ok) {
2125 		info->tcpi_options |= TCPI_OPT_WSCALE;
2126 		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2127 		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2128 	}
2129 
2130 	if (tp->ecn_flags&TCP_ECN_OK)
2131 		info->tcpi_options |= TCPI_OPT_ECN;
2132 
2133 	info->tcpi_rto = jiffies_to_usecs(tp->rto);
2134 	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2135 	info->tcpi_snd_mss = tp->mss_cache;
2136 	info->tcpi_rcv_mss = tp->ack.rcv_mss;
2137 
2138 	info->tcpi_unacked = tp->packets_out;
2139 	info->tcpi_sacked = tp->sacked_out;
2140 	info->tcpi_lost = tp->lost_out;
2141 	info->tcpi_retrans = tp->retrans_out;
2142 	info->tcpi_fackets = tp->fackets_out;
2143 
2144 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2145 	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2146 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2147 
2148 	info->tcpi_pmtu = tp->pmtu_cookie;
2149 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2150 	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2151 	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2152 	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2153 	info->tcpi_snd_cwnd = tp->snd_cwnd;
2154 	info->tcpi_advmss = tp->advmss;
2155 	info->tcpi_reordering = tp->reordering;
2156 
2157 	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2158 	info->tcpi_rcv_space = tp->rcvq_space.space;
2159 
2160 	info->tcpi_total_retrans = tp->total_retrans;
2161 }
2162 
2163 EXPORT_SYMBOL_GPL(tcp_get_info);
2164 
2165 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2166 		   int __user *optlen)
2167 {
2168 	struct tcp_sock *tp = tcp_sk(sk);
2169 	int val, len;
2170 
2171 	if (level != SOL_TCP)
2172 		return tp->af_specific->getsockopt(sk, level, optname,
2173 						   optval, optlen);
2174 
2175 	if (get_user(len, optlen))
2176 		return -EFAULT;
2177 
2178 	len = min_t(unsigned int, len, sizeof(int));
2179 
2180 	if (len < 0)
2181 		return -EINVAL;
2182 
2183 	switch (optname) {
2184 	case TCP_MAXSEG:
2185 		val = tp->mss_cache;
2186 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2187 			val = tp->rx_opt.user_mss;
2188 		break;
2189 	case TCP_NODELAY:
2190 		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2191 		break;
2192 	case TCP_CORK:
2193 		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2194 		break;
2195 	case TCP_KEEPIDLE:
2196 		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2197 		break;
2198 	case TCP_KEEPINTVL:
2199 		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2200 		break;
2201 	case TCP_KEEPCNT:
2202 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2203 		break;
2204 	case TCP_SYNCNT:
2205 		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2206 		break;
2207 	case TCP_LINGER2:
2208 		val = tp->linger2;
2209 		if (val >= 0)
2210 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2211 		break;
2212 	case TCP_DEFER_ACCEPT:
2213 		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2214 					       (tp->defer_accept - 1));
2215 		break;
2216 	case TCP_WINDOW_CLAMP:
2217 		val = tp->window_clamp;
2218 		break;
2219 	case TCP_INFO: {
2220 		struct tcp_info info;
2221 
2222 		if (get_user(len, optlen))
2223 			return -EFAULT;
2224 
2225 		tcp_get_info(sk, &info);
2226 
2227 		len = min_t(unsigned int, len, sizeof(info));
2228 		if (put_user(len, optlen))
2229 			return -EFAULT;
2230 		if (copy_to_user(optval, &info, len))
2231 			return -EFAULT;
2232 		return 0;
2233 	}
2234 	case TCP_QUICKACK:
2235 		val = !tp->ack.pingpong;
2236 		break;
2237 
2238 	case TCP_CONGESTION:
2239 		if (get_user(len, optlen))
2240 			return -EFAULT;
2241 		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2242 		if (put_user(len, optlen))
2243 			return -EFAULT;
2244 		if (copy_to_user(optval, tp->ca_ops->name, len))
2245 			return -EFAULT;
2246 		return 0;
2247 	default:
2248 		return -ENOPROTOOPT;
2249 	};
2250 
2251 	if (put_user(len, optlen))
2252 		return -EFAULT;
2253 	if (copy_to_user(optval, &val, len))
2254 		return -EFAULT;
2255 	return 0;
2256 }
2257 
2258 
2259 extern void __skb_cb_too_small_for_tcp(int, int);
2260 extern struct tcp_congestion_ops tcp_reno;
2261 
2262 static __initdata unsigned long thash_entries;
2263 static int __init set_thash_entries(char *str)
2264 {
2265 	if (!str)
2266 		return 0;
2267 	thash_entries = simple_strtoul(str, &str, 0);
2268 	return 1;
2269 }
2270 __setup("thash_entries=", set_thash_entries);
2271 
2272 void __init tcp_init(void)
2273 {
2274 	struct sk_buff *skb = NULL;
2275 	int order, i;
2276 
2277 	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2278 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2279 					   sizeof(skb->cb));
2280 
2281 	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2282 					      sizeof(struct tcp_bind_bucket),
2283 					      0, SLAB_HWCACHE_ALIGN,
2284 					      NULL, NULL);
2285 	if (!tcp_bucket_cachep)
2286 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2287 
2288 	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2289 						sizeof(struct tcp_tw_bucket),
2290 						0, SLAB_HWCACHE_ALIGN,
2291 						NULL, NULL);
2292 	if (!tcp_timewait_cachep)
2293 		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2294 
2295 	/* Size and allocate the main established and bind bucket
2296 	 * hash tables.
2297 	 *
2298 	 * The methodology is similar to that of the buffer cache.
2299 	 */
2300 	tcp_ehash = (struct tcp_ehash_bucket *)
2301 		alloc_large_system_hash("TCP established",
2302 					sizeof(struct tcp_ehash_bucket),
2303 					thash_entries,
2304 					(num_physpages >= 128 * 1024) ?
2305 						(25 - PAGE_SHIFT) :
2306 						(27 - PAGE_SHIFT),
2307 					HASH_HIGHMEM,
2308 					&tcp_ehash_size,
2309 					NULL,
2310 					0);
2311 	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2312 	for (i = 0; i < (tcp_ehash_size << 1); i++) {
2313 		rwlock_init(&tcp_ehash[i].lock);
2314 		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2315 	}
2316 
2317 	tcp_bhash = (struct tcp_bind_hashbucket *)
2318 		alloc_large_system_hash("TCP bind",
2319 					sizeof(struct tcp_bind_hashbucket),
2320 					tcp_ehash_size,
2321 					(num_physpages >= 128 * 1024) ?
2322 						(25 - PAGE_SHIFT) :
2323 						(27 - PAGE_SHIFT),
2324 					HASH_HIGHMEM,
2325 					&tcp_bhash_size,
2326 					NULL,
2327 					64 * 1024);
2328 	tcp_bhash_size = 1 << tcp_bhash_size;
2329 	for (i = 0; i < tcp_bhash_size; i++) {
2330 		spin_lock_init(&tcp_bhash[i].lock);
2331 		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2332 	}
2333 
2334 	/* Try to be a bit smarter and adjust defaults depending
2335 	 * on available memory.
2336 	 */
2337 	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2338 			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2339 			order++)
2340 		;
2341 	if (order >= 4) {
2342 		sysctl_local_port_range[0] = 32768;
2343 		sysctl_local_port_range[1] = 61000;
2344 		sysctl_tcp_max_tw_buckets = 180000;
2345 		sysctl_tcp_max_orphans = 4096 << (order - 4);
2346 		sysctl_max_syn_backlog = 1024;
2347 	} else if (order < 3) {
2348 		sysctl_local_port_range[0] = 1024 * (3 - order);
2349 		sysctl_tcp_max_tw_buckets >>= (3 - order);
2350 		sysctl_tcp_max_orphans >>= (3 - order);
2351 		sysctl_max_syn_backlog = 128;
2352 	}
2353 	tcp_port_rover = sysctl_local_port_range[0] - 1;
2354 
2355 	sysctl_tcp_mem[0] =  768 << order;
2356 	sysctl_tcp_mem[1] = 1024 << order;
2357 	sysctl_tcp_mem[2] = 1536 << order;
2358 
2359 	if (order < 3) {
2360 		sysctl_tcp_wmem[2] = 64 * 1024;
2361 		sysctl_tcp_rmem[0] = PAGE_SIZE;
2362 		sysctl_tcp_rmem[1] = 43689;
2363 		sysctl_tcp_rmem[2] = 2 * 43689;
2364 	}
2365 
2366 	printk(KERN_INFO "TCP: Hash tables configured "
2367 	       "(established %d bind %d)\n",
2368 	       tcp_ehash_size << 1, tcp_bhash_size);
2369 
2370 	tcp_register_congestion_control(&tcp_reno);
2371 }
2372 
2373 EXPORT_SYMBOL(tcp_accept);
2374 EXPORT_SYMBOL(tcp_close);
2375 EXPORT_SYMBOL(tcp_destroy_sock);
2376 EXPORT_SYMBOL(tcp_disconnect);
2377 EXPORT_SYMBOL(tcp_getsockopt);
2378 EXPORT_SYMBOL(tcp_ioctl);
2379 EXPORT_SYMBOL(tcp_poll);
2380 EXPORT_SYMBOL(tcp_read_sock);
2381 EXPORT_SYMBOL(tcp_recvmsg);
2382 EXPORT_SYMBOL(tcp_sendmsg);
2383 EXPORT_SYMBOL(tcp_sendpage);
2384 EXPORT_SYMBOL(tcp_setsockopt);
2385 EXPORT_SYMBOL(tcp_shutdown);
2386 EXPORT_SYMBOL(tcp_statistics);
2387 EXPORT_SYMBOL(tcp_timewait_cachep);
2388