xref: /linux/net/ipv4/tcp.c (revision 13abf8130139c2ccd4962a7e5a8902be5e6cb5a7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *		Alan Cox	:	Numerous verify_area() calls
24  *		Alan Cox	:	Set the ACK bit on a reset
25  *		Alan Cox	:	Stopped it crashing if it closed while
26  *					sk->inuse=1 and was trying to connect
27  *					(tcp_err()).
28  *		Alan Cox	:	All icmp error handling was broken
29  *					pointers passed where wrong and the
30  *					socket was looked up backwards. Nobody
31  *					tested any icmp error code obviously.
32  *		Alan Cox	:	tcp_err() now handled properly. It
33  *					wakes people on errors. poll
34  *					behaves and the icmp error race
35  *					has gone by moving it into sock.c
36  *		Alan Cox	:	tcp_send_reset() fixed to work for
37  *					everything not just packets for
38  *					unknown sockets.
39  *		Alan Cox	:	tcp option processing.
40  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41  *					syn rule wrong]
42  *		Herp Rosmanith  :	More reset fixes
43  *		Alan Cox	:	No longer acks invalid rst frames.
44  *					Acking any kind of RST is right out.
45  *		Alan Cox	:	Sets an ignore me flag on an rst
46  *					receive otherwise odd bits of prattle
47  *					escape still
48  *		Alan Cox	:	Fixed another acking RST frame bug.
49  *					Should stop LAN workplace lockups.
50  *		Alan Cox	: 	Some tidyups using the new skb list
51  *					facilities
52  *		Alan Cox	:	sk->keepopen now seems to work
53  *		Alan Cox	:	Pulls options out correctly on accepts
54  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56  *					bit to skb ops.
57  *		Alan Cox	:	Tidied tcp_data to avoid a potential
58  *					nasty.
59  *		Alan Cox	:	Added some better commenting, as the
60  *					tcp is hard to follow
61  *		Alan Cox	:	Removed incorrect check for 20 * psh
62  *	Michael O'Reilly	:	ack < copied bug fix.
63  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64  *		Alan Cox	:	FIN with no memory -> CRASH
65  *		Alan Cox	:	Added socket option proto entries.
66  *					Also added awareness of them to accept.
67  *		Alan Cox	:	Added TCP options (SOL_TCP)
68  *		Alan Cox	:	Switched wakeup calls to callbacks,
69  *					so the kernel can layer network
70  *					sockets.
71  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72  *		Alan Cox	:	Handle FIN (more) properly (we hope).
73  *		Alan Cox	:	RST frames sent on unsynchronised
74  *					state ack error.
75  *		Alan Cox	:	Put in missing check for SYN bit.
76  *		Alan Cox	:	Added tcp_select_window() aka NET2E
77  *					window non shrink trick.
78  *		Alan Cox	:	Added a couple of small NET2E timer
79  *					fixes
80  *		Charles Hedrick :	TCP fixes
81  *		Toomas Tamm	:	TCP window fixes
82  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83  *		Charles Hedrick	:	Rewrote most of it to actually work
84  *		Linus		:	Rewrote tcp_read() and URG handling
85  *					completely
86  *		Gerhard Koerting:	Fixed some missing timer handling
87  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88  *		Gerhard Koerting:	PC/TCP workarounds
89  *		Adam Caldwell	:	Assorted timer/timing errors
90  *		Matthew Dillon	:	Fixed another RST bug
91  *		Alan Cox	:	Move to kernel side addressing changes.
92  *		Alan Cox	:	Beginning work on TCP fastpathing
93  *					(not yet usable)
94  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95  *		Alan Cox	:	TCP fast path debugging
96  *		Alan Cox	:	Window clamping
97  *		Michael Riepe	:	Bug in tcp_check()
98  *		Matt Dillon	:	More TCP improvements and RST bug fixes
99  *		Matt Dillon	:	Yet more small nasties remove from the
100  *					TCP code (Be very nice to this man if
101  *					tcp finally works 100%) 8)
102  *		Alan Cox	:	BSD accept semantics.
103  *		Alan Cox	:	Reset on closedown bug.
104  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105  *		Michael Pall	:	Handle poll() after URG properly in
106  *					all cases.
107  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108  *					(multi URG PUSH broke rlogin).
109  *		Michael Pall	:	Fix the multi URG PUSH problem in
110  *					tcp_readable(), poll() after URG
111  *					works now.
112  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113  *					BSD api.
114  *		Alan Cox	:	Changed the semantics of sk->socket to
115  *					fix a race and a signal problem with
116  *					accept() and async I/O.
117  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120  *					clients/servers which listen in on
121  *					fixed ports.
122  *		Alan Cox	:	Cleaned the above up and shrank it to
123  *					a sensible code size.
124  *		Alan Cox	:	Self connect lockup fix.
125  *		Alan Cox	:	No connect to multicast.
126  *		Ross Biro	:	Close unaccepted children on master
127  *					socket close.
128  *		Alan Cox	:	Reset tracing code.
129  *		Alan Cox	:	Spurious resets on shutdown.
130  *		Alan Cox	:	Giant 15 minute/60 second timer error
131  *		Alan Cox	:	Small whoops in polling before an
132  *					accept.
133  *		Alan Cox	:	Kept the state trace facility since
134  *					it's handy for debugging.
135  *		Alan Cox	:	More reset handler fixes.
136  *		Alan Cox	:	Started rewriting the code based on
137  *					the RFC's for other useful protocol
138  *					references see: Comer, KA9Q NOS, and
139  *					for a reference on the difference
140  *					between specifications and how BSD
141  *					works see the 4.4lite source.
142  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143  *					close.
144  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146  *		Alan Cox	:	Reimplemented timers as per the RFC
147  *					and using multiple timers for sanity.
148  *		Alan Cox	:	Small bug fixes, and a lot of new
149  *					comments.
150  *		Alan Cox	:	Fixed dual reader crash by locking
151  *					the buffers (much like datagram.c)
152  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153  *					now gets fed up of retrying without
154  *					(even a no space) answer.
155  *		Alan Cox	:	Extracted closing code better
156  *		Alan Cox	:	Fixed the closing state machine to
157  *					resemble the RFC.
158  *		Alan Cox	:	More 'per spec' fixes.
159  *		Jorge Cwik	:	Even faster checksumming.
160  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161  *					only frames. At least one pc tcp stack
162  *					generates them.
163  *		Alan Cox	:	Cache last socket.
164  *		Alan Cox	:	Per route irtt.
165  *		Matt Day	:	poll()->select() match BSD precisely on error
166  *		Alan Cox	:	New buffers
167  *		Marc Tamsky	:	Various sk->prot->retransmits and
168  *					sk->retransmits misupdating fixed.
169  *					Fixed tcp_write_timeout: stuck close,
170  *					and TCP syn retries gets used now.
171  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172  *					ack if state is TCP_CLOSED.
173  *		Alan Cox	:	Look up device on a retransmit - routes may
174  *					change. Doesn't yet cope with MSS shrink right
175  *					but it's a start!
176  *		Marc Tamsky	:	Closing in closing fixes.
177  *		Mike Shaver	:	RFC1122 verifications.
178  *		Alan Cox	:	rcv_saddr errors.
179  *		Alan Cox	:	Block double connect().
180  *		Alan Cox	:	Small hooks for enSKIP.
181  *		Alexey Kuznetsov:	Path MTU discovery.
182  *		Alan Cox	:	Support soft errors.
183  *		Alan Cox	:	Fix MTU discovery pathological case
184  *					when the remote claims no mtu!
185  *		Marc Tamsky	:	TCP_CLOSE fix.
186  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187  *					window but wrong (fixes NT lpd problems)
188  *		Pedro Roque	:	Better TCP window handling, delayed ack.
189  *		Joerg Reuter	:	No modification of locked buffers in
190  *					tcp_do_retransmit()
191  *		Eric Schenk	:	Changed receiver side silly window
192  *					avoidance algorithm to BSD style
193  *					algorithm. This doubles throughput
194  *					against machines running Solaris,
195  *					and seems to result in general
196  *					improvement.
197  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198  *	Willy Konynenberg	:	Transparent proxying support.
199  *	Mike McLagan		:	Routing by source
200  *		Keith Owens	:	Do proper merging with partial SKB's in
201  *					tcp_do_sendmsg to avoid burstiness.
202  *		Eric Schenk	:	Fix fast close down bug with
203  *					shutdown() followed by close().
204  *		Andi Kleen 	:	Make poll agree with SIGIO
205  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206  *					lingertime == 0 (RFC 793 ABORT Call)
207  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208  *					csum_and_copy_from_user() if possible.
209  *
210  *		This program is free software; you can redistribute it and/or
211  *		modify it under the terms of the GNU General Public License
212  *		as published by the Free Software Foundation; either version
213  *		2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *	TCP_SYN_SENT		sent a connection request, waiting for ack
218  *
219  *	TCP_SYN_RECV		received a connection request, sent ack,
220  *				waiting for final ack in three-way handshake.
221  *
222  *	TCP_ESTABLISHED		connection established
223  *
224  *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225  *				transmission of remaining buffered data
226  *
227  *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228  *				to shutdown
229  *
230  *	TCP_CLOSING		both sides have shutdown but we still have
231  *				data we have to finish sending
232  *
233  *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234  *				closed, can only be entered from FIN_WAIT2
235  *				or CLOSING.  Required because the other end
236  *				may not have gotten our last ACK causing it
237  *				to retransmit the data packet (which we ignore)
238  *
239  *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240  *				us to finish writing our data and to shutdown
241  *				(we have to close() to move on to LAST_ACK)
242  *
243  *	TCP_LAST_ACK		out side has shutdown after remote has
244  *				shutdown.  There may still be data in our
245  *				buffer that we have to finish sending
246  *
247  *	TCP_CLOSE		socket is finished
248  */
249 
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260 
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265 
266 
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269 
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271 
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
273 
274 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
275 
276 EXPORT_SYMBOL_GPL(tcp_orphan_count);
277 
278 int sysctl_tcp_mem[3];
279 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
280 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
281 
282 EXPORT_SYMBOL(sysctl_tcp_mem);
283 EXPORT_SYMBOL(sysctl_tcp_rmem);
284 EXPORT_SYMBOL(sysctl_tcp_wmem);
285 
286 atomic_t tcp_memory_allocated;	/* Current allocated memory. */
287 atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
288 
289 EXPORT_SYMBOL(tcp_memory_allocated);
290 EXPORT_SYMBOL(tcp_sockets_allocated);
291 
292 /*
293  * Pressure flag: try to collapse.
294  * Technical note: it is used by multiple contexts non atomically.
295  * All the sk_stream_mem_schedule() is of this nature: accounting
296  * is strict, actions are advisory and have some latency.
297  */
298 int tcp_memory_pressure;
299 
300 EXPORT_SYMBOL(tcp_memory_pressure);
301 
302 void tcp_enter_memory_pressure(void)
303 {
304 	if (!tcp_memory_pressure) {
305 		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
306 		tcp_memory_pressure = 1;
307 	}
308 }
309 
310 EXPORT_SYMBOL(tcp_enter_memory_pressure);
311 
312 /*
313  *	Wait for a TCP event.
314  *
315  *	Note that we don't need to lock the socket, as the upper poll layers
316  *	take care of normal races (between the test and the event) and we don't
317  *	go look at any of the socket buffers directly.
318  */
319 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
320 {
321 	unsigned int mask;
322 	struct sock *sk = sock->sk;
323 	struct tcp_sock *tp = tcp_sk(sk);
324 
325 	poll_wait(file, sk->sk_sleep, wait);
326 	if (sk->sk_state == TCP_LISTEN)
327 		return inet_csk_listen_poll(sk);
328 
329 	/* Socket is not locked. We are protected from async events
330 	   by poll logic and correct handling of state changes
331 	   made by another threads is impossible in any case.
332 	 */
333 
334 	mask = 0;
335 	if (sk->sk_err)
336 		mask = POLLERR;
337 
338 	/*
339 	 * POLLHUP is certainly not done right. But poll() doesn't
340 	 * have a notion of HUP in just one direction, and for a
341 	 * socket the read side is more interesting.
342 	 *
343 	 * Some poll() documentation says that POLLHUP is incompatible
344 	 * with the POLLOUT/POLLWR flags, so somebody should check this
345 	 * all. But careful, it tends to be safer to return too many
346 	 * bits than too few, and you can easily break real applications
347 	 * if you don't tell them that something has hung up!
348 	 *
349 	 * Check-me.
350 	 *
351 	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
352 	 * our fs/select.c). It means that after we received EOF,
353 	 * poll always returns immediately, making impossible poll() on write()
354 	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
355 	 * if and only if shutdown has been made in both directions.
356 	 * Actually, it is interesting to look how Solaris and DUX
357 	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
358 	 * then we could set it on SND_SHUTDOWN. BTW examples given
359 	 * in Stevens' books assume exactly this behaviour, it explains
360 	 * why PULLHUP is incompatible with POLLOUT.	--ANK
361 	 *
362 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
363 	 * blocking on fresh not-connected or disconnected socket. --ANK
364 	 */
365 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
366 		mask |= POLLHUP;
367 	if (sk->sk_shutdown & RCV_SHUTDOWN)
368 		mask |= POLLIN | POLLRDNORM;
369 
370 	/* Connected? */
371 	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
372 		/* Potential race condition. If read of tp below will
373 		 * escape above sk->sk_state, we can be illegally awaken
374 		 * in SYN_* states. */
375 		if ((tp->rcv_nxt != tp->copied_seq) &&
376 		    (tp->urg_seq != tp->copied_seq ||
377 		     tp->rcv_nxt != tp->copied_seq + 1 ||
378 		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
379 			mask |= POLLIN | POLLRDNORM;
380 
381 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
382 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
383 				mask |= POLLOUT | POLLWRNORM;
384 			} else {  /* send SIGIO later */
385 				set_bit(SOCK_ASYNC_NOSPACE,
386 					&sk->sk_socket->flags);
387 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
388 
389 				/* Race breaker. If space is freed after
390 				 * wspace test but before the flags are set,
391 				 * IO signal will be lost.
392 				 */
393 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
394 					mask |= POLLOUT | POLLWRNORM;
395 			}
396 		}
397 
398 		if (tp->urg_data & TCP_URG_VALID)
399 			mask |= POLLPRI;
400 	}
401 	return mask;
402 }
403 
404 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
405 {
406 	struct tcp_sock *tp = tcp_sk(sk);
407 	int answ;
408 
409 	switch (cmd) {
410 	case SIOCINQ:
411 		if (sk->sk_state == TCP_LISTEN)
412 			return -EINVAL;
413 
414 		lock_sock(sk);
415 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
416 			answ = 0;
417 		else if (sock_flag(sk, SOCK_URGINLINE) ||
418 			 !tp->urg_data ||
419 			 before(tp->urg_seq, tp->copied_seq) ||
420 			 !before(tp->urg_seq, tp->rcv_nxt)) {
421 			answ = tp->rcv_nxt - tp->copied_seq;
422 
423 			/* Subtract 1, if FIN is in queue. */
424 			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
425 				answ -=
426 		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
427 		} else
428 			answ = tp->urg_seq - tp->copied_seq;
429 		release_sock(sk);
430 		break;
431 	case SIOCATMARK:
432 		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
433 		break;
434 	case SIOCOUTQ:
435 		if (sk->sk_state == TCP_LISTEN)
436 			return -EINVAL;
437 
438 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
439 			answ = 0;
440 		else
441 			answ = tp->write_seq - tp->snd_una;
442 		break;
443 	default:
444 		return -ENOIOCTLCMD;
445 	};
446 
447 	return put_user(answ, (int __user *)arg);
448 }
449 
450 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
451 {
452 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
453 	tp->pushed_seq = tp->write_seq;
454 }
455 
456 static inline int forced_push(struct tcp_sock *tp)
457 {
458 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
459 }
460 
461 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
462 			      struct sk_buff *skb)
463 {
464 	skb->csum = 0;
465 	TCP_SKB_CB(skb)->seq = tp->write_seq;
466 	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
467 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
468 	TCP_SKB_CB(skb)->sacked = 0;
469 	skb_header_release(skb);
470 	__skb_queue_tail(&sk->sk_write_queue, skb);
471 	sk_charge_skb(sk, skb);
472 	if (!sk->sk_send_head)
473 		sk->sk_send_head = skb;
474 	if (tp->nonagle & TCP_NAGLE_PUSH)
475 		tp->nonagle &= ~TCP_NAGLE_PUSH;
476 }
477 
478 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
479 				struct sk_buff *skb)
480 {
481 	if (flags & MSG_OOB) {
482 		tp->urg_mode = 1;
483 		tp->snd_up = tp->write_seq;
484 		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
485 	}
486 }
487 
488 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
489 			    int mss_now, int nonagle)
490 {
491 	if (sk->sk_send_head) {
492 		struct sk_buff *skb = sk->sk_write_queue.prev;
493 		if (!(flags & MSG_MORE) || forced_push(tp))
494 			tcp_mark_push(tp, skb);
495 		tcp_mark_urg(tp, flags, skb);
496 		__tcp_push_pending_frames(sk, tp, mss_now,
497 					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
498 	}
499 }
500 
501 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
502 			 size_t psize, int flags)
503 {
504 	struct tcp_sock *tp = tcp_sk(sk);
505 	int mss_now, size_goal;
506 	int err;
507 	ssize_t copied;
508 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
509 
510 	/* Wait for a connection to finish. */
511 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
512 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
513 			goto out_err;
514 
515 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
516 
517 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
518 	size_goal = tp->xmit_size_goal;
519 	copied = 0;
520 
521 	err = -EPIPE;
522 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
523 		goto do_error;
524 
525 	while (psize > 0) {
526 		struct sk_buff *skb = sk->sk_write_queue.prev;
527 		struct page *page = pages[poffset / PAGE_SIZE];
528 		int copy, i, can_coalesce;
529 		int offset = poffset % PAGE_SIZE;
530 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
531 
532 		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
533 new_segment:
534 			if (!sk_stream_memory_free(sk))
535 				goto wait_for_sndbuf;
536 
537 			skb = sk_stream_alloc_pskb(sk, 0, 0,
538 						   sk->sk_allocation);
539 			if (!skb)
540 				goto wait_for_memory;
541 
542 			skb_entail(sk, tp, skb);
543 			copy = size_goal;
544 		}
545 
546 		if (copy > size)
547 			copy = size;
548 
549 		i = skb_shinfo(skb)->nr_frags;
550 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
551 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
552 			tcp_mark_push(tp, skb);
553 			goto new_segment;
554 		}
555 		if (sk->sk_forward_alloc < copy &&
556 		    !sk_stream_mem_schedule(sk, copy, 0))
557 			goto wait_for_memory;
558 
559 		if (can_coalesce) {
560 			skb_shinfo(skb)->frags[i - 1].size += copy;
561 		} else {
562 			get_page(page);
563 			skb_fill_page_desc(skb, i, page, offset, copy);
564 		}
565 
566 		skb->len += copy;
567 		skb->data_len += copy;
568 		skb->truesize += copy;
569 		sk->sk_wmem_queued += copy;
570 		sk->sk_forward_alloc -= copy;
571 		skb->ip_summed = CHECKSUM_HW;
572 		tp->write_seq += copy;
573 		TCP_SKB_CB(skb)->end_seq += copy;
574 		skb_shinfo(skb)->tso_segs = 0;
575 
576 		if (!copied)
577 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
578 
579 		copied += copy;
580 		poffset += copy;
581 		if (!(psize -= copy))
582 			goto out;
583 
584 		if (skb->len < mss_now || (flags & MSG_OOB))
585 			continue;
586 
587 		if (forced_push(tp)) {
588 			tcp_mark_push(tp, skb);
589 			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
590 		} else if (skb == sk->sk_send_head)
591 			tcp_push_one(sk, mss_now);
592 		continue;
593 
594 wait_for_sndbuf:
595 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
596 wait_for_memory:
597 		if (copied)
598 			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
599 
600 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
601 			goto do_error;
602 
603 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
604 		size_goal = tp->xmit_size_goal;
605 	}
606 
607 out:
608 	if (copied)
609 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
610 	return copied;
611 
612 do_error:
613 	if (copied)
614 		goto out;
615 out_err:
616 	return sk_stream_error(sk, flags, err);
617 }
618 
619 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
620 		     size_t size, int flags)
621 {
622 	ssize_t res;
623 	struct sock *sk = sock->sk;
624 
625 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
626 
627 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
628 	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
629 		return sock_no_sendpage(sock, page, offset, size, flags);
630 
631 #undef TCP_ZC_CSUM_FLAGS
632 
633 	lock_sock(sk);
634 	TCP_CHECK_TIMER(sk);
635 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
636 	TCP_CHECK_TIMER(sk);
637 	release_sock(sk);
638 	return res;
639 }
640 
641 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
642 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
643 
644 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
645 {
646 	int tmp = tp->mss_cache;
647 
648 	if (sk->sk_route_caps & NETIF_F_SG) {
649 		if (sk->sk_route_caps & NETIF_F_TSO)
650 			tmp = 0;
651 		else {
652 			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
653 
654 			if (tmp >= pgbreak &&
655 			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
656 				tmp = pgbreak;
657 		}
658 	}
659 
660 	return tmp;
661 }
662 
663 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
664 		size_t size)
665 {
666 	struct iovec *iov;
667 	struct tcp_sock *tp = tcp_sk(sk);
668 	struct sk_buff *skb;
669 	int iovlen, flags;
670 	int mss_now, size_goal;
671 	int err, copied;
672 	long timeo;
673 
674 	lock_sock(sk);
675 	TCP_CHECK_TIMER(sk);
676 
677 	flags = msg->msg_flags;
678 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
679 
680 	/* Wait for a connection to finish. */
681 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
682 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
683 			goto out_err;
684 
685 	/* This should be in poll */
686 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
687 
688 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
689 	size_goal = tp->xmit_size_goal;
690 
691 	/* Ok commence sending. */
692 	iovlen = msg->msg_iovlen;
693 	iov = msg->msg_iov;
694 	copied = 0;
695 
696 	err = -EPIPE;
697 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
698 		goto do_error;
699 
700 	while (--iovlen >= 0) {
701 		int seglen = iov->iov_len;
702 		unsigned char __user *from = iov->iov_base;
703 
704 		iov++;
705 
706 		while (seglen > 0) {
707 			int copy;
708 
709 			skb = sk->sk_write_queue.prev;
710 
711 			if (!sk->sk_send_head ||
712 			    (copy = size_goal - skb->len) <= 0) {
713 
714 new_segment:
715 				/* Allocate new segment. If the interface is SG,
716 				 * allocate skb fitting to single page.
717 				 */
718 				if (!sk_stream_memory_free(sk))
719 					goto wait_for_sndbuf;
720 
721 				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
722 							   0, sk->sk_allocation);
723 				if (!skb)
724 					goto wait_for_memory;
725 
726 				/*
727 				 * Check whether we can use HW checksum.
728 				 */
729 				if (sk->sk_route_caps &
730 				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
731 				     NETIF_F_HW_CSUM))
732 					skb->ip_summed = CHECKSUM_HW;
733 
734 				skb_entail(sk, tp, skb);
735 				copy = size_goal;
736 			}
737 
738 			/* Try to append data to the end of skb. */
739 			if (copy > seglen)
740 				copy = seglen;
741 
742 			/* Where to copy to? */
743 			if (skb_tailroom(skb) > 0) {
744 				/* We have some space in skb head. Superb! */
745 				if (copy > skb_tailroom(skb))
746 					copy = skb_tailroom(skb);
747 				if ((err = skb_add_data(skb, from, copy)) != 0)
748 					goto do_fault;
749 			} else {
750 				int merge = 0;
751 				int i = skb_shinfo(skb)->nr_frags;
752 				struct page *page = TCP_PAGE(sk);
753 				int off = TCP_OFF(sk);
754 
755 				if (skb_can_coalesce(skb, i, page, off) &&
756 				    off != PAGE_SIZE) {
757 					/* We can extend the last page
758 					 * fragment. */
759 					merge = 1;
760 				} else if (i == MAX_SKB_FRAGS ||
761 					   (!i &&
762 					   !(sk->sk_route_caps & NETIF_F_SG))) {
763 					/* Need to add new fragment and cannot
764 					 * do this because interface is non-SG,
765 					 * or because all the page slots are
766 					 * busy. */
767 					tcp_mark_push(tp, skb);
768 					goto new_segment;
769 				} else if (page) {
770 					if (off == PAGE_SIZE) {
771 						put_page(page);
772 						TCP_PAGE(sk) = page = NULL;
773 					}
774 				}
775 
776 				if (!page) {
777 					/* Allocate new cache page. */
778 					if (!(page = sk_stream_alloc_page(sk)))
779 						goto wait_for_memory;
780 					off = 0;
781 				}
782 
783 				if (copy > PAGE_SIZE - off)
784 					copy = PAGE_SIZE - off;
785 
786 				/* Time to copy data. We are close to
787 				 * the end! */
788 				err = skb_copy_to_page(sk, from, skb, page,
789 						       off, copy);
790 				if (err) {
791 					/* If this page was new, give it to the
792 					 * socket so it does not get leaked.
793 					 */
794 					if (!TCP_PAGE(sk)) {
795 						TCP_PAGE(sk) = page;
796 						TCP_OFF(sk) = 0;
797 					}
798 					goto do_error;
799 				}
800 
801 				/* Update the skb. */
802 				if (merge) {
803 					skb_shinfo(skb)->frags[i - 1].size +=
804 									copy;
805 				} else {
806 					skb_fill_page_desc(skb, i, page, off, copy);
807 					if (TCP_PAGE(sk)) {
808 						get_page(page);
809 					} else if (off + copy < PAGE_SIZE) {
810 						get_page(page);
811 						TCP_PAGE(sk) = page;
812 					}
813 				}
814 
815 				TCP_OFF(sk) = off + copy;
816 			}
817 
818 			if (!copied)
819 				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
820 
821 			tp->write_seq += copy;
822 			TCP_SKB_CB(skb)->end_seq += copy;
823 			skb_shinfo(skb)->tso_segs = 0;
824 
825 			from += copy;
826 			copied += copy;
827 			if ((seglen -= copy) == 0 && iovlen == 0)
828 				goto out;
829 
830 			if (skb->len < mss_now || (flags & MSG_OOB))
831 				continue;
832 
833 			if (forced_push(tp)) {
834 				tcp_mark_push(tp, skb);
835 				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
836 			} else if (skb == sk->sk_send_head)
837 				tcp_push_one(sk, mss_now);
838 			continue;
839 
840 wait_for_sndbuf:
841 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
842 wait_for_memory:
843 			if (copied)
844 				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
845 
846 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
847 				goto do_error;
848 
849 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
850 			size_goal = tp->xmit_size_goal;
851 		}
852 	}
853 
854 out:
855 	if (copied)
856 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
857 	TCP_CHECK_TIMER(sk);
858 	release_sock(sk);
859 	return copied;
860 
861 do_fault:
862 	if (!skb->len) {
863 		if (sk->sk_send_head == skb)
864 			sk->sk_send_head = NULL;
865 		__skb_unlink(skb, &sk->sk_write_queue);
866 		sk_stream_free_skb(sk, skb);
867 	}
868 
869 do_error:
870 	if (copied)
871 		goto out;
872 out_err:
873 	err = sk_stream_error(sk, flags, err);
874 	TCP_CHECK_TIMER(sk);
875 	release_sock(sk);
876 	return err;
877 }
878 
879 /*
880  *	Handle reading urgent data. BSD has very simple semantics for
881  *	this, no blocking and very strange errors 8)
882  */
883 
884 static int tcp_recv_urg(struct sock *sk, long timeo,
885 			struct msghdr *msg, int len, int flags,
886 			int *addr_len)
887 {
888 	struct tcp_sock *tp = tcp_sk(sk);
889 
890 	/* No URG data to read. */
891 	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
892 	    tp->urg_data == TCP_URG_READ)
893 		return -EINVAL;	/* Yes this is right ! */
894 
895 	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
896 		return -ENOTCONN;
897 
898 	if (tp->urg_data & TCP_URG_VALID) {
899 		int err = 0;
900 		char c = tp->urg_data;
901 
902 		if (!(flags & MSG_PEEK))
903 			tp->urg_data = TCP_URG_READ;
904 
905 		/* Read urgent data. */
906 		msg->msg_flags |= MSG_OOB;
907 
908 		if (len > 0) {
909 			if (!(flags & MSG_TRUNC))
910 				err = memcpy_toiovec(msg->msg_iov, &c, 1);
911 			len = 1;
912 		} else
913 			msg->msg_flags |= MSG_TRUNC;
914 
915 		return err ? -EFAULT : len;
916 	}
917 
918 	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
919 		return 0;
920 
921 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
922 	 * the available implementations agree in this case:
923 	 * this call should never block, independent of the
924 	 * blocking state of the socket.
925 	 * Mike <pall@rz.uni-karlsruhe.de>
926 	 */
927 	return -EAGAIN;
928 }
929 
930 /* Clean up the receive buffer for full frames taken by the user,
931  * then send an ACK if necessary.  COPIED is the number of bytes
932  * tcp_recvmsg has given to the user so far, it speeds up the
933  * calculation of whether or not we must ACK for the sake of
934  * a window update.
935  */
936 static void cleanup_rbuf(struct sock *sk, int copied)
937 {
938 	struct tcp_sock *tp = tcp_sk(sk);
939 	int time_to_ack = 0;
940 
941 #if TCP_DEBUG
942 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
943 
944 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
945 #endif
946 
947 	if (inet_csk_ack_scheduled(sk)) {
948 		const struct inet_connection_sock *icsk = inet_csk(sk);
949 		   /* Delayed ACKs frequently hit locked sockets during bulk
950 		    * receive. */
951 		if (icsk->icsk_ack.blocked ||
952 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
953 		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
954 		    /*
955 		     * If this read emptied read buffer, we send ACK, if
956 		     * connection is not bidirectional, user drained
957 		     * receive buffer and there was a small segment
958 		     * in queue.
959 		     */
960 		    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
961 		     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
962 			time_to_ack = 1;
963 	}
964 
965 	/* We send an ACK if we can now advertise a non-zero window
966 	 * which has been raised "significantly".
967 	 *
968 	 * Even if window raised up to infinity, do not send window open ACK
969 	 * in states, where we will not receive more. It is useless.
970 	 */
971 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
972 		__u32 rcv_window_now = tcp_receive_window(tp);
973 
974 		/* Optimize, __tcp_select_window() is not cheap. */
975 		if (2*rcv_window_now <= tp->window_clamp) {
976 			__u32 new_window = __tcp_select_window(sk);
977 
978 			/* Send ACK now, if this read freed lots of space
979 			 * in our buffer. Certainly, new_window is new window.
980 			 * We can advertise it now, if it is not less than current one.
981 			 * "Lots" means "at least twice" here.
982 			 */
983 			if (new_window && new_window >= 2 * rcv_window_now)
984 				time_to_ack = 1;
985 		}
986 	}
987 	if (time_to_ack)
988 		tcp_send_ack(sk);
989 }
990 
991 static void tcp_prequeue_process(struct sock *sk)
992 {
993 	struct sk_buff *skb;
994 	struct tcp_sock *tp = tcp_sk(sk);
995 
996 	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
997 
998 	/* RX process wants to run with disabled BHs, though it is not
999 	 * necessary */
1000 	local_bh_disable();
1001 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1002 		sk->sk_backlog_rcv(sk, skb);
1003 	local_bh_enable();
1004 
1005 	/* Clear memory counter. */
1006 	tp->ucopy.memory = 0;
1007 }
1008 
1009 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1010 {
1011 	struct sk_buff *skb;
1012 	u32 offset;
1013 
1014 	skb_queue_walk(&sk->sk_receive_queue, skb) {
1015 		offset = seq - TCP_SKB_CB(skb)->seq;
1016 		if (skb->h.th->syn)
1017 			offset--;
1018 		if (offset < skb->len || skb->h.th->fin) {
1019 			*off = offset;
1020 			return skb;
1021 		}
1022 	}
1023 	return NULL;
1024 }
1025 
1026 /*
1027  * This routine provides an alternative to tcp_recvmsg() for routines
1028  * that would like to handle copying from skbuffs directly in 'sendfile'
1029  * fashion.
1030  * Note:
1031  *	- It is assumed that the socket was locked by the caller.
1032  *	- The routine does not block.
1033  *	- At present, there is no support for reading OOB data
1034  *	  or for 'peeking' the socket using this routine
1035  *	  (although both would be easy to implement).
1036  */
1037 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1038 		  sk_read_actor_t recv_actor)
1039 {
1040 	struct sk_buff *skb;
1041 	struct tcp_sock *tp = tcp_sk(sk);
1042 	u32 seq = tp->copied_seq;
1043 	u32 offset;
1044 	int copied = 0;
1045 
1046 	if (sk->sk_state == TCP_LISTEN)
1047 		return -ENOTCONN;
1048 	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1049 		if (offset < skb->len) {
1050 			size_t used, len;
1051 
1052 			len = skb->len - offset;
1053 			/* Stop reading if we hit a patch of urgent data */
1054 			if (tp->urg_data) {
1055 				u32 urg_offset = tp->urg_seq - seq;
1056 				if (urg_offset < len)
1057 					len = urg_offset;
1058 				if (!len)
1059 					break;
1060 			}
1061 			used = recv_actor(desc, skb, offset, len);
1062 			if (used <= len) {
1063 				seq += used;
1064 				copied += used;
1065 				offset += used;
1066 			}
1067 			if (offset != skb->len)
1068 				break;
1069 		}
1070 		if (skb->h.th->fin) {
1071 			sk_eat_skb(sk, skb);
1072 			++seq;
1073 			break;
1074 		}
1075 		sk_eat_skb(sk, skb);
1076 		if (!desc->count)
1077 			break;
1078 	}
1079 	tp->copied_seq = seq;
1080 
1081 	tcp_rcv_space_adjust(sk);
1082 
1083 	/* Clean up data we have read: This will do ACK frames. */
1084 	if (copied)
1085 		cleanup_rbuf(sk, copied);
1086 	return copied;
1087 }
1088 
1089 /*
1090  *	This routine copies from a sock struct into the user buffer.
1091  *
1092  *	Technical note: in 2.3 we work on _locked_ socket, so that
1093  *	tricks with *seq access order and skb->users are not required.
1094  *	Probably, code can be easily improved even more.
1095  */
1096 
1097 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1098 		size_t len, int nonblock, int flags, int *addr_len)
1099 {
1100 	struct tcp_sock *tp = tcp_sk(sk);
1101 	int copied = 0;
1102 	u32 peek_seq;
1103 	u32 *seq;
1104 	unsigned long used;
1105 	int err;
1106 	int target;		/* Read at least this many bytes */
1107 	long timeo;
1108 	struct task_struct *user_recv = NULL;
1109 
1110 	lock_sock(sk);
1111 
1112 	TCP_CHECK_TIMER(sk);
1113 
1114 	err = -ENOTCONN;
1115 	if (sk->sk_state == TCP_LISTEN)
1116 		goto out;
1117 
1118 	timeo = sock_rcvtimeo(sk, nonblock);
1119 
1120 	/* Urgent data needs to be handled specially. */
1121 	if (flags & MSG_OOB)
1122 		goto recv_urg;
1123 
1124 	seq = &tp->copied_seq;
1125 	if (flags & MSG_PEEK) {
1126 		peek_seq = tp->copied_seq;
1127 		seq = &peek_seq;
1128 	}
1129 
1130 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1131 
1132 	do {
1133 		struct sk_buff *skb;
1134 		u32 offset;
1135 
1136 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1137 		if (tp->urg_data && tp->urg_seq == *seq) {
1138 			if (copied)
1139 				break;
1140 			if (signal_pending(current)) {
1141 				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1142 				break;
1143 			}
1144 		}
1145 
1146 		/* Next get a buffer. */
1147 
1148 		skb = skb_peek(&sk->sk_receive_queue);
1149 		do {
1150 			if (!skb)
1151 				break;
1152 
1153 			/* Now that we have two receive queues this
1154 			 * shouldn't happen.
1155 			 */
1156 			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1157 				printk(KERN_INFO "recvmsg bug: copied %X "
1158 				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1159 				break;
1160 			}
1161 			offset = *seq - TCP_SKB_CB(skb)->seq;
1162 			if (skb->h.th->syn)
1163 				offset--;
1164 			if (offset < skb->len)
1165 				goto found_ok_skb;
1166 			if (skb->h.th->fin)
1167 				goto found_fin_ok;
1168 			BUG_TRAP(flags & MSG_PEEK);
1169 			skb = skb->next;
1170 		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1171 
1172 		/* Well, if we have backlog, try to process it now yet. */
1173 
1174 		if (copied >= target && !sk->sk_backlog.tail)
1175 			break;
1176 
1177 		if (copied) {
1178 			if (sk->sk_err ||
1179 			    sk->sk_state == TCP_CLOSE ||
1180 			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1181 			    !timeo ||
1182 			    signal_pending(current) ||
1183 			    (flags & MSG_PEEK))
1184 				break;
1185 		} else {
1186 			if (sock_flag(sk, SOCK_DONE))
1187 				break;
1188 
1189 			if (sk->sk_err) {
1190 				copied = sock_error(sk);
1191 				break;
1192 			}
1193 
1194 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1195 				break;
1196 
1197 			if (sk->sk_state == TCP_CLOSE) {
1198 				if (!sock_flag(sk, SOCK_DONE)) {
1199 					/* This occurs when user tries to read
1200 					 * from never connected socket.
1201 					 */
1202 					copied = -ENOTCONN;
1203 					break;
1204 				}
1205 				break;
1206 			}
1207 
1208 			if (!timeo) {
1209 				copied = -EAGAIN;
1210 				break;
1211 			}
1212 
1213 			if (signal_pending(current)) {
1214 				copied = sock_intr_errno(timeo);
1215 				break;
1216 			}
1217 		}
1218 
1219 		cleanup_rbuf(sk, copied);
1220 
1221 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1222 			/* Install new reader */
1223 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1224 				user_recv = current;
1225 				tp->ucopy.task = user_recv;
1226 				tp->ucopy.iov = msg->msg_iov;
1227 			}
1228 
1229 			tp->ucopy.len = len;
1230 
1231 			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1232 				 (flags & (MSG_PEEK | MSG_TRUNC)));
1233 
1234 			/* Ugly... If prequeue is not empty, we have to
1235 			 * process it before releasing socket, otherwise
1236 			 * order will be broken at second iteration.
1237 			 * More elegant solution is required!!!
1238 			 *
1239 			 * Look: we have the following (pseudo)queues:
1240 			 *
1241 			 * 1. packets in flight
1242 			 * 2. backlog
1243 			 * 3. prequeue
1244 			 * 4. receive_queue
1245 			 *
1246 			 * Each queue can be processed only if the next ones
1247 			 * are empty. At this point we have empty receive_queue.
1248 			 * But prequeue _can_ be not empty after 2nd iteration,
1249 			 * when we jumped to start of loop because backlog
1250 			 * processing added something to receive_queue.
1251 			 * We cannot release_sock(), because backlog contains
1252 			 * packets arrived _after_ prequeued ones.
1253 			 *
1254 			 * Shortly, algorithm is clear --- to process all
1255 			 * the queues in order. We could make it more directly,
1256 			 * requeueing packets from backlog to prequeue, if
1257 			 * is not empty. It is more elegant, but eats cycles,
1258 			 * unfortunately.
1259 			 */
1260 			if (!skb_queue_empty(&tp->ucopy.prequeue))
1261 				goto do_prequeue;
1262 
1263 			/* __ Set realtime policy in scheduler __ */
1264 		}
1265 
1266 		if (copied >= target) {
1267 			/* Do not sleep, just process backlog. */
1268 			release_sock(sk);
1269 			lock_sock(sk);
1270 		} else
1271 			sk_wait_data(sk, &timeo);
1272 
1273 		if (user_recv) {
1274 			int chunk;
1275 
1276 			/* __ Restore normal policy in scheduler __ */
1277 
1278 			if ((chunk = len - tp->ucopy.len) != 0) {
1279 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1280 				len -= chunk;
1281 				copied += chunk;
1282 			}
1283 
1284 			if (tp->rcv_nxt == tp->copied_seq &&
1285 			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1286 do_prequeue:
1287 				tcp_prequeue_process(sk);
1288 
1289 				if ((chunk = len - tp->ucopy.len) != 0) {
1290 					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1291 					len -= chunk;
1292 					copied += chunk;
1293 				}
1294 			}
1295 		}
1296 		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1297 			if (net_ratelimit())
1298 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1299 				       current->comm, current->pid);
1300 			peek_seq = tp->copied_seq;
1301 		}
1302 		continue;
1303 
1304 	found_ok_skb:
1305 		/* Ok so how much can we use? */
1306 		used = skb->len - offset;
1307 		if (len < used)
1308 			used = len;
1309 
1310 		/* Do we have urgent data here? */
1311 		if (tp->urg_data) {
1312 			u32 urg_offset = tp->urg_seq - *seq;
1313 			if (urg_offset < used) {
1314 				if (!urg_offset) {
1315 					if (!sock_flag(sk, SOCK_URGINLINE)) {
1316 						++*seq;
1317 						offset++;
1318 						used--;
1319 						if (!used)
1320 							goto skip_copy;
1321 					}
1322 				} else
1323 					used = urg_offset;
1324 			}
1325 		}
1326 
1327 		if (!(flags & MSG_TRUNC)) {
1328 			err = skb_copy_datagram_iovec(skb, offset,
1329 						      msg->msg_iov, used);
1330 			if (err) {
1331 				/* Exception. Bailout! */
1332 				if (!copied)
1333 					copied = -EFAULT;
1334 				break;
1335 			}
1336 		}
1337 
1338 		*seq += used;
1339 		copied += used;
1340 		len -= used;
1341 
1342 		tcp_rcv_space_adjust(sk);
1343 
1344 skip_copy:
1345 		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1346 			tp->urg_data = 0;
1347 			tcp_fast_path_check(sk, tp);
1348 		}
1349 		if (used + offset < skb->len)
1350 			continue;
1351 
1352 		if (skb->h.th->fin)
1353 			goto found_fin_ok;
1354 		if (!(flags & MSG_PEEK))
1355 			sk_eat_skb(sk, skb);
1356 		continue;
1357 
1358 	found_fin_ok:
1359 		/* Process the FIN. */
1360 		++*seq;
1361 		if (!(flags & MSG_PEEK))
1362 			sk_eat_skb(sk, skb);
1363 		break;
1364 	} while (len > 0);
1365 
1366 	if (user_recv) {
1367 		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1368 			int chunk;
1369 
1370 			tp->ucopy.len = copied > 0 ? len : 0;
1371 
1372 			tcp_prequeue_process(sk);
1373 
1374 			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1375 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1376 				len -= chunk;
1377 				copied += chunk;
1378 			}
1379 		}
1380 
1381 		tp->ucopy.task = NULL;
1382 		tp->ucopy.len = 0;
1383 	}
1384 
1385 	/* According to UNIX98, msg_name/msg_namelen are ignored
1386 	 * on connected socket. I was just happy when found this 8) --ANK
1387 	 */
1388 
1389 	/* Clean up data we have read: This will do ACK frames. */
1390 	cleanup_rbuf(sk, copied);
1391 
1392 	TCP_CHECK_TIMER(sk);
1393 	release_sock(sk);
1394 	return copied;
1395 
1396 out:
1397 	TCP_CHECK_TIMER(sk);
1398 	release_sock(sk);
1399 	return err;
1400 
1401 recv_urg:
1402 	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1403 	goto out;
1404 }
1405 
1406 /*
1407  *	State processing on a close. This implements the state shift for
1408  *	sending our FIN frame. Note that we only send a FIN for some
1409  *	states. A shutdown() may have already sent the FIN, or we may be
1410  *	closed.
1411  */
1412 
1413 static unsigned char new_state[16] = {
1414   /* current state:        new state:      action:	*/
1415   /* (Invalid)		*/ TCP_CLOSE,
1416   /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1417   /* TCP_SYN_SENT	*/ TCP_CLOSE,
1418   /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1419   /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1420   /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1421   /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1422   /* TCP_CLOSE		*/ TCP_CLOSE,
1423   /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1424   /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1425   /* TCP_LISTEN		*/ TCP_CLOSE,
1426   /* TCP_CLOSING	*/ TCP_CLOSING,
1427 };
1428 
1429 static int tcp_close_state(struct sock *sk)
1430 {
1431 	int next = (int)new_state[sk->sk_state];
1432 	int ns = next & TCP_STATE_MASK;
1433 
1434 	tcp_set_state(sk, ns);
1435 
1436 	return next & TCP_ACTION_FIN;
1437 }
1438 
1439 /*
1440  *	Shutdown the sending side of a connection. Much like close except
1441  *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1442  */
1443 
1444 void tcp_shutdown(struct sock *sk, int how)
1445 {
1446 	/*	We need to grab some memory, and put together a FIN,
1447 	 *	and then put it into the queue to be sent.
1448 	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1449 	 */
1450 	if (!(how & SEND_SHUTDOWN))
1451 		return;
1452 
1453 	/* If we've already sent a FIN, or it's a closed state, skip this. */
1454 	if ((1 << sk->sk_state) &
1455 	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1456 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1457 		/* Clear out any half completed packets.  FIN if needed. */
1458 		if (tcp_close_state(sk))
1459 			tcp_send_fin(sk);
1460 	}
1461 }
1462 
1463 void tcp_close(struct sock *sk, long timeout)
1464 {
1465 	struct sk_buff *skb;
1466 	int data_was_unread = 0;
1467 
1468 	lock_sock(sk);
1469 	sk->sk_shutdown = SHUTDOWN_MASK;
1470 
1471 	if (sk->sk_state == TCP_LISTEN) {
1472 		tcp_set_state(sk, TCP_CLOSE);
1473 
1474 		/* Special case. */
1475 		inet_csk_listen_stop(sk);
1476 
1477 		goto adjudge_to_death;
1478 	}
1479 
1480 	/*  We need to flush the recv. buffs.  We do this only on the
1481 	 *  descriptor close, not protocol-sourced closes, because the
1482 	 *  reader process may not have drained the data yet!
1483 	 */
1484 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1485 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1486 			  skb->h.th->fin;
1487 		data_was_unread += len;
1488 		__kfree_skb(skb);
1489 	}
1490 
1491 	sk_stream_mem_reclaim(sk);
1492 
1493 	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1494 	 * 3.10, we send a RST here because data was lost.  To
1495 	 * witness the awful effects of the old behavior of always
1496 	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1497 	 * a bulk GET in an FTP client, suspend the process, wait
1498 	 * for the client to advertise a zero window, then kill -9
1499 	 * the FTP client, wheee...  Note: timeout is always zero
1500 	 * in such a case.
1501 	 */
1502 	if (data_was_unread) {
1503 		/* Unread data was tossed, zap the connection. */
1504 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1505 		tcp_set_state(sk, TCP_CLOSE);
1506 		tcp_send_active_reset(sk, GFP_KERNEL);
1507 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1508 		/* Check zero linger _after_ checking for unread data. */
1509 		sk->sk_prot->disconnect(sk, 0);
1510 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1511 	} else if (tcp_close_state(sk)) {
1512 		/* We FIN if the application ate all the data before
1513 		 * zapping the connection.
1514 		 */
1515 
1516 		/* RED-PEN. Formally speaking, we have broken TCP state
1517 		 * machine. State transitions:
1518 		 *
1519 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1520 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1521 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1522 		 *
1523 		 * are legal only when FIN has been sent (i.e. in window),
1524 		 * rather than queued out of window. Purists blame.
1525 		 *
1526 		 * F.e. "RFC state" is ESTABLISHED,
1527 		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1528 		 *
1529 		 * The visible declinations are that sometimes
1530 		 * we enter time-wait state, when it is not required really
1531 		 * (harmless), do not send active resets, when they are
1532 		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1533 		 * they look as CLOSING or LAST_ACK for Linux)
1534 		 * Probably, I missed some more holelets.
1535 		 * 						--ANK
1536 		 */
1537 		tcp_send_fin(sk);
1538 	}
1539 
1540 	sk_stream_wait_close(sk, timeout);
1541 
1542 adjudge_to_death:
1543 	/* It is the last release_sock in its life. It will remove backlog. */
1544 	release_sock(sk);
1545 
1546 
1547 	/* Now socket is owned by kernel and we acquire BH lock
1548 	   to finish close. No need to check for user refs.
1549 	 */
1550 	local_bh_disable();
1551 	bh_lock_sock(sk);
1552 	BUG_TRAP(!sock_owned_by_user(sk));
1553 
1554 	sock_hold(sk);
1555 	sock_orphan(sk);
1556 
1557 	/*	This is a (useful) BSD violating of the RFC. There is a
1558 	 *	problem with TCP as specified in that the other end could
1559 	 *	keep a socket open forever with no application left this end.
1560 	 *	We use a 3 minute timeout (about the same as BSD) then kill
1561 	 *	our end. If they send after that then tough - BUT: long enough
1562 	 *	that we won't make the old 4*rto = almost no time - whoops
1563 	 *	reset mistake.
1564 	 *
1565 	 *	Nope, it was not mistake. It is really desired behaviour
1566 	 *	f.e. on http servers, when such sockets are useless, but
1567 	 *	consume significant resources. Let's do it with special
1568 	 *	linger2	option.					--ANK
1569 	 */
1570 
1571 	if (sk->sk_state == TCP_FIN_WAIT2) {
1572 		struct tcp_sock *tp = tcp_sk(sk);
1573 		if (tp->linger2 < 0) {
1574 			tcp_set_state(sk, TCP_CLOSE);
1575 			tcp_send_active_reset(sk, GFP_ATOMIC);
1576 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1577 		} else {
1578 			const int tmo = tcp_fin_time(sk);
1579 
1580 			if (tmo > TCP_TIMEWAIT_LEN) {
1581 				inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1582 			} else {
1583 				atomic_inc(sk->sk_prot->orphan_count);
1584 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1585 				goto out;
1586 			}
1587 		}
1588 	}
1589 	if (sk->sk_state != TCP_CLOSE) {
1590 		sk_stream_mem_reclaim(sk);
1591 		if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1592 		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1593 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1594 			if (net_ratelimit())
1595 				printk(KERN_INFO "TCP: too many of orphaned "
1596 				       "sockets\n");
1597 			tcp_set_state(sk, TCP_CLOSE);
1598 			tcp_send_active_reset(sk, GFP_ATOMIC);
1599 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1600 		}
1601 	}
1602 	atomic_inc(sk->sk_prot->orphan_count);
1603 
1604 	if (sk->sk_state == TCP_CLOSE)
1605 		inet_csk_destroy_sock(sk);
1606 	/* Otherwise, socket is reprieved until protocol close. */
1607 
1608 out:
1609 	bh_unlock_sock(sk);
1610 	local_bh_enable();
1611 	sock_put(sk);
1612 }
1613 
1614 /* These states need RST on ABORT according to RFC793 */
1615 
1616 static inline int tcp_need_reset(int state)
1617 {
1618 	return (1 << state) &
1619 	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1620 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1621 }
1622 
1623 int tcp_disconnect(struct sock *sk, int flags)
1624 {
1625 	struct inet_sock *inet = inet_sk(sk);
1626 	struct inet_connection_sock *icsk = inet_csk(sk);
1627 	struct tcp_sock *tp = tcp_sk(sk);
1628 	int err = 0;
1629 	int old_state = sk->sk_state;
1630 
1631 	if (old_state != TCP_CLOSE)
1632 		tcp_set_state(sk, TCP_CLOSE);
1633 
1634 	/* ABORT function of RFC793 */
1635 	if (old_state == TCP_LISTEN) {
1636 		inet_csk_listen_stop(sk);
1637 	} else if (tcp_need_reset(old_state) ||
1638 		   (tp->snd_nxt != tp->write_seq &&
1639 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1640 		/* The last check adjusts for discrepance of Linux wrt. RFC
1641 		 * states
1642 		 */
1643 		tcp_send_active_reset(sk, gfp_any());
1644 		sk->sk_err = ECONNRESET;
1645 	} else if (old_state == TCP_SYN_SENT)
1646 		sk->sk_err = ECONNRESET;
1647 
1648 	tcp_clear_xmit_timers(sk);
1649 	__skb_queue_purge(&sk->sk_receive_queue);
1650 	sk_stream_writequeue_purge(sk);
1651 	__skb_queue_purge(&tp->out_of_order_queue);
1652 
1653 	inet->dport = 0;
1654 
1655 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1656 		inet_reset_saddr(sk);
1657 
1658 	sk->sk_shutdown = 0;
1659 	sock_reset_flag(sk, SOCK_DONE);
1660 	tp->srtt = 0;
1661 	if ((tp->write_seq += tp->max_window + 2) == 0)
1662 		tp->write_seq = 1;
1663 	icsk->icsk_backoff = 0;
1664 	tp->snd_cwnd = 2;
1665 	icsk->icsk_probes_out = 0;
1666 	tp->packets_out = 0;
1667 	tp->snd_ssthresh = 0x7fffffff;
1668 	tp->snd_cwnd_cnt = 0;
1669 	tcp_set_ca_state(sk, TCP_CA_Open);
1670 	tcp_clear_retrans(tp);
1671 	inet_csk_delack_init(sk);
1672 	sk->sk_send_head = NULL;
1673 	tp->rx_opt.saw_tstamp = 0;
1674 	tcp_sack_reset(&tp->rx_opt);
1675 	__sk_dst_reset(sk);
1676 
1677 	BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1678 
1679 	sk->sk_error_report(sk);
1680 	return err;
1681 }
1682 
1683 /*
1684  *	Socket option code for TCP.
1685  */
1686 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1687 		   int optlen)
1688 {
1689 	struct tcp_sock *tp = tcp_sk(sk);
1690 	struct inet_connection_sock *icsk = inet_csk(sk);
1691 	int val;
1692 	int err = 0;
1693 
1694 	if (level != SOL_TCP)
1695 		return tp->af_specific->setsockopt(sk, level, optname,
1696 						   optval, optlen);
1697 
1698 	/* This is a string value all the others are int's */
1699 	if (optname == TCP_CONGESTION) {
1700 		char name[TCP_CA_NAME_MAX];
1701 
1702 		if (optlen < 1)
1703 			return -EINVAL;
1704 
1705 		val = strncpy_from_user(name, optval,
1706 					min(TCP_CA_NAME_MAX-1, optlen));
1707 		if (val < 0)
1708 			return -EFAULT;
1709 		name[val] = 0;
1710 
1711 		lock_sock(sk);
1712 		err = tcp_set_congestion_control(sk, name);
1713 		release_sock(sk);
1714 		return err;
1715 	}
1716 
1717 	if (optlen < sizeof(int))
1718 		return -EINVAL;
1719 
1720 	if (get_user(val, (int __user *)optval))
1721 		return -EFAULT;
1722 
1723 	lock_sock(sk);
1724 
1725 	switch (optname) {
1726 	case TCP_MAXSEG:
1727 		/* Values greater than interface MTU won't take effect. However
1728 		 * at the point when this call is done we typically don't yet
1729 		 * know which interface is going to be used */
1730 		if (val < 8 || val > MAX_TCP_WINDOW) {
1731 			err = -EINVAL;
1732 			break;
1733 		}
1734 		tp->rx_opt.user_mss = val;
1735 		break;
1736 
1737 	case TCP_NODELAY:
1738 		if (val) {
1739 			/* TCP_NODELAY is weaker than TCP_CORK, so that
1740 			 * this option on corked socket is remembered, but
1741 			 * it is not activated until cork is cleared.
1742 			 *
1743 			 * However, when TCP_NODELAY is set we make
1744 			 * an explicit push, which overrides even TCP_CORK
1745 			 * for currently queued segments.
1746 			 */
1747 			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1748 			tcp_push_pending_frames(sk, tp);
1749 		} else {
1750 			tp->nonagle &= ~TCP_NAGLE_OFF;
1751 		}
1752 		break;
1753 
1754 	case TCP_CORK:
1755 		/* When set indicates to always queue non-full frames.
1756 		 * Later the user clears this option and we transmit
1757 		 * any pending partial frames in the queue.  This is
1758 		 * meant to be used alongside sendfile() to get properly
1759 		 * filled frames when the user (for example) must write
1760 		 * out headers with a write() call first and then use
1761 		 * sendfile to send out the data parts.
1762 		 *
1763 		 * TCP_CORK can be set together with TCP_NODELAY and it is
1764 		 * stronger than TCP_NODELAY.
1765 		 */
1766 		if (val) {
1767 			tp->nonagle |= TCP_NAGLE_CORK;
1768 		} else {
1769 			tp->nonagle &= ~TCP_NAGLE_CORK;
1770 			if (tp->nonagle&TCP_NAGLE_OFF)
1771 				tp->nonagle |= TCP_NAGLE_PUSH;
1772 			tcp_push_pending_frames(sk, tp);
1773 		}
1774 		break;
1775 
1776 	case TCP_KEEPIDLE:
1777 		if (val < 1 || val > MAX_TCP_KEEPIDLE)
1778 			err = -EINVAL;
1779 		else {
1780 			tp->keepalive_time = val * HZ;
1781 			if (sock_flag(sk, SOCK_KEEPOPEN) &&
1782 			    !((1 << sk->sk_state) &
1783 			      (TCPF_CLOSE | TCPF_LISTEN))) {
1784 				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1785 				if (tp->keepalive_time > elapsed)
1786 					elapsed = tp->keepalive_time - elapsed;
1787 				else
1788 					elapsed = 0;
1789 				inet_csk_reset_keepalive_timer(sk, elapsed);
1790 			}
1791 		}
1792 		break;
1793 	case TCP_KEEPINTVL:
1794 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
1795 			err = -EINVAL;
1796 		else
1797 			tp->keepalive_intvl = val * HZ;
1798 		break;
1799 	case TCP_KEEPCNT:
1800 		if (val < 1 || val > MAX_TCP_KEEPCNT)
1801 			err = -EINVAL;
1802 		else
1803 			tp->keepalive_probes = val;
1804 		break;
1805 	case TCP_SYNCNT:
1806 		if (val < 1 || val > MAX_TCP_SYNCNT)
1807 			err = -EINVAL;
1808 		else
1809 			icsk->icsk_syn_retries = val;
1810 		break;
1811 
1812 	case TCP_LINGER2:
1813 		if (val < 0)
1814 			tp->linger2 = -1;
1815 		else if (val > sysctl_tcp_fin_timeout / HZ)
1816 			tp->linger2 = 0;
1817 		else
1818 			tp->linger2 = val * HZ;
1819 		break;
1820 
1821 	case TCP_DEFER_ACCEPT:
1822 		icsk->icsk_accept_queue.rskq_defer_accept = 0;
1823 		if (val > 0) {
1824 			/* Translate value in seconds to number of
1825 			 * retransmits */
1826 			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1827 			       val > ((TCP_TIMEOUT_INIT / HZ) <<
1828 				       icsk->icsk_accept_queue.rskq_defer_accept))
1829 				icsk->icsk_accept_queue.rskq_defer_accept++;
1830 			icsk->icsk_accept_queue.rskq_defer_accept++;
1831 		}
1832 		break;
1833 
1834 	case TCP_WINDOW_CLAMP:
1835 		if (!val) {
1836 			if (sk->sk_state != TCP_CLOSE) {
1837 				err = -EINVAL;
1838 				break;
1839 			}
1840 			tp->window_clamp = 0;
1841 		} else
1842 			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1843 						SOCK_MIN_RCVBUF / 2 : val;
1844 		break;
1845 
1846 	case TCP_QUICKACK:
1847 		if (!val) {
1848 			icsk->icsk_ack.pingpong = 1;
1849 		} else {
1850 			icsk->icsk_ack.pingpong = 0;
1851 			if ((1 << sk->sk_state) &
1852 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1853 			    inet_csk_ack_scheduled(sk)) {
1854 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1855 				cleanup_rbuf(sk, 1);
1856 				if (!(val & 1))
1857 					icsk->icsk_ack.pingpong = 1;
1858 			}
1859 		}
1860 		break;
1861 
1862 	default:
1863 		err = -ENOPROTOOPT;
1864 		break;
1865 	};
1866 	release_sock(sk);
1867 	return err;
1868 }
1869 
1870 /* Return information about state of tcp endpoint in API format. */
1871 void tcp_get_info(struct sock *sk, struct tcp_info *info)
1872 {
1873 	struct tcp_sock *tp = tcp_sk(sk);
1874 	const struct inet_connection_sock *icsk = inet_csk(sk);
1875 	u32 now = tcp_time_stamp;
1876 
1877 	memset(info, 0, sizeof(*info));
1878 
1879 	info->tcpi_state = sk->sk_state;
1880 	info->tcpi_ca_state = icsk->icsk_ca_state;
1881 	info->tcpi_retransmits = icsk->icsk_retransmits;
1882 	info->tcpi_probes = icsk->icsk_probes_out;
1883 	info->tcpi_backoff = icsk->icsk_backoff;
1884 
1885 	if (tp->rx_opt.tstamp_ok)
1886 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1887 	if (tp->rx_opt.sack_ok)
1888 		info->tcpi_options |= TCPI_OPT_SACK;
1889 	if (tp->rx_opt.wscale_ok) {
1890 		info->tcpi_options |= TCPI_OPT_WSCALE;
1891 		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1892 		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1893 	}
1894 
1895 	if (tp->ecn_flags&TCP_ECN_OK)
1896 		info->tcpi_options |= TCPI_OPT_ECN;
1897 
1898 	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
1899 	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
1900 	info->tcpi_snd_mss = tp->mss_cache;
1901 	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
1902 
1903 	info->tcpi_unacked = tp->packets_out;
1904 	info->tcpi_sacked = tp->sacked_out;
1905 	info->tcpi_lost = tp->lost_out;
1906 	info->tcpi_retrans = tp->retrans_out;
1907 	info->tcpi_fackets = tp->fackets_out;
1908 
1909 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
1910 	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1911 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1912 
1913 	info->tcpi_pmtu = tp->pmtu_cookie;
1914 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1915 	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1916 	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
1917 	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
1918 	info->tcpi_snd_cwnd = tp->snd_cwnd;
1919 	info->tcpi_advmss = tp->advmss;
1920 	info->tcpi_reordering = tp->reordering;
1921 
1922 	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
1923 	info->tcpi_rcv_space = tp->rcvq_space.space;
1924 
1925 	info->tcpi_total_retrans = tp->total_retrans;
1926 }
1927 
1928 EXPORT_SYMBOL_GPL(tcp_get_info);
1929 
1930 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1931 		   int __user *optlen)
1932 {
1933 	struct inet_connection_sock *icsk = inet_csk(sk);
1934 	struct tcp_sock *tp = tcp_sk(sk);
1935 	int val, len;
1936 
1937 	if (level != SOL_TCP)
1938 		return tp->af_specific->getsockopt(sk, level, optname,
1939 						   optval, optlen);
1940 
1941 	if (get_user(len, optlen))
1942 		return -EFAULT;
1943 
1944 	len = min_t(unsigned int, len, sizeof(int));
1945 
1946 	if (len < 0)
1947 		return -EINVAL;
1948 
1949 	switch (optname) {
1950 	case TCP_MAXSEG:
1951 		val = tp->mss_cache;
1952 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
1953 			val = tp->rx_opt.user_mss;
1954 		break;
1955 	case TCP_NODELAY:
1956 		val = !!(tp->nonagle&TCP_NAGLE_OFF);
1957 		break;
1958 	case TCP_CORK:
1959 		val = !!(tp->nonagle&TCP_NAGLE_CORK);
1960 		break;
1961 	case TCP_KEEPIDLE:
1962 		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
1963 		break;
1964 	case TCP_KEEPINTVL:
1965 		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
1966 		break;
1967 	case TCP_KEEPCNT:
1968 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
1969 		break;
1970 	case TCP_SYNCNT:
1971 		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
1972 		break;
1973 	case TCP_LINGER2:
1974 		val = tp->linger2;
1975 		if (val >= 0)
1976 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
1977 		break;
1978 	case TCP_DEFER_ACCEPT:
1979 		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
1980 			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
1981 		break;
1982 	case TCP_WINDOW_CLAMP:
1983 		val = tp->window_clamp;
1984 		break;
1985 	case TCP_INFO: {
1986 		struct tcp_info info;
1987 
1988 		if (get_user(len, optlen))
1989 			return -EFAULT;
1990 
1991 		tcp_get_info(sk, &info);
1992 
1993 		len = min_t(unsigned int, len, sizeof(info));
1994 		if (put_user(len, optlen))
1995 			return -EFAULT;
1996 		if (copy_to_user(optval, &info, len))
1997 			return -EFAULT;
1998 		return 0;
1999 	}
2000 	case TCP_QUICKACK:
2001 		val = !icsk->icsk_ack.pingpong;
2002 		break;
2003 
2004 	case TCP_CONGESTION:
2005 		if (get_user(len, optlen))
2006 			return -EFAULT;
2007 		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2008 		if (put_user(len, optlen))
2009 			return -EFAULT;
2010 		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2011 			return -EFAULT;
2012 		return 0;
2013 	default:
2014 		return -ENOPROTOOPT;
2015 	};
2016 
2017 	if (put_user(len, optlen))
2018 		return -EFAULT;
2019 	if (copy_to_user(optval, &val, len))
2020 		return -EFAULT;
2021 	return 0;
2022 }
2023 
2024 
2025 extern void __skb_cb_too_small_for_tcp(int, int);
2026 extern struct tcp_congestion_ops tcp_reno;
2027 
2028 static __initdata unsigned long thash_entries;
2029 static int __init set_thash_entries(char *str)
2030 {
2031 	if (!str)
2032 		return 0;
2033 	thash_entries = simple_strtoul(str, &str, 0);
2034 	return 1;
2035 }
2036 __setup("thash_entries=", set_thash_entries);
2037 
2038 void __init tcp_init(void)
2039 {
2040 	struct sk_buff *skb = NULL;
2041 	int order, i;
2042 
2043 	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2044 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2045 					   sizeof(skb->cb));
2046 
2047 	tcp_hashinfo.bind_bucket_cachep =
2048 		kmem_cache_create("tcp_bind_bucket",
2049 				  sizeof(struct inet_bind_bucket), 0,
2050 				  SLAB_HWCACHE_ALIGN, NULL, NULL);
2051 	if (!tcp_hashinfo.bind_bucket_cachep)
2052 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2053 
2054 	/* Size and allocate the main established and bind bucket
2055 	 * hash tables.
2056 	 *
2057 	 * The methodology is similar to that of the buffer cache.
2058 	 */
2059 	tcp_hashinfo.ehash =
2060 		alloc_large_system_hash("TCP established",
2061 					sizeof(struct inet_ehash_bucket),
2062 					thash_entries,
2063 					(num_physpages >= 128 * 1024) ?
2064 						(25 - PAGE_SHIFT) :
2065 						(27 - PAGE_SHIFT),
2066 					HASH_HIGHMEM,
2067 					&tcp_hashinfo.ehash_size,
2068 					NULL,
2069 					0);
2070 	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2071 	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2072 		rwlock_init(&tcp_hashinfo.ehash[i].lock);
2073 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2074 	}
2075 
2076 	tcp_hashinfo.bhash =
2077 		alloc_large_system_hash("TCP bind",
2078 					sizeof(struct inet_bind_hashbucket),
2079 					tcp_hashinfo.ehash_size,
2080 					(num_physpages >= 128 * 1024) ?
2081 						(25 - PAGE_SHIFT) :
2082 						(27 - PAGE_SHIFT),
2083 					HASH_HIGHMEM,
2084 					&tcp_hashinfo.bhash_size,
2085 					NULL,
2086 					64 * 1024);
2087 	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2088 	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2089 		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2090 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2091 	}
2092 
2093 	/* Try to be a bit smarter and adjust defaults depending
2094 	 * on available memory.
2095 	 */
2096 	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2097 			(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2098 			order++)
2099 		;
2100 	if (order >= 4) {
2101 		sysctl_local_port_range[0] = 32768;
2102 		sysctl_local_port_range[1] = 61000;
2103 		tcp_death_row.sysctl_max_tw_buckets = 180000;
2104 		sysctl_tcp_max_orphans = 4096 << (order - 4);
2105 		sysctl_max_syn_backlog = 1024;
2106 	} else if (order < 3) {
2107 		sysctl_local_port_range[0] = 1024 * (3 - order);
2108 		tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2109 		sysctl_tcp_max_orphans >>= (3 - order);
2110 		sysctl_max_syn_backlog = 128;
2111 	}
2112 	tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
2113 
2114 	sysctl_tcp_mem[0] =  768 << order;
2115 	sysctl_tcp_mem[1] = 1024 << order;
2116 	sysctl_tcp_mem[2] = 1536 << order;
2117 
2118 	if (order < 3) {
2119 		sysctl_tcp_wmem[2] = 64 * 1024;
2120 		sysctl_tcp_rmem[0] = PAGE_SIZE;
2121 		sysctl_tcp_rmem[1] = 43689;
2122 		sysctl_tcp_rmem[2] = 2 * 43689;
2123 	}
2124 
2125 	printk(KERN_INFO "TCP: Hash tables configured "
2126 	       "(established %d bind %d)\n",
2127 	       tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2128 
2129 	tcp_register_congestion_control(&tcp_reno);
2130 }
2131 
2132 EXPORT_SYMBOL(tcp_close);
2133 EXPORT_SYMBOL(tcp_disconnect);
2134 EXPORT_SYMBOL(tcp_getsockopt);
2135 EXPORT_SYMBOL(tcp_ioctl);
2136 EXPORT_SYMBOL(tcp_poll);
2137 EXPORT_SYMBOL(tcp_read_sock);
2138 EXPORT_SYMBOL(tcp_recvmsg);
2139 EXPORT_SYMBOL(tcp_sendmsg);
2140 EXPORT_SYMBOL(tcp_sendpage);
2141 EXPORT_SYMBOL(tcp_setsockopt);
2142 EXPORT_SYMBOL(tcp_shutdown);
2143 EXPORT_SYMBOL(tcp_statistics);
2144