xref: /linux/net/ipv4/tcp.c (revision 6e8331ac6973435b1e7604c30f2ad394035b46e1)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *		Alan Cox	:	Numerous verify_area() calls
24  *		Alan Cox	:	Set the ACK bit on a reset
25  *		Alan Cox	:	Stopped it crashing if it closed while
26  *					sk->inuse=1 and was trying to connect
27  *					(tcp_err()).
28  *		Alan Cox	:	All icmp error handling was broken
29  *					pointers passed where wrong and the
30  *					socket was looked up backwards. Nobody
31  *					tested any icmp error code obviously.
32  *		Alan Cox	:	tcp_err() now handled properly. It
33  *					wakes people on errors. poll
34  *					behaves and the icmp error race
35  *					has gone by moving it into sock.c
36  *		Alan Cox	:	tcp_send_reset() fixed to work for
37  *					everything not just packets for
38  *					unknown sockets.
39  *		Alan Cox	:	tcp option processing.
40  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41  *					syn rule wrong]
42  *		Herp Rosmanith  :	More reset fixes
43  *		Alan Cox	:	No longer acks invalid rst frames.
44  *					Acking any kind of RST is right out.
45  *		Alan Cox	:	Sets an ignore me flag on an rst
46  *					receive otherwise odd bits of prattle
47  *					escape still
48  *		Alan Cox	:	Fixed another acking RST frame bug.
49  *					Should stop LAN workplace lockups.
50  *		Alan Cox	: 	Some tidyups using the new skb list
51  *					facilities
52  *		Alan Cox	:	sk->keepopen now seems to work
53  *		Alan Cox	:	Pulls options out correctly on accepts
54  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56  *					bit to skb ops.
57  *		Alan Cox	:	Tidied tcp_data to avoid a potential
58  *					nasty.
59  *		Alan Cox	:	Added some better commenting, as the
60  *					tcp is hard to follow
61  *		Alan Cox	:	Removed incorrect check for 20 * psh
62  *	Michael O'Reilly	:	ack < copied bug fix.
63  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64  *		Alan Cox	:	FIN with no memory -> CRASH
65  *		Alan Cox	:	Added socket option proto entries.
66  *					Also added awareness of them to accept.
67  *		Alan Cox	:	Added TCP options (SOL_TCP)
68  *		Alan Cox	:	Switched wakeup calls to callbacks,
69  *					so the kernel can layer network
70  *					sockets.
71  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72  *		Alan Cox	:	Handle FIN (more) properly (we hope).
73  *		Alan Cox	:	RST frames sent on unsynchronised
74  *					state ack error.
75  *		Alan Cox	:	Put in missing check for SYN bit.
76  *		Alan Cox	:	Added tcp_select_window() aka NET2E
77  *					window non shrink trick.
78  *		Alan Cox	:	Added a couple of small NET2E timer
79  *					fixes
80  *		Charles Hedrick :	TCP fixes
81  *		Toomas Tamm	:	TCP window fixes
82  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83  *		Charles Hedrick	:	Rewrote most of it to actually work
84  *		Linus		:	Rewrote tcp_read() and URG handling
85  *					completely
86  *		Gerhard Koerting:	Fixed some missing timer handling
87  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88  *		Gerhard Koerting:	PC/TCP workarounds
89  *		Adam Caldwell	:	Assorted timer/timing errors
90  *		Matthew Dillon	:	Fixed another RST bug
91  *		Alan Cox	:	Move to kernel side addressing changes.
92  *		Alan Cox	:	Beginning work on TCP fastpathing
93  *					(not yet usable)
94  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95  *		Alan Cox	:	TCP fast path debugging
96  *		Alan Cox	:	Window clamping
97  *		Michael Riepe	:	Bug in tcp_check()
98  *		Matt Dillon	:	More TCP improvements and RST bug fixes
99  *		Matt Dillon	:	Yet more small nasties remove from the
100  *					TCP code (Be very nice to this man if
101  *					tcp finally works 100%) 8)
102  *		Alan Cox	:	BSD accept semantics.
103  *		Alan Cox	:	Reset on closedown bug.
104  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105  *		Michael Pall	:	Handle poll() after URG properly in
106  *					all cases.
107  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108  *					(multi URG PUSH broke rlogin).
109  *		Michael Pall	:	Fix the multi URG PUSH problem in
110  *					tcp_readable(), poll() after URG
111  *					works now.
112  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113  *					BSD api.
114  *		Alan Cox	:	Changed the semantics of sk->socket to
115  *					fix a race and a signal problem with
116  *					accept() and async I/O.
117  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120  *					clients/servers which listen in on
121  *					fixed ports.
122  *		Alan Cox	:	Cleaned the above up and shrank it to
123  *					a sensible code size.
124  *		Alan Cox	:	Self connect lockup fix.
125  *		Alan Cox	:	No connect to multicast.
126  *		Ross Biro	:	Close unaccepted children on master
127  *					socket close.
128  *		Alan Cox	:	Reset tracing code.
129  *		Alan Cox	:	Spurious resets on shutdown.
130  *		Alan Cox	:	Giant 15 minute/60 second timer error
131  *		Alan Cox	:	Small whoops in polling before an
132  *					accept.
133  *		Alan Cox	:	Kept the state trace facility since
134  *					it's handy for debugging.
135  *		Alan Cox	:	More reset handler fixes.
136  *		Alan Cox	:	Started rewriting the code based on
137  *					the RFC's for other useful protocol
138  *					references see: Comer, KA9Q NOS, and
139  *					for a reference on the difference
140  *					between specifications and how BSD
141  *					works see the 4.4lite source.
142  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143  *					close.
144  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146  *		Alan Cox	:	Reimplemented timers as per the RFC
147  *					and using multiple timers for sanity.
148  *		Alan Cox	:	Small bug fixes, and a lot of new
149  *					comments.
150  *		Alan Cox	:	Fixed dual reader crash by locking
151  *					the buffers (much like datagram.c)
152  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153  *					now gets fed up of retrying without
154  *					(even a no space) answer.
155  *		Alan Cox	:	Extracted closing code better
156  *		Alan Cox	:	Fixed the closing state machine to
157  *					resemble the RFC.
158  *		Alan Cox	:	More 'per spec' fixes.
159  *		Jorge Cwik	:	Even faster checksumming.
160  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161  *					only frames. At least one pc tcp stack
162  *					generates them.
163  *		Alan Cox	:	Cache last socket.
164  *		Alan Cox	:	Per route irtt.
165  *		Matt Day	:	poll()->select() match BSD precisely on error
166  *		Alan Cox	:	New buffers
167  *		Marc Tamsky	:	Various sk->prot->retransmits and
168  *					sk->retransmits misupdating fixed.
169  *					Fixed tcp_write_timeout: stuck close,
170  *					and TCP syn retries gets used now.
171  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172  *					ack if state is TCP_CLOSED.
173  *		Alan Cox	:	Look up device on a retransmit - routes may
174  *					change. Doesn't yet cope with MSS shrink right
175  *					but it's a start!
176  *		Marc Tamsky	:	Closing in closing fixes.
177  *		Mike Shaver	:	RFC1122 verifications.
178  *		Alan Cox	:	rcv_saddr errors.
179  *		Alan Cox	:	Block double connect().
180  *		Alan Cox	:	Small hooks for enSKIP.
181  *		Alexey Kuznetsov:	Path MTU discovery.
182  *		Alan Cox	:	Support soft errors.
183  *		Alan Cox	:	Fix MTU discovery pathological case
184  *					when the remote claims no mtu!
185  *		Marc Tamsky	:	TCP_CLOSE fix.
186  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187  *					window but wrong (fixes NT lpd problems)
188  *		Pedro Roque	:	Better TCP window handling, delayed ack.
189  *		Joerg Reuter	:	No modification of locked buffers in
190  *					tcp_do_retransmit()
191  *		Eric Schenk	:	Changed receiver side silly window
192  *					avoidance algorithm to BSD style
193  *					algorithm. This doubles throughput
194  *					against machines running Solaris,
195  *					and seems to result in general
196  *					improvement.
197  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198  *	Willy Konynenberg	:	Transparent proxying support.
199  *	Mike McLagan		:	Routing by source
200  *		Keith Owens	:	Do proper merging with partial SKB's in
201  *					tcp_do_sendmsg to avoid burstiness.
202  *		Eric Schenk	:	Fix fast close down bug with
203  *					shutdown() followed by close().
204  *		Andi Kleen 	:	Make poll agree with SIGIO
205  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206  *					lingertime == 0 (RFC 793 ABORT Call)
207  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208  *					csum_and_copy_from_user() if possible.
209  *
210  *		This program is free software; you can redistribute it and/or
211  *		modify it under the terms of the GNU General Public License
212  *		as published by the Free Software Foundation; either version
213  *		2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *	TCP_SYN_SENT		sent a connection request, waiting for ack
218  *
219  *	TCP_SYN_RECV		received a connection request, sent ack,
220  *				waiting for final ack in three-way handshake.
221  *
222  *	TCP_ESTABLISHED		connection established
223  *
224  *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225  *				transmission of remaining buffered data
226  *
227  *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228  *				to shutdown
229  *
230  *	TCP_CLOSING		both sides have shutdown but we still have
231  *				data we have to finish sending
232  *
233  *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234  *				closed, can only be entered from FIN_WAIT2
235  *				or CLOSING.  Required because the other end
236  *				may not have gotten our last ACK causing it
237  *				to retransmit the data packet (which we ignore)
238  *
239  *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240  *				us to finish writing our data and to shutdown
241  *				(we have to close() to move on to LAST_ACK)
242  *
243  *	TCP_LAST_ACK		out side has shutdown after remote has
244  *				shutdown.  There may still be data in our
245  *				buffer that we have to finish sending
246  *
247  *	TCP_CLOSE		socket is finished
248  */
249 
250 #include <linux/module.h>
251 #include <linux/types.h>
252 #include <linux/fcntl.h>
253 #include <linux/poll.h>
254 #include <linux/init.h>
255 #include <linux/smp_lock.h>
256 #include <linux/fs.h>
257 #include <linux/random.h>
258 #include <linux/bootmem.h>
259 #include <linux/cache.h>
260 #include <linux/err.h>
261 
262 #include <net/icmp.h>
263 #include <net/tcp.h>
264 #include <net/xfrm.h>
265 #include <net/ip.h>
266 #include <net/netdma.h>
267 
268 #include <asm/uaccess.h>
269 #include <asm/ioctls.h>
270 
271 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
272 
273 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
274 
275 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
276 
277 EXPORT_SYMBOL_GPL(tcp_orphan_count);
278 
279 int sysctl_tcp_mem[3] __read_mostly;
280 int sysctl_tcp_wmem[3] __read_mostly;
281 int sysctl_tcp_rmem[3] __read_mostly;
282 
283 EXPORT_SYMBOL(sysctl_tcp_mem);
284 EXPORT_SYMBOL(sysctl_tcp_rmem);
285 EXPORT_SYMBOL(sysctl_tcp_wmem);
286 
287 atomic_t tcp_memory_allocated;	/* Current allocated memory. */
288 atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
289 
290 EXPORT_SYMBOL(tcp_memory_allocated);
291 EXPORT_SYMBOL(tcp_sockets_allocated);
292 
293 /*
294  * Pressure flag: try to collapse.
295  * Technical note: it is used by multiple contexts non atomically.
296  * All the sk_stream_mem_schedule() is of this nature: accounting
297  * is strict, actions are advisory and have some latency.
298  */
299 int tcp_memory_pressure;
300 
301 EXPORT_SYMBOL(tcp_memory_pressure);
302 
303 void tcp_enter_memory_pressure(void)
304 {
305 	if (!tcp_memory_pressure) {
306 		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307 		tcp_memory_pressure = 1;
308 	}
309 }
310 
311 EXPORT_SYMBOL(tcp_enter_memory_pressure);
312 
313 /*
314  *	Wait for a TCP event.
315  *
316  *	Note that we don't need to lock the socket, as the upper poll layers
317  *	take care of normal races (between the test and the event) and we don't
318  *	go look at any of the socket buffers directly.
319  */
320 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
321 {
322 	unsigned int mask;
323 	struct sock *sk = sock->sk;
324 	struct tcp_sock *tp = tcp_sk(sk);
325 
326 	poll_wait(file, sk->sk_sleep, wait);
327 	if (sk->sk_state == TCP_LISTEN)
328 		return inet_csk_listen_poll(sk);
329 
330 	/* Socket is not locked. We are protected from async events
331 	   by poll logic and correct handling of state changes
332 	   made by another threads is impossible in any case.
333 	 */
334 
335 	mask = 0;
336 	if (sk->sk_err)
337 		mask = POLLERR;
338 
339 	/*
340 	 * POLLHUP is certainly not done right. But poll() doesn't
341 	 * have a notion of HUP in just one direction, and for a
342 	 * socket the read side is more interesting.
343 	 *
344 	 * Some poll() documentation says that POLLHUP is incompatible
345 	 * with the POLLOUT/POLLWR flags, so somebody should check this
346 	 * all. But careful, it tends to be safer to return too many
347 	 * bits than too few, and you can easily break real applications
348 	 * if you don't tell them that something has hung up!
349 	 *
350 	 * Check-me.
351 	 *
352 	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
353 	 * our fs/select.c). It means that after we received EOF,
354 	 * poll always returns immediately, making impossible poll() on write()
355 	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
356 	 * if and only if shutdown has been made in both directions.
357 	 * Actually, it is interesting to look how Solaris and DUX
358 	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
359 	 * then we could set it on SND_SHUTDOWN. BTW examples given
360 	 * in Stevens' books assume exactly this behaviour, it explains
361 	 * why PULLHUP is incompatible with POLLOUT.	--ANK
362 	 *
363 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
364 	 * blocking on fresh not-connected or disconnected socket. --ANK
365 	 */
366 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
367 		mask |= POLLHUP;
368 	if (sk->sk_shutdown & RCV_SHUTDOWN)
369 		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
370 
371 	/* Connected? */
372 	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
373 		/* Potential race condition. If read of tp below will
374 		 * escape above sk->sk_state, we can be illegally awaken
375 		 * in SYN_* states. */
376 		if ((tp->rcv_nxt != tp->copied_seq) &&
377 		    (tp->urg_seq != tp->copied_seq ||
378 		     tp->rcv_nxt != tp->copied_seq + 1 ||
379 		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
380 			mask |= POLLIN | POLLRDNORM;
381 
382 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
383 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
384 				mask |= POLLOUT | POLLWRNORM;
385 			} else {  /* send SIGIO later */
386 				set_bit(SOCK_ASYNC_NOSPACE,
387 					&sk->sk_socket->flags);
388 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
389 
390 				/* Race breaker. If space is freed after
391 				 * wspace test but before the flags are set,
392 				 * IO signal will be lost.
393 				 */
394 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
395 					mask |= POLLOUT | POLLWRNORM;
396 			}
397 		}
398 
399 		if (tp->urg_data & TCP_URG_VALID)
400 			mask |= POLLPRI;
401 	}
402 	return mask;
403 }
404 
405 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
406 {
407 	struct tcp_sock *tp = tcp_sk(sk);
408 	int answ;
409 
410 	switch (cmd) {
411 	case SIOCINQ:
412 		if (sk->sk_state == TCP_LISTEN)
413 			return -EINVAL;
414 
415 		lock_sock(sk);
416 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
417 			answ = 0;
418 		else if (sock_flag(sk, SOCK_URGINLINE) ||
419 			 !tp->urg_data ||
420 			 before(tp->urg_seq, tp->copied_seq) ||
421 			 !before(tp->urg_seq, tp->rcv_nxt)) {
422 			answ = tp->rcv_nxt - tp->copied_seq;
423 
424 			/* Subtract 1, if FIN is in queue. */
425 			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
426 				answ -=
427 		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
428 		} else
429 			answ = tp->urg_seq - tp->copied_seq;
430 		release_sock(sk);
431 		break;
432 	case SIOCATMARK:
433 		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
434 		break;
435 	case SIOCOUTQ:
436 		if (sk->sk_state == TCP_LISTEN)
437 			return -EINVAL;
438 
439 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
440 			answ = 0;
441 		else
442 			answ = tp->write_seq - tp->snd_una;
443 		break;
444 	default:
445 		return -ENOIOCTLCMD;
446 	};
447 
448 	return put_user(answ, (int __user *)arg);
449 }
450 
451 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
452 {
453 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
454 	tp->pushed_seq = tp->write_seq;
455 }
456 
457 static inline int forced_push(struct tcp_sock *tp)
458 {
459 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
460 }
461 
462 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
463 			      struct sk_buff *skb)
464 {
465 	skb->csum = 0;
466 	TCP_SKB_CB(skb)->seq = tp->write_seq;
467 	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
468 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
469 	TCP_SKB_CB(skb)->sacked = 0;
470 	skb_header_release(skb);
471 	__skb_queue_tail(&sk->sk_write_queue, skb);
472 	sk_charge_skb(sk, skb);
473 	if (!sk->sk_send_head)
474 		sk->sk_send_head = skb;
475 	if (tp->nonagle & TCP_NAGLE_PUSH)
476 		tp->nonagle &= ~TCP_NAGLE_PUSH;
477 }
478 
479 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
480 				struct sk_buff *skb)
481 {
482 	if (flags & MSG_OOB) {
483 		tp->urg_mode = 1;
484 		tp->snd_up = tp->write_seq;
485 		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
486 	}
487 }
488 
489 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
490 			    int mss_now, int nonagle)
491 {
492 	if (sk->sk_send_head) {
493 		struct sk_buff *skb = sk->sk_write_queue.prev;
494 		if (!(flags & MSG_MORE) || forced_push(tp))
495 			tcp_mark_push(tp, skb);
496 		tcp_mark_urg(tp, flags, skb);
497 		__tcp_push_pending_frames(sk, tp, mss_now,
498 					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
499 	}
500 }
501 
502 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
503 			 size_t psize, int flags)
504 {
505 	struct tcp_sock *tp = tcp_sk(sk);
506 	int mss_now, size_goal;
507 	int err;
508 	ssize_t copied;
509 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
510 
511 	/* Wait for a connection to finish. */
512 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
513 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
514 			goto out_err;
515 
516 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
517 
518 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
519 	size_goal = tp->xmit_size_goal;
520 	copied = 0;
521 
522 	err = -EPIPE;
523 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
524 		goto do_error;
525 
526 	while (psize > 0) {
527 		struct sk_buff *skb = sk->sk_write_queue.prev;
528 		struct page *page = pages[poffset / PAGE_SIZE];
529 		int copy, i, can_coalesce;
530 		int offset = poffset % PAGE_SIZE;
531 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
532 
533 		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
534 new_segment:
535 			if (!sk_stream_memory_free(sk))
536 				goto wait_for_sndbuf;
537 
538 			skb = sk_stream_alloc_pskb(sk, 0, 0,
539 						   sk->sk_allocation);
540 			if (!skb)
541 				goto wait_for_memory;
542 
543 			skb_entail(sk, tp, skb);
544 			copy = size_goal;
545 		}
546 
547 		if (copy > size)
548 			copy = size;
549 
550 		i = skb_shinfo(skb)->nr_frags;
551 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
552 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
553 			tcp_mark_push(tp, skb);
554 			goto new_segment;
555 		}
556 		if (!sk_stream_wmem_schedule(sk, copy))
557 			goto wait_for_memory;
558 
559 		if (can_coalesce) {
560 			skb_shinfo(skb)->frags[i - 1].size += copy;
561 		} else {
562 			get_page(page);
563 			skb_fill_page_desc(skb, i, page, offset, copy);
564 		}
565 
566 		skb->len += copy;
567 		skb->data_len += copy;
568 		skb->truesize += copy;
569 		sk->sk_wmem_queued += copy;
570 		sk->sk_forward_alloc -= copy;
571 		skb->ip_summed = CHECKSUM_HW;
572 		tp->write_seq += copy;
573 		TCP_SKB_CB(skb)->end_seq += copy;
574 		skb_shinfo(skb)->gso_segs = 0;
575 
576 		if (!copied)
577 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
578 
579 		copied += copy;
580 		poffset += copy;
581 		if (!(psize -= copy))
582 			goto out;
583 
584 		if (skb->len < mss_now || (flags & MSG_OOB))
585 			continue;
586 
587 		if (forced_push(tp)) {
588 			tcp_mark_push(tp, skb);
589 			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
590 		} else if (skb == sk->sk_send_head)
591 			tcp_push_one(sk, mss_now);
592 		continue;
593 
594 wait_for_sndbuf:
595 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
596 wait_for_memory:
597 		if (copied)
598 			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
599 
600 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
601 			goto do_error;
602 
603 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
604 		size_goal = tp->xmit_size_goal;
605 	}
606 
607 out:
608 	if (copied)
609 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
610 	return copied;
611 
612 do_error:
613 	if (copied)
614 		goto out;
615 out_err:
616 	return sk_stream_error(sk, flags, err);
617 }
618 
619 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
620 		     size_t size, int flags)
621 {
622 	ssize_t res;
623 	struct sock *sk = sock->sk;
624 
625 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
626 	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
627 		return sock_no_sendpage(sock, page, offset, size, flags);
628 
629 	lock_sock(sk);
630 	TCP_CHECK_TIMER(sk);
631 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
632 	TCP_CHECK_TIMER(sk);
633 	release_sock(sk);
634 	return res;
635 }
636 
637 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
638 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
639 
640 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
641 {
642 	int tmp = tp->mss_cache;
643 
644 	if (sk->sk_route_caps & NETIF_F_SG) {
645 		if (sk_can_gso(sk))
646 			tmp = 0;
647 		else {
648 			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
649 
650 			if (tmp >= pgbreak &&
651 			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
652 				tmp = pgbreak;
653 		}
654 	}
655 
656 	return tmp;
657 }
658 
659 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
660 		size_t size)
661 {
662 	struct iovec *iov;
663 	struct tcp_sock *tp = tcp_sk(sk);
664 	struct sk_buff *skb;
665 	int iovlen, flags;
666 	int mss_now, size_goal;
667 	int err, copied;
668 	long timeo;
669 
670 	lock_sock(sk);
671 	TCP_CHECK_TIMER(sk);
672 
673 	flags = msg->msg_flags;
674 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
675 
676 	/* Wait for a connection to finish. */
677 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
678 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
679 			goto out_err;
680 
681 	/* This should be in poll */
682 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
683 
684 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
685 	size_goal = tp->xmit_size_goal;
686 
687 	/* Ok commence sending. */
688 	iovlen = msg->msg_iovlen;
689 	iov = msg->msg_iov;
690 	copied = 0;
691 
692 	err = -EPIPE;
693 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
694 		goto do_error;
695 
696 	while (--iovlen >= 0) {
697 		int seglen = iov->iov_len;
698 		unsigned char __user *from = iov->iov_base;
699 
700 		iov++;
701 
702 		while (seglen > 0) {
703 			int copy;
704 
705 			skb = sk->sk_write_queue.prev;
706 
707 			if (!sk->sk_send_head ||
708 			    (copy = size_goal - skb->len) <= 0) {
709 
710 new_segment:
711 				/* Allocate new segment. If the interface is SG,
712 				 * allocate skb fitting to single page.
713 				 */
714 				if (!sk_stream_memory_free(sk))
715 					goto wait_for_sndbuf;
716 
717 				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
718 							   0, sk->sk_allocation);
719 				if (!skb)
720 					goto wait_for_memory;
721 
722 				/*
723 				 * Check whether we can use HW checksum.
724 				 */
725 				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
726 					skb->ip_summed = CHECKSUM_HW;
727 
728 				skb_entail(sk, tp, skb);
729 				copy = size_goal;
730 			}
731 
732 			/* Try to append data to the end of skb. */
733 			if (copy > seglen)
734 				copy = seglen;
735 
736 			/* Where to copy to? */
737 			if (skb_tailroom(skb) > 0) {
738 				/* We have some space in skb head. Superb! */
739 				if (copy > skb_tailroom(skb))
740 					copy = skb_tailroom(skb);
741 				if ((err = skb_add_data(skb, from, copy)) != 0)
742 					goto do_fault;
743 			} else {
744 				int merge = 0;
745 				int i = skb_shinfo(skb)->nr_frags;
746 				struct page *page = TCP_PAGE(sk);
747 				int off = TCP_OFF(sk);
748 
749 				if (skb_can_coalesce(skb, i, page, off) &&
750 				    off != PAGE_SIZE) {
751 					/* We can extend the last page
752 					 * fragment. */
753 					merge = 1;
754 				} else if (i == MAX_SKB_FRAGS ||
755 					   (!i &&
756 					   !(sk->sk_route_caps & NETIF_F_SG))) {
757 					/* Need to add new fragment and cannot
758 					 * do this because interface is non-SG,
759 					 * or because all the page slots are
760 					 * busy. */
761 					tcp_mark_push(tp, skb);
762 					goto new_segment;
763 				} else if (page) {
764 					if (off == PAGE_SIZE) {
765 						put_page(page);
766 						TCP_PAGE(sk) = page = NULL;
767 						off = 0;
768 					}
769 				} else
770 					off = 0;
771 
772 				if (copy > PAGE_SIZE - off)
773 					copy = PAGE_SIZE - off;
774 
775 				if (!sk_stream_wmem_schedule(sk, copy))
776 					goto wait_for_memory;
777 
778 				if (!page) {
779 					/* Allocate new cache page. */
780 					if (!(page = sk_stream_alloc_page(sk)))
781 						goto wait_for_memory;
782 				}
783 
784 				/* Time to copy data. We are close to
785 				 * the end! */
786 				err = skb_copy_to_page(sk, from, skb, page,
787 						       off, copy);
788 				if (err) {
789 					/* If this page was new, give it to the
790 					 * socket so it does not get leaked.
791 					 */
792 					if (!TCP_PAGE(sk)) {
793 						TCP_PAGE(sk) = page;
794 						TCP_OFF(sk) = 0;
795 					}
796 					goto do_error;
797 				}
798 
799 				/* Update the skb. */
800 				if (merge) {
801 					skb_shinfo(skb)->frags[i - 1].size +=
802 									copy;
803 				} else {
804 					skb_fill_page_desc(skb, i, page, off, copy);
805 					if (TCP_PAGE(sk)) {
806 						get_page(page);
807 					} else if (off + copy < PAGE_SIZE) {
808 						get_page(page);
809 						TCP_PAGE(sk) = page;
810 					}
811 				}
812 
813 				TCP_OFF(sk) = off + copy;
814 			}
815 
816 			if (!copied)
817 				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
818 
819 			tp->write_seq += copy;
820 			TCP_SKB_CB(skb)->end_seq += copy;
821 			skb_shinfo(skb)->gso_segs = 0;
822 
823 			from += copy;
824 			copied += copy;
825 			if ((seglen -= copy) == 0 && iovlen == 0)
826 				goto out;
827 
828 			if (skb->len < mss_now || (flags & MSG_OOB))
829 				continue;
830 
831 			if (forced_push(tp)) {
832 				tcp_mark_push(tp, skb);
833 				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
834 			} else if (skb == sk->sk_send_head)
835 				tcp_push_one(sk, mss_now);
836 			continue;
837 
838 wait_for_sndbuf:
839 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
840 wait_for_memory:
841 			if (copied)
842 				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
843 
844 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
845 				goto do_error;
846 
847 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
848 			size_goal = tp->xmit_size_goal;
849 		}
850 	}
851 
852 out:
853 	if (copied)
854 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
855 	TCP_CHECK_TIMER(sk);
856 	release_sock(sk);
857 	return copied;
858 
859 do_fault:
860 	if (!skb->len) {
861 		if (sk->sk_send_head == skb)
862 			sk->sk_send_head = NULL;
863 		__skb_unlink(skb, &sk->sk_write_queue);
864 		sk_stream_free_skb(sk, skb);
865 	}
866 
867 do_error:
868 	if (copied)
869 		goto out;
870 out_err:
871 	err = sk_stream_error(sk, flags, err);
872 	TCP_CHECK_TIMER(sk);
873 	release_sock(sk);
874 	return err;
875 }
876 
877 /*
878  *	Handle reading urgent data. BSD has very simple semantics for
879  *	this, no blocking and very strange errors 8)
880  */
881 
882 static int tcp_recv_urg(struct sock *sk, long timeo,
883 			struct msghdr *msg, int len, int flags,
884 			int *addr_len)
885 {
886 	struct tcp_sock *tp = tcp_sk(sk);
887 
888 	/* No URG data to read. */
889 	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
890 	    tp->urg_data == TCP_URG_READ)
891 		return -EINVAL;	/* Yes this is right ! */
892 
893 	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
894 		return -ENOTCONN;
895 
896 	if (tp->urg_data & TCP_URG_VALID) {
897 		int err = 0;
898 		char c = tp->urg_data;
899 
900 		if (!(flags & MSG_PEEK))
901 			tp->urg_data = TCP_URG_READ;
902 
903 		/* Read urgent data. */
904 		msg->msg_flags |= MSG_OOB;
905 
906 		if (len > 0) {
907 			if (!(flags & MSG_TRUNC))
908 				err = memcpy_toiovec(msg->msg_iov, &c, 1);
909 			len = 1;
910 		} else
911 			msg->msg_flags |= MSG_TRUNC;
912 
913 		return err ? -EFAULT : len;
914 	}
915 
916 	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
917 		return 0;
918 
919 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
920 	 * the available implementations agree in this case:
921 	 * this call should never block, independent of the
922 	 * blocking state of the socket.
923 	 * Mike <pall@rz.uni-karlsruhe.de>
924 	 */
925 	return -EAGAIN;
926 }
927 
928 /* Clean up the receive buffer for full frames taken by the user,
929  * then send an ACK if necessary.  COPIED is the number of bytes
930  * tcp_recvmsg has given to the user so far, it speeds up the
931  * calculation of whether or not we must ACK for the sake of
932  * a window update.
933  */
934 void tcp_cleanup_rbuf(struct sock *sk, int copied)
935 {
936 	struct tcp_sock *tp = tcp_sk(sk);
937 	int time_to_ack = 0;
938 
939 #if TCP_DEBUG
940 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
941 
942 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
943 #endif
944 
945 	if (inet_csk_ack_scheduled(sk)) {
946 		const struct inet_connection_sock *icsk = inet_csk(sk);
947 		   /* Delayed ACKs frequently hit locked sockets during bulk
948 		    * receive. */
949 		if (icsk->icsk_ack.blocked ||
950 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
951 		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
952 		    /*
953 		     * If this read emptied read buffer, we send ACK, if
954 		     * connection is not bidirectional, user drained
955 		     * receive buffer and there was a small segment
956 		     * in queue.
957 		     */
958 		    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
959 		     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
960 			time_to_ack = 1;
961 	}
962 
963 	/* We send an ACK if we can now advertise a non-zero window
964 	 * which has been raised "significantly".
965 	 *
966 	 * Even if window raised up to infinity, do not send window open ACK
967 	 * in states, where we will not receive more. It is useless.
968 	 */
969 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
970 		__u32 rcv_window_now = tcp_receive_window(tp);
971 
972 		/* Optimize, __tcp_select_window() is not cheap. */
973 		if (2*rcv_window_now <= tp->window_clamp) {
974 			__u32 new_window = __tcp_select_window(sk);
975 
976 			/* Send ACK now, if this read freed lots of space
977 			 * in our buffer. Certainly, new_window is new window.
978 			 * We can advertise it now, if it is not less than current one.
979 			 * "Lots" means "at least twice" here.
980 			 */
981 			if (new_window && new_window >= 2 * rcv_window_now)
982 				time_to_ack = 1;
983 		}
984 	}
985 	if (time_to_ack)
986 		tcp_send_ack(sk);
987 }
988 
989 static void tcp_prequeue_process(struct sock *sk)
990 {
991 	struct sk_buff *skb;
992 	struct tcp_sock *tp = tcp_sk(sk);
993 
994 	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
995 
996 	/* RX process wants to run with disabled BHs, though it is not
997 	 * necessary */
998 	local_bh_disable();
999 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1000 		sk->sk_backlog_rcv(sk, skb);
1001 	local_bh_enable();
1002 
1003 	/* Clear memory counter. */
1004 	tp->ucopy.memory = 0;
1005 }
1006 
1007 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1008 {
1009 	struct sk_buff *skb;
1010 	u32 offset;
1011 
1012 	skb_queue_walk(&sk->sk_receive_queue, skb) {
1013 		offset = seq - TCP_SKB_CB(skb)->seq;
1014 		if (skb->h.th->syn)
1015 			offset--;
1016 		if (offset < skb->len || skb->h.th->fin) {
1017 			*off = offset;
1018 			return skb;
1019 		}
1020 	}
1021 	return NULL;
1022 }
1023 
1024 /*
1025  * This routine provides an alternative to tcp_recvmsg() for routines
1026  * that would like to handle copying from skbuffs directly in 'sendfile'
1027  * fashion.
1028  * Note:
1029  *	- It is assumed that the socket was locked by the caller.
1030  *	- The routine does not block.
1031  *	- At present, there is no support for reading OOB data
1032  *	  or for 'peeking' the socket using this routine
1033  *	  (although both would be easy to implement).
1034  */
1035 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1036 		  sk_read_actor_t recv_actor)
1037 {
1038 	struct sk_buff *skb;
1039 	struct tcp_sock *tp = tcp_sk(sk);
1040 	u32 seq = tp->copied_seq;
1041 	u32 offset;
1042 	int copied = 0;
1043 
1044 	if (sk->sk_state == TCP_LISTEN)
1045 		return -ENOTCONN;
1046 	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1047 		if (offset < skb->len) {
1048 			size_t used, len;
1049 
1050 			len = skb->len - offset;
1051 			/* Stop reading if we hit a patch of urgent data */
1052 			if (tp->urg_data) {
1053 				u32 urg_offset = tp->urg_seq - seq;
1054 				if (urg_offset < len)
1055 					len = urg_offset;
1056 				if (!len)
1057 					break;
1058 			}
1059 			used = recv_actor(desc, skb, offset, len);
1060 			if (used <= len) {
1061 				seq += used;
1062 				copied += used;
1063 				offset += used;
1064 			}
1065 			if (offset != skb->len)
1066 				break;
1067 		}
1068 		if (skb->h.th->fin) {
1069 			sk_eat_skb(sk, skb, 0);
1070 			++seq;
1071 			break;
1072 		}
1073 		sk_eat_skb(sk, skb, 0);
1074 		if (!desc->count)
1075 			break;
1076 	}
1077 	tp->copied_seq = seq;
1078 
1079 	tcp_rcv_space_adjust(sk);
1080 
1081 	/* Clean up data we have read: This will do ACK frames. */
1082 	if (copied)
1083 		tcp_cleanup_rbuf(sk, copied);
1084 	return copied;
1085 }
1086 
1087 /*
1088  *	This routine copies from a sock struct into the user buffer.
1089  *
1090  *	Technical note: in 2.3 we work on _locked_ socket, so that
1091  *	tricks with *seq access order and skb->users are not required.
1092  *	Probably, code can be easily improved even more.
1093  */
1094 
1095 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1096 		size_t len, int nonblock, int flags, int *addr_len)
1097 {
1098 	struct tcp_sock *tp = tcp_sk(sk);
1099 	int copied = 0;
1100 	u32 peek_seq;
1101 	u32 *seq;
1102 	unsigned long used;
1103 	int err;
1104 	int target;		/* Read at least this many bytes */
1105 	long timeo;
1106 	struct task_struct *user_recv = NULL;
1107 	int copied_early = 0;
1108 
1109 	lock_sock(sk);
1110 
1111 	TCP_CHECK_TIMER(sk);
1112 
1113 	err = -ENOTCONN;
1114 	if (sk->sk_state == TCP_LISTEN)
1115 		goto out;
1116 
1117 	timeo = sock_rcvtimeo(sk, nonblock);
1118 
1119 	/* Urgent data needs to be handled specially. */
1120 	if (flags & MSG_OOB)
1121 		goto recv_urg;
1122 
1123 	seq = &tp->copied_seq;
1124 	if (flags & MSG_PEEK) {
1125 		peek_seq = tp->copied_seq;
1126 		seq = &peek_seq;
1127 	}
1128 
1129 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1130 
1131 #ifdef CONFIG_NET_DMA
1132 	tp->ucopy.dma_chan = NULL;
1133 	preempt_disable();
1134 	if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1135 	    !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) {
1136 		preempt_enable_no_resched();
1137 		tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
1138 	} else
1139 		preempt_enable_no_resched();
1140 #endif
1141 
1142 	do {
1143 		struct sk_buff *skb;
1144 		u32 offset;
1145 
1146 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1147 		if (tp->urg_data && tp->urg_seq == *seq) {
1148 			if (copied)
1149 				break;
1150 			if (signal_pending(current)) {
1151 				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1152 				break;
1153 			}
1154 		}
1155 
1156 		/* Next get a buffer. */
1157 
1158 		skb = skb_peek(&sk->sk_receive_queue);
1159 		do {
1160 			if (!skb)
1161 				break;
1162 
1163 			/* Now that we have two receive queues this
1164 			 * shouldn't happen.
1165 			 */
1166 			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1167 				printk(KERN_INFO "recvmsg bug: copied %X "
1168 				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1169 				break;
1170 			}
1171 			offset = *seq - TCP_SKB_CB(skb)->seq;
1172 			if (skb->h.th->syn)
1173 				offset--;
1174 			if (offset < skb->len)
1175 				goto found_ok_skb;
1176 			if (skb->h.th->fin)
1177 				goto found_fin_ok;
1178 			BUG_TRAP(flags & MSG_PEEK);
1179 			skb = skb->next;
1180 		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1181 
1182 		/* Well, if we have backlog, try to process it now yet. */
1183 
1184 		if (copied >= target && !sk->sk_backlog.tail)
1185 			break;
1186 
1187 		if (copied) {
1188 			if (sk->sk_err ||
1189 			    sk->sk_state == TCP_CLOSE ||
1190 			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1191 			    !timeo ||
1192 			    signal_pending(current) ||
1193 			    (flags & MSG_PEEK))
1194 				break;
1195 		} else {
1196 			if (sock_flag(sk, SOCK_DONE))
1197 				break;
1198 
1199 			if (sk->sk_err) {
1200 				copied = sock_error(sk);
1201 				break;
1202 			}
1203 
1204 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1205 				break;
1206 
1207 			if (sk->sk_state == TCP_CLOSE) {
1208 				if (!sock_flag(sk, SOCK_DONE)) {
1209 					/* This occurs when user tries to read
1210 					 * from never connected socket.
1211 					 */
1212 					copied = -ENOTCONN;
1213 					break;
1214 				}
1215 				break;
1216 			}
1217 
1218 			if (!timeo) {
1219 				copied = -EAGAIN;
1220 				break;
1221 			}
1222 
1223 			if (signal_pending(current)) {
1224 				copied = sock_intr_errno(timeo);
1225 				break;
1226 			}
1227 		}
1228 
1229 		tcp_cleanup_rbuf(sk, copied);
1230 
1231 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1232 			/* Install new reader */
1233 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1234 				user_recv = current;
1235 				tp->ucopy.task = user_recv;
1236 				tp->ucopy.iov = msg->msg_iov;
1237 			}
1238 
1239 			tp->ucopy.len = len;
1240 
1241 			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1242 				 (flags & (MSG_PEEK | MSG_TRUNC)));
1243 
1244 			/* Ugly... If prequeue is not empty, we have to
1245 			 * process it before releasing socket, otherwise
1246 			 * order will be broken at second iteration.
1247 			 * More elegant solution is required!!!
1248 			 *
1249 			 * Look: we have the following (pseudo)queues:
1250 			 *
1251 			 * 1. packets in flight
1252 			 * 2. backlog
1253 			 * 3. prequeue
1254 			 * 4. receive_queue
1255 			 *
1256 			 * Each queue can be processed only if the next ones
1257 			 * are empty. At this point we have empty receive_queue.
1258 			 * But prequeue _can_ be not empty after 2nd iteration,
1259 			 * when we jumped to start of loop because backlog
1260 			 * processing added something to receive_queue.
1261 			 * We cannot release_sock(), because backlog contains
1262 			 * packets arrived _after_ prequeued ones.
1263 			 *
1264 			 * Shortly, algorithm is clear --- to process all
1265 			 * the queues in order. We could make it more directly,
1266 			 * requeueing packets from backlog to prequeue, if
1267 			 * is not empty. It is more elegant, but eats cycles,
1268 			 * unfortunately.
1269 			 */
1270 			if (!skb_queue_empty(&tp->ucopy.prequeue))
1271 				goto do_prequeue;
1272 
1273 			/* __ Set realtime policy in scheduler __ */
1274 		}
1275 
1276 		if (copied >= target) {
1277 			/* Do not sleep, just process backlog. */
1278 			release_sock(sk);
1279 			lock_sock(sk);
1280 		} else
1281 			sk_wait_data(sk, &timeo);
1282 
1283 #ifdef CONFIG_NET_DMA
1284 		tp->ucopy.wakeup = 0;
1285 #endif
1286 
1287 		if (user_recv) {
1288 			int chunk;
1289 
1290 			/* __ Restore normal policy in scheduler __ */
1291 
1292 			if ((chunk = len - tp->ucopy.len) != 0) {
1293 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1294 				len -= chunk;
1295 				copied += chunk;
1296 			}
1297 
1298 			if (tp->rcv_nxt == tp->copied_seq &&
1299 			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1300 do_prequeue:
1301 				tcp_prequeue_process(sk);
1302 
1303 				if ((chunk = len - tp->ucopy.len) != 0) {
1304 					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1305 					len -= chunk;
1306 					copied += chunk;
1307 				}
1308 			}
1309 		}
1310 		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1311 			if (net_ratelimit())
1312 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1313 				       current->comm, current->pid);
1314 			peek_seq = tp->copied_seq;
1315 		}
1316 		continue;
1317 
1318 	found_ok_skb:
1319 		/* Ok so how much can we use? */
1320 		used = skb->len - offset;
1321 		if (len < used)
1322 			used = len;
1323 
1324 		/* Do we have urgent data here? */
1325 		if (tp->urg_data) {
1326 			u32 urg_offset = tp->urg_seq - *seq;
1327 			if (urg_offset < used) {
1328 				if (!urg_offset) {
1329 					if (!sock_flag(sk, SOCK_URGINLINE)) {
1330 						++*seq;
1331 						offset++;
1332 						used--;
1333 						if (!used)
1334 							goto skip_copy;
1335 					}
1336 				} else
1337 					used = urg_offset;
1338 			}
1339 		}
1340 
1341 		if (!(flags & MSG_TRUNC)) {
1342 #ifdef CONFIG_NET_DMA
1343 			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1344 				tp->ucopy.dma_chan = get_softnet_dma();
1345 
1346 			if (tp->ucopy.dma_chan) {
1347 				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1348 					tp->ucopy.dma_chan, skb, offset,
1349 					msg->msg_iov, used,
1350 					tp->ucopy.pinned_list);
1351 
1352 				if (tp->ucopy.dma_cookie < 0) {
1353 
1354 					printk(KERN_ALERT "dma_cookie < 0\n");
1355 
1356 					/* Exception. Bailout! */
1357 					if (!copied)
1358 						copied = -EFAULT;
1359 					break;
1360 				}
1361 				if ((offset + used) == skb->len)
1362 					copied_early = 1;
1363 
1364 			} else
1365 #endif
1366 			{
1367 				err = skb_copy_datagram_iovec(skb, offset,
1368 						msg->msg_iov, used);
1369 				if (err) {
1370 					/* Exception. Bailout! */
1371 					if (!copied)
1372 						copied = -EFAULT;
1373 					break;
1374 				}
1375 			}
1376 		}
1377 
1378 		*seq += used;
1379 		copied += used;
1380 		len -= used;
1381 
1382 		tcp_rcv_space_adjust(sk);
1383 
1384 skip_copy:
1385 		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1386 			tp->urg_data = 0;
1387 			tcp_fast_path_check(sk, tp);
1388 		}
1389 		if (used + offset < skb->len)
1390 			continue;
1391 
1392 		if (skb->h.th->fin)
1393 			goto found_fin_ok;
1394 		if (!(flags & MSG_PEEK)) {
1395 			sk_eat_skb(sk, skb, copied_early);
1396 			copied_early = 0;
1397 		}
1398 		continue;
1399 
1400 	found_fin_ok:
1401 		/* Process the FIN. */
1402 		++*seq;
1403 		if (!(flags & MSG_PEEK)) {
1404 			sk_eat_skb(sk, skb, copied_early);
1405 			copied_early = 0;
1406 		}
1407 		break;
1408 	} while (len > 0);
1409 
1410 	if (user_recv) {
1411 		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1412 			int chunk;
1413 
1414 			tp->ucopy.len = copied > 0 ? len : 0;
1415 
1416 			tcp_prequeue_process(sk);
1417 
1418 			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1419 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1420 				len -= chunk;
1421 				copied += chunk;
1422 			}
1423 		}
1424 
1425 		tp->ucopy.task = NULL;
1426 		tp->ucopy.len = 0;
1427 	}
1428 
1429 #ifdef CONFIG_NET_DMA
1430 	if (tp->ucopy.dma_chan) {
1431 		struct sk_buff *skb;
1432 		dma_cookie_t done, used;
1433 
1434 		dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1435 
1436 		while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1437 		                                 tp->ucopy.dma_cookie, &done,
1438 		                                 &used) == DMA_IN_PROGRESS) {
1439 			/* do partial cleanup of sk_async_wait_queue */
1440 			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1441 			       (dma_async_is_complete(skb->dma_cookie, done,
1442 			                              used) == DMA_SUCCESS)) {
1443 				__skb_dequeue(&sk->sk_async_wait_queue);
1444 				kfree_skb(skb);
1445 			}
1446 		}
1447 
1448 		/* Safe to free early-copied skbs now */
1449 		__skb_queue_purge(&sk->sk_async_wait_queue);
1450 		dma_chan_put(tp->ucopy.dma_chan);
1451 		tp->ucopy.dma_chan = NULL;
1452 	}
1453 	if (tp->ucopy.pinned_list) {
1454 		dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1455 		tp->ucopy.pinned_list = NULL;
1456 	}
1457 #endif
1458 
1459 	/* According to UNIX98, msg_name/msg_namelen are ignored
1460 	 * on connected socket. I was just happy when found this 8) --ANK
1461 	 */
1462 
1463 	/* Clean up data we have read: This will do ACK frames. */
1464 	tcp_cleanup_rbuf(sk, copied);
1465 
1466 	TCP_CHECK_TIMER(sk);
1467 	release_sock(sk);
1468 	return copied;
1469 
1470 out:
1471 	TCP_CHECK_TIMER(sk);
1472 	release_sock(sk);
1473 	return err;
1474 
1475 recv_urg:
1476 	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1477 	goto out;
1478 }
1479 
1480 /*
1481  *	State processing on a close. This implements the state shift for
1482  *	sending our FIN frame. Note that we only send a FIN for some
1483  *	states. A shutdown() may have already sent the FIN, or we may be
1484  *	closed.
1485  */
1486 
1487 static const unsigned char new_state[16] = {
1488   /* current state:        new state:      action:	*/
1489   /* (Invalid)		*/ TCP_CLOSE,
1490   /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1491   /* TCP_SYN_SENT	*/ TCP_CLOSE,
1492   /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1493   /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1494   /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1495   /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1496   /* TCP_CLOSE		*/ TCP_CLOSE,
1497   /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1498   /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1499   /* TCP_LISTEN		*/ TCP_CLOSE,
1500   /* TCP_CLOSING	*/ TCP_CLOSING,
1501 };
1502 
1503 static int tcp_close_state(struct sock *sk)
1504 {
1505 	int next = (int)new_state[sk->sk_state];
1506 	int ns = next & TCP_STATE_MASK;
1507 
1508 	tcp_set_state(sk, ns);
1509 
1510 	return next & TCP_ACTION_FIN;
1511 }
1512 
1513 /*
1514  *	Shutdown the sending side of a connection. Much like close except
1515  *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1516  */
1517 
1518 void tcp_shutdown(struct sock *sk, int how)
1519 {
1520 	/*	We need to grab some memory, and put together a FIN,
1521 	 *	and then put it into the queue to be sent.
1522 	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1523 	 */
1524 	if (!(how & SEND_SHUTDOWN))
1525 		return;
1526 
1527 	/* If we've already sent a FIN, or it's a closed state, skip this. */
1528 	if ((1 << sk->sk_state) &
1529 	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1530 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1531 		/* Clear out any half completed packets.  FIN if needed. */
1532 		if (tcp_close_state(sk))
1533 			tcp_send_fin(sk);
1534 	}
1535 }
1536 
1537 void tcp_close(struct sock *sk, long timeout)
1538 {
1539 	struct sk_buff *skb;
1540 	int data_was_unread = 0;
1541 	int state;
1542 
1543 	lock_sock(sk);
1544 	sk->sk_shutdown = SHUTDOWN_MASK;
1545 
1546 	if (sk->sk_state == TCP_LISTEN) {
1547 		tcp_set_state(sk, TCP_CLOSE);
1548 
1549 		/* Special case. */
1550 		inet_csk_listen_stop(sk);
1551 
1552 		goto adjudge_to_death;
1553 	}
1554 
1555 	/*  We need to flush the recv. buffs.  We do this only on the
1556 	 *  descriptor close, not protocol-sourced closes, because the
1557 	 *  reader process may not have drained the data yet!
1558 	 */
1559 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1560 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1561 			  skb->h.th->fin;
1562 		data_was_unread += len;
1563 		__kfree_skb(skb);
1564 	}
1565 
1566 	sk_stream_mem_reclaim(sk);
1567 
1568 	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1569 	 * 3.10, we send a RST here because data was lost.  To
1570 	 * witness the awful effects of the old behavior of always
1571 	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1572 	 * a bulk GET in an FTP client, suspend the process, wait
1573 	 * for the client to advertise a zero window, then kill -9
1574 	 * the FTP client, wheee...  Note: timeout is always zero
1575 	 * in such a case.
1576 	 */
1577 	if (data_was_unread) {
1578 		/* Unread data was tossed, zap the connection. */
1579 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1580 		tcp_set_state(sk, TCP_CLOSE);
1581 		tcp_send_active_reset(sk, GFP_KERNEL);
1582 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1583 		/* Check zero linger _after_ checking for unread data. */
1584 		sk->sk_prot->disconnect(sk, 0);
1585 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1586 	} else if (tcp_close_state(sk)) {
1587 		/* We FIN if the application ate all the data before
1588 		 * zapping the connection.
1589 		 */
1590 
1591 		/* RED-PEN. Formally speaking, we have broken TCP state
1592 		 * machine. State transitions:
1593 		 *
1594 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1595 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1596 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1597 		 *
1598 		 * are legal only when FIN has been sent (i.e. in window),
1599 		 * rather than queued out of window. Purists blame.
1600 		 *
1601 		 * F.e. "RFC state" is ESTABLISHED,
1602 		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1603 		 *
1604 		 * The visible declinations are that sometimes
1605 		 * we enter time-wait state, when it is not required really
1606 		 * (harmless), do not send active resets, when they are
1607 		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1608 		 * they look as CLOSING or LAST_ACK for Linux)
1609 		 * Probably, I missed some more holelets.
1610 		 * 						--ANK
1611 		 */
1612 		tcp_send_fin(sk);
1613 	}
1614 
1615 	sk_stream_wait_close(sk, timeout);
1616 
1617 adjudge_to_death:
1618 	state = sk->sk_state;
1619 	sock_hold(sk);
1620 	sock_orphan(sk);
1621 	atomic_inc(sk->sk_prot->orphan_count);
1622 
1623 	/* It is the last release_sock in its life. It will remove backlog. */
1624 	release_sock(sk);
1625 
1626 
1627 	/* Now socket is owned by kernel and we acquire BH lock
1628 	   to finish close. No need to check for user refs.
1629 	 */
1630 	local_bh_disable();
1631 	bh_lock_sock(sk);
1632 	BUG_TRAP(!sock_owned_by_user(sk));
1633 
1634 	/* Have we already been destroyed by a softirq or backlog? */
1635 	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1636 		goto out;
1637 
1638 	/*	This is a (useful) BSD violating of the RFC. There is a
1639 	 *	problem with TCP as specified in that the other end could
1640 	 *	keep a socket open forever with no application left this end.
1641 	 *	We use a 3 minute timeout (about the same as BSD) then kill
1642 	 *	our end. If they send after that then tough - BUT: long enough
1643 	 *	that we won't make the old 4*rto = almost no time - whoops
1644 	 *	reset mistake.
1645 	 *
1646 	 *	Nope, it was not mistake. It is really desired behaviour
1647 	 *	f.e. on http servers, when such sockets are useless, but
1648 	 *	consume significant resources. Let's do it with special
1649 	 *	linger2	option.					--ANK
1650 	 */
1651 
1652 	if (sk->sk_state == TCP_FIN_WAIT2) {
1653 		struct tcp_sock *tp = tcp_sk(sk);
1654 		if (tp->linger2 < 0) {
1655 			tcp_set_state(sk, TCP_CLOSE);
1656 			tcp_send_active_reset(sk, GFP_ATOMIC);
1657 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1658 		} else {
1659 			const int tmo = tcp_fin_time(sk);
1660 
1661 			if (tmo > TCP_TIMEWAIT_LEN) {
1662 				inet_csk_reset_keepalive_timer(sk,
1663 						tmo - TCP_TIMEWAIT_LEN);
1664 			} else {
1665 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1666 				goto out;
1667 			}
1668 		}
1669 	}
1670 	if (sk->sk_state != TCP_CLOSE) {
1671 		sk_stream_mem_reclaim(sk);
1672 		if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1673 		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1674 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1675 			if (net_ratelimit())
1676 				printk(KERN_INFO "TCP: too many of orphaned "
1677 				       "sockets\n");
1678 			tcp_set_state(sk, TCP_CLOSE);
1679 			tcp_send_active_reset(sk, GFP_ATOMIC);
1680 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1681 		}
1682 	}
1683 
1684 	if (sk->sk_state == TCP_CLOSE)
1685 		inet_csk_destroy_sock(sk);
1686 	/* Otherwise, socket is reprieved until protocol close. */
1687 
1688 out:
1689 	bh_unlock_sock(sk);
1690 	local_bh_enable();
1691 	sock_put(sk);
1692 }
1693 
1694 /* These states need RST on ABORT according to RFC793 */
1695 
1696 static inline int tcp_need_reset(int state)
1697 {
1698 	return (1 << state) &
1699 	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1700 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1701 }
1702 
1703 int tcp_disconnect(struct sock *sk, int flags)
1704 {
1705 	struct inet_sock *inet = inet_sk(sk);
1706 	struct inet_connection_sock *icsk = inet_csk(sk);
1707 	struct tcp_sock *tp = tcp_sk(sk);
1708 	int err = 0;
1709 	int old_state = sk->sk_state;
1710 
1711 	if (old_state != TCP_CLOSE)
1712 		tcp_set_state(sk, TCP_CLOSE);
1713 
1714 	/* ABORT function of RFC793 */
1715 	if (old_state == TCP_LISTEN) {
1716 		inet_csk_listen_stop(sk);
1717 	} else if (tcp_need_reset(old_state) ||
1718 		   (tp->snd_nxt != tp->write_seq &&
1719 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1720 		/* The last check adjusts for discrepancy of Linux wrt. RFC
1721 		 * states
1722 		 */
1723 		tcp_send_active_reset(sk, gfp_any());
1724 		sk->sk_err = ECONNRESET;
1725 	} else if (old_state == TCP_SYN_SENT)
1726 		sk->sk_err = ECONNRESET;
1727 
1728 	tcp_clear_xmit_timers(sk);
1729 	__skb_queue_purge(&sk->sk_receive_queue);
1730 	sk_stream_writequeue_purge(sk);
1731 	__skb_queue_purge(&tp->out_of_order_queue);
1732 #ifdef CONFIG_NET_DMA
1733 	__skb_queue_purge(&sk->sk_async_wait_queue);
1734 #endif
1735 
1736 	inet->dport = 0;
1737 
1738 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1739 		inet_reset_saddr(sk);
1740 
1741 	sk->sk_shutdown = 0;
1742 	sock_reset_flag(sk, SOCK_DONE);
1743 	tp->srtt = 0;
1744 	if ((tp->write_seq += tp->max_window + 2) == 0)
1745 		tp->write_seq = 1;
1746 	icsk->icsk_backoff = 0;
1747 	tp->snd_cwnd = 2;
1748 	icsk->icsk_probes_out = 0;
1749 	tp->packets_out = 0;
1750 	tp->snd_ssthresh = 0x7fffffff;
1751 	tp->snd_cwnd_cnt = 0;
1752 	tp->bytes_acked = 0;
1753 	tcp_set_ca_state(sk, TCP_CA_Open);
1754 	tcp_clear_retrans(tp);
1755 	inet_csk_delack_init(sk);
1756 	sk->sk_send_head = NULL;
1757 	tp->rx_opt.saw_tstamp = 0;
1758 	tcp_sack_reset(&tp->rx_opt);
1759 	__sk_dst_reset(sk);
1760 
1761 	BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1762 
1763 	sk->sk_error_report(sk);
1764 	return err;
1765 }
1766 
1767 /*
1768  *	Socket option code for TCP.
1769  */
1770 static int do_tcp_setsockopt(struct sock *sk, int level,
1771 		int optname, char __user *optval, int optlen)
1772 {
1773 	struct tcp_sock *tp = tcp_sk(sk);
1774 	struct inet_connection_sock *icsk = inet_csk(sk);
1775 	int val;
1776 	int err = 0;
1777 
1778 	/* This is a string value all the others are int's */
1779 	if (optname == TCP_CONGESTION) {
1780 		char name[TCP_CA_NAME_MAX];
1781 
1782 		if (optlen < 1)
1783 			return -EINVAL;
1784 
1785 		val = strncpy_from_user(name, optval,
1786 					min(TCP_CA_NAME_MAX-1, optlen));
1787 		if (val < 0)
1788 			return -EFAULT;
1789 		name[val] = 0;
1790 
1791 		lock_sock(sk);
1792 		err = tcp_set_congestion_control(sk, name);
1793 		release_sock(sk);
1794 		return err;
1795 	}
1796 
1797 	if (optlen < sizeof(int))
1798 		return -EINVAL;
1799 
1800 	if (get_user(val, (int __user *)optval))
1801 		return -EFAULT;
1802 
1803 	lock_sock(sk);
1804 
1805 	switch (optname) {
1806 	case TCP_MAXSEG:
1807 		/* Values greater than interface MTU won't take effect. However
1808 		 * at the point when this call is done we typically don't yet
1809 		 * know which interface is going to be used */
1810 		if (val < 8 || val > MAX_TCP_WINDOW) {
1811 			err = -EINVAL;
1812 			break;
1813 		}
1814 		tp->rx_opt.user_mss = val;
1815 		break;
1816 
1817 	case TCP_NODELAY:
1818 		if (val) {
1819 			/* TCP_NODELAY is weaker than TCP_CORK, so that
1820 			 * this option on corked socket is remembered, but
1821 			 * it is not activated until cork is cleared.
1822 			 *
1823 			 * However, when TCP_NODELAY is set we make
1824 			 * an explicit push, which overrides even TCP_CORK
1825 			 * for currently queued segments.
1826 			 */
1827 			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1828 			tcp_push_pending_frames(sk, tp);
1829 		} else {
1830 			tp->nonagle &= ~TCP_NAGLE_OFF;
1831 		}
1832 		break;
1833 
1834 	case TCP_CORK:
1835 		/* When set indicates to always queue non-full frames.
1836 		 * Later the user clears this option and we transmit
1837 		 * any pending partial frames in the queue.  This is
1838 		 * meant to be used alongside sendfile() to get properly
1839 		 * filled frames when the user (for example) must write
1840 		 * out headers with a write() call first and then use
1841 		 * sendfile to send out the data parts.
1842 		 *
1843 		 * TCP_CORK can be set together with TCP_NODELAY and it is
1844 		 * stronger than TCP_NODELAY.
1845 		 */
1846 		if (val) {
1847 			tp->nonagle |= TCP_NAGLE_CORK;
1848 		} else {
1849 			tp->nonagle &= ~TCP_NAGLE_CORK;
1850 			if (tp->nonagle&TCP_NAGLE_OFF)
1851 				tp->nonagle |= TCP_NAGLE_PUSH;
1852 			tcp_push_pending_frames(sk, tp);
1853 		}
1854 		break;
1855 
1856 	case TCP_KEEPIDLE:
1857 		if (val < 1 || val > MAX_TCP_KEEPIDLE)
1858 			err = -EINVAL;
1859 		else {
1860 			tp->keepalive_time = val * HZ;
1861 			if (sock_flag(sk, SOCK_KEEPOPEN) &&
1862 			    !((1 << sk->sk_state) &
1863 			      (TCPF_CLOSE | TCPF_LISTEN))) {
1864 				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1865 				if (tp->keepalive_time > elapsed)
1866 					elapsed = tp->keepalive_time - elapsed;
1867 				else
1868 					elapsed = 0;
1869 				inet_csk_reset_keepalive_timer(sk, elapsed);
1870 			}
1871 		}
1872 		break;
1873 	case TCP_KEEPINTVL:
1874 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
1875 			err = -EINVAL;
1876 		else
1877 			tp->keepalive_intvl = val * HZ;
1878 		break;
1879 	case TCP_KEEPCNT:
1880 		if (val < 1 || val > MAX_TCP_KEEPCNT)
1881 			err = -EINVAL;
1882 		else
1883 			tp->keepalive_probes = val;
1884 		break;
1885 	case TCP_SYNCNT:
1886 		if (val < 1 || val > MAX_TCP_SYNCNT)
1887 			err = -EINVAL;
1888 		else
1889 			icsk->icsk_syn_retries = val;
1890 		break;
1891 
1892 	case TCP_LINGER2:
1893 		if (val < 0)
1894 			tp->linger2 = -1;
1895 		else if (val > sysctl_tcp_fin_timeout / HZ)
1896 			tp->linger2 = 0;
1897 		else
1898 			tp->linger2 = val * HZ;
1899 		break;
1900 
1901 	case TCP_DEFER_ACCEPT:
1902 		icsk->icsk_accept_queue.rskq_defer_accept = 0;
1903 		if (val > 0) {
1904 			/* Translate value in seconds to number of
1905 			 * retransmits */
1906 			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1907 			       val > ((TCP_TIMEOUT_INIT / HZ) <<
1908 				       icsk->icsk_accept_queue.rskq_defer_accept))
1909 				icsk->icsk_accept_queue.rskq_defer_accept++;
1910 			icsk->icsk_accept_queue.rskq_defer_accept++;
1911 		}
1912 		break;
1913 
1914 	case TCP_WINDOW_CLAMP:
1915 		if (!val) {
1916 			if (sk->sk_state != TCP_CLOSE) {
1917 				err = -EINVAL;
1918 				break;
1919 			}
1920 			tp->window_clamp = 0;
1921 		} else
1922 			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1923 						SOCK_MIN_RCVBUF / 2 : val;
1924 		break;
1925 
1926 	case TCP_QUICKACK:
1927 		if (!val) {
1928 			icsk->icsk_ack.pingpong = 1;
1929 		} else {
1930 			icsk->icsk_ack.pingpong = 0;
1931 			if ((1 << sk->sk_state) &
1932 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1933 			    inet_csk_ack_scheduled(sk)) {
1934 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1935 				tcp_cleanup_rbuf(sk, 1);
1936 				if (!(val & 1))
1937 					icsk->icsk_ack.pingpong = 1;
1938 			}
1939 		}
1940 		break;
1941 
1942 	default:
1943 		err = -ENOPROTOOPT;
1944 		break;
1945 	};
1946 	release_sock(sk);
1947 	return err;
1948 }
1949 
1950 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1951 		   int optlen)
1952 {
1953 	struct inet_connection_sock *icsk = inet_csk(sk);
1954 
1955 	if (level != SOL_TCP)
1956 		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
1957 						     optval, optlen);
1958 	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
1959 }
1960 
1961 #ifdef CONFIG_COMPAT
1962 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
1963 			  char __user *optval, int optlen)
1964 {
1965 	if (level != SOL_TCP)
1966 		return inet_csk_compat_setsockopt(sk, level, optname,
1967 						  optval, optlen);
1968 	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
1969 }
1970 
1971 EXPORT_SYMBOL(compat_tcp_setsockopt);
1972 #endif
1973 
1974 /* Return information about state of tcp endpoint in API format. */
1975 void tcp_get_info(struct sock *sk, struct tcp_info *info)
1976 {
1977 	struct tcp_sock *tp = tcp_sk(sk);
1978 	const struct inet_connection_sock *icsk = inet_csk(sk);
1979 	u32 now = tcp_time_stamp;
1980 
1981 	memset(info, 0, sizeof(*info));
1982 
1983 	info->tcpi_state = sk->sk_state;
1984 	info->tcpi_ca_state = icsk->icsk_ca_state;
1985 	info->tcpi_retransmits = icsk->icsk_retransmits;
1986 	info->tcpi_probes = icsk->icsk_probes_out;
1987 	info->tcpi_backoff = icsk->icsk_backoff;
1988 
1989 	if (tp->rx_opt.tstamp_ok)
1990 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1991 	if (tp->rx_opt.sack_ok)
1992 		info->tcpi_options |= TCPI_OPT_SACK;
1993 	if (tp->rx_opt.wscale_ok) {
1994 		info->tcpi_options |= TCPI_OPT_WSCALE;
1995 		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1996 		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1997 	}
1998 
1999 	if (tp->ecn_flags&TCP_ECN_OK)
2000 		info->tcpi_options |= TCPI_OPT_ECN;
2001 
2002 	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2003 	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2004 	info->tcpi_snd_mss = tp->mss_cache;
2005 	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2006 
2007 	info->tcpi_unacked = tp->packets_out;
2008 	info->tcpi_sacked = tp->sacked_out;
2009 	info->tcpi_lost = tp->lost_out;
2010 	info->tcpi_retrans = tp->retrans_out;
2011 	info->tcpi_fackets = tp->fackets_out;
2012 
2013 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2014 	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2015 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2016 
2017 	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2018 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2019 	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2020 	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2021 	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2022 	info->tcpi_snd_cwnd = tp->snd_cwnd;
2023 	info->tcpi_advmss = tp->advmss;
2024 	info->tcpi_reordering = tp->reordering;
2025 
2026 	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2027 	info->tcpi_rcv_space = tp->rcvq_space.space;
2028 
2029 	info->tcpi_total_retrans = tp->total_retrans;
2030 }
2031 
2032 EXPORT_SYMBOL_GPL(tcp_get_info);
2033 
2034 static int do_tcp_getsockopt(struct sock *sk, int level,
2035 		int optname, char __user *optval, int __user *optlen)
2036 {
2037 	struct inet_connection_sock *icsk = inet_csk(sk);
2038 	struct tcp_sock *tp = tcp_sk(sk);
2039 	int val, len;
2040 
2041 	if (get_user(len, optlen))
2042 		return -EFAULT;
2043 
2044 	len = min_t(unsigned int, len, sizeof(int));
2045 
2046 	if (len < 0)
2047 		return -EINVAL;
2048 
2049 	switch (optname) {
2050 	case TCP_MAXSEG:
2051 		val = tp->mss_cache;
2052 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2053 			val = tp->rx_opt.user_mss;
2054 		break;
2055 	case TCP_NODELAY:
2056 		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2057 		break;
2058 	case TCP_CORK:
2059 		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2060 		break;
2061 	case TCP_KEEPIDLE:
2062 		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2063 		break;
2064 	case TCP_KEEPINTVL:
2065 		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2066 		break;
2067 	case TCP_KEEPCNT:
2068 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2069 		break;
2070 	case TCP_SYNCNT:
2071 		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2072 		break;
2073 	case TCP_LINGER2:
2074 		val = tp->linger2;
2075 		if (val >= 0)
2076 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2077 		break;
2078 	case TCP_DEFER_ACCEPT:
2079 		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2080 			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2081 		break;
2082 	case TCP_WINDOW_CLAMP:
2083 		val = tp->window_clamp;
2084 		break;
2085 	case TCP_INFO: {
2086 		struct tcp_info info;
2087 
2088 		if (get_user(len, optlen))
2089 			return -EFAULT;
2090 
2091 		tcp_get_info(sk, &info);
2092 
2093 		len = min_t(unsigned int, len, sizeof(info));
2094 		if (put_user(len, optlen))
2095 			return -EFAULT;
2096 		if (copy_to_user(optval, &info, len))
2097 			return -EFAULT;
2098 		return 0;
2099 	}
2100 	case TCP_QUICKACK:
2101 		val = !icsk->icsk_ack.pingpong;
2102 		break;
2103 
2104 	case TCP_CONGESTION:
2105 		if (get_user(len, optlen))
2106 			return -EFAULT;
2107 		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2108 		if (put_user(len, optlen))
2109 			return -EFAULT;
2110 		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2111 			return -EFAULT;
2112 		return 0;
2113 	default:
2114 		return -ENOPROTOOPT;
2115 	};
2116 
2117 	if (put_user(len, optlen))
2118 		return -EFAULT;
2119 	if (copy_to_user(optval, &val, len))
2120 		return -EFAULT;
2121 	return 0;
2122 }
2123 
2124 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2125 		   int __user *optlen)
2126 {
2127 	struct inet_connection_sock *icsk = inet_csk(sk);
2128 
2129 	if (level != SOL_TCP)
2130 		return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2131 						     optval, optlen);
2132 	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2133 }
2134 
2135 #ifdef CONFIG_COMPAT
2136 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2137 			  char __user *optval, int __user *optlen)
2138 {
2139 	if (level != SOL_TCP)
2140 		return inet_csk_compat_getsockopt(sk, level, optname,
2141 						  optval, optlen);
2142 	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2143 }
2144 
2145 EXPORT_SYMBOL(compat_tcp_getsockopt);
2146 #endif
2147 
2148 struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2149 {
2150 	struct sk_buff *segs = ERR_PTR(-EINVAL);
2151 	struct tcphdr *th;
2152 	unsigned thlen;
2153 	unsigned int seq;
2154 	unsigned int delta;
2155 	unsigned int oldlen;
2156 	unsigned int len;
2157 
2158 	if (!pskb_may_pull(skb, sizeof(*th)))
2159 		goto out;
2160 
2161 	th = skb->h.th;
2162 	thlen = th->doff * 4;
2163 	if (thlen < sizeof(*th))
2164 		goto out;
2165 
2166 	if (!pskb_may_pull(skb, thlen))
2167 		goto out;
2168 
2169 	oldlen = (u16)~skb->len;
2170 	__skb_pull(skb, thlen);
2171 
2172 	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2173 		/* Packet is from an untrusted source, reset gso_segs. */
2174 		int type = skb_shinfo(skb)->gso_type;
2175 		int mss;
2176 
2177 		if (unlikely(type &
2178 			     ~(SKB_GSO_TCPV4 |
2179 			       SKB_GSO_DODGY |
2180 			       SKB_GSO_TCP_ECN |
2181 			       SKB_GSO_TCPV6 |
2182 			       0) ||
2183 			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2184 			goto out;
2185 
2186 		mss = skb_shinfo(skb)->gso_size;
2187 		skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss;
2188 
2189 		segs = NULL;
2190 		goto out;
2191 	}
2192 
2193 	segs = skb_segment(skb, features);
2194 	if (IS_ERR(segs))
2195 		goto out;
2196 
2197 	len = skb_shinfo(skb)->gso_size;
2198 	delta = htonl(oldlen + (thlen + len));
2199 
2200 	skb = segs;
2201 	th = skb->h.th;
2202 	seq = ntohl(th->seq);
2203 
2204 	do {
2205 		th->fin = th->psh = 0;
2206 
2207 		th->check = ~csum_fold(th->check + delta);
2208 		if (skb->ip_summed != CHECKSUM_HW)
2209 			th->check = csum_fold(csum_partial(skb->h.raw, thlen,
2210 							   skb->csum));
2211 
2212 		seq += len;
2213 		skb = skb->next;
2214 		th = skb->h.th;
2215 
2216 		th->seq = htonl(seq);
2217 		th->cwr = 0;
2218 	} while (skb->next);
2219 
2220 	delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
2221 	th->check = ~csum_fold(th->check + delta);
2222 	if (skb->ip_summed != CHECKSUM_HW)
2223 		th->check = csum_fold(csum_partial(skb->h.raw, thlen,
2224 						   skb->csum));
2225 
2226 out:
2227 	return segs;
2228 }
2229 EXPORT_SYMBOL(tcp_tso_segment);
2230 
2231 extern void __skb_cb_too_small_for_tcp(int, int);
2232 extern struct tcp_congestion_ops tcp_reno;
2233 
2234 static __initdata unsigned long thash_entries;
2235 static int __init set_thash_entries(char *str)
2236 {
2237 	if (!str)
2238 		return 0;
2239 	thash_entries = simple_strtoul(str, &str, 0);
2240 	return 1;
2241 }
2242 __setup("thash_entries=", set_thash_entries);
2243 
2244 void __init tcp_init(void)
2245 {
2246 	struct sk_buff *skb = NULL;
2247 	unsigned long limit;
2248 	int order, i, max_share;
2249 
2250 	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2251 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2252 					   sizeof(skb->cb));
2253 
2254 	tcp_hashinfo.bind_bucket_cachep =
2255 		kmem_cache_create("tcp_bind_bucket",
2256 				  sizeof(struct inet_bind_bucket), 0,
2257 				  SLAB_HWCACHE_ALIGN, NULL, NULL);
2258 	if (!tcp_hashinfo.bind_bucket_cachep)
2259 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2260 
2261 	/* Size and allocate the main established and bind bucket
2262 	 * hash tables.
2263 	 *
2264 	 * The methodology is similar to that of the buffer cache.
2265 	 */
2266 	tcp_hashinfo.ehash =
2267 		alloc_large_system_hash("TCP established",
2268 					sizeof(struct inet_ehash_bucket),
2269 					thash_entries,
2270 					(num_physpages >= 128 * 1024) ?
2271 					13 : 15,
2272 					HASH_HIGHMEM,
2273 					&tcp_hashinfo.ehash_size,
2274 					NULL,
2275 					0);
2276 	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2277 	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2278 		rwlock_init(&tcp_hashinfo.ehash[i].lock);
2279 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2280 	}
2281 
2282 	tcp_hashinfo.bhash =
2283 		alloc_large_system_hash("TCP bind",
2284 					sizeof(struct inet_bind_hashbucket),
2285 					tcp_hashinfo.ehash_size,
2286 					(num_physpages >= 128 * 1024) ?
2287 					13 : 15,
2288 					HASH_HIGHMEM,
2289 					&tcp_hashinfo.bhash_size,
2290 					NULL,
2291 					64 * 1024);
2292 	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2293 	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2294 		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2295 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2296 	}
2297 
2298 	/* Try to be a bit smarter and adjust defaults depending
2299 	 * on available memory.
2300 	 */
2301 	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2302 			(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2303 			order++)
2304 		;
2305 	if (order >= 4) {
2306 		sysctl_local_port_range[0] = 32768;
2307 		sysctl_local_port_range[1] = 61000;
2308 		tcp_death_row.sysctl_max_tw_buckets = 180000;
2309 		sysctl_tcp_max_orphans = 4096 << (order - 4);
2310 		sysctl_max_syn_backlog = 1024;
2311 	} else if (order < 3) {
2312 		sysctl_local_port_range[0] = 1024 * (3 - order);
2313 		tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2314 		sysctl_tcp_max_orphans >>= (3 - order);
2315 		sysctl_max_syn_backlog = 128;
2316 	}
2317 
2318 	sysctl_tcp_mem[0] =  768 << order;
2319 	sysctl_tcp_mem[1] = 1024 << order;
2320 	sysctl_tcp_mem[2] = 1536 << order;
2321 
2322 	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2323 	max_share = min(4UL*1024*1024, limit);
2324 
2325 	sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
2326 	sysctl_tcp_wmem[1] = 16*1024;
2327 	sysctl_tcp_wmem[2] = max(64*1024, max_share);
2328 
2329 	sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
2330 	sysctl_tcp_rmem[1] = 87380;
2331 	sysctl_tcp_rmem[2] = max(87380, max_share);
2332 
2333 	printk(KERN_INFO "TCP: Hash tables configured "
2334 	       "(established %d bind %d)\n",
2335 	       tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2336 
2337 	tcp_register_congestion_control(&tcp_reno);
2338 }
2339 
2340 EXPORT_SYMBOL(tcp_close);
2341 EXPORT_SYMBOL(tcp_disconnect);
2342 EXPORT_SYMBOL(tcp_getsockopt);
2343 EXPORT_SYMBOL(tcp_ioctl);
2344 EXPORT_SYMBOL(tcp_poll);
2345 EXPORT_SYMBOL(tcp_read_sock);
2346 EXPORT_SYMBOL(tcp_recvmsg);
2347 EXPORT_SYMBOL(tcp_sendmsg);
2348 EXPORT_SYMBOL(tcp_sendpage);
2349 EXPORT_SYMBOL(tcp_setsockopt);
2350 EXPORT_SYMBOL(tcp_shutdown);
2351 EXPORT_SYMBOL(tcp_statistics);
2352