xref: /linux/net/ipv4/tcp.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1*1da177e4SLinus Torvalds /*
2*1da177e4SLinus Torvalds  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3*1da177e4SLinus Torvalds  *		operating system.  INET is implemented using the  BSD Socket
4*1da177e4SLinus Torvalds  *		interface as the means of communication with the user level.
5*1da177e4SLinus Torvalds  *
6*1da177e4SLinus Torvalds  *		Implementation of the Transmission Control Protocol(TCP).
7*1da177e4SLinus Torvalds  *
8*1da177e4SLinus Torvalds  * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9*1da177e4SLinus Torvalds  *
10*1da177e4SLinus Torvalds  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11*1da177e4SLinus Torvalds  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12*1da177e4SLinus Torvalds  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13*1da177e4SLinus Torvalds  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14*1da177e4SLinus Torvalds  *		Florian La Roche, <flla@stud.uni-sb.de>
15*1da177e4SLinus Torvalds  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16*1da177e4SLinus Torvalds  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17*1da177e4SLinus Torvalds  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18*1da177e4SLinus Torvalds  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19*1da177e4SLinus Torvalds  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20*1da177e4SLinus Torvalds  *		Jorge Cwik, <jorge@laser.satlink.net>
21*1da177e4SLinus Torvalds  *
22*1da177e4SLinus Torvalds  * Fixes:
23*1da177e4SLinus Torvalds  *		Alan Cox	:	Numerous verify_area() calls
24*1da177e4SLinus Torvalds  *		Alan Cox	:	Set the ACK bit on a reset
25*1da177e4SLinus Torvalds  *		Alan Cox	:	Stopped it crashing if it closed while
26*1da177e4SLinus Torvalds  *					sk->inuse=1 and was trying to connect
27*1da177e4SLinus Torvalds  *					(tcp_err()).
28*1da177e4SLinus Torvalds  *		Alan Cox	:	All icmp error handling was broken
29*1da177e4SLinus Torvalds  *					pointers passed where wrong and the
30*1da177e4SLinus Torvalds  *					socket was looked up backwards. Nobody
31*1da177e4SLinus Torvalds  *					tested any icmp error code obviously.
32*1da177e4SLinus Torvalds  *		Alan Cox	:	tcp_err() now handled properly. It
33*1da177e4SLinus Torvalds  *					wakes people on errors. poll
34*1da177e4SLinus Torvalds  *					behaves and the icmp error race
35*1da177e4SLinus Torvalds  *					has gone by moving it into sock.c
36*1da177e4SLinus Torvalds  *		Alan Cox	:	tcp_send_reset() fixed to work for
37*1da177e4SLinus Torvalds  *					everything not just packets for
38*1da177e4SLinus Torvalds  *					unknown sockets.
39*1da177e4SLinus Torvalds  *		Alan Cox	:	tcp option processing.
40*1da177e4SLinus Torvalds  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41*1da177e4SLinus Torvalds  *					syn rule wrong]
42*1da177e4SLinus Torvalds  *		Herp Rosmanith  :	More reset fixes
43*1da177e4SLinus Torvalds  *		Alan Cox	:	No longer acks invalid rst frames.
44*1da177e4SLinus Torvalds  *					Acking any kind of RST is right out.
45*1da177e4SLinus Torvalds  *		Alan Cox	:	Sets an ignore me flag on an rst
46*1da177e4SLinus Torvalds  *					receive otherwise odd bits of prattle
47*1da177e4SLinus Torvalds  *					escape still
48*1da177e4SLinus Torvalds  *		Alan Cox	:	Fixed another acking RST frame bug.
49*1da177e4SLinus Torvalds  *					Should stop LAN workplace lockups.
50*1da177e4SLinus Torvalds  *		Alan Cox	: 	Some tidyups using the new skb list
51*1da177e4SLinus Torvalds  *					facilities
52*1da177e4SLinus Torvalds  *		Alan Cox	:	sk->keepopen now seems to work
53*1da177e4SLinus Torvalds  *		Alan Cox	:	Pulls options out correctly on accepts
54*1da177e4SLinus Torvalds  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55*1da177e4SLinus Torvalds  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56*1da177e4SLinus Torvalds  *					bit to skb ops.
57*1da177e4SLinus Torvalds  *		Alan Cox	:	Tidied tcp_data to avoid a potential
58*1da177e4SLinus Torvalds  *					nasty.
59*1da177e4SLinus Torvalds  *		Alan Cox	:	Added some better commenting, as the
60*1da177e4SLinus Torvalds  *					tcp is hard to follow
61*1da177e4SLinus Torvalds  *		Alan Cox	:	Removed incorrect check for 20 * psh
62*1da177e4SLinus Torvalds  *	Michael O'Reilly	:	ack < copied bug fix.
63*1da177e4SLinus Torvalds  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64*1da177e4SLinus Torvalds  *		Alan Cox	:	FIN with no memory -> CRASH
65*1da177e4SLinus Torvalds  *		Alan Cox	:	Added socket option proto entries.
66*1da177e4SLinus Torvalds  *					Also added awareness of them to accept.
67*1da177e4SLinus Torvalds  *		Alan Cox	:	Added TCP options (SOL_TCP)
68*1da177e4SLinus Torvalds  *		Alan Cox	:	Switched wakeup calls to callbacks,
69*1da177e4SLinus Torvalds  *					so the kernel can layer network
70*1da177e4SLinus Torvalds  *					sockets.
71*1da177e4SLinus Torvalds  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72*1da177e4SLinus Torvalds  *		Alan Cox	:	Handle FIN (more) properly (we hope).
73*1da177e4SLinus Torvalds  *		Alan Cox	:	RST frames sent on unsynchronised
74*1da177e4SLinus Torvalds  *					state ack error.
75*1da177e4SLinus Torvalds  *		Alan Cox	:	Put in missing check for SYN bit.
76*1da177e4SLinus Torvalds  *		Alan Cox	:	Added tcp_select_window() aka NET2E
77*1da177e4SLinus Torvalds  *					window non shrink trick.
78*1da177e4SLinus Torvalds  *		Alan Cox	:	Added a couple of small NET2E timer
79*1da177e4SLinus Torvalds  *					fixes
80*1da177e4SLinus Torvalds  *		Charles Hedrick :	TCP fixes
81*1da177e4SLinus Torvalds  *		Toomas Tamm	:	TCP window fixes
82*1da177e4SLinus Torvalds  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83*1da177e4SLinus Torvalds  *		Charles Hedrick	:	Rewrote most of it to actually work
84*1da177e4SLinus Torvalds  *		Linus		:	Rewrote tcp_read() and URG handling
85*1da177e4SLinus Torvalds  *					completely
86*1da177e4SLinus Torvalds  *		Gerhard Koerting:	Fixed some missing timer handling
87*1da177e4SLinus Torvalds  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88*1da177e4SLinus Torvalds  *		Gerhard Koerting:	PC/TCP workarounds
89*1da177e4SLinus Torvalds  *		Adam Caldwell	:	Assorted timer/timing errors
90*1da177e4SLinus Torvalds  *		Matthew Dillon	:	Fixed another RST bug
91*1da177e4SLinus Torvalds  *		Alan Cox	:	Move to kernel side addressing changes.
92*1da177e4SLinus Torvalds  *		Alan Cox	:	Beginning work on TCP fastpathing
93*1da177e4SLinus Torvalds  *					(not yet usable)
94*1da177e4SLinus Torvalds  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95*1da177e4SLinus Torvalds  *		Alan Cox	:	TCP fast path debugging
96*1da177e4SLinus Torvalds  *		Alan Cox	:	Window clamping
97*1da177e4SLinus Torvalds  *		Michael Riepe	:	Bug in tcp_check()
98*1da177e4SLinus Torvalds  *		Matt Dillon	:	More TCP improvements and RST bug fixes
99*1da177e4SLinus Torvalds  *		Matt Dillon	:	Yet more small nasties remove from the
100*1da177e4SLinus Torvalds  *					TCP code (Be very nice to this man if
101*1da177e4SLinus Torvalds  *					tcp finally works 100%) 8)
102*1da177e4SLinus Torvalds  *		Alan Cox	:	BSD accept semantics.
103*1da177e4SLinus Torvalds  *		Alan Cox	:	Reset on closedown bug.
104*1da177e4SLinus Torvalds  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105*1da177e4SLinus Torvalds  *		Michael Pall	:	Handle poll() after URG properly in
106*1da177e4SLinus Torvalds  *					all cases.
107*1da177e4SLinus Torvalds  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108*1da177e4SLinus Torvalds  *					(multi URG PUSH broke rlogin).
109*1da177e4SLinus Torvalds  *		Michael Pall	:	Fix the multi URG PUSH problem in
110*1da177e4SLinus Torvalds  *					tcp_readable(), poll() after URG
111*1da177e4SLinus Torvalds  *					works now.
112*1da177e4SLinus Torvalds  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113*1da177e4SLinus Torvalds  *					BSD api.
114*1da177e4SLinus Torvalds  *		Alan Cox	:	Changed the semantics of sk->socket to
115*1da177e4SLinus Torvalds  *					fix a race and a signal problem with
116*1da177e4SLinus Torvalds  *					accept() and async I/O.
117*1da177e4SLinus Torvalds  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118*1da177e4SLinus Torvalds  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119*1da177e4SLinus Torvalds  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120*1da177e4SLinus Torvalds  *					clients/servers which listen in on
121*1da177e4SLinus Torvalds  *					fixed ports.
122*1da177e4SLinus Torvalds  *		Alan Cox	:	Cleaned the above up and shrank it to
123*1da177e4SLinus Torvalds  *					a sensible code size.
124*1da177e4SLinus Torvalds  *		Alan Cox	:	Self connect lockup fix.
125*1da177e4SLinus Torvalds  *		Alan Cox	:	No connect to multicast.
126*1da177e4SLinus Torvalds  *		Ross Biro	:	Close unaccepted children on master
127*1da177e4SLinus Torvalds  *					socket close.
128*1da177e4SLinus Torvalds  *		Alan Cox	:	Reset tracing code.
129*1da177e4SLinus Torvalds  *		Alan Cox	:	Spurious resets on shutdown.
130*1da177e4SLinus Torvalds  *		Alan Cox	:	Giant 15 minute/60 second timer error
131*1da177e4SLinus Torvalds  *		Alan Cox	:	Small whoops in polling before an
132*1da177e4SLinus Torvalds  *					accept.
133*1da177e4SLinus Torvalds  *		Alan Cox	:	Kept the state trace facility since
134*1da177e4SLinus Torvalds  *					it's handy for debugging.
135*1da177e4SLinus Torvalds  *		Alan Cox	:	More reset handler fixes.
136*1da177e4SLinus Torvalds  *		Alan Cox	:	Started rewriting the code based on
137*1da177e4SLinus Torvalds  *					the RFC's for other useful protocol
138*1da177e4SLinus Torvalds  *					references see: Comer, KA9Q NOS, and
139*1da177e4SLinus Torvalds  *					for a reference on the difference
140*1da177e4SLinus Torvalds  *					between specifications and how BSD
141*1da177e4SLinus Torvalds  *					works see the 4.4lite source.
142*1da177e4SLinus Torvalds  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143*1da177e4SLinus Torvalds  *					close.
144*1da177e4SLinus Torvalds  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145*1da177e4SLinus Torvalds  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146*1da177e4SLinus Torvalds  *		Alan Cox	:	Reimplemented timers as per the RFC
147*1da177e4SLinus Torvalds  *					and using multiple timers for sanity.
148*1da177e4SLinus Torvalds  *		Alan Cox	:	Small bug fixes, and a lot of new
149*1da177e4SLinus Torvalds  *					comments.
150*1da177e4SLinus Torvalds  *		Alan Cox	:	Fixed dual reader crash by locking
151*1da177e4SLinus Torvalds  *					the buffers (much like datagram.c)
152*1da177e4SLinus Torvalds  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153*1da177e4SLinus Torvalds  *					now gets fed up of retrying without
154*1da177e4SLinus Torvalds  *					(even a no space) answer.
155*1da177e4SLinus Torvalds  *		Alan Cox	:	Extracted closing code better
156*1da177e4SLinus Torvalds  *		Alan Cox	:	Fixed the closing state machine to
157*1da177e4SLinus Torvalds  *					resemble the RFC.
158*1da177e4SLinus Torvalds  *		Alan Cox	:	More 'per spec' fixes.
159*1da177e4SLinus Torvalds  *		Jorge Cwik	:	Even faster checksumming.
160*1da177e4SLinus Torvalds  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161*1da177e4SLinus Torvalds  *					only frames. At least one pc tcp stack
162*1da177e4SLinus Torvalds  *					generates them.
163*1da177e4SLinus Torvalds  *		Alan Cox	:	Cache last socket.
164*1da177e4SLinus Torvalds  *		Alan Cox	:	Per route irtt.
165*1da177e4SLinus Torvalds  *		Matt Day	:	poll()->select() match BSD precisely on error
166*1da177e4SLinus Torvalds  *		Alan Cox	:	New buffers
167*1da177e4SLinus Torvalds  *		Marc Tamsky	:	Various sk->prot->retransmits and
168*1da177e4SLinus Torvalds  *					sk->retransmits misupdating fixed.
169*1da177e4SLinus Torvalds  *					Fixed tcp_write_timeout: stuck close,
170*1da177e4SLinus Torvalds  *					and TCP syn retries gets used now.
171*1da177e4SLinus Torvalds  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172*1da177e4SLinus Torvalds  *					ack if state is TCP_CLOSED.
173*1da177e4SLinus Torvalds  *		Alan Cox	:	Look up device on a retransmit - routes may
174*1da177e4SLinus Torvalds  *					change. Doesn't yet cope with MSS shrink right
175*1da177e4SLinus Torvalds  *					but it's a start!
176*1da177e4SLinus Torvalds  *		Marc Tamsky	:	Closing in closing fixes.
177*1da177e4SLinus Torvalds  *		Mike Shaver	:	RFC1122 verifications.
178*1da177e4SLinus Torvalds  *		Alan Cox	:	rcv_saddr errors.
179*1da177e4SLinus Torvalds  *		Alan Cox	:	Block double connect().
180*1da177e4SLinus Torvalds  *		Alan Cox	:	Small hooks for enSKIP.
181*1da177e4SLinus Torvalds  *		Alexey Kuznetsov:	Path MTU discovery.
182*1da177e4SLinus Torvalds  *		Alan Cox	:	Support soft errors.
183*1da177e4SLinus Torvalds  *		Alan Cox	:	Fix MTU discovery pathological case
184*1da177e4SLinus Torvalds  *					when the remote claims no mtu!
185*1da177e4SLinus Torvalds  *		Marc Tamsky	:	TCP_CLOSE fix.
186*1da177e4SLinus Torvalds  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187*1da177e4SLinus Torvalds  *					window but wrong (fixes NT lpd problems)
188*1da177e4SLinus Torvalds  *		Pedro Roque	:	Better TCP window handling, delayed ack.
189*1da177e4SLinus Torvalds  *		Joerg Reuter	:	No modification of locked buffers in
190*1da177e4SLinus Torvalds  *					tcp_do_retransmit()
191*1da177e4SLinus Torvalds  *		Eric Schenk	:	Changed receiver side silly window
192*1da177e4SLinus Torvalds  *					avoidance algorithm to BSD style
193*1da177e4SLinus Torvalds  *					algorithm. This doubles throughput
194*1da177e4SLinus Torvalds  *					against machines running Solaris,
195*1da177e4SLinus Torvalds  *					and seems to result in general
196*1da177e4SLinus Torvalds  *					improvement.
197*1da177e4SLinus Torvalds  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198*1da177e4SLinus Torvalds  *	Willy Konynenberg	:	Transparent proxying support.
199*1da177e4SLinus Torvalds  *	Mike McLagan		:	Routing by source
200*1da177e4SLinus Torvalds  *		Keith Owens	:	Do proper merging with partial SKB's in
201*1da177e4SLinus Torvalds  *					tcp_do_sendmsg to avoid burstiness.
202*1da177e4SLinus Torvalds  *		Eric Schenk	:	Fix fast close down bug with
203*1da177e4SLinus Torvalds  *					shutdown() followed by close().
204*1da177e4SLinus Torvalds  *		Andi Kleen 	:	Make poll agree with SIGIO
205*1da177e4SLinus Torvalds  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206*1da177e4SLinus Torvalds  *					lingertime == 0 (RFC 793 ABORT Call)
207*1da177e4SLinus Torvalds  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208*1da177e4SLinus Torvalds  *					csum_and_copy_from_user() if possible.
209*1da177e4SLinus Torvalds  *
210*1da177e4SLinus Torvalds  *		This program is free software; you can redistribute it and/or
211*1da177e4SLinus Torvalds  *		modify it under the terms of the GNU General Public License
212*1da177e4SLinus Torvalds  *		as published by the Free Software Foundation; either version
213*1da177e4SLinus Torvalds  *		2 of the License, or(at your option) any later version.
214*1da177e4SLinus Torvalds  *
215*1da177e4SLinus Torvalds  * Description of States:
216*1da177e4SLinus Torvalds  *
217*1da177e4SLinus Torvalds  *	TCP_SYN_SENT		sent a connection request, waiting for ack
218*1da177e4SLinus Torvalds  *
219*1da177e4SLinus Torvalds  *	TCP_SYN_RECV		received a connection request, sent ack,
220*1da177e4SLinus Torvalds  *				waiting for final ack in three-way handshake.
221*1da177e4SLinus Torvalds  *
222*1da177e4SLinus Torvalds  *	TCP_ESTABLISHED		connection established
223*1da177e4SLinus Torvalds  *
224*1da177e4SLinus Torvalds  *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225*1da177e4SLinus Torvalds  *				transmission of remaining buffered data
226*1da177e4SLinus Torvalds  *
227*1da177e4SLinus Torvalds  *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228*1da177e4SLinus Torvalds  *				to shutdown
229*1da177e4SLinus Torvalds  *
230*1da177e4SLinus Torvalds  *	TCP_CLOSING		both sides have shutdown but we still have
231*1da177e4SLinus Torvalds  *				data we have to finish sending
232*1da177e4SLinus Torvalds  *
233*1da177e4SLinus Torvalds  *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234*1da177e4SLinus Torvalds  *				closed, can only be entered from FIN_WAIT2
235*1da177e4SLinus Torvalds  *				or CLOSING.  Required because the other end
236*1da177e4SLinus Torvalds  *				may not have gotten our last ACK causing it
237*1da177e4SLinus Torvalds  *				to retransmit the data packet (which we ignore)
238*1da177e4SLinus Torvalds  *
239*1da177e4SLinus Torvalds  *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240*1da177e4SLinus Torvalds  *				us to finish writing our data and to shutdown
241*1da177e4SLinus Torvalds  *				(we have to close() to move on to LAST_ACK)
242*1da177e4SLinus Torvalds  *
243*1da177e4SLinus Torvalds  *	TCP_LAST_ACK		out side has shutdown after remote has
244*1da177e4SLinus Torvalds  *				shutdown.  There may still be data in our
245*1da177e4SLinus Torvalds  *				buffer that we have to finish sending
246*1da177e4SLinus Torvalds  *
247*1da177e4SLinus Torvalds  *	TCP_CLOSE		socket is finished
248*1da177e4SLinus Torvalds  */
249*1da177e4SLinus Torvalds 
250*1da177e4SLinus Torvalds #include <linux/config.h>
251*1da177e4SLinus Torvalds #include <linux/module.h>
252*1da177e4SLinus Torvalds #include <linux/types.h>
253*1da177e4SLinus Torvalds #include <linux/fcntl.h>
254*1da177e4SLinus Torvalds #include <linux/poll.h>
255*1da177e4SLinus Torvalds #include <linux/init.h>
256*1da177e4SLinus Torvalds #include <linux/smp_lock.h>
257*1da177e4SLinus Torvalds #include <linux/fs.h>
258*1da177e4SLinus Torvalds #include <linux/random.h>
259*1da177e4SLinus Torvalds #include <linux/bootmem.h>
260*1da177e4SLinus Torvalds 
261*1da177e4SLinus Torvalds #include <net/icmp.h>
262*1da177e4SLinus Torvalds #include <net/tcp.h>
263*1da177e4SLinus Torvalds #include <net/xfrm.h>
264*1da177e4SLinus Torvalds #include <net/ip.h>
265*1da177e4SLinus Torvalds 
266*1da177e4SLinus Torvalds 
267*1da177e4SLinus Torvalds #include <asm/uaccess.h>
268*1da177e4SLinus Torvalds #include <asm/ioctls.h>
269*1da177e4SLinus Torvalds 
270*1da177e4SLinus Torvalds int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271*1da177e4SLinus Torvalds 
272*1da177e4SLinus Torvalds DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273*1da177e4SLinus Torvalds 
274*1da177e4SLinus Torvalds kmem_cache_t *tcp_openreq_cachep;
275*1da177e4SLinus Torvalds kmem_cache_t *tcp_bucket_cachep;
276*1da177e4SLinus Torvalds kmem_cache_t *tcp_timewait_cachep;
277*1da177e4SLinus Torvalds 
278*1da177e4SLinus Torvalds atomic_t tcp_orphan_count = ATOMIC_INIT(0);
279*1da177e4SLinus Torvalds 
280*1da177e4SLinus Torvalds int sysctl_tcp_mem[3];
281*1da177e4SLinus Torvalds int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
282*1da177e4SLinus Torvalds int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
283*1da177e4SLinus Torvalds 
284*1da177e4SLinus Torvalds EXPORT_SYMBOL(sysctl_tcp_mem);
285*1da177e4SLinus Torvalds EXPORT_SYMBOL(sysctl_tcp_rmem);
286*1da177e4SLinus Torvalds EXPORT_SYMBOL(sysctl_tcp_wmem);
287*1da177e4SLinus Torvalds 
288*1da177e4SLinus Torvalds atomic_t tcp_memory_allocated;	/* Current allocated memory. */
289*1da177e4SLinus Torvalds atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
290*1da177e4SLinus Torvalds 
291*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_memory_allocated);
292*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_sockets_allocated);
293*1da177e4SLinus Torvalds 
294*1da177e4SLinus Torvalds /*
295*1da177e4SLinus Torvalds  * Pressure flag: try to collapse.
296*1da177e4SLinus Torvalds  * Technical note: it is used by multiple contexts non atomically.
297*1da177e4SLinus Torvalds  * All the sk_stream_mem_schedule() is of this nature: accounting
298*1da177e4SLinus Torvalds  * is strict, actions are advisory and have some latency.
299*1da177e4SLinus Torvalds  */
300*1da177e4SLinus Torvalds int tcp_memory_pressure;
301*1da177e4SLinus Torvalds 
302*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_memory_pressure);
303*1da177e4SLinus Torvalds 
304*1da177e4SLinus Torvalds void tcp_enter_memory_pressure(void)
305*1da177e4SLinus Torvalds {
306*1da177e4SLinus Torvalds 	if (!tcp_memory_pressure) {
307*1da177e4SLinus Torvalds 		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
308*1da177e4SLinus Torvalds 		tcp_memory_pressure = 1;
309*1da177e4SLinus Torvalds 	}
310*1da177e4SLinus Torvalds }
311*1da177e4SLinus Torvalds 
312*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_enter_memory_pressure);
313*1da177e4SLinus Torvalds 
314*1da177e4SLinus Torvalds /*
315*1da177e4SLinus Torvalds  * LISTEN is a special case for poll..
316*1da177e4SLinus Torvalds  */
317*1da177e4SLinus Torvalds static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
318*1da177e4SLinus Torvalds 					       poll_table *wait)
319*1da177e4SLinus Torvalds {
320*1da177e4SLinus Torvalds 	return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
321*1da177e4SLinus Torvalds }
322*1da177e4SLinus Torvalds 
323*1da177e4SLinus Torvalds /*
324*1da177e4SLinus Torvalds  *	Wait for a TCP event.
325*1da177e4SLinus Torvalds  *
326*1da177e4SLinus Torvalds  *	Note that we don't need to lock the socket, as the upper poll layers
327*1da177e4SLinus Torvalds  *	take care of normal races (between the test and the event) and we don't
328*1da177e4SLinus Torvalds  *	go look at any of the socket buffers directly.
329*1da177e4SLinus Torvalds  */
330*1da177e4SLinus Torvalds unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
331*1da177e4SLinus Torvalds {
332*1da177e4SLinus Torvalds 	unsigned int mask;
333*1da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
334*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
335*1da177e4SLinus Torvalds 
336*1da177e4SLinus Torvalds 	poll_wait(file, sk->sk_sleep, wait);
337*1da177e4SLinus Torvalds 	if (sk->sk_state == TCP_LISTEN)
338*1da177e4SLinus Torvalds 		return tcp_listen_poll(sk, wait);
339*1da177e4SLinus Torvalds 
340*1da177e4SLinus Torvalds 	/* Socket is not locked. We are protected from async events
341*1da177e4SLinus Torvalds 	   by poll logic and correct handling of state changes
342*1da177e4SLinus Torvalds 	   made by another threads is impossible in any case.
343*1da177e4SLinus Torvalds 	 */
344*1da177e4SLinus Torvalds 
345*1da177e4SLinus Torvalds 	mask = 0;
346*1da177e4SLinus Torvalds 	if (sk->sk_err)
347*1da177e4SLinus Torvalds 		mask = POLLERR;
348*1da177e4SLinus Torvalds 
349*1da177e4SLinus Torvalds 	/*
350*1da177e4SLinus Torvalds 	 * POLLHUP is certainly not done right. But poll() doesn't
351*1da177e4SLinus Torvalds 	 * have a notion of HUP in just one direction, and for a
352*1da177e4SLinus Torvalds 	 * socket the read side is more interesting.
353*1da177e4SLinus Torvalds 	 *
354*1da177e4SLinus Torvalds 	 * Some poll() documentation says that POLLHUP is incompatible
355*1da177e4SLinus Torvalds 	 * with the POLLOUT/POLLWR flags, so somebody should check this
356*1da177e4SLinus Torvalds 	 * all. But careful, it tends to be safer to return too many
357*1da177e4SLinus Torvalds 	 * bits than too few, and you can easily break real applications
358*1da177e4SLinus Torvalds 	 * if you don't tell them that something has hung up!
359*1da177e4SLinus Torvalds 	 *
360*1da177e4SLinus Torvalds 	 * Check-me.
361*1da177e4SLinus Torvalds 	 *
362*1da177e4SLinus Torvalds 	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
363*1da177e4SLinus Torvalds 	 * our fs/select.c). It means that after we received EOF,
364*1da177e4SLinus Torvalds 	 * poll always returns immediately, making impossible poll() on write()
365*1da177e4SLinus Torvalds 	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
366*1da177e4SLinus Torvalds 	 * if and only if shutdown has been made in both directions.
367*1da177e4SLinus Torvalds 	 * Actually, it is interesting to look how Solaris and DUX
368*1da177e4SLinus Torvalds 	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
369*1da177e4SLinus Torvalds 	 * then we could set it on SND_SHUTDOWN. BTW examples given
370*1da177e4SLinus Torvalds 	 * in Stevens' books assume exactly this behaviour, it explains
371*1da177e4SLinus Torvalds 	 * why PULLHUP is incompatible with POLLOUT.	--ANK
372*1da177e4SLinus Torvalds 	 *
373*1da177e4SLinus Torvalds 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
374*1da177e4SLinus Torvalds 	 * blocking on fresh not-connected or disconnected socket. --ANK
375*1da177e4SLinus Torvalds 	 */
376*1da177e4SLinus Torvalds 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
377*1da177e4SLinus Torvalds 		mask |= POLLHUP;
378*1da177e4SLinus Torvalds 	if (sk->sk_shutdown & RCV_SHUTDOWN)
379*1da177e4SLinus Torvalds 		mask |= POLLIN | POLLRDNORM;
380*1da177e4SLinus Torvalds 
381*1da177e4SLinus Torvalds 	/* Connected? */
382*1da177e4SLinus Torvalds 	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
383*1da177e4SLinus Torvalds 		/* Potential race condition. If read of tp below will
384*1da177e4SLinus Torvalds 		 * escape above sk->sk_state, we can be illegally awaken
385*1da177e4SLinus Torvalds 		 * in SYN_* states. */
386*1da177e4SLinus Torvalds 		if ((tp->rcv_nxt != tp->copied_seq) &&
387*1da177e4SLinus Torvalds 		    (tp->urg_seq != tp->copied_seq ||
388*1da177e4SLinus Torvalds 		     tp->rcv_nxt != tp->copied_seq + 1 ||
389*1da177e4SLinus Torvalds 		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
390*1da177e4SLinus Torvalds 			mask |= POLLIN | POLLRDNORM;
391*1da177e4SLinus Torvalds 
392*1da177e4SLinus Torvalds 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
393*1da177e4SLinus Torvalds 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
394*1da177e4SLinus Torvalds 				mask |= POLLOUT | POLLWRNORM;
395*1da177e4SLinus Torvalds 			} else {  /* send SIGIO later */
396*1da177e4SLinus Torvalds 				set_bit(SOCK_ASYNC_NOSPACE,
397*1da177e4SLinus Torvalds 					&sk->sk_socket->flags);
398*1da177e4SLinus Torvalds 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
399*1da177e4SLinus Torvalds 
400*1da177e4SLinus Torvalds 				/* Race breaker. If space is freed after
401*1da177e4SLinus Torvalds 				 * wspace test but before the flags are set,
402*1da177e4SLinus Torvalds 				 * IO signal will be lost.
403*1da177e4SLinus Torvalds 				 */
404*1da177e4SLinus Torvalds 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
405*1da177e4SLinus Torvalds 					mask |= POLLOUT | POLLWRNORM;
406*1da177e4SLinus Torvalds 			}
407*1da177e4SLinus Torvalds 		}
408*1da177e4SLinus Torvalds 
409*1da177e4SLinus Torvalds 		if (tp->urg_data & TCP_URG_VALID)
410*1da177e4SLinus Torvalds 			mask |= POLLPRI;
411*1da177e4SLinus Torvalds 	}
412*1da177e4SLinus Torvalds 	return mask;
413*1da177e4SLinus Torvalds }
414*1da177e4SLinus Torvalds 
415*1da177e4SLinus Torvalds int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
416*1da177e4SLinus Torvalds {
417*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
418*1da177e4SLinus Torvalds 	int answ;
419*1da177e4SLinus Torvalds 
420*1da177e4SLinus Torvalds 	switch (cmd) {
421*1da177e4SLinus Torvalds 	case SIOCINQ:
422*1da177e4SLinus Torvalds 		if (sk->sk_state == TCP_LISTEN)
423*1da177e4SLinus Torvalds 			return -EINVAL;
424*1da177e4SLinus Torvalds 
425*1da177e4SLinus Torvalds 		lock_sock(sk);
426*1da177e4SLinus Torvalds 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
427*1da177e4SLinus Torvalds 			answ = 0;
428*1da177e4SLinus Torvalds 		else if (sock_flag(sk, SOCK_URGINLINE) ||
429*1da177e4SLinus Torvalds 			 !tp->urg_data ||
430*1da177e4SLinus Torvalds 			 before(tp->urg_seq, tp->copied_seq) ||
431*1da177e4SLinus Torvalds 			 !before(tp->urg_seq, tp->rcv_nxt)) {
432*1da177e4SLinus Torvalds 			answ = tp->rcv_nxt - tp->copied_seq;
433*1da177e4SLinus Torvalds 
434*1da177e4SLinus Torvalds 			/* Subtract 1, if FIN is in queue. */
435*1da177e4SLinus Torvalds 			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
436*1da177e4SLinus Torvalds 				answ -=
437*1da177e4SLinus Torvalds 		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
438*1da177e4SLinus Torvalds 		} else
439*1da177e4SLinus Torvalds 			answ = tp->urg_seq - tp->copied_seq;
440*1da177e4SLinus Torvalds 		release_sock(sk);
441*1da177e4SLinus Torvalds 		break;
442*1da177e4SLinus Torvalds 	case SIOCATMARK:
443*1da177e4SLinus Torvalds 		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
444*1da177e4SLinus Torvalds 		break;
445*1da177e4SLinus Torvalds 	case SIOCOUTQ:
446*1da177e4SLinus Torvalds 		if (sk->sk_state == TCP_LISTEN)
447*1da177e4SLinus Torvalds 			return -EINVAL;
448*1da177e4SLinus Torvalds 
449*1da177e4SLinus Torvalds 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
450*1da177e4SLinus Torvalds 			answ = 0;
451*1da177e4SLinus Torvalds 		else
452*1da177e4SLinus Torvalds 			answ = tp->write_seq - tp->snd_una;
453*1da177e4SLinus Torvalds 		break;
454*1da177e4SLinus Torvalds 	default:
455*1da177e4SLinus Torvalds 		return -ENOIOCTLCMD;
456*1da177e4SLinus Torvalds 	};
457*1da177e4SLinus Torvalds 
458*1da177e4SLinus Torvalds 	return put_user(answ, (int __user *)arg);
459*1da177e4SLinus Torvalds }
460*1da177e4SLinus Torvalds 
461*1da177e4SLinus Torvalds 
462*1da177e4SLinus Torvalds int tcp_listen_start(struct sock *sk)
463*1da177e4SLinus Torvalds {
464*1da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sk);
465*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
466*1da177e4SLinus Torvalds 	struct tcp_listen_opt *lopt;
467*1da177e4SLinus Torvalds 
468*1da177e4SLinus Torvalds 	sk->sk_max_ack_backlog = 0;
469*1da177e4SLinus Torvalds 	sk->sk_ack_backlog = 0;
470*1da177e4SLinus Torvalds 	tp->accept_queue = tp->accept_queue_tail = NULL;
471*1da177e4SLinus Torvalds 	rwlock_init(&tp->syn_wait_lock);
472*1da177e4SLinus Torvalds 	tcp_delack_init(tp);
473*1da177e4SLinus Torvalds 
474*1da177e4SLinus Torvalds 	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
475*1da177e4SLinus Torvalds 	if (!lopt)
476*1da177e4SLinus Torvalds 		return -ENOMEM;
477*1da177e4SLinus Torvalds 
478*1da177e4SLinus Torvalds 	memset(lopt, 0, sizeof(struct tcp_listen_opt));
479*1da177e4SLinus Torvalds 	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
480*1da177e4SLinus Torvalds 		if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
481*1da177e4SLinus Torvalds 			break;
482*1da177e4SLinus Torvalds 	get_random_bytes(&lopt->hash_rnd, 4);
483*1da177e4SLinus Torvalds 
484*1da177e4SLinus Torvalds 	write_lock_bh(&tp->syn_wait_lock);
485*1da177e4SLinus Torvalds 	tp->listen_opt = lopt;
486*1da177e4SLinus Torvalds 	write_unlock_bh(&tp->syn_wait_lock);
487*1da177e4SLinus Torvalds 
488*1da177e4SLinus Torvalds 	/* There is race window here: we announce ourselves listening,
489*1da177e4SLinus Torvalds 	 * but this transition is still not validated by get_port().
490*1da177e4SLinus Torvalds 	 * It is OK, because this socket enters to hash table only
491*1da177e4SLinus Torvalds 	 * after validation is complete.
492*1da177e4SLinus Torvalds 	 */
493*1da177e4SLinus Torvalds 	sk->sk_state = TCP_LISTEN;
494*1da177e4SLinus Torvalds 	if (!sk->sk_prot->get_port(sk, inet->num)) {
495*1da177e4SLinus Torvalds 		inet->sport = htons(inet->num);
496*1da177e4SLinus Torvalds 
497*1da177e4SLinus Torvalds 		sk_dst_reset(sk);
498*1da177e4SLinus Torvalds 		sk->sk_prot->hash(sk);
499*1da177e4SLinus Torvalds 
500*1da177e4SLinus Torvalds 		return 0;
501*1da177e4SLinus Torvalds 	}
502*1da177e4SLinus Torvalds 
503*1da177e4SLinus Torvalds 	sk->sk_state = TCP_CLOSE;
504*1da177e4SLinus Torvalds 	write_lock_bh(&tp->syn_wait_lock);
505*1da177e4SLinus Torvalds 	tp->listen_opt = NULL;
506*1da177e4SLinus Torvalds 	write_unlock_bh(&tp->syn_wait_lock);
507*1da177e4SLinus Torvalds 	kfree(lopt);
508*1da177e4SLinus Torvalds 	return -EADDRINUSE;
509*1da177e4SLinus Torvalds }
510*1da177e4SLinus Torvalds 
511*1da177e4SLinus Torvalds /*
512*1da177e4SLinus Torvalds  *	This routine closes sockets which have been at least partially
513*1da177e4SLinus Torvalds  *	opened, but not yet accepted.
514*1da177e4SLinus Torvalds  */
515*1da177e4SLinus Torvalds 
516*1da177e4SLinus Torvalds static void tcp_listen_stop (struct sock *sk)
517*1da177e4SLinus Torvalds {
518*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
519*1da177e4SLinus Torvalds 	struct tcp_listen_opt *lopt = tp->listen_opt;
520*1da177e4SLinus Torvalds 	struct open_request *acc_req = tp->accept_queue;
521*1da177e4SLinus Torvalds 	struct open_request *req;
522*1da177e4SLinus Torvalds 	int i;
523*1da177e4SLinus Torvalds 
524*1da177e4SLinus Torvalds 	tcp_delete_keepalive_timer(sk);
525*1da177e4SLinus Torvalds 
526*1da177e4SLinus Torvalds 	/* make all the listen_opt local to us */
527*1da177e4SLinus Torvalds 	write_lock_bh(&tp->syn_wait_lock);
528*1da177e4SLinus Torvalds 	tp->listen_opt = NULL;
529*1da177e4SLinus Torvalds 	write_unlock_bh(&tp->syn_wait_lock);
530*1da177e4SLinus Torvalds 	tp->accept_queue = tp->accept_queue_tail = NULL;
531*1da177e4SLinus Torvalds 
532*1da177e4SLinus Torvalds 	if (lopt->qlen) {
533*1da177e4SLinus Torvalds 		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
534*1da177e4SLinus Torvalds 			while ((req = lopt->syn_table[i]) != NULL) {
535*1da177e4SLinus Torvalds 				lopt->syn_table[i] = req->dl_next;
536*1da177e4SLinus Torvalds 				lopt->qlen--;
537*1da177e4SLinus Torvalds 				tcp_openreq_free(req);
538*1da177e4SLinus Torvalds 
539*1da177e4SLinus Torvalds 		/* Following specs, it would be better either to send FIN
540*1da177e4SLinus Torvalds 		 * (and enter FIN-WAIT-1, it is normal close)
541*1da177e4SLinus Torvalds 		 * or to send active reset (abort).
542*1da177e4SLinus Torvalds 		 * Certainly, it is pretty dangerous while synflood, but it is
543*1da177e4SLinus Torvalds 		 * bad justification for our negligence 8)
544*1da177e4SLinus Torvalds 		 * To be honest, we are not able to make either
545*1da177e4SLinus Torvalds 		 * of the variants now.			--ANK
546*1da177e4SLinus Torvalds 		 */
547*1da177e4SLinus Torvalds 			}
548*1da177e4SLinus Torvalds 		}
549*1da177e4SLinus Torvalds 	}
550*1da177e4SLinus Torvalds 	BUG_TRAP(!lopt->qlen);
551*1da177e4SLinus Torvalds 
552*1da177e4SLinus Torvalds 	kfree(lopt);
553*1da177e4SLinus Torvalds 
554*1da177e4SLinus Torvalds 	while ((req = acc_req) != NULL) {
555*1da177e4SLinus Torvalds 		struct sock *child = req->sk;
556*1da177e4SLinus Torvalds 
557*1da177e4SLinus Torvalds 		acc_req = req->dl_next;
558*1da177e4SLinus Torvalds 
559*1da177e4SLinus Torvalds 		local_bh_disable();
560*1da177e4SLinus Torvalds 		bh_lock_sock(child);
561*1da177e4SLinus Torvalds 		BUG_TRAP(!sock_owned_by_user(child));
562*1da177e4SLinus Torvalds 		sock_hold(child);
563*1da177e4SLinus Torvalds 
564*1da177e4SLinus Torvalds 		tcp_disconnect(child, O_NONBLOCK);
565*1da177e4SLinus Torvalds 
566*1da177e4SLinus Torvalds 		sock_orphan(child);
567*1da177e4SLinus Torvalds 
568*1da177e4SLinus Torvalds 		atomic_inc(&tcp_orphan_count);
569*1da177e4SLinus Torvalds 
570*1da177e4SLinus Torvalds 		tcp_destroy_sock(child);
571*1da177e4SLinus Torvalds 
572*1da177e4SLinus Torvalds 		bh_unlock_sock(child);
573*1da177e4SLinus Torvalds 		local_bh_enable();
574*1da177e4SLinus Torvalds 		sock_put(child);
575*1da177e4SLinus Torvalds 
576*1da177e4SLinus Torvalds 		sk_acceptq_removed(sk);
577*1da177e4SLinus Torvalds 		tcp_openreq_fastfree(req);
578*1da177e4SLinus Torvalds 	}
579*1da177e4SLinus Torvalds 	BUG_TRAP(!sk->sk_ack_backlog);
580*1da177e4SLinus Torvalds }
581*1da177e4SLinus Torvalds 
582*1da177e4SLinus Torvalds static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
583*1da177e4SLinus Torvalds {
584*1da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
585*1da177e4SLinus Torvalds 	tp->pushed_seq = tp->write_seq;
586*1da177e4SLinus Torvalds }
587*1da177e4SLinus Torvalds 
588*1da177e4SLinus Torvalds static inline int forced_push(struct tcp_sock *tp)
589*1da177e4SLinus Torvalds {
590*1da177e4SLinus Torvalds 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
591*1da177e4SLinus Torvalds }
592*1da177e4SLinus Torvalds 
593*1da177e4SLinus Torvalds static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
594*1da177e4SLinus Torvalds 			      struct sk_buff *skb)
595*1da177e4SLinus Torvalds {
596*1da177e4SLinus Torvalds 	skb->csum = 0;
597*1da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->seq = tp->write_seq;
598*1da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
599*1da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
600*1da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->sacked = 0;
601*1da177e4SLinus Torvalds 	skb_header_release(skb);
602*1da177e4SLinus Torvalds 	__skb_queue_tail(&sk->sk_write_queue, skb);
603*1da177e4SLinus Torvalds 	sk_charge_skb(sk, skb);
604*1da177e4SLinus Torvalds 	if (!sk->sk_send_head)
605*1da177e4SLinus Torvalds 		sk->sk_send_head = skb;
606*1da177e4SLinus Torvalds 	else if (tp->nonagle&TCP_NAGLE_PUSH)
607*1da177e4SLinus Torvalds 		tp->nonagle &= ~TCP_NAGLE_PUSH;
608*1da177e4SLinus Torvalds }
609*1da177e4SLinus Torvalds 
610*1da177e4SLinus Torvalds static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
611*1da177e4SLinus Torvalds 				struct sk_buff *skb)
612*1da177e4SLinus Torvalds {
613*1da177e4SLinus Torvalds 	if (flags & MSG_OOB) {
614*1da177e4SLinus Torvalds 		tp->urg_mode = 1;
615*1da177e4SLinus Torvalds 		tp->snd_up = tp->write_seq;
616*1da177e4SLinus Torvalds 		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
617*1da177e4SLinus Torvalds 	}
618*1da177e4SLinus Torvalds }
619*1da177e4SLinus Torvalds 
620*1da177e4SLinus Torvalds static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
621*1da177e4SLinus Torvalds 			    int mss_now, int nonagle)
622*1da177e4SLinus Torvalds {
623*1da177e4SLinus Torvalds 	if (sk->sk_send_head) {
624*1da177e4SLinus Torvalds 		struct sk_buff *skb = sk->sk_write_queue.prev;
625*1da177e4SLinus Torvalds 		if (!(flags & MSG_MORE) || forced_push(tp))
626*1da177e4SLinus Torvalds 			tcp_mark_push(tp, skb);
627*1da177e4SLinus Torvalds 		tcp_mark_urg(tp, flags, skb);
628*1da177e4SLinus Torvalds 		__tcp_push_pending_frames(sk, tp, mss_now,
629*1da177e4SLinus Torvalds 					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
630*1da177e4SLinus Torvalds 	}
631*1da177e4SLinus Torvalds }
632*1da177e4SLinus Torvalds 
633*1da177e4SLinus Torvalds static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
634*1da177e4SLinus Torvalds 			 size_t psize, int flags)
635*1da177e4SLinus Torvalds {
636*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
637*1da177e4SLinus Torvalds 	int mss_now;
638*1da177e4SLinus Torvalds 	int err;
639*1da177e4SLinus Torvalds 	ssize_t copied;
640*1da177e4SLinus Torvalds 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
641*1da177e4SLinus Torvalds 
642*1da177e4SLinus Torvalds 	/* Wait for a connection to finish. */
643*1da177e4SLinus Torvalds 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
644*1da177e4SLinus Torvalds 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
645*1da177e4SLinus Torvalds 			goto out_err;
646*1da177e4SLinus Torvalds 
647*1da177e4SLinus Torvalds 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
648*1da177e4SLinus Torvalds 
649*1da177e4SLinus Torvalds 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
650*1da177e4SLinus Torvalds 	copied = 0;
651*1da177e4SLinus Torvalds 
652*1da177e4SLinus Torvalds 	err = -EPIPE;
653*1da177e4SLinus Torvalds 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
654*1da177e4SLinus Torvalds 		goto do_error;
655*1da177e4SLinus Torvalds 
656*1da177e4SLinus Torvalds 	while (psize > 0) {
657*1da177e4SLinus Torvalds 		struct sk_buff *skb = sk->sk_write_queue.prev;
658*1da177e4SLinus Torvalds 		struct page *page = pages[poffset / PAGE_SIZE];
659*1da177e4SLinus Torvalds 		int copy, i, can_coalesce;
660*1da177e4SLinus Torvalds 		int offset = poffset % PAGE_SIZE;
661*1da177e4SLinus Torvalds 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
662*1da177e4SLinus Torvalds 
663*1da177e4SLinus Torvalds 		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
664*1da177e4SLinus Torvalds new_segment:
665*1da177e4SLinus Torvalds 			if (!sk_stream_memory_free(sk))
666*1da177e4SLinus Torvalds 				goto wait_for_sndbuf;
667*1da177e4SLinus Torvalds 
668*1da177e4SLinus Torvalds 			skb = sk_stream_alloc_pskb(sk, 0, 0,
669*1da177e4SLinus Torvalds 						   sk->sk_allocation);
670*1da177e4SLinus Torvalds 			if (!skb)
671*1da177e4SLinus Torvalds 				goto wait_for_memory;
672*1da177e4SLinus Torvalds 
673*1da177e4SLinus Torvalds 			skb_entail(sk, tp, skb);
674*1da177e4SLinus Torvalds 			copy = mss_now;
675*1da177e4SLinus Torvalds 		}
676*1da177e4SLinus Torvalds 
677*1da177e4SLinus Torvalds 		if (copy > size)
678*1da177e4SLinus Torvalds 			copy = size;
679*1da177e4SLinus Torvalds 
680*1da177e4SLinus Torvalds 		i = skb_shinfo(skb)->nr_frags;
681*1da177e4SLinus Torvalds 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
682*1da177e4SLinus Torvalds 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
683*1da177e4SLinus Torvalds 			tcp_mark_push(tp, skb);
684*1da177e4SLinus Torvalds 			goto new_segment;
685*1da177e4SLinus Torvalds 		}
686*1da177e4SLinus Torvalds 		if (sk->sk_forward_alloc < copy &&
687*1da177e4SLinus Torvalds 		    !sk_stream_mem_schedule(sk, copy, 0))
688*1da177e4SLinus Torvalds 			goto wait_for_memory;
689*1da177e4SLinus Torvalds 
690*1da177e4SLinus Torvalds 		if (can_coalesce) {
691*1da177e4SLinus Torvalds 			skb_shinfo(skb)->frags[i - 1].size += copy;
692*1da177e4SLinus Torvalds 		} else {
693*1da177e4SLinus Torvalds 			get_page(page);
694*1da177e4SLinus Torvalds 			skb_fill_page_desc(skb, i, page, offset, copy);
695*1da177e4SLinus Torvalds 		}
696*1da177e4SLinus Torvalds 
697*1da177e4SLinus Torvalds 		skb->len += copy;
698*1da177e4SLinus Torvalds 		skb->data_len += copy;
699*1da177e4SLinus Torvalds 		skb->truesize += copy;
700*1da177e4SLinus Torvalds 		sk->sk_wmem_queued += copy;
701*1da177e4SLinus Torvalds 		sk->sk_forward_alloc -= copy;
702*1da177e4SLinus Torvalds 		skb->ip_summed = CHECKSUM_HW;
703*1da177e4SLinus Torvalds 		tp->write_seq += copy;
704*1da177e4SLinus Torvalds 		TCP_SKB_CB(skb)->end_seq += copy;
705*1da177e4SLinus Torvalds 		skb_shinfo(skb)->tso_segs = 0;
706*1da177e4SLinus Torvalds 
707*1da177e4SLinus Torvalds 		if (!copied)
708*1da177e4SLinus Torvalds 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
709*1da177e4SLinus Torvalds 
710*1da177e4SLinus Torvalds 		copied += copy;
711*1da177e4SLinus Torvalds 		poffset += copy;
712*1da177e4SLinus Torvalds 		if (!(psize -= copy))
713*1da177e4SLinus Torvalds 			goto out;
714*1da177e4SLinus Torvalds 
715*1da177e4SLinus Torvalds 		if (skb->len != mss_now || (flags & MSG_OOB))
716*1da177e4SLinus Torvalds 			continue;
717*1da177e4SLinus Torvalds 
718*1da177e4SLinus Torvalds 		if (forced_push(tp)) {
719*1da177e4SLinus Torvalds 			tcp_mark_push(tp, skb);
720*1da177e4SLinus Torvalds 			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
721*1da177e4SLinus Torvalds 		} else if (skb == sk->sk_send_head)
722*1da177e4SLinus Torvalds 			tcp_push_one(sk, mss_now);
723*1da177e4SLinus Torvalds 		continue;
724*1da177e4SLinus Torvalds 
725*1da177e4SLinus Torvalds wait_for_sndbuf:
726*1da177e4SLinus Torvalds 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
727*1da177e4SLinus Torvalds wait_for_memory:
728*1da177e4SLinus Torvalds 		if (copied)
729*1da177e4SLinus Torvalds 			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
730*1da177e4SLinus Torvalds 
731*1da177e4SLinus Torvalds 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
732*1da177e4SLinus Torvalds 			goto do_error;
733*1da177e4SLinus Torvalds 
734*1da177e4SLinus Torvalds 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
735*1da177e4SLinus Torvalds 	}
736*1da177e4SLinus Torvalds 
737*1da177e4SLinus Torvalds out:
738*1da177e4SLinus Torvalds 	if (copied)
739*1da177e4SLinus Torvalds 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
740*1da177e4SLinus Torvalds 	return copied;
741*1da177e4SLinus Torvalds 
742*1da177e4SLinus Torvalds do_error:
743*1da177e4SLinus Torvalds 	if (copied)
744*1da177e4SLinus Torvalds 		goto out;
745*1da177e4SLinus Torvalds out_err:
746*1da177e4SLinus Torvalds 	return sk_stream_error(sk, flags, err);
747*1da177e4SLinus Torvalds }
748*1da177e4SLinus Torvalds 
749*1da177e4SLinus Torvalds ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
750*1da177e4SLinus Torvalds 		     size_t size, int flags)
751*1da177e4SLinus Torvalds {
752*1da177e4SLinus Torvalds 	ssize_t res;
753*1da177e4SLinus Torvalds 	struct sock *sk = sock->sk;
754*1da177e4SLinus Torvalds 
755*1da177e4SLinus Torvalds #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
756*1da177e4SLinus Torvalds 
757*1da177e4SLinus Torvalds 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
758*1da177e4SLinus Torvalds 	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
759*1da177e4SLinus Torvalds 		return sock_no_sendpage(sock, page, offset, size, flags);
760*1da177e4SLinus Torvalds 
761*1da177e4SLinus Torvalds #undef TCP_ZC_CSUM_FLAGS
762*1da177e4SLinus Torvalds 
763*1da177e4SLinus Torvalds 	lock_sock(sk);
764*1da177e4SLinus Torvalds 	TCP_CHECK_TIMER(sk);
765*1da177e4SLinus Torvalds 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
766*1da177e4SLinus Torvalds 	TCP_CHECK_TIMER(sk);
767*1da177e4SLinus Torvalds 	release_sock(sk);
768*1da177e4SLinus Torvalds 	return res;
769*1da177e4SLinus Torvalds }
770*1da177e4SLinus Torvalds 
771*1da177e4SLinus Torvalds #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
772*1da177e4SLinus Torvalds #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
773*1da177e4SLinus Torvalds 
774*1da177e4SLinus Torvalds static inline int select_size(struct sock *sk, struct tcp_sock *tp)
775*1da177e4SLinus Torvalds {
776*1da177e4SLinus Torvalds 	int tmp = tp->mss_cache_std;
777*1da177e4SLinus Torvalds 
778*1da177e4SLinus Torvalds 	if (sk->sk_route_caps & NETIF_F_SG) {
779*1da177e4SLinus Torvalds 		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
780*1da177e4SLinus Torvalds 
781*1da177e4SLinus Torvalds 		if (tmp >= pgbreak &&
782*1da177e4SLinus Torvalds 		    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
783*1da177e4SLinus Torvalds 			tmp = pgbreak;
784*1da177e4SLinus Torvalds 	}
785*1da177e4SLinus Torvalds 	return tmp;
786*1da177e4SLinus Torvalds }
787*1da177e4SLinus Torvalds 
788*1da177e4SLinus Torvalds int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
789*1da177e4SLinus Torvalds 		size_t size)
790*1da177e4SLinus Torvalds {
791*1da177e4SLinus Torvalds 	struct iovec *iov;
792*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
793*1da177e4SLinus Torvalds 	struct sk_buff *skb;
794*1da177e4SLinus Torvalds 	int iovlen, flags;
795*1da177e4SLinus Torvalds 	int mss_now;
796*1da177e4SLinus Torvalds 	int err, copied;
797*1da177e4SLinus Torvalds 	long timeo;
798*1da177e4SLinus Torvalds 
799*1da177e4SLinus Torvalds 	lock_sock(sk);
800*1da177e4SLinus Torvalds 	TCP_CHECK_TIMER(sk);
801*1da177e4SLinus Torvalds 
802*1da177e4SLinus Torvalds 	flags = msg->msg_flags;
803*1da177e4SLinus Torvalds 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
804*1da177e4SLinus Torvalds 
805*1da177e4SLinus Torvalds 	/* Wait for a connection to finish. */
806*1da177e4SLinus Torvalds 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
807*1da177e4SLinus Torvalds 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
808*1da177e4SLinus Torvalds 			goto out_err;
809*1da177e4SLinus Torvalds 
810*1da177e4SLinus Torvalds 	/* This should be in poll */
811*1da177e4SLinus Torvalds 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
812*1da177e4SLinus Torvalds 
813*1da177e4SLinus Torvalds 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
814*1da177e4SLinus Torvalds 
815*1da177e4SLinus Torvalds 	/* Ok commence sending. */
816*1da177e4SLinus Torvalds 	iovlen = msg->msg_iovlen;
817*1da177e4SLinus Torvalds 	iov = msg->msg_iov;
818*1da177e4SLinus Torvalds 	copied = 0;
819*1da177e4SLinus Torvalds 
820*1da177e4SLinus Torvalds 	err = -EPIPE;
821*1da177e4SLinus Torvalds 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
822*1da177e4SLinus Torvalds 		goto do_error;
823*1da177e4SLinus Torvalds 
824*1da177e4SLinus Torvalds 	while (--iovlen >= 0) {
825*1da177e4SLinus Torvalds 		int seglen = iov->iov_len;
826*1da177e4SLinus Torvalds 		unsigned char __user *from = iov->iov_base;
827*1da177e4SLinus Torvalds 
828*1da177e4SLinus Torvalds 		iov++;
829*1da177e4SLinus Torvalds 
830*1da177e4SLinus Torvalds 		while (seglen > 0) {
831*1da177e4SLinus Torvalds 			int copy;
832*1da177e4SLinus Torvalds 
833*1da177e4SLinus Torvalds 			skb = sk->sk_write_queue.prev;
834*1da177e4SLinus Torvalds 
835*1da177e4SLinus Torvalds 			if (!sk->sk_send_head ||
836*1da177e4SLinus Torvalds 			    (copy = mss_now - skb->len) <= 0) {
837*1da177e4SLinus Torvalds 
838*1da177e4SLinus Torvalds new_segment:
839*1da177e4SLinus Torvalds 				/* Allocate new segment. If the interface is SG,
840*1da177e4SLinus Torvalds 				 * allocate skb fitting to single page.
841*1da177e4SLinus Torvalds 				 */
842*1da177e4SLinus Torvalds 				if (!sk_stream_memory_free(sk))
843*1da177e4SLinus Torvalds 					goto wait_for_sndbuf;
844*1da177e4SLinus Torvalds 
845*1da177e4SLinus Torvalds 				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
846*1da177e4SLinus Torvalds 							   0, sk->sk_allocation);
847*1da177e4SLinus Torvalds 				if (!skb)
848*1da177e4SLinus Torvalds 					goto wait_for_memory;
849*1da177e4SLinus Torvalds 
850*1da177e4SLinus Torvalds 				/*
851*1da177e4SLinus Torvalds 				 * Check whether we can use HW checksum.
852*1da177e4SLinus Torvalds 				 */
853*1da177e4SLinus Torvalds 				if (sk->sk_route_caps &
854*1da177e4SLinus Torvalds 				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
855*1da177e4SLinus Torvalds 				     NETIF_F_HW_CSUM))
856*1da177e4SLinus Torvalds 					skb->ip_summed = CHECKSUM_HW;
857*1da177e4SLinus Torvalds 
858*1da177e4SLinus Torvalds 				skb_entail(sk, tp, skb);
859*1da177e4SLinus Torvalds 				copy = mss_now;
860*1da177e4SLinus Torvalds 			}
861*1da177e4SLinus Torvalds 
862*1da177e4SLinus Torvalds 			/* Try to append data to the end of skb. */
863*1da177e4SLinus Torvalds 			if (copy > seglen)
864*1da177e4SLinus Torvalds 				copy = seglen;
865*1da177e4SLinus Torvalds 
866*1da177e4SLinus Torvalds 			/* Where to copy to? */
867*1da177e4SLinus Torvalds 			if (skb_tailroom(skb) > 0) {
868*1da177e4SLinus Torvalds 				/* We have some space in skb head. Superb! */
869*1da177e4SLinus Torvalds 				if (copy > skb_tailroom(skb))
870*1da177e4SLinus Torvalds 					copy = skb_tailroom(skb);
871*1da177e4SLinus Torvalds 				if ((err = skb_add_data(skb, from, copy)) != 0)
872*1da177e4SLinus Torvalds 					goto do_fault;
873*1da177e4SLinus Torvalds 			} else {
874*1da177e4SLinus Torvalds 				int merge = 0;
875*1da177e4SLinus Torvalds 				int i = skb_shinfo(skb)->nr_frags;
876*1da177e4SLinus Torvalds 				struct page *page = TCP_PAGE(sk);
877*1da177e4SLinus Torvalds 				int off = TCP_OFF(sk);
878*1da177e4SLinus Torvalds 
879*1da177e4SLinus Torvalds 				if (skb_can_coalesce(skb, i, page, off) &&
880*1da177e4SLinus Torvalds 				    off != PAGE_SIZE) {
881*1da177e4SLinus Torvalds 					/* We can extend the last page
882*1da177e4SLinus Torvalds 					 * fragment. */
883*1da177e4SLinus Torvalds 					merge = 1;
884*1da177e4SLinus Torvalds 				} else if (i == MAX_SKB_FRAGS ||
885*1da177e4SLinus Torvalds 					   (!i &&
886*1da177e4SLinus Torvalds 					   !(sk->sk_route_caps & NETIF_F_SG))) {
887*1da177e4SLinus Torvalds 					/* Need to add new fragment and cannot
888*1da177e4SLinus Torvalds 					 * do this because interface is non-SG,
889*1da177e4SLinus Torvalds 					 * or because all the page slots are
890*1da177e4SLinus Torvalds 					 * busy. */
891*1da177e4SLinus Torvalds 					tcp_mark_push(tp, skb);
892*1da177e4SLinus Torvalds 					goto new_segment;
893*1da177e4SLinus Torvalds 				} else if (page) {
894*1da177e4SLinus Torvalds 					/* If page is cached, align
895*1da177e4SLinus Torvalds 					 * offset to L1 cache boundary
896*1da177e4SLinus Torvalds 					 */
897*1da177e4SLinus Torvalds 					off = (off + L1_CACHE_BYTES - 1) &
898*1da177e4SLinus Torvalds 					      ~(L1_CACHE_BYTES - 1);
899*1da177e4SLinus Torvalds 					if (off == PAGE_SIZE) {
900*1da177e4SLinus Torvalds 						put_page(page);
901*1da177e4SLinus Torvalds 						TCP_PAGE(sk) = page = NULL;
902*1da177e4SLinus Torvalds 					}
903*1da177e4SLinus Torvalds 				}
904*1da177e4SLinus Torvalds 
905*1da177e4SLinus Torvalds 				if (!page) {
906*1da177e4SLinus Torvalds 					/* Allocate new cache page. */
907*1da177e4SLinus Torvalds 					if (!(page = sk_stream_alloc_page(sk)))
908*1da177e4SLinus Torvalds 						goto wait_for_memory;
909*1da177e4SLinus Torvalds 					off = 0;
910*1da177e4SLinus Torvalds 				}
911*1da177e4SLinus Torvalds 
912*1da177e4SLinus Torvalds 				if (copy > PAGE_SIZE - off)
913*1da177e4SLinus Torvalds 					copy = PAGE_SIZE - off;
914*1da177e4SLinus Torvalds 
915*1da177e4SLinus Torvalds 				/* Time to copy data. We are close to
916*1da177e4SLinus Torvalds 				 * the end! */
917*1da177e4SLinus Torvalds 				err = skb_copy_to_page(sk, from, skb, page,
918*1da177e4SLinus Torvalds 						       off, copy);
919*1da177e4SLinus Torvalds 				if (err) {
920*1da177e4SLinus Torvalds 					/* If this page was new, give it to the
921*1da177e4SLinus Torvalds 					 * socket so it does not get leaked.
922*1da177e4SLinus Torvalds 					 */
923*1da177e4SLinus Torvalds 					if (!TCP_PAGE(sk)) {
924*1da177e4SLinus Torvalds 						TCP_PAGE(sk) = page;
925*1da177e4SLinus Torvalds 						TCP_OFF(sk) = 0;
926*1da177e4SLinus Torvalds 					}
927*1da177e4SLinus Torvalds 					goto do_error;
928*1da177e4SLinus Torvalds 				}
929*1da177e4SLinus Torvalds 
930*1da177e4SLinus Torvalds 				/* Update the skb. */
931*1da177e4SLinus Torvalds 				if (merge) {
932*1da177e4SLinus Torvalds 					skb_shinfo(skb)->frags[i - 1].size +=
933*1da177e4SLinus Torvalds 									copy;
934*1da177e4SLinus Torvalds 				} else {
935*1da177e4SLinus Torvalds 					skb_fill_page_desc(skb, i, page, off, copy);
936*1da177e4SLinus Torvalds 					if (TCP_PAGE(sk)) {
937*1da177e4SLinus Torvalds 						get_page(page);
938*1da177e4SLinus Torvalds 					} else if (off + copy < PAGE_SIZE) {
939*1da177e4SLinus Torvalds 						get_page(page);
940*1da177e4SLinus Torvalds 						TCP_PAGE(sk) = page;
941*1da177e4SLinus Torvalds 					}
942*1da177e4SLinus Torvalds 				}
943*1da177e4SLinus Torvalds 
944*1da177e4SLinus Torvalds 				TCP_OFF(sk) = off + copy;
945*1da177e4SLinus Torvalds 			}
946*1da177e4SLinus Torvalds 
947*1da177e4SLinus Torvalds 			if (!copied)
948*1da177e4SLinus Torvalds 				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
949*1da177e4SLinus Torvalds 
950*1da177e4SLinus Torvalds 			tp->write_seq += copy;
951*1da177e4SLinus Torvalds 			TCP_SKB_CB(skb)->end_seq += copy;
952*1da177e4SLinus Torvalds 			skb_shinfo(skb)->tso_segs = 0;
953*1da177e4SLinus Torvalds 
954*1da177e4SLinus Torvalds 			from += copy;
955*1da177e4SLinus Torvalds 			copied += copy;
956*1da177e4SLinus Torvalds 			if ((seglen -= copy) == 0 && iovlen == 0)
957*1da177e4SLinus Torvalds 				goto out;
958*1da177e4SLinus Torvalds 
959*1da177e4SLinus Torvalds 			if (skb->len != mss_now || (flags & MSG_OOB))
960*1da177e4SLinus Torvalds 				continue;
961*1da177e4SLinus Torvalds 
962*1da177e4SLinus Torvalds 			if (forced_push(tp)) {
963*1da177e4SLinus Torvalds 				tcp_mark_push(tp, skb);
964*1da177e4SLinus Torvalds 				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
965*1da177e4SLinus Torvalds 			} else if (skb == sk->sk_send_head)
966*1da177e4SLinus Torvalds 				tcp_push_one(sk, mss_now);
967*1da177e4SLinus Torvalds 			continue;
968*1da177e4SLinus Torvalds 
969*1da177e4SLinus Torvalds wait_for_sndbuf:
970*1da177e4SLinus Torvalds 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
971*1da177e4SLinus Torvalds wait_for_memory:
972*1da177e4SLinus Torvalds 			if (copied)
973*1da177e4SLinus Torvalds 				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
974*1da177e4SLinus Torvalds 
975*1da177e4SLinus Torvalds 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
976*1da177e4SLinus Torvalds 				goto do_error;
977*1da177e4SLinus Torvalds 
978*1da177e4SLinus Torvalds 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
979*1da177e4SLinus Torvalds 		}
980*1da177e4SLinus Torvalds 	}
981*1da177e4SLinus Torvalds 
982*1da177e4SLinus Torvalds out:
983*1da177e4SLinus Torvalds 	if (copied)
984*1da177e4SLinus Torvalds 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
985*1da177e4SLinus Torvalds 	TCP_CHECK_TIMER(sk);
986*1da177e4SLinus Torvalds 	release_sock(sk);
987*1da177e4SLinus Torvalds 	return copied;
988*1da177e4SLinus Torvalds 
989*1da177e4SLinus Torvalds do_fault:
990*1da177e4SLinus Torvalds 	if (!skb->len) {
991*1da177e4SLinus Torvalds 		if (sk->sk_send_head == skb)
992*1da177e4SLinus Torvalds 			sk->sk_send_head = NULL;
993*1da177e4SLinus Torvalds 		__skb_unlink(skb, skb->list);
994*1da177e4SLinus Torvalds 		sk_stream_free_skb(sk, skb);
995*1da177e4SLinus Torvalds 	}
996*1da177e4SLinus Torvalds 
997*1da177e4SLinus Torvalds do_error:
998*1da177e4SLinus Torvalds 	if (copied)
999*1da177e4SLinus Torvalds 		goto out;
1000*1da177e4SLinus Torvalds out_err:
1001*1da177e4SLinus Torvalds 	err = sk_stream_error(sk, flags, err);
1002*1da177e4SLinus Torvalds 	TCP_CHECK_TIMER(sk);
1003*1da177e4SLinus Torvalds 	release_sock(sk);
1004*1da177e4SLinus Torvalds 	return err;
1005*1da177e4SLinus Torvalds }
1006*1da177e4SLinus Torvalds 
1007*1da177e4SLinus Torvalds /*
1008*1da177e4SLinus Torvalds  *	Handle reading urgent data. BSD has very simple semantics for
1009*1da177e4SLinus Torvalds  *	this, no blocking and very strange errors 8)
1010*1da177e4SLinus Torvalds  */
1011*1da177e4SLinus Torvalds 
1012*1da177e4SLinus Torvalds static int tcp_recv_urg(struct sock *sk, long timeo,
1013*1da177e4SLinus Torvalds 			struct msghdr *msg, int len, int flags,
1014*1da177e4SLinus Torvalds 			int *addr_len)
1015*1da177e4SLinus Torvalds {
1016*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1017*1da177e4SLinus Torvalds 
1018*1da177e4SLinus Torvalds 	/* No URG data to read. */
1019*1da177e4SLinus Torvalds 	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1020*1da177e4SLinus Torvalds 	    tp->urg_data == TCP_URG_READ)
1021*1da177e4SLinus Torvalds 		return -EINVAL;	/* Yes this is right ! */
1022*1da177e4SLinus Torvalds 
1023*1da177e4SLinus Torvalds 	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1024*1da177e4SLinus Torvalds 		return -ENOTCONN;
1025*1da177e4SLinus Torvalds 
1026*1da177e4SLinus Torvalds 	if (tp->urg_data & TCP_URG_VALID) {
1027*1da177e4SLinus Torvalds 		int err = 0;
1028*1da177e4SLinus Torvalds 		char c = tp->urg_data;
1029*1da177e4SLinus Torvalds 
1030*1da177e4SLinus Torvalds 		if (!(flags & MSG_PEEK))
1031*1da177e4SLinus Torvalds 			tp->urg_data = TCP_URG_READ;
1032*1da177e4SLinus Torvalds 
1033*1da177e4SLinus Torvalds 		/* Read urgent data. */
1034*1da177e4SLinus Torvalds 		msg->msg_flags |= MSG_OOB;
1035*1da177e4SLinus Torvalds 
1036*1da177e4SLinus Torvalds 		if (len > 0) {
1037*1da177e4SLinus Torvalds 			if (!(flags & MSG_TRUNC))
1038*1da177e4SLinus Torvalds 				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1039*1da177e4SLinus Torvalds 			len = 1;
1040*1da177e4SLinus Torvalds 		} else
1041*1da177e4SLinus Torvalds 			msg->msg_flags |= MSG_TRUNC;
1042*1da177e4SLinus Torvalds 
1043*1da177e4SLinus Torvalds 		return err ? -EFAULT : len;
1044*1da177e4SLinus Torvalds 	}
1045*1da177e4SLinus Torvalds 
1046*1da177e4SLinus Torvalds 	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1047*1da177e4SLinus Torvalds 		return 0;
1048*1da177e4SLinus Torvalds 
1049*1da177e4SLinus Torvalds 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1050*1da177e4SLinus Torvalds 	 * the available implementations agree in this case:
1051*1da177e4SLinus Torvalds 	 * this call should never block, independent of the
1052*1da177e4SLinus Torvalds 	 * blocking state of the socket.
1053*1da177e4SLinus Torvalds 	 * Mike <pall@rz.uni-karlsruhe.de>
1054*1da177e4SLinus Torvalds 	 */
1055*1da177e4SLinus Torvalds 	return -EAGAIN;
1056*1da177e4SLinus Torvalds }
1057*1da177e4SLinus Torvalds 
1058*1da177e4SLinus Torvalds /* Clean up the receive buffer for full frames taken by the user,
1059*1da177e4SLinus Torvalds  * then send an ACK if necessary.  COPIED is the number of bytes
1060*1da177e4SLinus Torvalds  * tcp_recvmsg has given to the user so far, it speeds up the
1061*1da177e4SLinus Torvalds  * calculation of whether or not we must ACK for the sake of
1062*1da177e4SLinus Torvalds  * a window update.
1063*1da177e4SLinus Torvalds  */
1064*1da177e4SLinus Torvalds static void cleanup_rbuf(struct sock *sk, int copied)
1065*1da177e4SLinus Torvalds {
1066*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1067*1da177e4SLinus Torvalds 	int time_to_ack = 0;
1068*1da177e4SLinus Torvalds 
1069*1da177e4SLinus Torvalds #if TCP_DEBUG
1070*1da177e4SLinus Torvalds 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1071*1da177e4SLinus Torvalds 
1072*1da177e4SLinus Torvalds 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1073*1da177e4SLinus Torvalds #endif
1074*1da177e4SLinus Torvalds 
1075*1da177e4SLinus Torvalds 	if (tcp_ack_scheduled(tp)) {
1076*1da177e4SLinus Torvalds 		   /* Delayed ACKs frequently hit locked sockets during bulk
1077*1da177e4SLinus Torvalds 		    * receive. */
1078*1da177e4SLinus Torvalds 		if (tp->ack.blocked ||
1079*1da177e4SLinus Torvalds 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1080*1da177e4SLinus Torvalds 		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1081*1da177e4SLinus Torvalds 		    /*
1082*1da177e4SLinus Torvalds 		     * If this read emptied read buffer, we send ACK, if
1083*1da177e4SLinus Torvalds 		     * connection is not bidirectional, user drained
1084*1da177e4SLinus Torvalds 		     * receive buffer and there was a small segment
1085*1da177e4SLinus Torvalds 		     * in queue.
1086*1da177e4SLinus Torvalds 		     */
1087*1da177e4SLinus Torvalds 		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1088*1da177e4SLinus Torvalds 		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1089*1da177e4SLinus Torvalds 			time_to_ack = 1;
1090*1da177e4SLinus Torvalds 	}
1091*1da177e4SLinus Torvalds 
1092*1da177e4SLinus Torvalds 	/* We send an ACK if we can now advertise a non-zero window
1093*1da177e4SLinus Torvalds 	 * which has been raised "significantly".
1094*1da177e4SLinus Torvalds 	 *
1095*1da177e4SLinus Torvalds 	 * Even if window raised up to infinity, do not send window open ACK
1096*1da177e4SLinus Torvalds 	 * in states, where we will not receive more. It is useless.
1097*1da177e4SLinus Torvalds 	 */
1098*1da177e4SLinus Torvalds 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1099*1da177e4SLinus Torvalds 		__u32 rcv_window_now = tcp_receive_window(tp);
1100*1da177e4SLinus Torvalds 
1101*1da177e4SLinus Torvalds 		/* Optimize, __tcp_select_window() is not cheap. */
1102*1da177e4SLinus Torvalds 		if (2*rcv_window_now <= tp->window_clamp) {
1103*1da177e4SLinus Torvalds 			__u32 new_window = __tcp_select_window(sk);
1104*1da177e4SLinus Torvalds 
1105*1da177e4SLinus Torvalds 			/* Send ACK now, if this read freed lots of space
1106*1da177e4SLinus Torvalds 			 * in our buffer. Certainly, new_window is new window.
1107*1da177e4SLinus Torvalds 			 * We can advertise it now, if it is not less than current one.
1108*1da177e4SLinus Torvalds 			 * "Lots" means "at least twice" here.
1109*1da177e4SLinus Torvalds 			 */
1110*1da177e4SLinus Torvalds 			if (new_window && new_window >= 2 * rcv_window_now)
1111*1da177e4SLinus Torvalds 				time_to_ack = 1;
1112*1da177e4SLinus Torvalds 		}
1113*1da177e4SLinus Torvalds 	}
1114*1da177e4SLinus Torvalds 	if (time_to_ack)
1115*1da177e4SLinus Torvalds 		tcp_send_ack(sk);
1116*1da177e4SLinus Torvalds }
1117*1da177e4SLinus Torvalds 
1118*1da177e4SLinus Torvalds static void tcp_prequeue_process(struct sock *sk)
1119*1da177e4SLinus Torvalds {
1120*1da177e4SLinus Torvalds 	struct sk_buff *skb;
1121*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1122*1da177e4SLinus Torvalds 
1123*1da177e4SLinus Torvalds 	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1124*1da177e4SLinus Torvalds 
1125*1da177e4SLinus Torvalds 	/* RX process wants to run with disabled BHs, though it is not
1126*1da177e4SLinus Torvalds 	 * necessary */
1127*1da177e4SLinus Torvalds 	local_bh_disable();
1128*1da177e4SLinus Torvalds 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1129*1da177e4SLinus Torvalds 		sk->sk_backlog_rcv(sk, skb);
1130*1da177e4SLinus Torvalds 	local_bh_enable();
1131*1da177e4SLinus Torvalds 
1132*1da177e4SLinus Torvalds 	/* Clear memory counter. */
1133*1da177e4SLinus Torvalds 	tp->ucopy.memory = 0;
1134*1da177e4SLinus Torvalds }
1135*1da177e4SLinus Torvalds 
1136*1da177e4SLinus Torvalds static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1137*1da177e4SLinus Torvalds {
1138*1da177e4SLinus Torvalds 	struct sk_buff *skb;
1139*1da177e4SLinus Torvalds 	u32 offset;
1140*1da177e4SLinus Torvalds 
1141*1da177e4SLinus Torvalds 	skb_queue_walk(&sk->sk_receive_queue, skb) {
1142*1da177e4SLinus Torvalds 		offset = seq - TCP_SKB_CB(skb)->seq;
1143*1da177e4SLinus Torvalds 		if (skb->h.th->syn)
1144*1da177e4SLinus Torvalds 			offset--;
1145*1da177e4SLinus Torvalds 		if (offset < skb->len || skb->h.th->fin) {
1146*1da177e4SLinus Torvalds 			*off = offset;
1147*1da177e4SLinus Torvalds 			return skb;
1148*1da177e4SLinus Torvalds 		}
1149*1da177e4SLinus Torvalds 	}
1150*1da177e4SLinus Torvalds 	return NULL;
1151*1da177e4SLinus Torvalds }
1152*1da177e4SLinus Torvalds 
1153*1da177e4SLinus Torvalds /*
1154*1da177e4SLinus Torvalds  * This routine provides an alternative to tcp_recvmsg() for routines
1155*1da177e4SLinus Torvalds  * that would like to handle copying from skbuffs directly in 'sendfile'
1156*1da177e4SLinus Torvalds  * fashion.
1157*1da177e4SLinus Torvalds  * Note:
1158*1da177e4SLinus Torvalds  *	- It is assumed that the socket was locked by the caller.
1159*1da177e4SLinus Torvalds  *	- The routine does not block.
1160*1da177e4SLinus Torvalds  *	- At present, there is no support for reading OOB data
1161*1da177e4SLinus Torvalds  *	  or for 'peeking' the socket using this routine
1162*1da177e4SLinus Torvalds  *	  (although both would be easy to implement).
1163*1da177e4SLinus Torvalds  */
1164*1da177e4SLinus Torvalds int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1165*1da177e4SLinus Torvalds 		  sk_read_actor_t recv_actor)
1166*1da177e4SLinus Torvalds {
1167*1da177e4SLinus Torvalds 	struct sk_buff *skb;
1168*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1169*1da177e4SLinus Torvalds 	u32 seq = tp->copied_seq;
1170*1da177e4SLinus Torvalds 	u32 offset;
1171*1da177e4SLinus Torvalds 	int copied = 0;
1172*1da177e4SLinus Torvalds 
1173*1da177e4SLinus Torvalds 	if (sk->sk_state == TCP_LISTEN)
1174*1da177e4SLinus Torvalds 		return -ENOTCONN;
1175*1da177e4SLinus Torvalds 	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1176*1da177e4SLinus Torvalds 		if (offset < skb->len) {
1177*1da177e4SLinus Torvalds 			size_t used, len;
1178*1da177e4SLinus Torvalds 
1179*1da177e4SLinus Torvalds 			len = skb->len - offset;
1180*1da177e4SLinus Torvalds 			/* Stop reading if we hit a patch of urgent data */
1181*1da177e4SLinus Torvalds 			if (tp->urg_data) {
1182*1da177e4SLinus Torvalds 				u32 urg_offset = tp->urg_seq - seq;
1183*1da177e4SLinus Torvalds 				if (urg_offset < len)
1184*1da177e4SLinus Torvalds 					len = urg_offset;
1185*1da177e4SLinus Torvalds 				if (!len)
1186*1da177e4SLinus Torvalds 					break;
1187*1da177e4SLinus Torvalds 			}
1188*1da177e4SLinus Torvalds 			used = recv_actor(desc, skb, offset, len);
1189*1da177e4SLinus Torvalds 			if (used <= len) {
1190*1da177e4SLinus Torvalds 				seq += used;
1191*1da177e4SLinus Torvalds 				copied += used;
1192*1da177e4SLinus Torvalds 				offset += used;
1193*1da177e4SLinus Torvalds 			}
1194*1da177e4SLinus Torvalds 			if (offset != skb->len)
1195*1da177e4SLinus Torvalds 				break;
1196*1da177e4SLinus Torvalds 		}
1197*1da177e4SLinus Torvalds 		if (skb->h.th->fin) {
1198*1da177e4SLinus Torvalds 			sk_eat_skb(sk, skb);
1199*1da177e4SLinus Torvalds 			++seq;
1200*1da177e4SLinus Torvalds 			break;
1201*1da177e4SLinus Torvalds 		}
1202*1da177e4SLinus Torvalds 		sk_eat_skb(sk, skb);
1203*1da177e4SLinus Torvalds 		if (!desc->count)
1204*1da177e4SLinus Torvalds 			break;
1205*1da177e4SLinus Torvalds 	}
1206*1da177e4SLinus Torvalds 	tp->copied_seq = seq;
1207*1da177e4SLinus Torvalds 
1208*1da177e4SLinus Torvalds 	tcp_rcv_space_adjust(sk);
1209*1da177e4SLinus Torvalds 
1210*1da177e4SLinus Torvalds 	/* Clean up data we have read: This will do ACK frames. */
1211*1da177e4SLinus Torvalds 	if (copied)
1212*1da177e4SLinus Torvalds 		cleanup_rbuf(sk, copied);
1213*1da177e4SLinus Torvalds 	return copied;
1214*1da177e4SLinus Torvalds }
1215*1da177e4SLinus Torvalds 
1216*1da177e4SLinus Torvalds /*
1217*1da177e4SLinus Torvalds  *	This routine copies from a sock struct into the user buffer.
1218*1da177e4SLinus Torvalds  *
1219*1da177e4SLinus Torvalds  *	Technical note: in 2.3 we work on _locked_ socket, so that
1220*1da177e4SLinus Torvalds  *	tricks with *seq access order and skb->users are not required.
1221*1da177e4SLinus Torvalds  *	Probably, code can be easily improved even more.
1222*1da177e4SLinus Torvalds  */
1223*1da177e4SLinus Torvalds 
1224*1da177e4SLinus Torvalds int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1225*1da177e4SLinus Torvalds 		size_t len, int nonblock, int flags, int *addr_len)
1226*1da177e4SLinus Torvalds {
1227*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1228*1da177e4SLinus Torvalds 	int copied = 0;
1229*1da177e4SLinus Torvalds 	u32 peek_seq;
1230*1da177e4SLinus Torvalds 	u32 *seq;
1231*1da177e4SLinus Torvalds 	unsigned long used;
1232*1da177e4SLinus Torvalds 	int err;
1233*1da177e4SLinus Torvalds 	int target;		/* Read at least this many bytes */
1234*1da177e4SLinus Torvalds 	long timeo;
1235*1da177e4SLinus Torvalds 	struct task_struct *user_recv = NULL;
1236*1da177e4SLinus Torvalds 
1237*1da177e4SLinus Torvalds 	lock_sock(sk);
1238*1da177e4SLinus Torvalds 
1239*1da177e4SLinus Torvalds 	TCP_CHECK_TIMER(sk);
1240*1da177e4SLinus Torvalds 
1241*1da177e4SLinus Torvalds 	err = -ENOTCONN;
1242*1da177e4SLinus Torvalds 	if (sk->sk_state == TCP_LISTEN)
1243*1da177e4SLinus Torvalds 		goto out;
1244*1da177e4SLinus Torvalds 
1245*1da177e4SLinus Torvalds 	timeo = sock_rcvtimeo(sk, nonblock);
1246*1da177e4SLinus Torvalds 
1247*1da177e4SLinus Torvalds 	/* Urgent data needs to be handled specially. */
1248*1da177e4SLinus Torvalds 	if (flags & MSG_OOB)
1249*1da177e4SLinus Torvalds 		goto recv_urg;
1250*1da177e4SLinus Torvalds 
1251*1da177e4SLinus Torvalds 	seq = &tp->copied_seq;
1252*1da177e4SLinus Torvalds 	if (flags & MSG_PEEK) {
1253*1da177e4SLinus Torvalds 		peek_seq = tp->copied_seq;
1254*1da177e4SLinus Torvalds 		seq = &peek_seq;
1255*1da177e4SLinus Torvalds 	}
1256*1da177e4SLinus Torvalds 
1257*1da177e4SLinus Torvalds 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1258*1da177e4SLinus Torvalds 
1259*1da177e4SLinus Torvalds 	do {
1260*1da177e4SLinus Torvalds 		struct sk_buff *skb;
1261*1da177e4SLinus Torvalds 		u32 offset;
1262*1da177e4SLinus Torvalds 
1263*1da177e4SLinus Torvalds 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1264*1da177e4SLinus Torvalds 		if (tp->urg_data && tp->urg_seq == *seq) {
1265*1da177e4SLinus Torvalds 			if (copied)
1266*1da177e4SLinus Torvalds 				break;
1267*1da177e4SLinus Torvalds 			if (signal_pending(current)) {
1268*1da177e4SLinus Torvalds 				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1269*1da177e4SLinus Torvalds 				break;
1270*1da177e4SLinus Torvalds 			}
1271*1da177e4SLinus Torvalds 		}
1272*1da177e4SLinus Torvalds 
1273*1da177e4SLinus Torvalds 		/* Next get a buffer. */
1274*1da177e4SLinus Torvalds 
1275*1da177e4SLinus Torvalds 		skb = skb_peek(&sk->sk_receive_queue);
1276*1da177e4SLinus Torvalds 		do {
1277*1da177e4SLinus Torvalds 			if (!skb)
1278*1da177e4SLinus Torvalds 				break;
1279*1da177e4SLinus Torvalds 
1280*1da177e4SLinus Torvalds 			/* Now that we have two receive queues this
1281*1da177e4SLinus Torvalds 			 * shouldn't happen.
1282*1da177e4SLinus Torvalds 			 */
1283*1da177e4SLinus Torvalds 			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1284*1da177e4SLinus Torvalds 				printk(KERN_INFO "recvmsg bug: copied %X "
1285*1da177e4SLinus Torvalds 				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1286*1da177e4SLinus Torvalds 				break;
1287*1da177e4SLinus Torvalds 			}
1288*1da177e4SLinus Torvalds 			offset = *seq - TCP_SKB_CB(skb)->seq;
1289*1da177e4SLinus Torvalds 			if (skb->h.th->syn)
1290*1da177e4SLinus Torvalds 				offset--;
1291*1da177e4SLinus Torvalds 			if (offset < skb->len)
1292*1da177e4SLinus Torvalds 				goto found_ok_skb;
1293*1da177e4SLinus Torvalds 			if (skb->h.th->fin)
1294*1da177e4SLinus Torvalds 				goto found_fin_ok;
1295*1da177e4SLinus Torvalds 			BUG_TRAP(flags & MSG_PEEK);
1296*1da177e4SLinus Torvalds 			skb = skb->next;
1297*1da177e4SLinus Torvalds 		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1298*1da177e4SLinus Torvalds 
1299*1da177e4SLinus Torvalds 		/* Well, if we have backlog, try to process it now yet. */
1300*1da177e4SLinus Torvalds 
1301*1da177e4SLinus Torvalds 		if (copied >= target && !sk->sk_backlog.tail)
1302*1da177e4SLinus Torvalds 			break;
1303*1da177e4SLinus Torvalds 
1304*1da177e4SLinus Torvalds 		if (copied) {
1305*1da177e4SLinus Torvalds 			if (sk->sk_err ||
1306*1da177e4SLinus Torvalds 			    sk->sk_state == TCP_CLOSE ||
1307*1da177e4SLinus Torvalds 			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1308*1da177e4SLinus Torvalds 			    !timeo ||
1309*1da177e4SLinus Torvalds 			    signal_pending(current) ||
1310*1da177e4SLinus Torvalds 			    (flags & MSG_PEEK))
1311*1da177e4SLinus Torvalds 				break;
1312*1da177e4SLinus Torvalds 		} else {
1313*1da177e4SLinus Torvalds 			if (sock_flag(sk, SOCK_DONE))
1314*1da177e4SLinus Torvalds 				break;
1315*1da177e4SLinus Torvalds 
1316*1da177e4SLinus Torvalds 			if (sk->sk_err) {
1317*1da177e4SLinus Torvalds 				copied = sock_error(sk);
1318*1da177e4SLinus Torvalds 				break;
1319*1da177e4SLinus Torvalds 			}
1320*1da177e4SLinus Torvalds 
1321*1da177e4SLinus Torvalds 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1322*1da177e4SLinus Torvalds 				break;
1323*1da177e4SLinus Torvalds 
1324*1da177e4SLinus Torvalds 			if (sk->sk_state == TCP_CLOSE) {
1325*1da177e4SLinus Torvalds 				if (!sock_flag(sk, SOCK_DONE)) {
1326*1da177e4SLinus Torvalds 					/* This occurs when user tries to read
1327*1da177e4SLinus Torvalds 					 * from never connected socket.
1328*1da177e4SLinus Torvalds 					 */
1329*1da177e4SLinus Torvalds 					copied = -ENOTCONN;
1330*1da177e4SLinus Torvalds 					break;
1331*1da177e4SLinus Torvalds 				}
1332*1da177e4SLinus Torvalds 				break;
1333*1da177e4SLinus Torvalds 			}
1334*1da177e4SLinus Torvalds 
1335*1da177e4SLinus Torvalds 			if (!timeo) {
1336*1da177e4SLinus Torvalds 				copied = -EAGAIN;
1337*1da177e4SLinus Torvalds 				break;
1338*1da177e4SLinus Torvalds 			}
1339*1da177e4SLinus Torvalds 
1340*1da177e4SLinus Torvalds 			if (signal_pending(current)) {
1341*1da177e4SLinus Torvalds 				copied = sock_intr_errno(timeo);
1342*1da177e4SLinus Torvalds 				break;
1343*1da177e4SLinus Torvalds 			}
1344*1da177e4SLinus Torvalds 		}
1345*1da177e4SLinus Torvalds 
1346*1da177e4SLinus Torvalds 		cleanup_rbuf(sk, copied);
1347*1da177e4SLinus Torvalds 
1348*1da177e4SLinus Torvalds 		if (tp->ucopy.task == user_recv) {
1349*1da177e4SLinus Torvalds 			/* Install new reader */
1350*1da177e4SLinus Torvalds 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1351*1da177e4SLinus Torvalds 				user_recv = current;
1352*1da177e4SLinus Torvalds 				tp->ucopy.task = user_recv;
1353*1da177e4SLinus Torvalds 				tp->ucopy.iov = msg->msg_iov;
1354*1da177e4SLinus Torvalds 			}
1355*1da177e4SLinus Torvalds 
1356*1da177e4SLinus Torvalds 			tp->ucopy.len = len;
1357*1da177e4SLinus Torvalds 
1358*1da177e4SLinus Torvalds 			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1359*1da177e4SLinus Torvalds 				 (flags & (MSG_PEEK | MSG_TRUNC)));
1360*1da177e4SLinus Torvalds 
1361*1da177e4SLinus Torvalds 			/* Ugly... If prequeue is not empty, we have to
1362*1da177e4SLinus Torvalds 			 * process it before releasing socket, otherwise
1363*1da177e4SLinus Torvalds 			 * order will be broken at second iteration.
1364*1da177e4SLinus Torvalds 			 * More elegant solution is required!!!
1365*1da177e4SLinus Torvalds 			 *
1366*1da177e4SLinus Torvalds 			 * Look: we have the following (pseudo)queues:
1367*1da177e4SLinus Torvalds 			 *
1368*1da177e4SLinus Torvalds 			 * 1. packets in flight
1369*1da177e4SLinus Torvalds 			 * 2. backlog
1370*1da177e4SLinus Torvalds 			 * 3. prequeue
1371*1da177e4SLinus Torvalds 			 * 4. receive_queue
1372*1da177e4SLinus Torvalds 			 *
1373*1da177e4SLinus Torvalds 			 * Each queue can be processed only if the next ones
1374*1da177e4SLinus Torvalds 			 * are empty. At this point we have empty receive_queue.
1375*1da177e4SLinus Torvalds 			 * But prequeue _can_ be not empty after 2nd iteration,
1376*1da177e4SLinus Torvalds 			 * when we jumped to start of loop because backlog
1377*1da177e4SLinus Torvalds 			 * processing added something to receive_queue.
1378*1da177e4SLinus Torvalds 			 * We cannot release_sock(), because backlog contains
1379*1da177e4SLinus Torvalds 			 * packets arrived _after_ prequeued ones.
1380*1da177e4SLinus Torvalds 			 *
1381*1da177e4SLinus Torvalds 			 * Shortly, algorithm is clear --- to process all
1382*1da177e4SLinus Torvalds 			 * the queues in order. We could make it more directly,
1383*1da177e4SLinus Torvalds 			 * requeueing packets from backlog to prequeue, if
1384*1da177e4SLinus Torvalds 			 * is not empty. It is more elegant, but eats cycles,
1385*1da177e4SLinus Torvalds 			 * unfortunately.
1386*1da177e4SLinus Torvalds 			 */
1387*1da177e4SLinus Torvalds 			if (skb_queue_len(&tp->ucopy.prequeue))
1388*1da177e4SLinus Torvalds 				goto do_prequeue;
1389*1da177e4SLinus Torvalds 
1390*1da177e4SLinus Torvalds 			/* __ Set realtime policy in scheduler __ */
1391*1da177e4SLinus Torvalds 		}
1392*1da177e4SLinus Torvalds 
1393*1da177e4SLinus Torvalds 		if (copied >= target) {
1394*1da177e4SLinus Torvalds 			/* Do not sleep, just process backlog. */
1395*1da177e4SLinus Torvalds 			release_sock(sk);
1396*1da177e4SLinus Torvalds 			lock_sock(sk);
1397*1da177e4SLinus Torvalds 		} else
1398*1da177e4SLinus Torvalds 			sk_wait_data(sk, &timeo);
1399*1da177e4SLinus Torvalds 
1400*1da177e4SLinus Torvalds 		if (user_recv) {
1401*1da177e4SLinus Torvalds 			int chunk;
1402*1da177e4SLinus Torvalds 
1403*1da177e4SLinus Torvalds 			/* __ Restore normal policy in scheduler __ */
1404*1da177e4SLinus Torvalds 
1405*1da177e4SLinus Torvalds 			if ((chunk = len - tp->ucopy.len) != 0) {
1406*1da177e4SLinus Torvalds 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1407*1da177e4SLinus Torvalds 				len -= chunk;
1408*1da177e4SLinus Torvalds 				copied += chunk;
1409*1da177e4SLinus Torvalds 			}
1410*1da177e4SLinus Torvalds 
1411*1da177e4SLinus Torvalds 			if (tp->rcv_nxt == tp->copied_seq &&
1412*1da177e4SLinus Torvalds 			    skb_queue_len(&tp->ucopy.prequeue)) {
1413*1da177e4SLinus Torvalds do_prequeue:
1414*1da177e4SLinus Torvalds 				tcp_prequeue_process(sk);
1415*1da177e4SLinus Torvalds 
1416*1da177e4SLinus Torvalds 				if ((chunk = len - tp->ucopy.len) != 0) {
1417*1da177e4SLinus Torvalds 					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1418*1da177e4SLinus Torvalds 					len -= chunk;
1419*1da177e4SLinus Torvalds 					copied += chunk;
1420*1da177e4SLinus Torvalds 				}
1421*1da177e4SLinus Torvalds 			}
1422*1da177e4SLinus Torvalds 		}
1423*1da177e4SLinus Torvalds 		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1424*1da177e4SLinus Torvalds 			if (net_ratelimit())
1425*1da177e4SLinus Torvalds 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1426*1da177e4SLinus Torvalds 				       current->comm, current->pid);
1427*1da177e4SLinus Torvalds 			peek_seq = tp->copied_seq;
1428*1da177e4SLinus Torvalds 		}
1429*1da177e4SLinus Torvalds 		continue;
1430*1da177e4SLinus Torvalds 
1431*1da177e4SLinus Torvalds 	found_ok_skb:
1432*1da177e4SLinus Torvalds 		/* Ok so how much can we use? */
1433*1da177e4SLinus Torvalds 		used = skb->len - offset;
1434*1da177e4SLinus Torvalds 		if (len < used)
1435*1da177e4SLinus Torvalds 			used = len;
1436*1da177e4SLinus Torvalds 
1437*1da177e4SLinus Torvalds 		/* Do we have urgent data here? */
1438*1da177e4SLinus Torvalds 		if (tp->urg_data) {
1439*1da177e4SLinus Torvalds 			u32 urg_offset = tp->urg_seq - *seq;
1440*1da177e4SLinus Torvalds 			if (urg_offset < used) {
1441*1da177e4SLinus Torvalds 				if (!urg_offset) {
1442*1da177e4SLinus Torvalds 					if (!sock_flag(sk, SOCK_URGINLINE)) {
1443*1da177e4SLinus Torvalds 						++*seq;
1444*1da177e4SLinus Torvalds 						offset++;
1445*1da177e4SLinus Torvalds 						used--;
1446*1da177e4SLinus Torvalds 						if (!used)
1447*1da177e4SLinus Torvalds 							goto skip_copy;
1448*1da177e4SLinus Torvalds 					}
1449*1da177e4SLinus Torvalds 				} else
1450*1da177e4SLinus Torvalds 					used = urg_offset;
1451*1da177e4SLinus Torvalds 			}
1452*1da177e4SLinus Torvalds 		}
1453*1da177e4SLinus Torvalds 
1454*1da177e4SLinus Torvalds 		if (!(flags & MSG_TRUNC)) {
1455*1da177e4SLinus Torvalds 			err = skb_copy_datagram_iovec(skb, offset,
1456*1da177e4SLinus Torvalds 						      msg->msg_iov, used);
1457*1da177e4SLinus Torvalds 			if (err) {
1458*1da177e4SLinus Torvalds 				/* Exception. Bailout! */
1459*1da177e4SLinus Torvalds 				if (!copied)
1460*1da177e4SLinus Torvalds 					copied = -EFAULT;
1461*1da177e4SLinus Torvalds 				break;
1462*1da177e4SLinus Torvalds 			}
1463*1da177e4SLinus Torvalds 		}
1464*1da177e4SLinus Torvalds 
1465*1da177e4SLinus Torvalds 		*seq += used;
1466*1da177e4SLinus Torvalds 		copied += used;
1467*1da177e4SLinus Torvalds 		len -= used;
1468*1da177e4SLinus Torvalds 
1469*1da177e4SLinus Torvalds 		tcp_rcv_space_adjust(sk);
1470*1da177e4SLinus Torvalds 
1471*1da177e4SLinus Torvalds skip_copy:
1472*1da177e4SLinus Torvalds 		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1473*1da177e4SLinus Torvalds 			tp->urg_data = 0;
1474*1da177e4SLinus Torvalds 			tcp_fast_path_check(sk, tp);
1475*1da177e4SLinus Torvalds 		}
1476*1da177e4SLinus Torvalds 		if (used + offset < skb->len)
1477*1da177e4SLinus Torvalds 			continue;
1478*1da177e4SLinus Torvalds 
1479*1da177e4SLinus Torvalds 		if (skb->h.th->fin)
1480*1da177e4SLinus Torvalds 			goto found_fin_ok;
1481*1da177e4SLinus Torvalds 		if (!(flags & MSG_PEEK))
1482*1da177e4SLinus Torvalds 			sk_eat_skb(sk, skb);
1483*1da177e4SLinus Torvalds 		continue;
1484*1da177e4SLinus Torvalds 
1485*1da177e4SLinus Torvalds 	found_fin_ok:
1486*1da177e4SLinus Torvalds 		/* Process the FIN. */
1487*1da177e4SLinus Torvalds 		++*seq;
1488*1da177e4SLinus Torvalds 		if (!(flags & MSG_PEEK))
1489*1da177e4SLinus Torvalds 			sk_eat_skb(sk, skb);
1490*1da177e4SLinus Torvalds 		break;
1491*1da177e4SLinus Torvalds 	} while (len > 0);
1492*1da177e4SLinus Torvalds 
1493*1da177e4SLinus Torvalds 	if (user_recv) {
1494*1da177e4SLinus Torvalds 		if (skb_queue_len(&tp->ucopy.prequeue)) {
1495*1da177e4SLinus Torvalds 			int chunk;
1496*1da177e4SLinus Torvalds 
1497*1da177e4SLinus Torvalds 			tp->ucopy.len = copied > 0 ? len : 0;
1498*1da177e4SLinus Torvalds 
1499*1da177e4SLinus Torvalds 			tcp_prequeue_process(sk);
1500*1da177e4SLinus Torvalds 
1501*1da177e4SLinus Torvalds 			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1502*1da177e4SLinus Torvalds 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1503*1da177e4SLinus Torvalds 				len -= chunk;
1504*1da177e4SLinus Torvalds 				copied += chunk;
1505*1da177e4SLinus Torvalds 			}
1506*1da177e4SLinus Torvalds 		}
1507*1da177e4SLinus Torvalds 
1508*1da177e4SLinus Torvalds 		tp->ucopy.task = NULL;
1509*1da177e4SLinus Torvalds 		tp->ucopy.len = 0;
1510*1da177e4SLinus Torvalds 	}
1511*1da177e4SLinus Torvalds 
1512*1da177e4SLinus Torvalds 	/* According to UNIX98, msg_name/msg_namelen are ignored
1513*1da177e4SLinus Torvalds 	 * on connected socket. I was just happy when found this 8) --ANK
1514*1da177e4SLinus Torvalds 	 */
1515*1da177e4SLinus Torvalds 
1516*1da177e4SLinus Torvalds 	/* Clean up data we have read: This will do ACK frames. */
1517*1da177e4SLinus Torvalds 	cleanup_rbuf(sk, copied);
1518*1da177e4SLinus Torvalds 
1519*1da177e4SLinus Torvalds 	TCP_CHECK_TIMER(sk);
1520*1da177e4SLinus Torvalds 	release_sock(sk);
1521*1da177e4SLinus Torvalds 	return copied;
1522*1da177e4SLinus Torvalds 
1523*1da177e4SLinus Torvalds out:
1524*1da177e4SLinus Torvalds 	TCP_CHECK_TIMER(sk);
1525*1da177e4SLinus Torvalds 	release_sock(sk);
1526*1da177e4SLinus Torvalds 	return err;
1527*1da177e4SLinus Torvalds 
1528*1da177e4SLinus Torvalds recv_urg:
1529*1da177e4SLinus Torvalds 	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1530*1da177e4SLinus Torvalds 	goto out;
1531*1da177e4SLinus Torvalds }
1532*1da177e4SLinus Torvalds 
1533*1da177e4SLinus Torvalds /*
1534*1da177e4SLinus Torvalds  *	State processing on a close. This implements the state shift for
1535*1da177e4SLinus Torvalds  *	sending our FIN frame. Note that we only send a FIN for some
1536*1da177e4SLinus Torvalds  *	states. A shutdown() may have already sent the FIN, or we may be
1537*1da177e4SLinus Torvalds  *	closed.
1538*1da177e4SLinus Torvalds  */
1539*1da177e4SLinus Torvalds 
1540*1da177e4SLinus Torvalds static unsigned char new_state[16] = {
1541*1da177e4SLinus Torvalds   /* current state:        new state:      action:	*/
1542*1da177e4SLinus Torvalds   /* (Invalid)		*/ TCP_CLOSE,
1543*1da177e4SLinus Torvalds   /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1544*1da177e4SLinus Torvalds   /* TCP_SYN_SENT	*/ TCP_CLOSE,
1545*1da177e4SLinus Torvalds   /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1546*1da177e4SLinus Torvalds   /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1547*1da177e4SLinus Torvalds   /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1548*1da177e4SLinus Torvalds   /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1549*1da177e4SLinus Torvalds   /* TCP_CLOSE		*/ TCP_CLOSE,
1550*1da177e4SLinus Torvalds   /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1551*1da177e4SLinus Torvalds   /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1552*1da177e4SLinus Torvalds   /* TCP_LISTEN		*/ TCP_CLOSE,
1553*1da177e4SLinus Torvalds   /* TCP_CLOSING	*/ TCP_CLOSING,
1554*1da177e4SLinus Torvalds };
1555*1da177e4SLinus Torvalds 
1556*1da177e4SLinus Torvalds static int tcp_close_state(struct sock *sk)
1557*1da177e4SLinus Torvalds {
1558*1da177e4SLinus Torvalds 	int next = (int)new_state[sk->sk_state];
1559*1da177e4SLinus Torvalds 	int ns = next & TCP_STATE_MASK;
1560*1da177e4SLinus Torvalds 
1561*1da177e4SLinus Torvalds 	tcp_set_state(sk, ns);
1562*1da177e4SLinus Torvalds 
1563*1da177e4SLinus Torvalds 	return next & TCP_ACTION_FIN;
1564*1da177e4SLinus Torvalds }
1565*1da177e4SLinus Torvalds 
1566*1da177e4SLinus Torvalds /*
1567*1da177e4SLinus Torvalds  *	Shutdown the sending side of a connection. Much like close except
1568*1da177e4SLinus Torvalds  *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1569*1da177e4SLinus Torvalds  */
1570*1da177e4SLinus Torvalds 
1571*1da177e4SLinus Torvalds void tcp_shutdown(struct sock *sk, int how)
1572*1da177e4SLinus Torvalds {
1573*1da177e4SLinus Torvalds 	/*	We need to grab some memory, and put together a FIN,
1574*1da177e4SLinus Torvalds 	 *	and then put it into the queue to be sent.
1575*1da177e4SLinus Torvalds 	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1576*1da177e4SLinus Torvalds 	 */
1577*1da177e4SLinus Torvalds 	if (!(how & SEND_SHUTDOWN))
1578*1da177e4SLinus Torvalds 		return;
1579*1da177e4SLinus Torvalds 
1580*1da177e4SLinus Torvalds 	/* If we've already sent a FIN, or it's a closed state, skip this. */
1581*1da177e4SLinus Torvalds 	if ((1 << sk->sk_state) &
1582*1da177e4SLinus Torvalds 	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1583*1da177e4SLinus Torvalds 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1584*1da177e4SLinus Torvalds 		/* Clear out any half completed packets.  FIN if needed. */
1585*1da177e4SLinus Torvalds 		if (tcp_close_state(sk))
1586*1da177e4SLinus Torvalds 			tcp_send_fin(sk);
1587*1da177e4SLinus Torvalds 	}
1588*1da177e4SLinus Torvalds }
1589*1da177e4SLinus Torvalds 
1590*1da177e4SLinus Torvalds /*
1591*1da177e4SLinus Torvalds  * At this point, there should be no process reference to this
1592*1da177e4SLinus Torvalds  * socket, and thus no user references at all.  Therefore we
1593*1da177e4SLinus Torvalds  * can assume the socket waitqueue is inactive and nobody will
1594*1da177e4SLinus Torvalds  * try to jump onto it.
1595*1da177e4SLinus Torvalds  */
1596*1da177e4SLinus Torvalds void tcp_destroy_sock(struct sock *sk)
1597*1da177e4SLinus Torvalds {
1598*1da177e4SLinus Torvalds 	BUG_TRAP(sk->sk_state == TCP_CLOSE);
1599*1da177e4SLinus Torvalds 	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1600*1da177e4SLinus Torvalds 
1601*1da177e4SLinus Torvalds 	/* It cannot be in hash table! */
1602*1da177e4SLinus Torvalds 	BUG_TRAP(sk_unhashed(sk));
1603*1da177e4SLinus Torvalds 
1604*1da177e4SLinus Torvalds 	/* If it has not 0 inet_sk(sk)->num, it must be bound */
1605*1da177e4SLinus Torvalds 	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1606*1da177e4SLinus Torvalds 
1607*1da177e4SLinus Torvalds 	sk->sk_prot->destroy(sk);
1608*1da177e4SLinus Torvalds 
1609*1da177e4SLinus Torvalds 	sk_stream_kill_queues(sk);
1610*1da177e4SLinus Torvalds 
1611*1da177e4SLinus Torvalds 	xfrm_sk_free_policy(sk);
1612*1da177e4SLinus Torvalds 
1613*1da177e4SLinus Torvalds #ifdef INET_REFCNT_DEBUG
1614*1da177e4SLinus Torvalds 	if (atomic_read(&sk->sk_refcnt) != 1) {
1615*1da177e4SLinus Torvalds 		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1616*1da177e4SLinus Torvalds 		       sk, atomic_read(&sk->sk_refcnt));
1617*1da177e4SLinus Torvalds 	}
1618*1da177e4SLinus Torvalds #endif
1619*1da177e4SLinus Torvalds 
1620*1da177e4SLinus Torvalds 	atomic_dec(&tcp_orphan_count);
1621*1da177e4SLinus Torvalds 	sock_put(sk);
1622*1da177e4SLinus Torvalds }
1623*1da177e4SLinus Torvalds 
1624*1da177e4SLinus Torvalds void tcp_close(struct sock *sk, long timeout)
1625*1da177e4SLinus Torvalds {
1626*1da177e4SLinus Torvalds 	struct sk_buff *skb;
1627*1da177e4SLinus Torvalds 	int data_was_unread = 0;
1628*1da177e4SLinus Torvalds 
1629*1da177e4SLinus Torvalds 	lock_sock(sk);
1630*1da177e4SLinus Torvalds 	sk->sk_shutdown = SHUTDOWN_MASK;
1631*1da177e4SLinus Torvalds 
1632*1da177e4SLinus Torvalds 	if (sk->sk_state == TCP_LISTEN) {
1633*1da177e4SLinus Torvalds 		tcp_set_state(sk, TCP_CLOSE);
1634*1da177e4SLinus Torvalds 
1635*1da177e4SLinus Torvalds 		/* Special case. */
1636*1da177e4SLinus Torvalds 		tcp_listen_stop(sk);
1637*1da177e4SLinus Torvalds 
1638*1da177e4SLinus Torvalds 		goto adjudge_to_death;
1639*1da177e4SLinus Torvalds 	}
1640*1da177e4SLinus Torvalds 
1641*1da177e4SLinus Torvalds 	/*  We need to flush the recv. buffs.  We do this only on the
1642*1da177e4SLinus Torvalds 	 *  descriptor close, not protocol-sourced closes, because the
1643*1da177e4SLinus Torvalds 	 *  reader process may not have drained the data yet!
1644*1da177e4SLinus Torvalds 	 */
1645*1da177e4SLinus Torvalds 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1646*1da177e4SLinus Torvalds 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1647*1da177e4SLinus Torvalds 			  skb->h.th->fin;
1648*1da177e4SLinus Torvalds 		data_was_unread += len;
1649*1da177e4SLinus Torvalds 		__kfree_skb(skb);
1650*1da177e4SLinus Torvalds 	}
1651*1da177e4SLinus Torvalds 
1652*1da177e4SLinus Torvalds 	sk_stream_mem_reclaim(sk);
1653*1da177e4SLinus Torvalds 
1654*1da177e4SLinus Torvalds 	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1655*1da177e4SLinus Torvalds 	 * 3.10, we send a RST here because data was lost.  To
1656*1da177e4SLinus Torvalds 	 * witness the awful effects of the old behavior of always
1657*1da177e4SLinus Torvalds 	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1658*1da177e4SLinus Torvalds 	 * a bulk GET in an FTP client, suspend the process, wait
1659*1da177e4SLinus Torvalds 	 * for the client to advertise a zero window, then kill -9
1660*1da177e4SLinus Torvalds 	 * the FTP client, wheee...  Note: timeout is always zero
1661*1da177e4SLinus Torvalds 	 * in such a case.
1662*1da177e4SLinus Torvalds 	 */
1663*1da177e4SLinus Torvalds 	if (data_was_unread) {
1664*1da177e4SLinus Torvalds 		/* Unread data was tossed, zap the connection. */
1665*1da177e4SLinus Torvalds 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1666*1da177e4SLinus Torvalds 		tcp_set_state(sk, TCP_CLOSE);
1667*1da177e4SLinus Torvalds 		tcp_send_active_reset(sk, GFP_KERNEL);
1668*1da177e4SLinus Torvalds 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1669*1da177e4SLinus Torvalds 		/* Check zero linger _after_ checking for unread data. */
1670*1da177e4SLinus Torvalds 		sk->sk_prot->disconnect(sk, 0);
1671*1da177e4SLinus Torvalds 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1672*1da177e4SLinus Torvalds 	} else if (tcp_close_state(sk)) {
1673*1da177e4SLinus Torvalds 		/* We FIN if the application ate all the data before
1674*1da177e4SLinus Torvalds 		 * zapping the connection.
1675*1da177e4SLinus Torvalds 		 */
1676*1da177e4SLinus Torvalds 
1677*1da177e4SLinus Torvalds 		/* RED-PEN. Formally speaking, we have broken TCP state
1678*1da177e4SLinus Torvalds 		 * machine. State transitions:
1679*1da177e4SLinus Torvalds 		 *
1680*1da177e4SLinus Torvalds 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1681*1da177e4SLinus Torvalds 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1682*1da177e4SLinus Torvalds 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1683*1da177e4SLinus Torvalds 		 *
1684*1da177e4SLinus Torvalds 		 * are legal only when FIN has been sent (i.e. in window),
1685*1da177e4SLinus Torvalds 		 * rather than queued out of window. Purists blame.
1686*1da177e4SLinus Torvalds 		 *
1687*1da177e4SLinus Torvalds 		 * F.e. "RFC state" is ESTABLISHED,
1688*1da177e4SLinus Torvalds 		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1689*1da177e4SLinus Torvalds 		 *
1690*1da177e4SLinus Torvalds 		 * The visible declinations are that sometimes
1691*1da177e4SLinus Torvalds 		 * we enter time-wait state, when it is not required really
1692*1da177e4SLinus Torvalds 		 * (harmless), do not send active resets, when they are
1693*1da177e4SLinus Torvalds 		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1694*1da177e4SLinus Torvalds 		 * they look as CLOSING or LAST_ACK for Linux)
1695*1da177e4SLinus Torvalds 		 * Probably, I missed some more holelets.
1696*1da177e4SLinus Torvalds 		 * 						--ANK
1697*1da177e4SLinus Torvalds 		 */
1698*1da177e4SLinus Torvalds 		tcp_send_fin(sk);
1699*1da177e4SLinus Torvalds 	}
1700*1da177e4SLinus Torvalds 
1701*1da177e4SLinus Torvalds 	sk_stream_wait_close(sk, timeout);
1702*1da177e4SLinus Torvalds 
1703*1da177e4SLinus Torvalds adjudge_to_death:
1704*1da177e4SLinus Torvalds 	/* It is the last release_sock in its life. It will remove backlog. */
1705*1da177e4SLinus Torvalds 	release_sock(sk);
1706*1da177e4SLinus Torvalds 
1707*1da177e4SLinus Torvalds 
1708*1da177e4SLinus Torvalds 	/* Now socket is owned by kernel and we acquire BH lock
1709*1da177e4SLinus Torvalds 	   to finish close. No need to check for user refs.
1710*1da177e4SLinus Torvalds 	 */
1711*1da177e4SLinus Torvalds 	local_bh_disable();
1712*1da177e4SLinus Torvalds 	bh_lock_sock(sk);
1713*1da177e4SLinus Torvalds 	BUG_TRAP(!sock_owned_by_user(sk));
1714*1da177e4SLinus Torvalds 
1715*1da177e4SLinus Torvalds 	sock_hold(sk);
1716*1da177e4SLinus Torvalds 	sock_orphan(sk);
1717*1da177e4SLinus Torvalds 
1718*1da177e4SLinus Torvalds 	/*	This is a (useful) BSD violating of the RFC. There is a
1719*1da177e4SLinus Torvalds 	 *	problem with TCP as specified in that the other end could
1720*1da177e4SLinus Torvalds 	 *	keep a socket open forever with no application left this end.
1721*1da177e4SLinus Torvalds 	 *	We use a 3 minute timeout (about the same as BSD) then kill
1722*1da177e4SLinus Torvalds 	 *	our end. If they send after that then tough - BUT: long enough
1723*1da177e4SLinus Torvalds 	 *	that we won't make the old 4*rto = almost no time - whoops
1724*1da177e4SLinus Torvalds 	 *	reset mistake.
1725*1da177e4SLinus Torvalds 	 *
1726*1da177e4SLinus Torvalds 	 *	Nope, it was not mistake. It is really desired behaviour
1727*1da177e4SLinus Torvalds 	 *	f.e. on http servers, when such sockets are useless, but
1728*1da177e4SLinus Torvalds 	 *	consume significant resources. Let's do it with special
1729*1da177e4SLinus Torvalds 	 *	linger2	option.					--ANK
1730*1da177e4SLinus Torvalds 	 */
1731*1da177e4SLinus Torvalds 
1732*1da177e4SLinus Torvalds 	if (sk->sk_state == TCP_FIN_WAIT2) {
1733*1da177e4SLinus Torvalds 		struct tcp_sock *tp = tcp_sk(sk);
1734*1da177e4SLinus Torvalds 		if (tp->linger2 < 0) {
1735*1da177e4SLinus Torvalds 			tcp_set_state(sk, TCP_CLOSE);
1736*1da177e4SLinus Torvalds 			tcp_send_active_reset(sk, GFP_ATOMIC);
1737*1da177e4SLinus Torvalds 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1738*1da177e4SLinus Torvalds 		} else {
1739*1da177e4SLinus Torvalds 			int tmo = tcp_fin_time(tp);
1740*1da177e4SLinus Torvalds 
1741*1da177e4SLinus Torvalds 			if (tmo > TCP_TIMEWAIT_LEN) {
1742*1da177e4SLinus Torvalds 				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1743*1da177e4SLinus Torvalds 			} else {
1744*1da177e4SLinus Torvalds 				atomic_inc(&tcp_orphan_count);
1745*1da177e4SLinus Torvalds 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1746*1da177e4SLinus Torvalds 				goto out;
1747*1da177e4SLinus Torvalds 			}
1748*1da177e4SLinus Torvalds 		}
1749*1da177e4SLinus Torvalds 	}
1750*1da177e4SLinus Torvalds 	if (sk->sk_state != TCP_CLOSE) {
1751*1da177e4SLinus Torvalds 		sk_stream_mem_reclaim(sk);
1752*1da177e4SLinus Torvalds 		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1753*1da177e4SLinus Torvalds 		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1754*1da177e4SLinus Torvalds 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1755*1da177e4SLinus Torvalds 			if (net_ratelimit())
1756*1da177e4SLinus Torvalds 				printk(KERN_INFO "TCP: too many of orphaned "
1757*1da177e4SLinus Torvalds 				       "sockets\n");
1758*1da177e4SLinus Torvalds 			tcp_set_state(sk, TCP_CLOSE);
1759*1da177e4SLinus Torvalds 			tcp_send_active_reset(sk, GFP_ATOMIC);
1760*1da177e4SLinus Torvalds 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1761*1da177e4SLinus Torvalds 		}
1762*1da177e4SLinus Torvalds 	}
1763*1da177e4SLinus Torvalds 	atomic_inc(&tcp_orphan_count);
1764*1da177e4SLinus Torvalds 
1765*1da177e4SLinus Torvalds 	if (sk->sk_state == TCP_CLOSE)
1766*1da177e4SLinus Torvalds 		tcp_destroy_sock(sk);
1767*1da177e4SLinus Torvalds 	/* Otherwise, socket is reprieved until protocol close. */
1768*1da177e4SLinus Torvalds 
1769*1da177e4SLinus Torvalds out:
1770*1da177e4SLinus Torvalds 	bh_unlock_sock(sk);
1771*1da177e4SLinus Torvalds 	local_bh_enable();
1772*1da177e4SLinus Torvalds 	sock_put(sk);
1773*1da177e4SLinus Torvalds }
1774*1da177e4SLinus Torvalds 
1775*1da177e4SLinus Torvalds /* These states need RST on ABORT according to RFC793 */
1776*1da177e4SLinus Torvalds 
1777*1da177e4SLinus Torvalds static inline int tcp_need_reset(int state)
1778*1da177e4SLinus Torvalds {
1779*1da177e4SLinus Torvalds 	return (1 << state) &
1780*1da177e4SLinus Torvalds 	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1781*1da177e4SLinus Torvalds 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1782*1da177e4SLinus Torvalds }
1783*1da177e4SLinus Torvalds 
1784*1da177e4SLinus Torvalds int tcp_disconnect(struct sock *sk, int flags)
1785*1da177e4SLinus Torvalds {
1786*1da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sk);
1787*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1788*1da177e4SLinus Torvalds 	int err = 0;
1789*1da177e4SLinus Torvalds 	int old_state = sk->sk_state;
1790*1da177e4SLinus Torvalds 
1791*1da177e4SLinus Torvalds 	if (old_state != TCP_CLOSE)
1792*1da177e4SLinus Torvalds 		tcp_set_state(sk, TCP_CLOSE);
1793*1da177e4SLinus Torvalds 
1794*1da177e4SLinus Torvalds 	/* ABORT function of RFC793 */
1795*1da177e4SLinus Torvalds 	if (old_state == TCP_LISTEN) {
1796*1da177e4SLinus Torvalds 		tcp_listen_stop(sk);
1797*1da177e4SLinus Torvalds 	} else if (tcp_need_reset(old_state) ||
1798*1da177e4SLinus Torvalds 		   (tp->snd_nxt != tp->write_seq &&
1799*1da177e4SLinus Torvalds 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1800*1da177e4SLinus Torvalds 		/* The last check adjusts for discrepance of Linux wrt. RFC
1801*1da177e4SLinus Torvalds 		 * states
1802*1da177e4SLinus Torvalds 		 */
1803*1da177e4SLinus Torvalds 		tcp_send_active_reset(sk, gfp_any());
1804*1da177e4SLinus Torvalds 		sk->sk_err = ECONNRESET;
1805*1da177e4SLinus Torvalds 	} else if (old_state == TCP_SYN_SENT)
1806*1da177e4SLinus Torvalds 		sk->sk_err = ECONNRESET;
1807*1da177e4SLinus Torvalds 
1808*1da177e4SLinus Torvalds 	tcp_clear_xmit_timers(sk);
1809*1da177e4SLinus Torvalds 	__skb_queue_purge(&sk->sk_receive_queue);
1810*1da177e4SLinus Torvalds 	sk_stream_writequeue_purge(sk);
1811*1da177e4SLinus Torvalds 	__skb_queue_purge(&tp->out_of_order_queue);
1812*1da177e4SLinus Torvalds 
1813*1da177e4SLinus Torvalds 	inet->dport = 0;
1814*1da177e4SLinus Torvalds 
1815*1da177e4SLinus Torvalds 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1816*1da177e4SLinus Torvalds 		inet_reset_saddr(sk);
1817*1da177e4SLinus Torvalds 
1818*1da177e4SLinus Torvalds 	sk->sk_shutdown = 0;
1819*1da177e4SLinus Torvalds 	sock_reset_flag(sk, SOCK_DONE);
1820*1da177e4SLinus Torvalds 	tp->srtt = 0;
1821*1da177e4SLinus Torvalds 	if ((tp->write_seq += tp->max_window + 2) == 0)
1822*1da177e4SLinus Torvalds 		tp->write_seq = 1;
1823*1da177e4SLinus Torvalds 	tp->backoff = 0;
1824*1da177e4SLinus Torvalds 	tp->snd_cwnd = 2;
1825*1da177e4SLinus Torvalds 	tp->probes_out = 0;
1826*1da177e4SLinus Torvalds 	tp->packets_out = 0;
1827*1da177e4SLinus Torvalds 	tp->snd_ssthresh = 0x7fffffff;
1828*1da177e4SLinus Torvalds 	tp->snd_cwnd_cnt = 0;
1829*1da177e4SLinus Torvalds 	tcp_set_ca_state(tp, TCP_CA_Open);
1830*1da177e4SLinus Torvalds 	tcp_clear_retrans(tp);
1831*1da177e4SLinus Torvalds 	tcp_delack_init(tp);
1832*1da177e4SLinus Torvalds 	sk->sk_send_head = NULL;
1833*1da177e4SLinus Torvalds 	tp->rx_opt.saw_tstamp = 0;
1834*1da177e4SLinus Torvalds 	tcp_sack_reset(&tp->rx_opt);
1835*1da177e4SLinus Torvalds 	__sk_dst_reset(sk);
1836*1da177e4SLinus Torvalds 
1837*1da177e4SLinus Torvalds 	BUG_TRAP(!inet->num || tp->bind_hash);
1838*1da177e4SLinus Torvalds 
1839*1da177e4SLinus Torvalds 	sk->sk_error_report(sk);
1840*1da177e4SLinus Torvalds 	return err;
1841*1da177e4SLinus Torvalds }
1842*1da177e4SLinus Torvalds 
1843*1da177e4SLinus Torvalds /*
1844*1da177e4SLinus Torvalds  *	Wait for an incoming connection, avoid race
1845*1da177e4SLinus Torvalds  *	conditions. This must be called with the socket locked.
1846*1da177e4SLinus Torvalds  */
1847*1da177e4SLinus Torvalds static int wait_for_connect(struct sock *sk, long timeo)
1848*1da177e4SLinus Torvalds {
1849*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1850*1da177e4SLinus Torvalds 	DEFINE_WAIT(wait);
1851*1da177e4SLinus Torvalds 	int err;
1852*1da177e4SLinus Torvalds 
1853*1da177e4SLinus Torvalds 	/*
1854*1da177e4SLinus Torvalds 	 * True wake-one mechanism for incoming connections: only
1855*1da177e4SLinus Torvalds 	 * one process gets woken up, not the 'whole herd'.
1856*1da177e4SLinus Torvalds 	 * Since we do not 'race & poll' for established sockets
1857*1da177e4SLinus Torvalds 	 * anymore, the common case will execute the loop only once.
1858*1da177e4SLinus Torvalds 	 *
1859*1da177e4SLinus Torvalds 	 * Subtle issue: "add_wait_queue_exclusive()" will be added
1860*1da177e4SLinus Torvalds 	 * after any current non-exclusive waiters, and we know that
1861*1da177e4SLinus Torvalds 	 * it will always _stay_ after any new non-exclusive waiters
1862*1da177e4SLinus Torvalds 	 * because all non-exclusive waiters are added at the
1863*1da177e4SLinus Torvalds 	 * beginning of the wait-queue. As such, it's ok to "drop"
1864*1da177e4SLinus Torvalds 	 * our exclusiveness temporarily when we get woken up without
1865*1da177e4SLinus Torvalds 	 * having to remove and re-insert us on the wait queue.
1866*1da177e4SLinus Torvalds 	 */
1867*1da177e4SLinus Torvalds 	for (;;) {
1868*1da177e4SLinus Torvalds 		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1869*1da177e4SLinus Torvalds 					  TASK_INTERRUPTIBLE);
1870*1da177e4SLinus Torvalds 		release_sock(sk);
1871*1da177e4SLinus Torvalds 		if (!tp->accept_queue)
1872*1da177e4SLinus Torvalds 			timeo = schedule_timeout(timeo);
1873*1da177e4SLinus Torvalds 		lock_sock(sk);
1874*1da177e4SLinus Torvalds 		err = 0;
1875*1da177e4SLinus Torvalds 		if (tp->accept_queue)
1876*1da177e4SLinus Torvalds 			break;
1877*1da177e4SLinus Torvalds 		err = -EINVAL;
1878*1da177e4SLinus Torvalds 		if (sk->sk_state != TCP_LISTEN)
1879*1da177e4SLinus Torvalds 			break;
1880*1da177e4SLinus Torvalds 		err = sock_intr_errno(timeo);
1881*1da177e4SLinus Torvalds 		if (signal_pending(current))
1882*1da177e4SLinus Torvalds 			break;
1883*1da177e4SLinus Torvalds 		err = -EAGAIN;
1884*1da177e4SLinus Torvalds 		if (!timeo)
1885*1da177e4SLinus Torvalds 			break;
1886*1da177e4SLinus Torvalds 	}
1887*1da177e4SLinus Torvalds 	finish_wait(sk->sk_sleep, &wait);
1888*1da177e4SLinus Torvalds 	return err;
1889*1da177e4SLinus Torvalds }
1890*1da177e4SLinus Torvalds 
1891*1da177e4SLinus Torvalds /*
1892*1da177e4SLinus Torvalds  *	This will accept the next outstanding connection.
1893*1da177e4SLinus Torvalds  */
1894*1da177e4SLinus Torvalds 
1895*1da177e4SLinus Torvalds struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1896*1da177e4SLinus Torvalds {
1897*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1898*1da177e4SLinus Torvalds 	struct open_request *req;
1899*1da177e4SLinus Torvalds 	struct sock *newsk;
1900*1da177e4SLinus Torvalds 	int error;
1901*1da177e4SLinus Torvalds 
1902*1da177e4SLinus Torvalds 	lock_sock(sk);
1903*1da177e4SLinus Torvalds 
1904*1da177e4SLinus Torvalds 	/* We need to make sure that this socket is listening,
1905*1da177e4SLinus Torvalds 	 * and that it has something pending.
1906*1da177e4SLinus Torvalds 	 */
1907*1da177e4SLinus Torvalds 	error = -EINVAL;
1908*1da177e4SLinus Torvalds 	if (sk->sk_state != TCP_LISTEN)
1909*1da177e4SLinus Torvalds 		goto out;
1910*1da177e4SLinus Torvalds 
1911*1da177e4SLinus Torvalds 	/* Find already established connection */
1912*1da177e4SLinus Torvalds 	if (!tp->accept_queue) {
1913*1da177e4SLinus Torvalds 		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1914*1da177e4SLinus Torvalds 
1915*1da177e4SLinus Torvalds 		/* If this is a non blocking socket don't sleep */
1916*1da177e4SLinus Torvalds 		error = -EAGAIN;
1917*1da177e4SLinus Torvalds 		if (!timeo)
1918*1da177e4SLinus Torvalds 			goto out;
1919*1da177e4SLinus Torvalds 
1920*1da177e4SLinus Torvalds 		error = wait_for_connect(sk, timeo);
1921*1da177e4SLinus Torvalds 		if (error)
1922*1da177e4SLinus Torvalds 			goto out;
1923*1da177e4SLinus Torvalds 	}
1924*1da177e4SLinus Torvalds 
1925*1da177e4SLinus Torvalds 	req = tp->accept_queue;
1926*1da177e4SLinus Torvalds 	if ((tp->accept_queue = req->dl_next) == NULL)
1927*1da177e4SLinus Torvalds 		tp->accept_queue_tail = NULL;
1928*1da177e4SLinus Torvalds 
1929*1da177e4SLinus Torvalds  	newsk = req->sk;
1930*1da177e4SLinus Torvalds 	sk_acceptq_removed(sk);
1931*1da177e4SLinus Torvalds 	tcp_openreq_fastfree(req);
1932*1da177e4SLinus Torvalds 	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1933*1da177e4SLinus Torvalds 	release_sock(sk);
1934*1da177e4SLinus Torvalds 	return newsk;
1935*1da177e4SLinus Torvalds 
1936*1da177e4SLinus Torvalds out:
1937*1da177e4SLinus Torvalds 	release_sock(sk);
1938*1da177e4SLinus Torvalds 	*err = error;
1939*1da177e4SLinus Torvalds 	return NULL;
1940*1da177e4SLinus Torvalds }
1941*1da177e4SLinus Torvalds 
1942*1da177e4SLinus Torvalds /*
1943*1da177e4SLinus Torvalds  *	Socket option code for TCP.
1944*1da177e4SLinus Torvalds  */
1945*1da177e4SLinus Torvalds int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1946*1da177e4SLinus Torvalds 		   int optlen)
1947*1da177e4SLinus Torvalds {
1948*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1949*1da177e4SLinus Torvalds 	int val;
1950*1da177e4SLinus Torvalds 	int err = 0;
1951*1da177e4SLinus Torvalds 
1952*1da177e4SLinus Torvalds 	if (level != SOL_TCP)
1953*1da177e4SLinus Torvalds 		return tp->af_specific->setsockopt(sk, level, optname,
1954*1da177e4SLinus Torvalds 						   optval, optlen);
1955*1da177e4SLinus Torvalds 
1956*1da177e4SLinus Torvalds 	if (optlen < sizeof(int))
1957*1da177e4SLinus Torvalds 		return -EINVAL;
1958*1da177e4SLinus Torvalds 
1959*1da177e4SLinus Torvalds 	if (get_user(val, (int __user *)optval))
1960*1da177e4SLinus Torvalds 		return -EFAULT;
1961*1da177e4SLinus Torvalds 
1962*1da177e4SLinus Torvalds 	lock_sock(sk);
1963*1da177e4SLinus Torvalds 
1964*1da177e4SLinus Torvalds 	switch (optname) {
1965*1da177e4SLinus Torvalds 	case TCP_MAXSEG:
1966*1da177e4SLinus Torvalds 		/* Values greater than interface MTU won't take effect. However
1967*1da177e4SLinus Torvalds 		 * at the point when this call is done we typically don't yet
1968*1da177e4SLinus Torvalds 		 * know which interface is going to be used */
1969*1da177e4SLinus Torvalds 		if (val < 8 || val > MAX_TCP_WINDOW) {
1970*1da177e4SLinus Torvalds 			err = -EINVAL;
1971*1da177e4SLinus Torvalds 			break;
1972*1da177e4SLinus Torvalds 		}
1973*1da177e4SLinus Torvalds 		tp->rx_opt.user_mss = val;
1974*1da177e4SLinus Torvalds 		break;
1975*1da177e4SLinus Torvalds 
1976*1da177e4SLinus Torvalds 	case TCP_NODELAY:
1977*1da177e4SLinus Torvalds 		if (val) {
1978*1da177e4SLinus Torvalds 			/* TCP_NODELAY is weaker than TCP_CORK, so that
1979*1da177e4SLinus Torvalds 			 * this option on corked socket is remembered, but
1980*1da177e4SLinus Torvalds 			 * it is not activated until cork is cleared.
1981*1da177e4SLinus Torvalds 			 *
1982*1da177e4SLinus Torvalds 			 * However, when TCP_NODELAY is set we make
1983*1da177e4SLinus Torvalds 			 * an explicit push, which overrides even TCP_CORK
1984*1da177e4SLinus Torvalds 			 * for currently queued segments.
1985*1da177e4SLinus Torvalds 			 */
1986*1da177e4SLinus Torvalds 			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1987*1da177e4SLinus Torvalds 			tcp_push_pending_frames(sk, tp);
1988*1da177e4SLinus Torvalds 		} else {
1989*1da177e4SLinus Torvalds 			tp->nonagle &= ~TCP_NAGLE_OFF;
1990*1da177e4SLinus Torvalds 		}
1991*1da177e4SLinus Torvalds 		break;
1992*1da177e4SLinus Torvalds 
1993*1da177e4SLinus Torvalds 	case TCP_CORK:
1994*1da177e4SLinus Torvalds 		/* When set indicates to always queue non-full frames.
1995*1da177e4SLinus Torvalds 		 * Later the user clears this option and we transmit
1996*1da177e4SLinus Torvalds 		 * any pending partial frames in the queue.  This is
1997*1da177e4SLinus Torvalds 		 * meant to be used alongside sendfile() to get properly
1998*1da177e4SLinus Torvalds 		 * filled frames when the user (for example) must write
1999*1da177e4SLinus Torvalds 		 * out headers with a write() call first and then use
2000*1da177e4SLinus Torvalds 		 * sendfile to send out the data parts.
2001*1da177e4SLinus Torvalds 		 *
2002*1da177e4SLinus Torvalds 		 * TCP_CORK can be set together with TCP_NODELAY and it is
2003*1da177e4SLinus Torvalds 		 * stronger than TCP_NODELAY.
2004*1da177e4SLinus Torvalds 		 */
2005*1da177e4SLinus Torvalds 		if (val) {
2006*1da177e4SLinus Torvalds 			tp->nonagle |= TCP_NAGLE_CORK;
2007*1da177e4SLinus Torvalds 		} else {
2008*1da177e4SLinus Torvalds 			tp->nonagle &= ~TCP_NAGLE_CORK;
2009*1da177e4SLinus Torvalds 			if (tp->nonagle&TCP_NAGLE_OFF)
2010*1da177e4SLinus Torvalds 				tp->nonagle |= TCP_NAGLE_PUSH;
2011*1da177e4SLinus Torvalds 			tcp_push_pending_frames(sk, tp);
2012*1da177e4SLinus Torvalds 		}
2013*1da177e4SLinus Torvalds 		break;
2014*1da177e4SLinus Torvalds 
2015*1da177e4SLinus Torvalds 	case TCP_KEEPIDLE:
2016*1da177e4SLinus Torvalds 		if (val < 1 || val > MAX_TCP_KEEPIDLE)
2017*1da177e4SLinus Torvalds 			err = -EINVAL;
2018*1da177e4SLinus Torvalds 		else {
2019*1da177e4SLinus Torvalds 			tp->keepalive_time = val * HZ;
2020*1da177e4SLinus Torvalds 			if (sock_flag(sk, SOCK_KEEPOPEN) &&
2021*1da177e4SLinus Torvalds 			    !((1 << sk->sk_state) &
2022*1da177e4SLinus Torvalds 			      (TCPF_CLOSE | TCPF_LISTEN))) {
2023*1da177e4SLinus Torvalds 				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2024*1da177e4SLinus Torvalds 				if (tp->keepalive_time > elapsed)
2025*1da177e4SLinus Torvalds 					elapsed = tp->keepalive_time - elapsed;
2026*1da177e4SLinus Torvalds 				else
2027*1da177e4SLinus Torvalds 					elapsed = 0;
2028*1da177e4SLinus Torvalds 				tcp_reset_keepalive_timer(sk, elapsed);
2029*1da177e4SLinus Torvalds 			}
2030*1da177e4SLinus Torvalds 		}
2031*1da177e4SLinus Torvalds 		break;
2032*1da177e4SLinus Torvalds 	case TCP_KEEPINTVL:
2033*1da177e4SLinus Torvalds 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2034*1da177e4SLinus Torvalds 			err = -EINVAL;
2035*1da177e4SLinus Torvalds 		else
2036*1da177e4SLinus Torvalds 			tp->keepalive_intvl = val * HZ;
2037*1da177e4SLinus Torvalds 		break;
2038*1da177e4SLinus Torvalds 	case TCP_KEEPCNT:
2039*1da177e4SLinus Torvalds 		if (val < 1 || val > MAX_TCP_KEEPCNT)
2040*1da177e4SLinus Torvalds 			err = -EINVAL;
2041*1da177e4SLinus Torvalds 		else
2042*1da177e4SLinus Torvalds 			tp->keepalive_probes = val;
2043*1da177e4SLinus Torvalds 		break;
2044*1da177e4SLinus Torvalds 	case TCP_SYNCNT:
2045*1da177e4SLinus Torvalds 		if (val < 1 || val > MAX_TCP_SYNCNT)
2046*1da177e4SLinus Torvalds 			err = -EINVAL;
2047*1da177e4SLinus Torvalds 		else
2048*1da177e4SLinus Torvalds 			tp->syn_retries = val;
2049*1da177e4SLinus Torvalds 		break;
2050*1da177e4SLinus Torvalds 
2051*1da177e4SLinus Torvalds 	case TCP_LINGER2:
2052*1da177e4SLinus Torvalds 		if (val < 0)
2053*1da177e4SLinus Torvalds 			tp->linger2 = -1;
2054*1da177e4SLinus Torvalds 		else if (val > sysctl_tcp_fin_timeout / HZ)
2055*1da177e4SLinus Torvalds 			tp->linger2 = 0;
2056*1da177e4SLinus Torvalds 		else
2057*1da177e4SLinus Torvalds 			tp->linger2 = val * HZ;
2058*1da177e4SLinus Torvalds 		break;
2059*1da177e4SLinus Torvalds 
2060*1da177e4SLinus Torvalds 	case TCP_DEFER_ACCEPT:
2061*1da177e4SLinus Torvalds 		tp->defer_accept = 0;
2062*1da177e4SLinus Torvalds 		if (val > 0) {
2063*1da177e4SLinus Torvalds 			/* Translate value in seconds to number of
2064*1da177e4SLinus Torvalds 			 * retransmits */
2065*1da177e4SLinus Torvalds 			while (tp->defer_accept < 32 &&
2066*1da177e4SLinus Torvalds 			       val > ((TCP_TIMEOUT_INIT / HZ) <<
2067*1da177e4SLinus Torvalds 				       tp->defer_accept))
2068*1da177e4SLinus Torvalds 				tp->defer_accept++;
2069*1da177e4SLinus Torvalds 			tp->defer_accept++;
2070*1da177e4SLinus Torvalds 		}
2071*1da177e4SLinus Torvalds 		break;
2072*1da177e4SLinus Torvalds 
2073*1da177e4SLinus Torvalds 	case TCP_WINDOW_CLAMP:
2074*1da177e4SLinus Torvalds 		if (!val) {
2075*1da177e4SLinus Torvalds 			if (sk->sk_state != TCP_CLOSE) {
2076*1da177e4SLinus Torvalds 				err = -EINVAL;
2077*1da177e4SLinus Torvalds 				break;
2078*1da177e4SLinus Torvalds 			}
2079*1da177e4SLinus Torvalds 			tp->window_clamp = 0;
2080*1da177e4SLinus Torvalds 		} else
2081*1da177e4SLinus Torvalds 			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2082*1da177e4SLinus Torvalds 						SOCK_MIN_RCVBUF / 2 : val;
2083*1da177e4SLinus Torvalds 		break;
2084*1da177e4SLinus Torvalds 
2085*1da177e4SLinus Torvalds 	case TCP_QUICKACK:
2086*1da177e4SLinus Torvalds 		if (!val) {
2087*1da177e4SLinus Torvalds 			tp->ack.pingpong = 1;
2088*1da177e4SLinus Torvalds 		} else {
2089*1da177e4SLinus Torvalds 			tp->ack.pingpong = 0;
2090*1da177e4SLinus Torvalds 			if ((1 << sk->sk_state) &
2091*1da177e4SLinus Torvalds 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2092*1da177e4SLinus Torvalds 			    tcp_ack_scheduled(tp)) {
2093*1da177e4SLinus Torvalds 				tp->ack.pending |= TCP_ACK_PUSHED;
2094*1da177e4SLinus Torvalds 				cleanup_rbuf(sk, 1);
2095*1da177e4SLinus Torvalds 				if (!(val & 1))
2096*1da177e4SLinus Torvalds 					tp->ack.pingpong = 1;
2097*1da177e4SLinus Torvalds 			}
2098*1da177e4SLinus Torvalds 		}
2099*1da177e4SLinus Torvalds 		break;
2100*1da177e4SLinus Torvalds 
2101*1da177e4SLinus Torvalds 	default:
2102*1da177e4SLinus Torvalds 		err = -ENOPROTOOPT;
2103*1da177e4SLinus Torvalds 		break;
2104*1da177e4SLinus Torvalds 	};
2105*1da177e4SLinus Torvalds 	release_sock(sk);
2106*1da177e4SLinus Torvalds 	return err;
2107*1da177e4SLinus Torvalds }
2108*1da177e4SLinus Torvalds 
2109*1da177e4SLinus Torvalds /* Return information about state of tcp endpoint in API format. */
2110*1da177e4SLinus Torvalds void tcp_get_info(struct sock *sk, struct tcp_info *info)
2111*1da177e4SLinus Torvalds {
2112*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
2113*1da177e4SLinus Torvalds 	u32 now = tcp_time_stamp;
2114*1da177e4SLinus Torvalds 
2115*1da177e4SLinus Torvalds 	memset(info, 0, sizeof(*info));
2116*1da177e4SLinus Torvalds 
2117*1da177e4SLinus Torvalds 	info->tcpi_state = sk->sk_state;
2118*1da177e4SLinus Torvalds 	info->tcpi_ca_state = tp->ca_state;
2119*1da177e4SLinus Torvalds 	info->tcpi_retransmits = tp->retransmits;
2120*1da177e4SLinus Torvalds 	info->tcpi_probes = tp->probes_out;
2121*1da177e4SLinus Torvalds 	info->tcpi_backoff = tp->backoff;
2122*1da177e4SLinus Torvalds 
2123*1da177e4SLinus Torvalds 	if (tp->rx_opt.tstamp_ok)
2124*1da177e4SLinus Torvalds 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2125*1da177e4SLinus Torvalds 	if (tp->rx_opt.sack_ok)
2126*1da177e4SLinus Torvalds 		info->tcpi_options |= TCPI_OPT_SACK;
2127*1da177e4SLinus Torvalds 	if (tp->rx_opt.wscale_ok) {
2128*1da177e4SLinus Torvalds 		info->tcpi_options |= TCPI_OPT_WSCALE;
2129*1da177e4SLinus Torvalds 		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2130*1da177e4SLinus Torvalds 		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2131*1da177e4SLinus Torvalds 	}
2132*1da177e4SLinus Torvalds 
2133*1da177e4SLinus Torvalds 	if (tp->ecn_flags&TCP_ECN_OK)
2134*1da177e4SLinus Torvalds 		info->tcpi_options |= TCPI_OPT_ECN;
2135*1da177e4SLinus Torvalds 
2136*1da177e4SLinus Torvalds 	info->tcpi_rto = jiffies_to_usecs(tp->rto);
2137*1da177e4SLinus Torvalds 	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2138*1da177e4SLinus Torvalds 	info->tcpi_snd_mss = tp->mss_cache_std;
2139*1da177e4SLinus Torvalds 	info->tcpi_rcv_mss = tp->ack.rcv_mss;
2140*1da177e4SLinus Torvalds 
2141*1da177e4SLinus Torvalds 	info->tcpi_unacked = tp->packets_out;
2142*1da177e4SLinus Torvalds 	info->tcpi_sacked = tp->sacked_out;
2143*1da177e4SLinus Torvalds 	info->tcpi_lost = tp->lost_out;
2144*1da177e4SLinus Torvalds 	info->tcpi_retrans = tp->retrans_out;
2145*1da177e4SLinus Torvalds 	info->tcpi_fackets = tp->fackets_out;
2146*1da177e4SLinus Torvalds 
2147*1da177e4SLinus Torvalds 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2148*1da177e4SLinus Torvalds 	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2149*1da177e4SLinus Torvalds 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2150*1da177e4SLinus Torvalds 
2151*1da177e4SLinus Torvalds 	info->tcpi_pmtu = tp->pmtu_cookie;
2152*1da177e4SLinus Torvalds 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2153*1da177e4SLinus Torvalds 	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2154*1da177e4SLinus Torvalds 	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2155*1da177e4SLinus Torvalds 	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2156*1da177e4SLinus Torvalds 	info->tcpi_snd_cwnd = tp->snd_cwnd;
2157*1da177e4SLinus Torvalds 	info->tcpi_advmss = tp->advmss;
2158*1da177e4SLinus Torvalds 	info->tcpi_reordering = tp->reordering;
2159*1da177e4SLinus Torvalds 
2160*1da177e4SLinus Torvalds 	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2161*1da177e4SLinus Torvalds 	info->tcpi_rcv_space = tp->rcvq_space.space;
2162*1da177e4SLinus Torvalds 
2163*1da177e4SLinus Torvalds 	info->tcpi_total_retrans = tp->total_retrans;
2164*1da177e4SLinus Torvalds }
2165*1da177e4SLinus Torvalds 
2166*1da177e4SLinus Torvalds EXPORT_SYMBOL_GPL(tcp_get_info);
2167*1da177e4SLinus Torvalds 
2168*1da177e4SLinus Torvalds int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2169*1da177e4SLinus Torvalds 		   int __user *optlen)
2170*1da177e4SLinus Torvalds {
2171*1da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
2172*1da177e4SLinus Torvalds 	int val, len;
2173*1da177e4SLinus Torvalds 
2174*1da177e4SLinus Torvalds 	if (level != SOL_TCP)
2175*1da177e4SLinus Torvalds 		return tp->af_specific->getsockopt(sk, level, optname,
2176*1da177e4SLinus Torvalds 						   optval, optlen);
2177*1da177e4SLinus Torvalds 
2178*1da177e4SLinus Torvalds 	if (get_user(len, optlen))
2179*1da177e4SLinus Torvalds 		return -EFAULT;
2180*1da177e4SLinus Torvalds 
2181*1da177e4SLinus Torvalds 	len = min_t(unsigned int, len, sizeof(int));
2182*1da177e4SLinus Torvalds 
2183*1da177e4SLinus Torvalds 	if (len < 0)
2184*1da177e4SLinus Torvalds 		return -EINVAL;
2185*1da177e4SLinus Torvalds 
2186*1da177e4SLinus Torvalds 	switch (optname) {
2187*1da177e4SLinus Torvalds 	case TCP_MAXSEG:
2188*1da177e4SLinus Torvalds 		val = tp->mss_cache_std;
2189*1da177e4SLinus Torvalds 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2190*1da177e4SLinus Torvalds 			val = tp->rx_opt.user_mss;
2191*1da177e4SLinus Torvalds 		break;
2192*1da177e4SLinus Torvalds 	case TCP_NODELAY:
2193*1da177e4SLinus Torvalds 		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2194*1da177e4SLinus Torvalds 		break;
2195*1da177e4SLinus Torvalds 	case TCP_CORK:
2196*1da177e4SLinus Torvalds 		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2197*1da177e4SLinus Torvalds 		break;
2198*1da177e4SLinus Torvalds 	case TCP_KEEPIDLE:
2199*1da177e4SLinus Torvalds 		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2200*1da177e4SLinus Torvalds 		break;
2201*1da177e4SLinus Torvalds 	case TCP_KEEPINTVL:
2202*1da177e4SLinus Torvalds 		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2203*1da177e4SLinus Torvalds 		break;
2204*1da177e4SLinus Torvalds 	case TCP_KEEPCNT:
2205*1da177e4SLinus Torvalds 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2206*1da177e4SLinus Torvalds 		break;
2207*1da177e4SLinus Torvalds 	case TCP_SYNCNT:
2208*1da177e4SLinus Torvalds 		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2209*1da177e4SLinus Torvalds 		break;
2210*1da177e4SLinus Torvalds 	case TCP_LINGER2:
2211*1da177e4SLinus Torvalds 		val = tp->linger2;
2212*1da177e4SLinus Torvalds 		if (val >= 0)
2213*1da177e4SLinus Torvalds 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2214*1da177e4SLinus Torvalds 		break;
2215*1da177e4SLinus Torvalds 	case TCP_DEFER_ACCEPT:
2216*1da177e4SLinus Torvalds 		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2217*1da177e4SLinus Torvalds 					       (tp->defer_accept - 1));
2218*1da177e4SLinus Torvalds 		break;
2219*1da177e4SLinus Torvalds 	case TCP_WINDOW_CLAMP:
2220*1da177e4SLinus Torvalds 		val = tp->window_clamp;
2221*1da177e4SLinus Torvalds 		break;
2222*1da177e4SLinus Torvalds 	case TCP_INFO: {
2223*1da177e4SLinus Torvalds 		struct tcp_info info;
2224*1da177e4SLinus Torvalds 
2225*1da177e4SLinus Torvalds 		if (get_user(len, optlen))
2226*1da177e4SLinus Torvalds 			return -EFAULT;
2227*1da177e4SLinus Torvalds 
2228*1da177e4SLinus Torvalds 		tcp_get_info(sk, &info);
2229*1da177e4SLinus Torvalds 
2230*1da177e4SLinus Torvalds 		len = min_t(unsigned int, len, sizeof(info));
2231*1da177e4SLinus Torvalds 		if (put_user(len, optlen))
2232*1da177e4SLinus Torvalds 			return -EFAULT;
2233*1da177e4SLinus Torvalds 		if (copy_to_user(optval, &info, len))
2234*1da177e4SLinus Torvalds 			return -EFAULT;
2235*1da177e4SLinus Torvalds 		return 0;
2236*1da177e4SLinus Torvalds 	}
2237*1da177e4SLinus Torvalds 	case TCP_QUICKACK:
2238*1da177e4SLinus Torvalds 		val = !tp->ack.pingpong;
2239*1da177e4SLinus Torvalds 		break;
2240*1da177e4SLinus Torvalds 	default:
2241*1da177e4SLinus Torvalds 		return -ENOPROTOOPT;
2242*1da177e4SLinus Torvalds 	};
2243*1da177e4SLinus Torvalds 
2244*1da177e4SLinus Torvalds 	if (put_user(len, optlen))
2245*1da177e4SLinus Torvalds 		return -EFAULT;
2246*1da177e4SLinus Torvalds 	if (copy_to_user(optval, &val, len))
2247*1da177e4SLinus Torvalds 		return -EFAULT;
2248*1da177e4SLinus Torvalds 	return 0;
2249*1da177e4SLinus Torvalds }
2250*1da177e4SLinus Torvalds 
2251*1da177e4SLinus Torvalds 
2252*1da177e4SLinus Torvalds extern void __skb_cb_too_small_for_tcp(int, int);
2253*1da177e4SLinus Torvalds extern void tcpdiag_init(void);
2254*1da177e4SLinus Torvalds 
2255*1da177e4SLinus Torvalds static __initdata unsigned long thash_entries;
2256*1da177e4SLinus Torvalds static int __init set_thash_entries(char *str)
2257*1da177e4SLinus Torvalds {
2258*1da177e4SLinus Torvalds 	if (!str)
2259*1da177e4SLinus Torvalds 		return 0;
2260*1da177e4SLinus Torvalds 	thash_entries = simple_strtoul(str, &str, 0);
2261*1da177e4SLinus Torvalds 	return 1;
2262*1da177e4SLinus Torvalds }
2263*1da177e4SLinus Torvalds __setup("thash_entries=", set_thash_entries);
2264*1da177e4SLinus Torvalds 
2265*1da177e4SLinus Torvalds void __init tcp_init(void)
2266*1da177e4SLinus Torvalds {
2267*1da177e4SLinus Torvalds 	struct sk_buff *skb = NULL;
2268*1da177e4SLinus Torvalds 	int order, i;
2269*1da177e4SLinus Torvalds 
2270*1da177e4SLinus Torvalds 	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2271*1da177e4SLinus Torvalds 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2272*1da177e4SLinus Torvalds 					   sizeof(skb->cb));
2273*1da177e4SLinus Torvalds 
2274*1da177e4SLinus Torvalds 	tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2275*1da177e4SLinus Torvalds 						   sizeof(struct open_request),
2276*1da177e4SLinus Torvalds 					       0, SLAB_HWCACHE_ALIGN,
2277*1da177e4SLinus Torvalds 					       NULL, NULL);
2278*1da177e4SLinus Torvalds 	if (!tcp_openreq_cachep)
2279*1da177e4SLinus Torvalds 		panic("tcp_init: Cannot alloc open_request cache.");
2280*1da177e4SLinus Torvalds 
2281*1da177e4SLinus Torvalds 	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2282*1da177e4SLinus Torvalds 					      sizeof(struct tcp_bind_bucket),
2283*1da177e4SLinus Torvalds 					      0, SLAB_HWCACHE_ALIGN,
2284*1da177e4SLinus Torvalds 					      NULL, NULL);
2285*1da177e4SLinus Torvalds 	if (!tcp_bucket_cachep)
2286*1da177e4SLinus Torvalds 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2287*1da177e4SLinus Torvalds 
2288*1da177e4SLinus Torvalds 	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2289*1da177e4SLinus Torvalds 						sizeof(struct tcp_tw_bucket),
2290*1da177e4SLinus Torvalds 						0, SLAB_HWCACHE_ALIGN,
2291*1da177e4SLinus Torvalds 						NULL, NULL);
2292*1da177e4SLinus Torvalds 	if (!tcp_timewait_cachep)
2293*1da177e4SLinus Torvalds 		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2294*1da177e4SLinus Torvalds 
2295*1da177e4SLinus Torvalds 	/* Size and allocate the main established and bind bucket
2296*1da177e4SLinus Torvalds 	 * hash tables.
2297*1da177e4SLinus Torvalds 	 *
2298*1da177e4SLinus Torvalds 	 * The methodology is similar to that of the buffer cache.
2299*1da177e4SLinus Torvalds 	 */
2300*1da177e4SLinus Torvalds 	tcp_ehash = (struct tcp_ehash_bucket *)
2301*1da177e4SLinus Torvalds 		alloc_large_system_hash("TCP established",
2302*1da177e4SLinus Torvalds 					sizeof(struct tcp_ehash_bucket),
2303*1da177e4SLinus Torvalds 					thash_entries,
2304*1da177e4SLinus Torvalds 					(num_physpages >= 128 * 1024) ?
2305*1da177e4SLinus Torvalds 						(25 - PAGE_SHIFT) :
2306*1da177e4SLinus Torvalds 						(27 - PAGE_SHIFT),
2307*1da177e4SLinus Torvalds 					HASH_HIGHMEM,
2308*1da177e4SLinus Torvalds 					&tcp_ehash_size,
2309*1da177e4SLinus Torvalds 					NULL,
2310*1da177e4SLinus Torvalds 					0);
2311*1da177e4SLinus Torvalds 	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2312*1da177e4SLinus Torvalds 	for (i = 0; i < (tcp_ehash_size << 1); i++) {
2313*1da177e4SLinus Torvalds 		rwlock_init(&tcp_ehash[i].lock);
2314*1da177e4SLinus Torvalds 		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2315*1da177e4SLinus Torvalds 	}
2316*1da177e4SLinus Torvalds 
2317*1da177e4SLinus Torvalds 	tcp_bhash = (struct tcp_bind_hashbucket *)
2318*1da177e4SLinus Torvalds 		alloc_large_system_hash("TCP bind",
2319*1da177e4SLinus Torvalds 					sizeof(struct tcp_bind_hashbucket),
2320*1da177e4SLinus Torvalds 					tcp_ehash_size,
2321*1da177e4SLinus Torvalds 					(num_physpages >= 128 * 1024) ?
2322*1da177e4SLinus Torvalds 						(25 - PAGE_SHIFT) :
2323*1da177e4SLinus Torvalds 						(27 - PAGE_SHIFT),
2324*1da177e4SLinus Torvalds 					HASH_HIGHMEM,
2325*1da177e4SLinus Torvalds 					&tcp_bhash_size,
2326*1da177e4SLinus Torvalds 					NULL,
2327*1da177e4SLinus Torvalds 					64 * 1024);
2328*1da177e4SLinus Torvalds 	tcp_bhash_size = 1 << tcp_bhash_size;
2329*1da177e4SLinus Torvalds 	for (i = 0; i < tcp_bhash_size; i++) {
2330*1da177e4SLinus Torvalds 		spin_lock_init(&tcp_bhash[i].lock);
2331*1da177e4SLinus Torvalds 		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2332*1da177e4SLinus Torvalds 	}
2333*1da177e4SLinus Torvalds 
2334*1da177e4SLinus Torvalds 	/* Try to be a bit smarter and adjust defaults depending
2335*1da177e4SLinus Torvalds 	 * on available memory.
2336*1da177e4SLinus Torvalds 	 */
2337*1da177e4SLinus Torvalds 	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2338*1da177e4SLinus Torvalds 			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2339*1da177e4SLinus Torvalds 			order++)
2340*1da177e4SLinus Torvalds 		;
2341*1da177e4SLinus Torvalds 	if (order > 4) {
2342*1da177e4SLinus Torvalds 		sysctl_local_port_range[0] = 32768;
2343*1da177e4SLinus Torvalds 		sysctl_local_port_range[1] = 61000;
2344*1da177e4SLinus Torvalds 		sysctl_tcp_max_tw_buckets = 180000;
2345*1da177e4SLinus Torvalds 		sysctl_tcp_max_orphans = 4096 << (order - 4);
2346*1da177e4SLinus Torvalds 		sysctl_max_syn_backlog = 1024;
2347*1da177e4SLinus Torvalds 	} else if (order < 3) {
2348*1da177e4SLinus Torvalds 		sysctl_local_port_range[0] = 1024 * (3 - order);
2349*1da177e4SLinus Torvalds 		sysctl_tcp_max_tw_buckets >>= (3 - order);
2350*1da177e4SLinus Torvalds 		sysctl_tcp_max_orphans >>= (3 - order);
2351*1da177e4SLinus Torvalds 		sysctl_max_syn_backlog = 128;
2352*1da177e4SLinus Torvalds 	}
2353*1da177e4SLinus Torvalds 	tcp_port_rover = sysctl_local_port_range[0] - 1;
2354*1da177e4SLinus Torvalds 
2355*1da177e4SLinus Torvalds 	sysctl_tcp_mem[0] =  768 << order;
2356*1da177e4SLinus Torvalds 	sysctl_tcp_mem[1] = 1024 << order;
2357*1da177e4SLinus Torvalds 	sysctl_tcp_mem[2] = 1536 << order;
2358*1da177e4SLinus Torvalds 
2359*1da177e4SLinus Torvalds 	if (order < 3) {
2360*1da177e4SLinus Torvalds 		sysctl_tcp_wmem[2] = 64 * 1024;
2361*1da177e4SLinus Torvalds 		sysctl_tcp_rmem[0] = PAGE_SIZE;
2362*1da177e4SLinus Torvalds 		sysctl_tcp_rmem[1] = 43689;
2363*1da177e4SLinus Torvalds 		sysctl_tcp_rmem[2] = 2 * 43689;
2364*1da177e4SLinus Torvalds 	}
2365*1da177e4SLinus Torvalds 
2366*1da177e4SLinus Torvalds 	printk(KERN_INFO "TCP: Hash tables configured "
2367*1da177e4SLinus Torvalds 	       "(established %d bind %d)\n",
2368*1da177e4SLinus Torvalds 	       tcp_ehash_size << 1, tcp_bhash_size);
2369*1da177e4SLinus Torvalds }
2370*1da177e4SLinus Torvalds 
2371*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_accept);
2372*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_close);
2373*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_destroy_sock);
2374*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_disconnect);
2375*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_getsockopt);
2376*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_ioctl);
2377*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_openreq_cachep);
2378*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_poll);
2379*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_read_sock);
2380*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_recvmsg);
2381*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_sendmsg);
2382*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_sendpage);
2383*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_setsockopt);
2384*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_shutdown);
2385*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_statistics);
2386*1da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_timewait_cachep);
2387