xref: /linux/net/ipv4/tcp.c (revision 1fc31357ad194fb98691f3d122bcd47e59239e83)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
11  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
15  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
16  *		Matthew Dillon, <dillon@apollo.west.oic.com>
17  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *		Jorge Cwik, <jorge@laser.satlink.net>
19  *
20  * Fixes:
21  *		Alan Cox	:	Numerous verify_area() calls
22  *		Alan Cox	:	Set the ACK bit on a reset
23  *		Alan Cox	:	Stopped it crashing if it closed while
24  *					sk->inuse=1 and was trying to connect
25  *					(tcp_err()).
26  *		Alan Cox	:	All icmp error handling was broken
27  *					pointers passed where wrong and the
28  *					socket was looked up backwards. Nobody
29  *					tested any icmp error code obviously.
30  *		Alan Cox	:	tcp_err() now handled properly. It
31  *					wakes people on errors. poll
32  *					behaves and the icmp error race
33  *					has gone by moving it into sock.c
34  *		Alan Cox	:	tcp_send_reset() fixed to work for
35  *					everything not just packets for
36  *					unknown sockets.
37  *		Alan Cox	:	tcp option processing.
38  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
39  *					syn rule wrong]
40  *		Herp Rosmanith  :	More reset fixes
41  *		Alan Cox	:	No longer acks invalid rst frames.
42  *					Acking any kind of RST is right out.
43  *		Alan Cox	:	Sets an ignore me flag on an rst
44  *					receive otherwise odd bits of prattle
45  *					escape still
46  *		Alan Cox	:	Fixed another acking RST frame bug.
47  *					Should stop LAN workplace lockups.
48  *		Alan Cox	: 	Some tidyups using the new skb list
49  *					facilities
50  *		Alan Cox	:	sk->keepopen now seems to work
51  *		Alan Cox	:	Pulls options out correctly on accepts
52  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
53  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
54  *					bit to skb ops.
55  *		Alan Cox	:	Tidied tcp_data to avoid a potential
56  *					nasty.
57  *		Alan Cox	:	Added some better commenting, as the
58  *					tcp is hard to follow
59  *		Alan Cox	:	Removed incorrect check for 20 * psh
60  *	Michael O'Reilly	:	ack < copied bug fix.
61  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
62  *		Alan Cox	:	FIN with no memory -> CRASH
63  *		Alan Cox	:	Added socket option proto entries.
64  *					Also added awareness of them to accept.
65  *		Alan Cox	:	Added TCP options (SOL_TCP)
66  *		Alan Cox	:	Switched wakeup calls to callbacks,
67  *					so the kernel can layer network
68  *					sockets.
69  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
70  *		Alan Cox	:	Handle FIN (more) properly (we hope).
71  *		Alan Cox	:	RST frames sent on unsynchronised
72  *					state ack error.
73  *		Alan Cox	:	Put in missing check for SYN bit.
74  *		Alan Cox	:	Added tcp_select_window() aka NET2E
75  *					window non shrink trick.
76  *		Alan Cox	:	Added a couple of small NET2E timer
77  *					fixes
78  *		Charles Hedrick :	TCP fixes
79  *		Toomas Tamm	:	TCP window fixes
80  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
81  *		Charles Hedrick	:	Rewrote most of it to actually work
82  *		Linus		:	Rewrote tcp_read() and URG handling
83  *					completely
84  *		Gerhard Koerting:	Fixed some missing timer handling
85  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
86  *		Gerhard Koerting:	PC/TCP workarounds
87  *		Adam Caldwell	:	Assorted timer/timing errors
88  *		Matthew Dillon	:	Fixed another RST bug
89  *		Alan Cox	:	Move to kernel side addressing changes.
90  *		Alan Cox	:	Beginning work on TCP fastpathing
91  *					(not yet usable)
92  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
93  *		Alan Cox	:	TCP fast path debugging
94  *		Alan Cox	:	Window clamping
95  *		Michael Riepe	:	Bug in tcp_check()
96  *		Matt Dillon	:	More TCP improvements and RST bug fixes
97  *		Matt Dillon	:	Yet more small nasties remove from the
98  *					TCP code (Be very nice to this man if
99  *					tcp finally works 100%) 8)
100  *		Alan Cox	:	BSD accept semantics.
101  *		Alan Cox	:	Reset on closedown bug.
102  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
103  *		Michael Pall	:	Handle poll() after URG properly in
104  *					all cases.
105  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
106  *					(multi URG PUSH broke rlogin).
107  *		Michael Pall	:	Fix the multi URG PUSH problem in
108  *					tcp_readable(), poll() after URG
109  *					works now.
110  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
111  *					BSD api.
112  *		Alan Cox	:	Changed the semantics of sk->socket to
113  *					fix a race and a signal problem with
114  *					accept() and async I/O.
115  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
116  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
117  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
118  *					clients/servers which listen in on
119  *					fixed ports.
120  *		Alan Cox	:	Cleaned the above up and shrank it to
121  *					a sensible code size.
122  *		Alan Cox	:	Self connect lockup fix.
123  *		Alan Cox	:	No connect to multicast.
124  *		Ross Biro	:	Close unaccepted children on master
125  *					socket close.
126  *		Alan Cox	:	Reset tracing code.
127  *		Alan Cox	:	Spurious resets on shutdown.
128  *		Alan Cox	:	Giant 15 minute/60 second timer error
129  *		Alan Cox	:	Small whoops in polling before an
130  *					accept.
131  *		Alan Cox	:	Kept the state trace facility since
132  *					it's handy for debugging.
133  *		Alan Cox	:	More reset handler fixes.
134  *		Alan Cox	:	Started rewriting the code based on
135  *					the RFC's for other useful protocol
136  *					references see: Comer, KA9Q NOS, and
137  *					for a reference on the difference
138  *					between specifications and how BSD
139  *					works see the 4.4lite source.
140  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
141  *					close.
142  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
143  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
144  *		Alan Cox	:	Reimplemented timers as per the RFC
145  *					and using multiple timers for sanity.
146  *		Alan Cox	:	Small bug fixes, and a lot of new
147  *					comments.
148  *		Alan Cox	:	Fixed dual reader crash by locking
149  *					the buffers (much like datagram.c)
150  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
151  *					now gets fed up of retrying without
152  *					(even a no space) answer.
153  *		Alan Cox	:	Extracted closing code better
154  *		Alan Cox	:	Fixed the closing state machine to
155  *					resemble the RFC.
156  *		Alan Cox	:	More 'per spec' fixes.
157  *		Jorge Cwik	:	Even faster checksumming.
158  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
159  *					only frames. At least one pc tcp stack
160  *					generates them.
161  *		Alan Cox	:	Cache last socket.
162  *		Alan Cox	:	Per route irtt.
163  *		Matt Day	:	poll()->select() match BSD precisely on error
164  *		Alan Cox	:	New buffers
165  *		Marc Tamsky	:	Various sk->prot->retransmits and
166  *					sk->retransmits misupdating fixed.
167  *					Fixed tcp_write_timeout: stuck close,
168  *					and TCP syn retries gets used now.
169  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
170  *					ack if state is TCP_CLOSED.
171  *		Alan Cox	:	Look up device on a retransmit - routes may
172  *					change. Doesn't yet cope with MSS shrink right
173  *					but it's a start!
174  *		Marc Tamsky	:	Closing in closing fixes.
175  *		Mike Shaver	:	RFC1122 verifications.
176  *		Alan Cox	:	rcv_saddr errors.
177  *		Alan Cox	:	Block double connect().
178  *		Alan Cox	:	Small hooks for enSKIP.
179  *		Alexey Kuznetsov:	Path MTU discovery.
180  *		Alan Cox	:	Support soft errors.
181  *		Alan Cox	:	Fix MTU discovery pathological case
182  *					when the remote claims no mtu!
183  *		Marc Tamsky	:	TCP_CLOSE fix.
184  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
185  *					window but wrong (fixes NT lpd problems)
186  *		Pedro Roque	:	Better TCP window handling, delayed ack.
187  *		Joerg Reuter	:	No modification of locked buffers in
188  *					tcp_do_retransmit()
189  *		Eric Schenk	:	Changed receiver side silly window
190  *					avoidance algorithm to BSD style
191  *					algorithm. This doubles throughput
192  *					against machines running Solaris,
193  *					and seems to result in general
194  *					improvement.
195  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
196  *	Willy Konynenberg	:	Transparent proxying support.
197  *	Mike McLagan		:	Routing by source
198  *		Keith Owens	:	Do proper merging with partial SKB's in
199  *					tcp_do_sendmsg to avoid burstiness.
200  *		Eric Schenk	:	Fix fast close down bug with
201  *					shutdown() followed by close().
202  *		Andi Kleen 	:	Make poll agree with SIGIO
203  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
204  *					lingertime == 0 (RFC 793 ABORT Call)
205  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
206  *					csum_and_copy_from_user() if possible.
207  *
208  *		This program is free software; you can redistribute it and/or
209  *		modify it under the terms of the GNU General Public License
210  *		as published by the Free Software Foundation; either version
211  *		2 of the License, or(at your option) any later version.
212  *
213  * Description of States:
214  *
215  *	TCP_SYN_SENT		sent a connection request, waiting for ack
216  *
217  *	TCP_SYN_RECV		received a connection request, sent ack,
218  *				waiting for final ack in three-way handshake.
219  *
220  *	TCP_ESTABLISHED		connection established
221  *
222  *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
223  *				transmission of remaining buffered data
224  *
225  *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
226  *				to shutdown
227  *
228  *	TCP_CLOSING		both sides have shutdown but we still have
229  *				data we have to finish sending
230  *
231  *	TCP_TIME_WAIT		timeout to catch resent junk before entering
232  *				closed, can only be entered from FIN_WAIT2
233  *				or CLOSING.  Required because the other end
234  *				may not have gotten our last ACK causing it
235  *				to retransmit the data packet (which we ignore)
236  *
237  *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
238  *				us to finish writing our data and to shutdown
239  *				(we have to close() to move on to LAST_ACK)
240  *
241  *	TCP_LAST_ACK		out side has shutdown after remote has
242  *				shutdown.  There may still be data in our
243  *				buffer that we have to finish sending
244  *
245  *	TCP_CLOSE		socket is finished
246  */
247 
248 #define pr_fmt(fmt) "TCP: " fmt
249 
250 #include <crypto/hash.h>
251 #include <linux/kernel.h>
252 #include <linux/module.h>
253 #include <linux/types.h>
254 #include <linux/fcntl.h>
255 #include <linux/poll.h>
256 #include <linux/inet_diag.h>
257 #include <linux/init.h>
258 #include <linux/fs.h>
259 #include <linux/skbuff.h>
260 #include <linux/scatterlist.h>
261 #include <linux/splice.h>
262 #include <linux/net.h>
263 #include <linux/socket.h>
264 #include <linux/random.h>
265 #include <linux/bootmem.h>
266 #include <linux/highmem.h>
267 #include <linux/swap.h>
268 #include <linux/cache.h>
269 #include <linux/err.h>
270 #include <linux/time.h>
271 #include <linux/slab.h>
272 
273 #include <net/icmp.h>
274 #include <net/inet_common.h>
275 #include <net/tcp.h>
276 #include <net/xfrm.h>
277 #include <net/ip.h>
278 #include <net/sock.h>
279 
280 #include <asm/uaccess.h>
281 #include <asm/ioctls.h>
282 #include <net/busy_poll.h>
283 
284 int sysctl_tcp_min_tso_segs __read_mostly = 2;
285 
286 int sysctl_tcp_autocorking __read_mostly = 1;
287 
288 struct percpu_counter tcp_orphan_count;
289 EXPORT_SYMBOL_GPL(tcp_orphan_count);
290 
291 long sysctl_tcp_mem[3] __read_mostly;
292 int sysctl_tcp_wmem[3] __read_mostly;
293 int sysctl_tcp_rmem[3] __read_mostly;
294 
295 EXPORT_SYMBOL(sysctl_tcp_mem);
296 EXPORT_SYMBOL(sysctl_tcp_rmem);
297 EXPORT_SYMBOL(sysctl_tcp_wmem);
298 
299 atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
300 EXPORT_SYMBOL(tcp_memory_allocated);
301 
302 /*
303  * Current number of TCP sockets.
304  */
305 struct percpu_counter tcp_sockets_allocated;
306 EXPORT_SYMBOL(tcp_sockets_allocated);
307 
308 /*
309  * TCP splice context
310  */
311 struct tcp_splice_state {
312 	struct pipe_inode_info *pipe;
313 	size_t len;
314 	unsigned int flags;
315 };
316 
317 /*
318  * Pressure flag: try to collapse.
319  * Technical note: it is used by multiple contexts non atomically.
320  * All the __sk_mem_schedule() is of this nature: accounting
321  * is strict, actions are advisory and have some latency.
322  */
323 int tcp_memory_pressure __read_mostly;
324 EXPORT_SYMBOL(tcp_memory_pressure);
325 
326 void tcp_enter_memory_pressure(struct sock *sk)
327 {
328 	if (!tcp_memory_pressure) {
329 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
330 		tcp_memory_pressure = 1;
331 	}
332 }
333 EXPORT_SYMBOL(tcp_enter_memory_pressure);
334 
335 /* Convert seconds to retransmits based on initial and max timeout */
336 static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
337 {
338 	u8 res = 0;
339 
340 	if (seconds > 0) {
341 		int period = timeout;
342 
343 		res = 1;
344 		while (seconds > period && res < 255) {
345 			res++;
346 			timeout <<= 1;
347 			if (timeout > rto_max)
348 				timeout = rto_max;
349 			period += timeout;
350 		}
351 	}
352 	return res;
353 }
354 
355 /* Convert retransmits to seconds based on initial and max timeout */
356 static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
357 {
358 	int period = 0;
359 
360 	if (retrans > 0) {
361 		period = timeout;
362 		while (--retrans) {
363 			timeout <<= 1;
364 			if (timeout > rto_max)
365 				timeout = rto_max;
366 			period += timeout;
367 		}
368 	}
369 	return period;
370 }
371 
372 /* Address-family independent initialization for a tcp_sock.
373  *
374  * NOTE: A lot of things set to zero explicitly by call to
375  *       sk_alloc() so need not be done here.
376  */
377 void tcp_init_sock(struct sock *sk)
378 {
379 	struct inet_connection_sock *icsk = inet_csk(sk);
380 	struct tcp_sock *tp = tcp_sk(sk);
381 
382 	tp->out_of_order_queue = RB_ROOT;
383 	tcp_init_xmit_timers(sk);
384 	tcp_prequeue_init(tp);
385 	INIT_LIST_HEAD(&tp->tsq_node);
386 
387 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
388 	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
389 	minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
390 
391 	/* So many TCP implementations out there (incorrectly) count the
392 	 * initial SYN frame in their delayed-ACK and congestion control
393 	 * algorithms that we must have the following bandaid to talk
394 	 * efficiently to them.  -DaveM
395 	 */
396 	tp->snd_cwnd = TCP_INIT_CWND;
397 
398 	/* There's a bubble in the pipe until at least the first ACK. */
399 	tp->app_limited = ~0U;
400 
401 	/* See draft-stevens-tcpca-spec-01 for discussion of the
402 	 * initialization of these values.
403 	 */
404 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
405 	tp->snd_cwnd_clamp = ~0;
406 	tp->mss_cache = TCP_MSS_DEFAULT;
407 
408 	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
409 	tcp_enable_early_retrans(tp);
410 	tcp_assign_congestion_control(sk);
411 
412 	tp->tsoffset = 0;
413 
414 	sk->sk_state = TCP_CLOSE;
415 
416 	sk->sk_write_space = sk_stream_write_space;
417 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
418 
419 	icsk->icsk_sync_mss = tcp_sync_mss;
420 
421 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
422 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
423 
424 	local_bh_disable();
425 	sk_sockets_allocated_inc(sk);
426 	local_bh_enable();
427 }
428 EXPORT_SYMBOL(tcp_init_sock);
429 
430 static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
431 {
432 	if (tsflags) {
433 		struct skb_shared_info *shinfo = skb_shinfo(skb);
434 		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
435 
436 		sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
437 		if (tsflags & SOF_TIMESTAMPING_TX_ACK)
438 			tcb->txstamp_ack = 1;
439 		if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
440 			shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
441 	}
442 }
443 
444 /*
445  *	Wait for a TCP event.
446  *
447  *	Note that we don't need to lock the socket, as the upper poll layers
448  *	take care of normal races (between the test and the event) and we don't
449  *	go look at any of the socket buffers directly.
450  */
451 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
452 {
453 	unsigned int mask;
454 	struct sock *sk = sock->sk;
455 	const struct tcp_sock *tp = tcp_sk(sk);
456 	int state;
457 
458 	sock_rps_record_flow(sk);
459 
460 	sock_poll_wait(file, sk_sleep(sk), wait);
461 
462 	state = sk_state_load(sk);
463 	if (state == TCP_LISTEN)
464 		return inet_csk_listen_poll(sk);
465 
466 	/* Socket is not locked. We are protected from async events
467 	 * by poll logic and correct handling of state changes
468 	 * made by other threads is impossible in any case.
469 	 */
470 
471 	mask = 0;
472 
473 	/*
474 	 * POLLHUP is certainly not done right. But poll() doesn't
475 	 * have a notion of HUP in just one direction, and for a
476 	 * socket the read side is more interesting.
477 	 *
478 	 * Some poll() documentation says that POLLHUP is incompatible
479 	 * with the POLLOUT/POLLWR flags, so somebody should check this
480 	 * all. But careful, it tends to be safer to return too many
481 	 * bits than too few, and you can easily break real applications
482 	 * if you don't tell them that something has hung up!
483 	 *
484 	 * Check-me.
485 	 *
486 	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
487 	 * our fs/select.c). It means that after we received EOF,
488 	 * poll always returns immediately, making impossible poll() on write()
489 	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
490 	 * if and only if shutdown has been made in both directions.
491 	 * Actually, it is interesting to look how Solaris and DUX
492 	 * solve this dilemma. I would prefer, if POLLHUP were maskable,
493 	 * then we could set it on SND_SHUTDOWN. BTW examples given
494 	 * in Stevens' books assume exactly this behaviour, it explains
495 	 * why POLLHUP is incompatible with POLLOUT.	--ANK
496 	 *
497 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
498 	 * blocking on fresh not-connected or disconnected socket. --ANK
499 	 */
500 	if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
501 		mask |= POLLHUP;
502 	if (sk->sk_shutdown & RCV_SHUTDOWN)
503 		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
504 
505 	/* Connected or passive Fast Open socket? */
506 	if (state != TCP_SYN_SENT &&
507 	    (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
508 		int target = sock_rcvlowat(sk, 0, INT_MAX);
509 
510 		if (tp->urg_seq == tp->copied_seq &&
511 		    !sock_flag(sk, SOCK_URGINLINE) &&
512 		    tp->urg_data)
513 			target++;
514 
515 		if (tp->rcv_nxt - tp->copied_seq >= target)
516 			mask |= POLLIN | POLLRDNORM;
517 
518 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
519 			if (sk_stream_is_writeable(sk)) {
520 				mask |= POLLOUT | POLLWRNORM;
521 			} else {  /* send SIGIO later */
522 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
523 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
524 
525 				/* Race breaker. If space is freed after
526 				 * wspace test but before the flags are set,
527 				 * IO signal will be lost. Memory barrier
528 				 * pairs with the input side.
529 				 */
530 				smp_mb__after_atomic();
531 				if (sk_stream_is_writeable(sk))
532 					mask |= POLLOUT | POLLWRNORM;
533 			}
534 		} else
535 			mask |= POLLOUT | POLLWRNORM;
536 
537 		if (tp->urg_data & TCP_URG_VALID)
538 			mask |= POLLPRI;
539 	}
540 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
541 	smp_rmb();
542 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
543 		mask |= POLLERR;
544 
545 	return mask;
546 }
547 EXPORT_SYMBOL(tcp_poll);
548 
549 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
550 {
551 	struct tcp_sock *tp = tcp_sk(sk);
552 	int answ;
553 	bool slow;
554 
555 	switch (cmd) {
556 	case SIOCINQ:
557 		if (sk->sk_state == TCP_LISTEN)
558 			return -EINVAL;
559 
560 		slow = lock_sock_fast(sk);
561 		answ = tcp_inq(sk);
562 		unlock_sock_fast(sk, slow);
563 		break;
564 	case SIOCATMARK:
565 		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
566 		break;
567 	case SIOCOUTQ:
568 		if (sk->sk_state == TCP_LISTEN)
569 			return -EINVAL;
570 
571 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
572 			answ = 0;
573 		else
574 			answ = tp->write_seq - tp->snd_una;
575 		break;
576 	case SIOCOUTQNSD:
577 		if (sk->sk_state == TCP_LISTEN)
578 			return -EINVAL;
579 
580 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
581 			answ = 0;
582 		else
583 			answ = tp->write_seq - tp->snd_nxt;
584 		break;
585 	default:
586 		return -ENOIOCTLCMD;
587 	}
588 
589 	return put_user(answ, (int __user *)arg);
590 }
591 EXPORT_SYMBOL(tcp_ioctl);
592 
593 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
594 {
595 	TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
596 	tp->pushed_seq = tp->write_seq;
597 }
598 
599 static inline bool forced_push(const struct tcp_sock *tp)
600 {
601 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
602 }
603 
604 static void skb_entail(struct sock *sk, struct sk_buff *skb)
605 {
606 	struct tcp_sock *tp = tcp_sk(sk);
607 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
608 
609 	skb->csum    = 0;
610 	tcb->seq     = tcb->end_seq = tp->write_seq;
611 	tcb->tcp_flags = TCPHDR_ACK;
612 	tcb->sacked  = 0;
613 	__skb_header_release(skb);
614 	tcp_add_write_queue_tail(sk, skb);
615 	sk->sk_wmem_queued += skb->truesize;
616 	sk_mem_charge(sk, skb->truesize);
617 	if (tp->nonagle & TCP_NAGLE_PUSH)
618 		tp->nonagle &= ~TCP_NAGLE_PUSH;
619 
620 	tcp_slow_start_after_idle_check(sk);
621 }
622 
623 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
624 {
625 	if (flags & MSG_OOB)
626 		tp->snd_up = tp->write_seq;
627 }
628 
629 /* If a not yet filled skb is pushed, do not send it if
630  * we have data packets in Qdisc or NIC queues :
631  * Because TX completion will happen shortly, it gives a chance
632  * to coalesce future sendmsg() payload into this skb, without
633  * need for a timer, and with no latency trade off.
634  * As packets containing data payload have a bigger truesize
635  * than pure acks (dataless) packets, the last checks prevent
636  * autocorking if we only have an ACK in Qdisc/NIC queues,
637  * or if TX completion was delayed after we processed ACK packet.
638  */
639 static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
640 				int size_goal)
641 {
642 	return skb->len < size_goal &&
643 	       sysctl_tcp_autocorking &&
644 	       skb != tcp_write_queue_head(sk) &&
645 	       atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
646 }
647 
648 static void tcp_push(struct sock *sk, int flags, int mss_now,
649 		     int nonagle, int size_goal)
650 {
651 	struct tcp_sock *tp = tcp_sk(sk);
652 	struct sk_buff *skb;
653 
654 	if (!tcp_send_head(sk))
655 		return;
656 
657 	skb = tcp_write_queue_tail(sk);
658 	if (!(flags & MSG_MORE) || forced_push(tp))
659 		tcp_mark_push(tp, skb);
660 
661 	tcp_mark_urg(tp, flags);
662 
663 	if (tcp_should_autocork(sk, skb, size_goal)) {
664 
665 		/* avoid atomic op if TSQ_THROTTLED bit is already set */
666 		if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
667 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
668 			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
669 		}
670 		/* It is possible TX completion already happened
671 		 * before we set TSQ_THROTTLED.
672 		 */
673 		if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
674 			return;
675 	}
676 
677 	if (flags & MSG_MORE)
678 		nonagle = TCP_NAGLE_CORK;
679 
680 	__tcp_push_pending_frames(sk, mss_now, nonagle);
681 }
682 
683 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
684 				unsigned int offset, size_t len)
685 {
686 	struct tcp_splice_state *tss = rd_desc->arg.data;
687 	int ret;
688 
689 	ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
690 			      min(rd_desc->count, len), tss->flags);
691 	if (ret > 0)
692 		rd_desc->count -= ret;
693 	return ret;
694 }
695 
696 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
697 {
698 	/* Store TCP splice context information in read_descriptor_t. */
699 	read_descriptor_t rd_desc = {
700 		.arg.data = tss,
701 		.count	  = tss->len,
702 	};
703 
704 	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
705 }
706 
707 /**
708  *  tcp_splice_read - splice data from TCP socket to a pipe
709  * @sock:	socket to splice from
710  * @ppos:	position (not valid)
711  * @pipe:	pipe to splice to
712  * @len:	number of bytes to splice
713  * @flags:	splice modifier flags
714  *
715  * Description:
716  *    Will read pages from given socket and fill them into a pipe.
717  *
718  **/
719 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
720 			struct pipe_inode_info *pipe, size_t len,
721 			unsigned int flags)
722 {
723 	struct sock *sk = sock->sk;
724 	struct tcp_splice_state tss = {
725 		.pipe = pipe,
726 		.len = len,
727 		.flags = flags,
728 	};
729 	long timeo;
730 	ssize_t spliced;
731 	int ret;
732 
733 	sock_rps_record_flow(sk);
734 	/*
735 	 * We can't seek on a socket input
736 	 */
737 	if (unlikely(*ppos))
738 		return -ESPIPE;
739 
740 	ret = spliced = 0;
741 
742 	lock_sock(sk);
743 
744 	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
745 	while (tss.len) {
746 		ret = __tcp_splice_read(sk, &tss);
747 		if (ret < 0)
748 			break;
749 		else if (!ret) {
750 			if (spliced)
751 				break;
752 			if (sock_flag(sk, SOCK_DONE))
753 				break;
754 			if (sk->sk_err) {
755 				ret = sock_error(sk);
756 				break;
757 			}
758 			if (sk->sk_shutdown & RCV_SHUTDOWN)
759 				break;
760 			if (sk->sk_state == TCP_CLOSE) {
761 				/*
762 				 * This occurs when user tries to read
763 				 * from never connected socket.
764 				 */
765 				if (!sock_flag(sk, SOCK_DONE))
766 					ret = -ENOTCONN;
767 				break;
768 			}
769 			if (!timeo) {
770 				ret = -EAGAIN;
771 				break;
772 			}
773 			sk_wait_data(sk, &timeo, NULL);
774 			if (signal_pending(current)) {
775 				ret = sock_intr_errno(timeo);
776 				break;
777 			}
778 			continue;
779 		}
780 		tss.len -= ret;
781 		spliced += ret;
782 
783 		if (!timeo)
784 			break;
785 		release_sock(sk);
786 		lock_sock(sk);
787 
788 		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
789 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
790 		    signal_pending(current))
791 			break;
792 	}
793 
794 	release_sock(sk);
795 
796 	if (spliced)
797 		return spliced;
798 
799 	return ret;
800 }
801 EXPORT_SYMBOL(tcp_splice_read);
802 
803 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
804 				    bool force_schedule)
805 {
806 	struct sk_buff *skb;
807 
808 	/* The TCP header must be at least 32-bit aligned.  */
809 	size = ALIGN(size, 4);
810 
811 	if (unlikely(tcp_under_memory_pressure(sk)))
812 		sk_mem_reclaim_partial(sk);
813 
814 	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
815 	if (likely(skb)) {
816 		bool mem_scheduled;
817 
818 		if (force_schedule) {
819 			mem_scheduled = true;
820 			sk_forced_mem_schedule(sk, skb->truesize);
821 		} else {
822 			mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
823 		}
824 		if (likely(mem_scheduled)) {
825 			skb_reserve(skb, sk->sk_prot->max_header);
826 			/*
827 			 * Make sure that we have exactly size bytes
828 			 * available to the caller, no more, no less.
829 			 */
830 			skb->reserved_tailroom = skb->end - skb->tail - size;
831 			return skb;
832 		}
833 		__kfree_skb(skb);
834 	} else {
835 		sk->sk_prot->enter_memory_pressure(sk);
836 		sk_stream_moderate_sndbuf(sk);
837 	}
838 	return NULL;
839 }
840 
841 static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
842 				       int large_allowed)
843 {
844 	struct tcp_sock *tp = tcp_sk(sk);
845 	u32 new_size_goal, size_goal;
846 
847 	if (!large_allowed || !sk_can_gso(sk))
848 		return mss_now;
849 
850 	/* Note : tcp_tso_autosize() will eventually split this later */
851 	new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
852 	new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
853 
854 	/* We try hard to avoid divides here */
855 	size_goal = tp->gso_segs * mss_now;
856 	if (unlikely(new_size_goal < size_goal ||
857 		     new_size_goal >= size_goal + mss_now)) {
858 		tp->gso_segs = min_t(u16, new_size_goal / mss_now,
859 				     sk->sk_gso_max_segs);
860 		size_goal = tp->gso_segs * mss_now;
861 	}
862 
863 	return max(size_goal, mss_now);
864 }
865 
866 static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
867 {
868 	int mss_now;
869 
870 	mss_now = tcp_current_mss(sk);
871 	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
872 
873 	return mss_now;
874 }
875 
876 static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
877 				size_t size, int flags)
878 {
879 	struct tcp_sock *tp = tcp_sk(sk);
880 	int mss_now, size_goal;
881 	int err;
882 	ssize_t copied;
883 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
884 
885 	/* Wait for a connection to finish. One exception is TCP Fast Open
886 	 * (passive side) where data is allowed to be sent before a connection
887 	 * is fully established.
888 	 */
889 	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
890 	    !tcp_passive_fastopen(sk)) {
891 		err = sk_stream_wait_connect(sk, &timeo);
892 		if (err != 0)
893 			goto out_err;
894 	}
895 
896 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
897 
898 	mss_now = tcp_send_mss(sk, &size_goal, flags);
899 	copied = 0;
900 
901 	err = -EPIPE;
902 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
903 		goto out_err;
904 
905 	while (size > 0) {
906 		struct sk_buff *skb = tcp_write_queue_tail(sk);
907 		int copy, i;
908 		bool can_coalesce;
909 
910 		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
911 		    !tcp_skb_can_collapse_to(skb)) {
912 new_segment:
913 			if (!sk_stream_memory_free(sk))
914 				goto wait_for_sndbuf;
915 
916 			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
917 						  skb_queue_empty(&sk->sk_write_queue));
918 			if (!skb)
919 				goto wait_for_memory;
920 
921 			skb_entail(sk, skb);
922 			copy = size_goal;
923 		}
924 
925 		if (copy > size)
926 			copy = size;
927 
928 		i = skb_shinfo(skb)->nr_frags;
929 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
930 		if (!can_coalesce && i >= sysctl_max_skb_frags) {
931 			tcp_mark_push(tp, skb);
932 			goto new_segment;
933 		}
934 		if (!sk_wmem_schedule(sk, copy))
935 			goto wait_for_memory;
936 
937 		if (can_coalesce) {
938 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
939 		} else {
940 			get_page(page);
941 			skb_fill_page_desc(skb, i, page, offset, copy);
942 		}
943 		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
944 
945 		skb->len += copy;
946 		skb->data_len += copy;
947 		skb->truesize += copy;
948 		sk->sk_wmem_queued += copy;
949 		sk_mem_charge(sk, copy);
950 		skb->ip_summed = CHECKSUM_PARTIAL;
951 		tp->write_seq += copy;
952 		TCP_SKB_CB(skb)->end_seq += copy;
953 		tcp_skb_pcount_set(skb, 0);
954 
955 		if (!copied)
956 			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
957 
958 		copied += copy;
959 		offset += copy;
960 		size -= copy;
961 		if (!size) {
962 			tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
963 			goto out;
964 		}
965 
966 		if (skb->len < size_goal || (flags & MSG_OOB))
967 			continue;
968 
969 		if (forced_push(tp)) {
970 			tcp_mark_push(tp, skb);
971 			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
972 		} else if (skb == tcp_send_head(sk))
973 			tcp_push_one(sk, mss_now);
974 		continue;
975 
976 wait_for_sndbuf:
977 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
978 wait_for_memory:
979 		tcp_push(sk, flags & ~MSG_MORE, mss_now,
980 			 TCP_NAGLE_PUSH, size_goal);
981 
982 		err = sk_stream_wait_memory(sk, &timeo);
983 		if (err != 0)
984 			goto do_error;
985 
986 		mss_now = tcp_send_mss(sk, &size_goal, flags);
987 	}
988 
989 out:
990 	if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
991 		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
992 	return copied;
993 
994 do_error:
995 	if (copied)
996 		goto out;
997 out_err:
998 	/* make sure we wake any epoll edge trigger waiter */
999 	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
1000 		sk->sk_write_space(sk);
1001 	return sk_stream_error(sk, flags, err);
1002 }
1003 
1004 int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1005 		 size_t size, int flags)
1006 {
1007 	ssize_t res;
1008 
1009 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
1010 	    !sk_check_csum_caps(sk))
1011 		return sock_no_sendpage(sk->sk_socket, page, offset, size,
1012 					flags);
1013 
1014 	lock_sock(sk);
1015 
1016 	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
1017 
1018 	res = do_tcp_sendpages(sk, page, offset, size, flags);
1019 	release_sock(sk);
1020 	return res;
1021 }
1022 EXPORT_SYMBOL(tcp_sendpage);
1023 
1024 /* Do not bother using a page frag for very small frames.
1025  * But use this heuristic only for the first skb in write queue.
1026  *
1027  * Having no payload in skb->head allows better SACK shifting
1028  * in tcp_shift_skb_data(), reducing sack/rack overhead, because
1029  * write queue has less skbs.
1030  * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
1031  * This also speeds up tso_fragment(), since it wont fallback
1032  * to tcp_fragment().
1033  */
1034 static int linear_payload_sz(bool first_skb)
1035 {
1036 	if (first_skb)
1037 		return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1038 	return 0;
1039 }
1040 
1041 static int select_size(const struct sock *sk, bool sg, bool first_skb)
1042 {
1043 	const struct tcp_sock *tp = tcp_sk(sk);
1044 	int tmp = tp->mss_cache;
1045 
1046 	if (sg) {
1047 		if (sk_can_gso(sk)) {
1048 			tmp = linear_payload_sz(first_skb);
1049 		} else {
1050 			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1051 
1052 			if (tmp >= pgbreak &&
1053 			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1054 				tmp = pgbreak;
1055 		}
1056 	}
1057 
1058 	return tmp;
1059 }
1060 
1061 void tcp_free_fastopen_req(struct tcp_sock *tp)
1062 {
1063 	if (tp->fastopen_req) {
1064 		kfree(tp->fastopen_req);
1065 		tp->fastopen_req = NULL;
1066 	}
1067 }
1068 
1069 static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1070 				int *copied, size_t size)
1071 {
1072 	struct tcp_sock *tp = tcp_sk(sk);
1073 	int err, flags;
1074 
1075 	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1076 		return -EOPNOTSUPP;
1077 	if (tp->fastopen_req)
1078 		return -EALREADY; /* Another Fast Open is in progress */
1079 
1080 	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1081 				   sk->sk_allocation);
1082 	if (unlikely(!tp->fastopen_req))
1083 		return -ENOBUFS;
1084 	tp->fastopen_req->data = msg;
1085 	tp->fastopen_req->size = size;
1086 
1087 	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1088 	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1089 				    msg->msg_namelen, flags);
1090 	*copied = tp->fastopen_req->copied;
1091 	tcp_free_fastopen_req(tp);
1092 	return err;
1093 }
1094 
1095 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1096 {
1097 	struct tcp_sock *tp = tcp_sk(sk);
1098 	struct sk_buff *skb;
1099 	struct sockcm_cookie sockc;
1100 	int flags, err, copied = 0;
1101 	int mss_now = 0, size_goal, copied_syn = 0;
1102 	bool process_backlog = false;
1103 	bool sg;
1104 	long timeo;
1105 
1106 	lock_sock(sk);
1107 
1108 	flags = msg->msg_flags;
1109 	if (flags & MSG_FASTOPEN) {
1110 		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1111 		if (err == -EINPROGRESS && copied_syn > 0)
1112 			goto out;
1113 		else if (err)
1114 			goto out_err;
1115 	}
1116 
1117 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1118 
1119 	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
1120 
1121 	/* Wait for a connection to finish. One exception is TCP Fast Open
1122 	 * (passive side) where data is allowed to be sent before a connection
1123 	 * is fully established.
1124 	 */
1125 	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1126 	    !tcp_passive_fastopen(sk)) {
1127 		err = sk_stream_wait_connect(sk, &timeo);
1128 		if (err != 0)
1129 			goto do_error;
1130 	}
1131 
1132 	if (unlikely(tp->repair)) {
1133 		if (tp->repair_queue == TCP_RECV_QUEUE) {
1134 			copied = tcp_send_rcvq(sk, msg, size);
1135 			goto out_nopush;
1136 		}
1137 
1138 		err = -EINVAL;
1139 		if (tp->repair_queue == TCP_NO_QUEUE)
1140 			goto out_err;
1141 
1142 		/* 'common' sending to sendq */
1143 	}
1144 
1145 	sockc.tsflags = sk->sk_tsflags;
1146 	if (msg->msg_controllen) {
1147 		err = sock_cmsg_send(sk, msg, &sockc);
1148 		if (unlikely(err)) {
1149 			err = -EINVAL;
1150 			goto out_err;
1151 		}
1152 	}
1153 
1154 	/* This should be in poll */
1155 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1156 
1157 	/* Ok commence sending. */
1158 	copied = 0;
1159 
1160 restart:
1161 	mss_now = tcp_send_mss(sk, &size_goal, flags);
1162 
1163 	err = -EPIPE;
1164 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1165 		goto do_error;
1166 
1167 	sg = !!(sk->sk_route_caps & NETIF_F_SG);
1168 
1169 	while (msg_data_left(msg)) {
1170 		int copy = 0;
1171 		int max = size_goal;
1172 
1173 		skb = tcp_write_queue_tail(sk);
1174 		if (tcp_send_head(sk)) {
1175 			if (skb->ip_summed == CHECKSUM_NONE)
1176 				max = mss_now;
1177 			copy = max - skb->len;
1178 		}
1179 
1180 		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1181 			bool first_skb;
1182 
1183 new_segment:
1184 			/* Allocate new segment. If the interface is SG,
1185 			 * allocate skb fitting to single page.
1186 			 */
1187 			if (!sk_stream_memory_free(sk))
1188 				goto wait_for_sndbuf;
1189 
1190 			if (process_backlog && sk_flush_backlog(sk)) {
1191 				process_backlog = false;
1192 				goto restart;
1193 			}
1194 			first_skb = skb_queue_empty(&sk->sk_write_queue);
1195 			skb = sk_stream_alloc_skb(sk,
1196 						  select_size(sk, sg, first_skb),
1197 						  sk->sk_allocation,
1198 						  first_skb);
1199 			if (!skb)
1200 				goto wait_for_memory;
1201 
1202 			process_backlog = true;
1203 			/*
1204 			 * Check whether we can use HW checksum.
1205 			 */
1206 			if (sk_check_csum_caps(sk))
1207 				skb->ip_summed = CHECKSUM_PARTIAL;
1208 
1209 			skb_entail(sk, skb);
1210 			copy = size_goal;
1211 			max = size_goal;
1212 
1213 			/* All packets are restored as if they have
1214 			 * already been sent. skb_mstamp isn't set to
1215 			 * avoid wrong rtt estimation.
1216 			 */
1217 			if (tp->repair)
1218 				TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1219 		}
1220 
1221 		/* Try to append data to the end of skb. */
1222 		if (copy > msg_data_left(msg))
1223 			copy = msg_data_left(msg);
1224 
1225 		/* Where to copy to? */
1226 		if (skb_availroom(skb) > 0) {
1227 			/* We have some space in skb head. Superb! */
1228 			copy = min_t(int, copy, skb_availroom(skb));
1229 			err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1230 			if (err)
1231 				goto do_fault;
1232 		} else {
1233 			bool merge = true;
1234 			int i = skb_shinfo(skb)->nr_frags;
1235 			struct page_frag *pfrag = sk_page_frag(sk);
1236 
1237 			if (!sk_page_frag_refill(sk, pfrag))
1238 				goto wait_for_memory;
1239 
1240 			if (!skb_can_coalesce(skb, i, pfrag->page,
1241 					      pfrag->offset)) {
1242 				if (i >= sysctl_max_skb_frags || !sg) {
1243 					tcp_mark_push(tp, skb);
1244 					goto new_segment;
1245 				}
1246 				merge = false;
1247 			}
1248 
1249 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1250 
1251 			if (!sk_wmem_schedule(sk, copy))
1252 				goto wait_for_memory;
1253 
1254 			err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1255 						       pfrag->page,
1256 						       pfrag->offset,
1257 						       copy);
1258 			if (err)
1259 				goto do_error;
1260 
1261 			/* Update the skb. */
1262 			if (merge) {
1263 				skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1264 			} else {
1265 				skb_fill_page_desc(skb, i, pfrag->page,
1266 						   pfrag->offset, copy);
1267 				get_page(pfrag->page);
1268 			}
1269 			pfrag->offset += copy;
1270 		}
1271 
1272 		if (!copied)
1273 			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1274 
1275 		tp->write_seq += copy;
1276 		TCP_SKB_CB(skb)->end_seq += copy;
1277 		tcp_skb_pcount_set(skb, 0);
1278 
1279 		copied += copy;
1280 		if (!msg_data_left(msg)) {
1281 			tcp_tx_timestamp(sk, sockc.tsflags, skb);
1282 			if (unlikely(flags & MSG_EOR))
1283 				TCP_SKB_CB(skb)->eor = 1;
1284 			goto out;
1285 		}
1286 
1287 		if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1288 			continue;
1289 
1290 		if (forced_push(tp)) {
1291 			tcp_mark_push(tp, skb);
1292 			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1293 		} else if (skb == tcp_send_head(sk))
1294 			tcp_push_one(sk, mss_now);
1295 		continue;
1296 
1297 wait_for_sndbuf:
1298 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1299 wait_for_memory:
1300 		if (copied)
1301 			tcp_push(sk, flags & ~MSG_MORE, mss_now,
1302 				 TCP_NAGLE_PUSH, size_goal);
1303 
1304 		err = sk_stream_wait_memory(sk, &timeo);
1305 		if (err != 0)
1306 			goto do_error;
1307 
1308 		mss_now = tcp_send_mss(sk, &size_goal, flags);
1309 	}
1310 
1311 out:
1312 	if (copied)
1313 		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1314 out_nopush:
1315 	release_sock(sk);
1316 	return copied + copied_syn;
1317 
1318 do_fault:
1319 	if (!skb->len) {
1320 		tcp_unlink_write_queue(skb, sk);
1321 		/* It is the one place in all of TCP, except connection
1322 		 * reset, where we can be unlinking the send_head.
1323 		 */
1324 		tcp_check_send_head(sk, skb);
1325 		sk_wmem_free_skb(sk, skb);
1326 	}
1327 
1328 do_error:
1329 	if (copied + copied_syn)
1330 		goto out;
1331 out_err:
1332 	err = sk_stream_error(sk, flags, err);
1333 	/* make sure we wake any epoll edge trigger waiter */
1334 	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
1335 		sk->sk_write_space(sk);
1336 	release_sock(sk);
1337 	return err;
1338 }
1339 EXPORT_SYMBOL(tcp_sendmsg);
1340 
1341 /*
1342  *	Handle reading urgent data. BSD has very simple semantics for
1343  *	this, no blocking and very strange errors 8)
1344  */
1345 
1346 static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1347 {
1348 	struct tcp_sock *tp = tcp_sk(sk);
1349 
1350 	/* No URG data to read. */
1351 	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1352 	    tp->urg_data == TCP_URG_READ)
1353 		return -EINVAL;	/* Yes this is right ! */
1354 
1355 	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1356 		return -ENOTCONN;
1357 
1358 	if (tp->urg_data & TCP_URG_VALID) {
1359 		int err = 0;
1360 		char c = tp->urg_data;
1361 
1362 		if (!(flags & MSG_PEEK))
1363 			tp->urg_data = TCP_URG_READ;
1364 
1365 		/* Read urgent data. */
1366 		msg->msg_flags |= MSG_OOB;
1367 
1368 		if (len > 0) {
1369 			if (!(flags & MSG_TRUNC))
1370 				err = memcpy_to_msg(msg, &c, 1);
1371 			len = 1;
1372 		} else
1373 			msg->msg_flags |= MSG_TRUNC;
1374 
1375 		return err ? -EFAULT : len;
1376 	}
1377 
1378 	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1379 		return 0;
1380 
1381 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1382 	 * the available implementations agree in this case:
1383 	 * this call should never block, independent of the
1384 	 * blocking state of the socket.
1385 	 * Mike <pall@rz.uni-karlsruhe.de>
1386 	 */
1387 	return -EAGAIN;
1388 }
1389 
1390 static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1391 {
1392 	struct sk_buff *skb;
1393 	int copied = 0, err = 0;
1394 
1395 	/* XXX -- need to support SO_PEEK_OFF */
1396 
1397 	skb_queue_walk(&sk->sk_write_queue, skb) {
1398 		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1399 		if (err)
1400 			break;
1401 
1402 		copied += skb->len;
1403 	}
1404 
1405 	return err ?: copied;
1406 }
1407 
1408 /* Clean up the receive buffer for full frames taken by the user,
1409  * then send an ACK if necessary.  COPIED is the number of bytes
1410  * tcp_recvmsg has given to the user so far, it speeds up the
1411  * calculation of whether or not we must ACK for the sake of
1412  * a window update.
1413  */
1414 static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1415 {
1416 	struct tcp_sock *tp = tcp_sk(sk);
1417 	bool time_to_ack = false;
1418 
1419 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1420 
1421 	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1422 	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1423 	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1424 
1425 	if (inet_csk_ack_scheduled(sk)) {
1426 		const struct inet_connection_sock *icsk = inet_csk(sk);
1427 		   /* Delayed ACKs frequently hit locked sockets during bulk
1428 		    * receive. */
1429 		if (icsk->icsk_ack.blocked ||
1430 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1431 		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1432 		    /*
1433 		     * If this read emptied read buffer, we send ACK, if
1434 		     * connection is not bidirectional, user drained
1435 		     * receive buffer and there was a small segment
1436 		     * in queue.
1437 		     */
1438 		    (copied > 0 &&
1439 		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1440 		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1441 		       !icsk->icsk_ack.pingpong)) &&
1442 		      !atomic_read(&sk->sk_rmem_alloc)))
1443 			time_to_ack = true;
1444 	}
1445 
1446 	/* We send an ACK if we can now advertise a non-zero window
1447 	 * which has been raised "significantly".
1448 	 *
1449 	 * Even if window raised up to infinity, do not send window open ACK
1450 	 * in states, where we will not receive more. It is useless.
1451 	 */
1452 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1453 		__u32 rcv_window_now = tcp_receive_window(tp);
1454 
1455 		/* Optimize, __tcp_select_window() is not cheap. */
1456 		if (2*rcv_window_now <= tp->window_clamp) {
1457 			__u32 new_window = __tcp_select_window(sk);
1458 
1459 			/* Send ACK now, if this read freed lots of space
1460 			 * in our buffer. Certainly, new_window is new window.
1461 			 * We can advertise it now, if it is not less than current one.
1462 			 * "Lots" means "at least twice" here.
1463 			 */
1464 			if (new_window && new_window >= 2 * rcv_window_now)
1465 				time_to_ack = true;
1466 		}
1467 	}
1468 	if (time_to_ack)
1469 		tcp_send_ack(sk);
1470 }
1471 
1472 static void tcp_prequeue_process(struct sock *sk)
1473 {
1474 	struct sk_buff *skb;
1475 	struct tcp_sock *tp = tcp_sk(sk);
1476 
1477 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1478 
1479 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1480 		sk_backlog_rcv(sk, skb);
1481 
1482 	/* Clear memory counter. */
1483 	tp->ucopy.memory = 0;
1484 }
1485 
1486 static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1487 {
1488 	struct sk_buff *skb;
1489 	u32 offset;
1490 
1491 	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1492 		offset = seq - TCP_SKB_CB(skb)->seq;
1493 		if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1494 			pr_err_once("%s: found a SYN, please report !\n", __func__);
1495 			offset--;
1496 		}
1497 		if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1498 			*off = offset;
1499 			return skb;
1500 		}
1501 		/* This looks weird, but this can happen if TCP collapsing
1502 		 * splitted a fat GRO packet, while we released socket lock
1503 		 * in skb_splice_bits()
1504 		 */
1505 		sk_eat_skb(sk, skb);
1506 	}
1507 	return NULL;
1508 }
1509 
1510 /*
1511  * This routine provides an alternative to tcp_recvmsg() for routines
1512  * that would like to handle copying from skbuffs directly in 'sendfile'
1513  * fashion.
1514  * Note:
1515  *	- It is assumed that the socket was locked by the caller.
1516  *	- The routine does not block.
1517  *	- At present, there is no support for reading OOB data
1518  *	  or for 'peeking' the socket using this routine
1519  *	  (although both would be easy to implement).
1520  */
1521 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1522 		  sk_read_actor_t recv_actor)
1523 {
1524 	struct sk_buff *skb;
1525 	struct tcp_sock *tp = tcp_sk(sk);
1526 	u32 seq = tp->copied_seq;
1527 	u32 offset;
1528 	int copied = 0;
1529 
1530 	if (sk->sk_state == TCP_LISTEN)
1531 		return -ENOTCONN;
1532 	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1533 		if (offset < skb->len) {
1534 			int used;
1535 			size_t len;
1536 
1537 			len = skb->len - offset;
1538 			/* Stop reading if we hit a patch of urgent data */
1539 			if (tp->urg_data) {
1540 				u32 urg_offset = tp->urg_seq - seq;
1541 				if (urg_offset < len)
1542 					len = urg_offset;
1543 				if (!len)
1544 					break;
1545 			}
1546 			used = recv_actor(desc, skb, offset, len);
1547 			if (used <= 0) {
1548 				if (!copied)
1549 					copied = used;
1550 				break;
1551 			} else if (used <= len) {
1552 				seq += used;
1553 				copied += used;
1554 				offset += used;
1555 			}
1556 			/* If recv_actor drops the lock (e.g. TCP splice
1557 			 * receive) the skb pointer might be invalid when
1558 			 * getting here: tcp_collapse might have deleted it
1559 			 * while aggregating skbs from the socket queue.
1560 			 */
1561 			skb = tcp_recv_skb(sk, seq - 1, &offset);
1562 			if (!skb)
1563 				break;
1564 			/* TCP coalescing might have appended data to the skb.
1565 			 * Try to splice more frags
1566 			 */
1567 			if (offset + 1 != skb->len)
1568 				continue;
1569 		}
1570 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1571 			sk_eat_skb(sk, skb);
1572 			++seq;
1573 			break;
1574 		}
1575 		sk_eat_skb(sk, skb);
1576 		if (!desc->count)
1577 			break;
1578 		tp->copied_seq = seq;
1579 	}
1580 	tp->copied_seq = seq;
1581 
1582 	tcp_rcv_space_adjust(sk);
1583 
1584 	/* Clean up data we have read: This will do ACK frames. */
1585 	if (copied > 0) {
1586 		tcp_recv_skb(sk, seq, &offset);
1587 		tcp_cleanup_rbuf(sk, copied);
1588 	}
1589 	return copied;
1590 }
1591 EXPORT_SYMBOL(tcp_read_sock);
1592 
1593 int tcp_peek_len(struct socket *sock)
1594 {
1595 	return tcp_inq(sock->sk);
1596 }
1597 EXPORT_SYMBOL(tcp_peek_len);
1598 
1599 /*
1600  *	This routine copies from a sock struct into the user buffer.
1601  *
1602  *	Technical note: in 2.3 we work on _locked_ socket, so that
1603  *	tricks with *seq access order and skb->users are not required.
1604  *	Probably, code can be easily improved even more.
1605  */
1606 
1607 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1608 		int flags, int *addr_len)
1609 {
1610 	struct tcp_sock *tp = tcp_sk(sk);
1611 	int copied = 0;
1612 	u32 peek_seq;
1613 	u32 *seq;
1614 	unsigned long used;
1615 	int err;
1616 	int target;		/* Read at least this many bytes */
1617 	long timeo;
1618 	struct task_struct *user_recv = NULL;
1619 	struct sk_buff *skb, *last;
1620 	u32 urg_hole = 0;
1621 
1622 	if (unlikely(flags & MSG_ERRQUEUE))
1623 		return inet_recv_error(sk, msg, len, addr_len);
1624 
1625 	if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1626 	    (sk->sk_state == TCP_ESTABLISHED))
1627 		sk_busy_loop(sk, nonblock);
1628 
1629 	lock_sock(sk);
1630 
1631 	err = -ENOTCONN;
1632 	if (sk->sk_state == TCP_LISTEN)
1633 		goto out;
1634 
1635 	timeo = sock_rcvtimeo(sk, nonblock);
1636 
1637 	/* Urgent data needs to be handled specially. */
1638 	if (flags & MSG_OOB)
1639 		goto recv_urg;
1640 
1641 	if (unlikely(tp->repair)) {
1642 		err = -EPERM;
1643 		if (!(flags & MSG_PEEK))
1644 			goto out;
1645 
1646 		if (tp->repair_queue == TCP_SEND_QUEUE)
1647 			goto recv_sndq;
1648 
1649 		err = -EINVAL;
1650 		if (tp->repair_queue == TCP_NO_QUEUE)
1651 			goto out;
1652 
1653 		/* 'common' recv queue MSG_PEEK-ing */
1654 	}
1655 
1656 	seq = &tp->copied_seq;
1657 	if (flags & MSG_PEEK) {
1658 		peek_seq = tp->copied_seq;
1659 		seq = &peek_seq;
1660 	}
1661 
1662 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1663 
1664 	do {
1665 		u32 offset;
1666 
1667 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1668 		if (tp->urg_data && tp->urg_seq == *seq) {
1669 			if (copied)
1670 				break;
1671 			if (signal_pending(current)) {
1672 				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1673 				break;
1674 			}
1675 		}
1676 
1677 		/* Next get a buffer. */
1678 
1679 		last = skb_peek_tail(&sk->sk_receive_queue);
1680 		skb_queue_walk(&sk->sk_receive_queue, skb) {
1681 			last = skb;
1682 			/* Now that we have two receive queues this
1683 			 * shouldn't happen.
1684 			 */
1685 			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1686 				 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1687 				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1688 				 flags))
1689 				break;
1690 
1691 			offset = *seq - TCP_SKB_CB(skb)->seq;
1692 			if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1693 				pr_err_once("%s: found a SYN, please report !\n", __func__);
1694 				offset--;
1695 			}
1696 			if (offset < skb->len)
1697 				goto found_ok_skb;
1698 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1699 				goto found_fin_ok;
1700 			WARN(!(flags & MSG_PEEK),
1701 			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1702 			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1703 		}
1704 
1705 		/* Well, if we have backlog, try to process it now yet. */
1706 
1707 		if (copied >= target && !sk->sk_backlog.tail)
1708 			break;
1709 
1710 		if (copied) {
1711 			if (sk->sk_err ||
1712 			    sk->sk_state == TCP_CLOSE ||
1713 			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1714 			    !timeo ||
1715 			    signal_pending(current))
1716 				break;
1717 		} else {
1718 			if (sock_flag(sk, SOCK_DONE))
1719 				break;
1720 
1721 			if (sk->sk_err) {
1722 				copied = sock_error(sk);
1723 				break;
1724 			}
1725 
1726 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1727 				break;
1728 
1729 			if (sk->sk_state == TCP_CLOSE) {
1730 				if (!sock_flag(sk, SOCK_DONE)) {
1731 					/* This occurs when user tries to read
1732 					 * from never connected socket.
1733 					 */
1734 					copied = -ENOTCONN;
1735 					break;
1736 				}
1737 				break;
1738 			}
1739 
1740 			if (!timeo) {
1741 				copied = -EAGAIN;
1742 				break;
1743 			}
1744 
1745 			if (signal_pending(current)) {
1746 				copied = sock_intr_errno(timeo);
1747 				break;
1748 			}
1749 		}
1750 
1751 		tcp_cleanup_rbuf(sk, copied);
1752 
1753 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1754 			/* Install new reader */
1755 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1756 				user_recv = current;
1757 				tp->ucopy.task = user_recv;
1758 				tp->ucopy.msg = msg;
1759 			}
1760 
1761 			tp->ucopy.len = len;
1762 
1763 			WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1764 				!(flags & (MSG_PEEK | MSG_TRUNC)));
1765 
1766 			/* Ugly... If prequeue is not empty, we have to
1767 			 * process it before releasing socket, otherwise
1768 			 * order will be broken at second iteration.
1769 			 * More elegant solution is required!!!
1770 			 *
1771 			 * Look: we have the following (pseudo)queues:
1772 			 *
1773 			 * 1. packets in flight
1774 			 * 2. backlog
1775 			 * 3. prequeue
1776 			 * 4. receive_queue
1777 			 *
1778 			 * Each queue can be processed only if the next ones
1779 			 * are empty. At this point we have empty receive_queue.
1780 			 * But prequeue _can_ be not empty after 2nd iteration,
1781 			 * when we jumped to start of loop because backlog
1782 			 * processing added something to receive_queue.
1783 			 * We cannot release_sock(), because backlog contains
1784 			 * packets arrived _after_ prequeued ones.
1785 			 *
1786 			 * Shortly, algorithm is clear --- to process all
1787 			 * the queues in order. We could make it more directly,
1788 			 * requeueing packets from backlog to prequeue, if
1789 			 * is not empty. It is more elegant, but eats cycles,
1790 			 * unfortunately.
1791 			 */
1792 			if (!skb_queue_empty(&tp->ucopy.prequeue))
1793 				goto do_prequeue;
1794 
1795 			/* __ Set realtime policy in scheduler __ */
1796 		}
1797 
1798 		if (copied >= target) {
1799 			/* Do not sleep, just process backlog. */
1800 			release_sock(sk);
1801 			lock_sock(sk);
1802 		} else {
1803 			sk_wait_data(sk, &timeo, last);
1804 		}
1805 
1806 		if (user_recv) {
1807 			int chunk;
1808 
1809 			/* __ Restore normal policy in scheduler __ */
1810 
1811 			chunk = len - tp->ucopy.len;
1812 			if (chunk != 0) {
1813 				NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1814 				len -= chunk;
1815 				copied += chunk;
1816 			}
1817 
1818 			if (tp->rcv_nxt == tp->copied_seq &&
1819 			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1820 do_prequeue:
1821 				tcp_prequeue_process(sk);
1822 
1823 				chunk = len - tp->ucopy.len;
1824 				if (chunk != 0) {
1825 					NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1826 					len -= chunk;
1827 					copied += chunk;
1828 				}
1829 			}
1830 		}
1831 		if ((flags & MSG_PEEK) &&
1832 		    (peek_seq - copied - urg_hole != tp->copied_seq)) {
1833 			net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1834 					    current->comm,
1835 					    task_pid_nr(current));
1836 			peek_seq = tp->copied_seq;
1837 		}
1838 		continue;
1839 
1840 	found_ok_skb:
1841 		/* Ok so how much can we use? */
1842 		used = skb->len - offset;
1843 		if (len < used)
1844 			used = len;
1845 
1846 		/* Do we have urgent data here? */
1847 		if (tp->urg_data) {
1848 			u32 urg_offset = tp->urg_seq - *seq;
1849 			if (urg_offset < used) {
1850 				if (!urg_offset) {
1851 					if (!sock_flag(sk, SOCK_URGINLINE)) {
1852 						++*seq;
1853 						urg_hole++;
1854 						offset++;
1855 						used--;
1856 						if (!used)
1857 							goto skip_copy;
1858 					}
1859 				} else
1860 					used = urg_offset;
1861 			}
1862 		}
1863 
1864 		if (!(flags & MSG_TRUNC)) {
1865 			err = skb_copy_datagram_msg(skb, offset, msg, used);
1866 			if (err) {
1867 				/* Exception. Bailout! */
1868 				if (!copied)
1869 					copied = -EFAULT;
1870 				break;
1871 			}
1872 		}
1873 
1874 		*seq += used;
1875 		copied += used;
1876 		len -= used;
1877 
1878 		tcp_rcv_space_adjust(sk);
1879 
1880 skip_copy:
1881 		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1882 			tp->urg_data = 0;
1883 			tcp_fast_path_check(sk);
1884 		}
1885 		if (used + offset < skb->len)
1886 			continue;
1887 
1888 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1889 			goto found_fin_ok;
1890 		if (!(flags & MSG_PEEK))
1891 			sk_eat_skb(sk, skb);
1892 		continue;
1893 
1894 	found_fin_ok:
1895 		/* Process the FIN. */
1896 		++*seq;
1897 		if (!(flags & MSG_PEEK))
1898 			sk_eat_skb(sk, skb);
1899 		break;
1900 	} while (len > 0);
1901 
1902 	if (user_recv) {
1903 		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1904 			int chunk;
1905 
1906 			tp->ucopy.len = copied > 0 ? len : 0;
1907 
1908 			tcp_prequeue_process(sk);
1909 
1910 			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1911 				NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1912 				len -= chunk;
1913 				copied += chunk;
1914 			}
1915 		}
1916 
1917 		tp->ucopy.task = NULL;
1918 		tp->ucopy.len = 0;
1919 	}
1920 
1921 	/* According to UNIX98, msg_name/msg_namelen are ignored
1922 	 * on connected socket. I was just happy when found this 8) --ANK
1923 	 */
1924 
1925 	/* Clean up data we have read: This will do ACK frames. */
1926 	tcp_cleanup_rbuf(sk, copied);
1927 
1928 	release_sock(sk);
1929 	return copied;
1930 
1931 out:
1932 	release_sock(sk);
1933 	return err;
1934 
1935 recv_urg:
1936 	err = tcp_recv_urg(sk, msg, len, flags);
1937 	goto out;
1938 
1939 recv_sndq:
1940 	err = tcp_peek_sndq(sk, msg, len);
1941 	goto out;
1942 }
1943 EXPORT_SYMBOL(tcp_recvmsg);
1944 
1945 void tcp_set_state(struct sock *sk, int state)
1946 {
1947 	int oldstate = sk->sk_state;
1948 
1949 	switch (state) {
1950 	case TCP_ESTABLISHED:
1951 		if (oldstate != TCP_ESTABLISHED)
1952 			TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1953 		break;
1954 
1955 	case TCP_CLOSE:
1956 		if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1957 			TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1958 
1959 		sk->sk_prot->unhash(sk);
1960 		if (inet_csk(sk)->icsk_bind_hash &&
1961 		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1962 			inet_put_port(sk);
1963 		/* fall through */
1964 	default:
1965 		if (oldstate == TCP_ESTABLISHED)
1966 			TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1967 	}
1968 
1969 	/* Change state AFTER socket is unhashed to avoid closed
1970 	 * socket sitting in hash tables.
1971 	 */
1972 	sk_state_store(sk, state);
1973 
1974 #ifdef STATE_TRACE
1975 	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1976 #endif
1977 }
1978 EXPORT_SYMBOL_GPL(tcp_set_state);
1979 
1980 /*
1981  *	State processing on a close. This implements the state shift for
1982  *	sending our FIN frame. Note that we only send a FIN for some
1983  *	states. A shutdown() may have already sent the FIN, or we may be
1984  *	closed.
1985  */
1986 
1987 static const unsigned char new_state[16] = {
1988   /* current state:        new state:      action:	*/
1989   [0 /* (Invalid) */]	= TCP_CLOSE,
1990   [TCP_ESTABLISHED]	= TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1991   [TCP_SYN_SENT]	= TCP_CLOSE,
1992   [TCP_SYN_RECV]	= TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1993   [TCP_FIN_WAIT1]	= TCP_FIN_WAIT1,
1994   [TCP_FIN_WAIT2]	= TCP_FIN_WAIT2,
1995   [TCP_TIME_WAIT]	= TCP_CLOSE,
1996   [TCP_CLOSE]		= TCP_CLOSE,
1997   [TCP_CLOSE_WAIT]	= TCP_LAST_ACK  | TCP_ACTION_FIN,
1998   [TCP_LAST_ACK]	= TCP_LAST_ACK,
1999   [TCP_LISTEN]		= TCP_CLOSE,
2000   [TCP_CLOSING]		= TCP_CLOSING,
2001   [TCP_NEW_SYN_RECV]	= TCP_CLOSE,	/* should not happen ! */
2002 };
2003 
2004 static int tcp_close_state(struct sock *sk)
2005 {
2006 	int next = (int)new_state[sk->sk_state];
2007 	int ns = next & TCP_STATE_MASK;
2008 
2009 	tcp_set_state(sk, ns);
2010 
2011 	return next & TCP_ACTION_FIN;
2012 }
2013 
2014 /*
2015  *	Shutdown the sending side of a connection. Much like close except
2016  *	that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
2017  */
2018 
2019 void tcp_shutdown(struct sock *sk, int how)
2020 {
2021 	/*	We need to grab some memory, and put together a FIN,
2022 	 *	and then put it into the queue to be sent.
2023 	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2024 	 */
2025 	if (!(how & SEND_SHUTDOWN))
2026 		return;
2027 
2028 	/* If we've already sent a FIN, or it's a closed state, skip this. */
2029 	if ((1 << sk->sk_state) &
2030 	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2031 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2032 		/* Clear out any half completed packets.  FIN if needed. */
2033 		if (tcp_close_state(sk))
2034 			tcp_send_fin(sk);
2035 	}
2036 }
2037 EXPORT_SYMBOL(tcp_shutdown);
2038 
2039 bool tcp_check_oom(struct sock *sk, int shift)
2040 {
2041 	bool too_many_orphans, out_of_socket_memory;
2042 
2043 	too_many_orphans = tcp_too_many_orphans(sk, shift);
2044 	out_of_socket_memory = tcp_out_of_memory(sk);
2045 
2046 	if (too_many_orphans)
2047 		net_info_ratelimited("too many orphaned sockets\n");
2048 	if (out_of_socket_memory)
2049 		net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2050 	return too_many_orphans || out_of_socket_memory;
2051 }
2052 
2053 void tcp_close(struct sock *sk, long timeout)
2054 {
2055 	struct sk_buff *skb;
2056 	int data_was_unread = 0;
2057 	int state;
2058 
2059 	lock_sock(sk);
2060 	sk->sk_shutdown = SHUTDOWN_MASK;
2061 
2062 	if (sk->sk_state == TCP_LISTEN) {
2063 		tcp_set_state(sk, TCP_CLOSE);
2064 
2065 		/* Special case. */
2066 		inet_csk_listen_stop(sk);
2067 
2068 		goto adjudge_to_death;
2069 	}
2070 
2071 	/*  We need to flush the recv. buffs.  We do this only on the
2072 	 *  descriptor close, not protocol-sourced closes, because the
2073 	 *  reader process may not have drained the data yet!
2074 	 */
2075 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2076 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2077 
2078 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2079 			len--;
2080 		data_was_unread += len;
2081 		__kfree_skb(skb);
2082 	}
2083 
2084 	sk_mem_reclaim(sk);
2085 
2086 	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
2087 	if (sk->sk_state == TCP_CLOSE)
2088 		goto adjudge_to_death;
2089 
2090 	/* As outlined in RFC 2525, section 2.17, we send a RST here because
2091 	 * data was lost. To witness the awful effects of the old behavior of
2092 	 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
2093 	 * GET in an FTP client, suspend the process, wait for the client to
2094 	 * advertise a zero window, then kill -9 the FTP client, wheee...
2095 	 * Note: timeout is always zero in such a case.
2096 	 */
2097 	if (unlikely(tcp_sk(sk)->repair)) {
2098 		sk->sk_prot->disconnect(sk, 0);
2099 	} else if (data_was_unread) {
2100 		/* Unread data was tossed, zap the connection. */
2101 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2102 		tcp_set_state(sk, TCP_CLOSE);
2103 		tcp_send_active_reset(sk, sk->sk_allocation);
2104 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2105 		/* Check zero linger _after_ checking for unread data. */
2106 		sk->sk_prot->disconnect(sk, 0);
2107 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2108 	} else if (tcp_close_state(sk)) {
2109 		/* We FIN if the application ate all the data before
2110 		 * zapping the connection.
2111 		 */
2112 
2113 		/* RED-PEN. Formally speaking, we have broken TCP state
2114 		 * machine. State transitions:
2115 		 *
2116 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2117 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
2118 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2119 		 *
2120 		 * are legal only when FIN has been sent (i.e. in window),
2121 		 * rather than queued out of window. Purists blame.
2122 		 *
2123 		 * F.e. "RFC state" is ESTABLISHED,
2124 		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2125 		 *
2126 		 * The visible declinations are that sometimes
2127 		 * we enter time-wait state, when it is not required really
2128 		 * (harmless), do not send active resets, when they are
2129 		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2130 		 * they look as CLOSING or LAST_ACK for Linux)
2131 		 * Probably, I missed some more holelets.
2132 		 * 						--ANK
2133 		 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2134 		 * in a single packet! (May consider it later but will
2135 		 * probably need API support or TCP_CORK SYN-ACK until
2136 		 * data is written and socket is closed.)
2137 		 */
2138 		tcp_send_fin(sk);
2139 	}
2140 
2141 	sk_stream_wait_close(sk, timeout);
2142 
2143 adjudge_to_death:
2144 	state = sk->sk_state;
2145 	sock_hold(sk);
2146 	sock_orphan(sk);
2147 
2148 	/* It is the last release_sock in its life. It will remove backlog. */
2149 	release_sock(sk);
2150 
2151 
2152 	/* Now socket is owned by kernel and we acquire BH lock
2153 	   to finish close. No need to check for user refs.
2154 	 */
2155 	local_bh_disable();
2156 	bh_lock_sock(sk);
2157 	WARN_ON(sock_owned_by_user(sk));
2158 
2159 	percpu_counter_inc(sk->sk_prot->orphan_count);
2160 
2161 	/* Have we already been destroyed by a softirq or backlog? */
2162 	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2163 		goto out;
2164 
2165 	/*	This is a (useful) BSD violating of the RFC. There is a
2166 	 *	problem with TCP as specified in that the other end could
2167 	 *	keep a socket open forever with no application left this end.
2168 	 *	We use a 1 minute timeout (about the same as BSD) then kill
2169 	 *	our end. If they send after that then tough - BUT: long enough
2170 	 *	that we won't make the old 4*rto = almost no time - whoops
2171 	 *	reset mistake.
2172 	 *
2173 	 *	Nope, it was not mistake. It is really desired behaviour
2174 	 *	f.e. on http servers, when such sockets are useless, but
2175 	 *	consume significant resources. Let's do it with special
2176 	 *	linger2	option.					--ANK
2177 	 */
2178 
2179 	if (sk->sk_state == TCP_FIN_WAIT2) {
2180 		struct tcp_sock *tp = tcp_sk(sk);
2181 		if (tp->linger2 < 0) {
2182 			tcp_set_state(sk, TCP_CLOSE);
2183 			tcp_send_active_reset(sk, GFP_ATOMIC);
2184 			__NET_INC_STATS(sock_net(sk),
2185 					LINUX_MIB_TCPABORTONLINGER);
2186 		} else {
2187 			const int tmo = tcp_fin_time(sk);
2188 
2189 			if (tmo > TCP_TIMEWAIT_LEN) {
2190 				inet_csk_reset_keepalive_timer(sk,
2191 						tmo - TCP_TIMEWAIT_LEN);
2192 			} else {
2193 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2194 				goto out;
2195 			}
2196 		}
2197 	}
2198 	if (sk->sk_state != TCP_CLOSE) {
2199 		sk_mem_reclaim(sk);
2200 		if (tcp_check_oom(sk, 0)) {
2201 			tcp_set_state(sk, TCP_CLOSE);
2202 			tcp_send_active_reset(sk, GFP_ATOMIC);
2203 			__NET_INC_STATS(sock_net(sk),
2204 					LINUX_MIB_TCPABORTONMEMORY);
2205 		}
2206 	}
2207 
2208 	if (sk->sk_state == TCP_CLOSE) {
2209 		struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2210 		/* We could get here with a non-NULL req if the socket is
2211 		 * aborted (e.g., closed with unread data) before 3WHS
2212 		 * finishes.
2213 		 */
2214 		if (req)
2215 			reqsk_fastopen_remove(sk, req, false);
2216 		inet_csk_destroy_sock(sk);
2217 	}
2218 	/* Otherwise, socket is reprieved until protocol close. */
2219 
2220 out:
2221 	bh_unlock_sock(sk);
2222 	local_bh_enable();
2223 	sock_put(sk);
2224 }
2225 EXPORT_SYMBOL(tcp_close);
2226 
2227 /* These states need RST on ABORT according to RFC793 */
2228 
2229 static inline bool tcp_need_reset(int state)
2230 {
2231 	return (1 << state) &
2232 	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2233 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2234 }
2235 
2236 int tcp_disconnect(struct sock *sk, int flags)
2237 {
2238 	struct inet_sock *inet = inet_sk(sk);
2239 	struct inet_connection_sock *icsk = inet_csk(sk);
2240 	struct tcp_sock *tp = tcp_sk(sk);
2241 	int err = 0;
2242 	int old_state = sk->sk_state;
2243 
2244 	if (old_state != TCP_CLOSE)
2245 		tcp_set_state(sk, TCP_CLOSE);
2246 
2247 	/* ABORT function of RFC793 */
2248 	if (old_state == TCP_LISTEN) {
2249 		inet_csk_listen_stop(sk);
2250 	} else if (unlikely(tp->repair)) {
2251 		sk->sk_err = ECONNABORTED;
2252 	} else if (tcp_need_reset(old_state) ||
2253 		   (tp->snd_nxt != tp->write_seq &&
2254 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2255 		/* The last check adjusts for discrepancy of Linux wrt. RFC
2256 		 * states
2257 		 */
2258 		tcp_send_active_reset(sk, gfp_any());
2259 		sk->sk_err = ECONNRESET;
2260 	} else if (old_state == TCP_SYN_SENT)
2261 		sk->sk_err = ECONNRESET;
2262 
2263 	tcp_clear_xmit_timers(sk);
2264 	__skb_queue_purge(&sk->sk_receive_queue);
2265 	tcp_write_queue_purge(sk);
2266 	skb_rbtree_purge(&tp->out_of_order_queue);
2267 
2268 	inet->inet_dport = 0;
2269 
2270 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2271 		inet_reset_saddr(sk);
2272 
2273 	sk->sk_shutdown = 0;
2274 	sock_reset_flag(sk, SOCK_DONE);
2275 	tp->srtt_us = 0;
2276 	tp->write_seq += tp->max_window + 2;
2277 	if (tp->write_seq == 0)
2278 		tp->write_seq = 1;
2279 	icsk->icsk_backoff = 0;
2280 	tp->snd_cwnd = 2;
2281 	icsk->icsk_probes_out = 0;
2282 	tp->packets_out = 0;
2283 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2284 	tp->snd_cwnd_cnt = 0;
2285 	tp->window_clamp = 0;
2286 	tcp_set_ca_state(sk, TCP_CA_Open);
2287 	tcp_clear_retrans(tp);
2288 	inet_csk_delack_init(sk);
2289 	tcp_init_send_head(sk);
2290 	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2291 	__sk_dst_reset(sk);
2292 
2293 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2294 
2295 	sk->sk_error_report(sk);
2296 	return err;
2297 }
2298 EXPORT_SYMBOL(tcp_disconnect);
2299 
2300 static inline bool tcp_can_repair_sock(const struct sock *sk)
2301 {
2302 	return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2303 		(sk->sk_state != TCP_LISTEN);
2304 }
2305 
2306 static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
2307 {
2308 	struct tcp_repair_window opt;
2309 
2310 	if (!tp->repair)
2311 		return -EPERM;
2312 
2313 	if (len != sizeof(opt))
2314 		return -EINVAL;
2315 
2316 	if (copy_from_user(&opt, optbuf, sizeof(opt)))
2317 		return -EFAULT;
2318 
2319 	if (opt.max_window < opt.snd_wnd)
2320 		return -EINVAL;
2321 
2322 	if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
2323 		return -EINVAL;
2324 
2325 	if (after(opt.rcv_wup, tp->rcv_nxt))
2326 		return -EINVAL;
2327 
2328 	tp->snd_wl1	= opt.snd_wl1;
2329 	tp->snd_wnd	= opt.snd_wnd;
2330 	tp->max_window	= opt.max_window;
2331 
2332 	tp->rcv_wnd	= opt.rcv_wnd;
2333 	tp->rcv_wup	= opt.rcv_wup;
2334 
2335 	return 0;
2336 }
2337 
2338 static int tcp_repair_options_est(struct tcp_sock *tp,
2339 		struct tcp_repair_opt __user *optbuf, unsigned int len)
2340 {
2341 	struct tcp_repair_opt opt;
2342 
2343 	while (len >= sizeof(opt)) {
2344 		if (copy_from_user(&opt, optbuf, sizeof(opt)))
2345 			return -EFAULT;
2346 
2347 		optbuf++;
2348 		len -= sizeof(opt);
2349 
2350 		switch (opt.opt_code) {
2351 		case TCPOPT_MSS:
2352 			tp->rx_opt.mss_clamp = opt.opt_val;
2353 			break;
2354 		case TCPOPT_WINDOW:
2355 			{
2356 				u16 snd_wscale = opt.opt_val & 0xFFFF;
2357 				u16 rcv_wscale = opt.opt_val >> 16;
2358 
2359 				if (snd_wscale > 14 || rcv_wscale > 14)
2360 					return -EFBIG;
2361 
2362 				tp->rx_opt.snd_wscale = snd_wscale;
2363 				tp->rx_opt.rcv_wscale = rcv_wscale;
2364 				tp->rx_opt.wscale_ok = 1;
2365 			}
2366 			break;
2367 		case TCPOPT_SACK_PERM:
2368 			if (opt.opt_val != 0)
2369 				return -EINVAL;
2370 
2371 			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2372 			if (sysctl_tcp_fack)
2373 				tcp_enable_fack(tp);
2374 			break;
2375 		case TCPOPT_TIMESTAMP:
2376 			if (opt.opt_val != 0)
2377 				return -EINVAL;
2378 
2379 			tp->rx_opt.tstamp_ok = 1;
2380 			break;
2381 		}
2382 	}
2383 
2384 	return 0;
2385 }
2386 
2387 /*
2388  *	Socket option code for TCP.
2389  */
2390 static int do_tcp_setsockopt(struct sock *sk, int level,
2391 		int optname, char __user *optval, unsigned int optlen)
2392 {
2393 	struct tcp_sock *tp = tcp_sk(sk);
2394 	struct inet_connection_sock *icsk = inet_csk(sk);
2395 	struct net *net = sock_net(sk);
2396 	int val;
2397 	int err = 0;
2398 
2399 	/* These are data/string values, all the others are ints */
2400 	switch (optname) {
2401 	case TCP_CONGESTION: {
2402 		char name[TCP_CA_NAME_MAX];
2403 
2404 		if (optlen < 1)
2405 			return -EINVAL;
2406 
2407 		val = strncpy_from_user(name, optval,
2408 					min_t(long, TCP_CA_NAME_MAX-1, optlen));
2409 		if (val < 0)
2410 			return -EFAULT;
2411 		name[val] = 0;
2412 
2413 		lock_sock(sk);
2414 		err = tcp_set_congestion_control(sk, name);
2415 		release_sock(sk);
2416 		return err;
2417 	}
2418 	default:
2419 		/* fallthru */
2420 		break;
2421 	}
2422 
2423 	if (optlen < sizeof(int))
2424 		return -EINVAL;
2425 
2426 	if (get_user(val, (int __user *)optval))
2427 		return -EFAULT;
2428 
2429 	lock_sock(sk);
2430 
2431 	switch (optname) {
2432 	case TCP_MAXSEG:
2433 		/* Values greater than interface MTU won't take effect. However
2434 		 * at the point when this call is done we typically don't yet
2435 		 * know which interface is going to be used */
2436 		if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2437 			err = -EINVAL;
2438 			break;
2439 		}
2440 		tp->rx_opt.user_mss = val;
2441 		break;
2442 
2443 	case TCP_NODELAY:
2444 		if (val) {
2445 			/* TCP_NODELAY is weaker than TCP_CORK, so that
2446 			 * this option on corked socket is remembered, but
2447 			 * it is not activated until cork is cleared.
2448 			 *
2449 			 * However, when TCP_NODELAY is set we make
2450 			 * an explicit push, which overrides even TCP_CORK
2451 			 * for currently queued segments.
2452 			 */
2453 			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2454 			tcp_push_pending_frames(sk);
2455 		} else {
2456 			tp->nonagle &= ~TCP_NAGLE_OFF;
2457 		}
2458 		break;
2459 
2460 	case TCP_THIN_LINEAR_TIMEOUTS:
2461 		if (val < 0 || val > 1)
2462 			err = -EINVAL;
2463 		else
2464 			tp->thin_lto = val;
2465 		break;
2466 
2467 	case TCP_THIN_DUPACK:
2468 		if (val < 0 || val > 1)
2469 			err = -EINVAL;
2470 		else {
2471 			tp->thin_dupack = val;
2472 			if (tp->thin_dupack)
2473 				tcp_disable_early_retrans(tp);
2474 		}
2475 		break;
2476 
2477 	case TCP_REPAIR:
2478 		if (!tcp_can_repair_sock(sk))
2479 			err = -EPERM;
2480 		else if (val == 1) {
2481 			tp->repair = 1;
2482 			sk->sk_reuse = SK_FORCE_REUSE;
2483 			tp->repair_queue = TCP_NO_QUEUE;
2484 		} else if (val == 0) {
2485 			tp->repair = 0;
2486 			sk->sk_reuse = SK_NO_REUSE;
2487 			tcp_send_window_probe(sk);
2488 		} else
2489 			err = -EINVAL;
2490 
2491 		break;
2492 
2493 	case TCP_REPAIR_QUEUE:
2494 		if (!tp->repair)
2495 			err = -EPERM;
2496 		else if (val < TCP_QUEUES_NR)
2497 			tp->repair_queue = val;
2498 		else
2499 			err = -EINVAL;
2500 		break;
2501 
2502 	case TCP_QUEUE_SEQ:
2503 		if (sk->sk_state != TCP_CLOSE)
2504 			err = -EPERM;
2505 		else if (tp->repair_queue == TCP_SEND_QUEUE)
2506 			tp->write_seq = val;
2507 		else if (tp->repair_queue == TCP_RECV_QUEUE)
2508 			tp->rcv_nxt = val;
2509 		else
2510 			err = -EINVAL;
2511 		break;
2512 
2513 	case TCP_REPAIR_OPTIONS:
2514 		if (!tp->repair)
2515 			err = -EINVAL;
2516 		else if (sk->sk_state == TCP_ESTABLISHED)
2517 			err = tcp_repair_options_est(tp,
2518 					(struct tcp_repair_opt __user *)optval,
2519 					optlen);
2520 		else
2521 			err = -EPERM;
2522 		break;
2523 
2524 	case TCP_CORK:
2525 		/* When set indicates to always queue non-full frames.
2526 		 * Later the user clears this option and we transmit
2527 		 * any pending partial frames in the queue.  This is
2528 		 * meant to be used alongside sendfile() to get properly
2529 		 * filled frames when the user (for example) must write
2530 		 * out headers with a write() call first and then use
2531 		 * sendfile to send out the data parts.
2532 		 *
2533 		 * TCP_CORK can be set together with TCP_NODELAY and it is
2534 		 * stronger than TCP_NODELAY.
2535 		 */
2536 		if (val) {
2537 			tp->nonagle |= TCP_NAGLE_CORK;
2538 		} else {
2539 			tp->nonagle &= ~TCP_NAGLE_CORK;
2540 			if (tp->nonagle&TCP_NAGLE_OFF)
2541 				tp->nonagle |= TCP_NAGLE_PUSH;
2542 			tcp_push_pending_frames(sk);
2543 		}
2544 		break;
2545 
2546 	case TCP_KEEPIDLE:
2547 		if (val < 1 || val > MAX_TCP_KEEPIDLE)
2548 			err = -EINVAL;
2549 		else {
2550 			tp->keepalive_time = val * HZ;
2551 			if (sock_flag(sk, SOCK_KEEPOPEN) &&
2552 			    !((1 << sk->sk_state) &
2553 			      (TCPF_CLOSE | TCPF_LISTEN))) {
2554 				u32 elapsed = keepalive_time_elapsed(tp);
2555 				if (tp->keepalive_time > elapsed)
2556 					elapsed = tp->keepalive_time - elapsed;
2557 				else
2558 					elapsed = 0;
2559 				inet_csk_reset_keepalive_timer(sk, elapsed);
2560 			}
2561 		}
2562 		break;
2563 	case TCP_KEEPINTVL:
2564 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2565 			err = -EINVAL;
2566 		else
2567 			tp->keepalive_intvl = val * HZ;
2568 		break;
2569 	case TCP_KEEPCNT:
2570 		if (val < 1 || val > MAX_TCP_KEEPCNT)
2571 			err = -EINVAL;
2572 		else
2573 			tp->keepalive_probes = val;
2574 		break;
2575 	case TCP_SYNCNT:
2576 		if (val < 1 || val > MAX_TCP_SYNCNT)
2577 			err = -EINVAL;
2578 		else
2579 			icsk->icsk_syn_retries = val;
2580 		break;
2581 
2582 	case TCP_SAVE_SYN:
2583 		if (val < 0 || val > 1)
2584 			err = -EINVAL;
2585 		else
2586 			tp->save_syn = val;
2587 		break;
2588 
2589 	case TCP_LINGER2:
2590 		if (val < 0)
2591 			tp->linger2 = -1;
2592 		else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
2593 			tp->linger2 = 0;
2594 		else
2595 			tp->linger2 = val * HZ;
2596 		break;
2597 
2598 	case TCP_DEFER_ACCEPT:
2599 		/* Translate value in seconds to number of retransmits */
2600 		icsk->icsk_accept_queue.rskq_defer_accept =
2601 			secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2602 					TCP_RTO_MAX / HZ);
2603 		break;
2604 
2605 	case TCP_WINDOW_CLAMP:
2606 		if (!val) {
2607 			if (sk->sk_state != TCP_CLOSE) {
2608 				err = -EINVAL;
2609 				break;
2610 			}
2611 			tp->window_clamp = 0;
2612 		} else
2613 			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2614 						SOCK_MIN_RCVBUF / 2 : val;
2615 		break;
2616 
2617 	case TCP_QUICKACK:
2618 		if (!val) {
2619 			icsk->icsk_ack.pingpong = 1;
2620 		} else {
2621 			icsk->icsk_ack.pingpong = 0;
2622 			if ((1 << sk->sk_state) &
2623 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2624 			    inet_csk_ack_scheduled(sk)) {
2625 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2626 				tcp_cleanup_rbuf(sk, 1);
2627 				if (!(val & 1))
2628 					icsk->icsk_ack.pingpong = 1;
2629 			}
2630 		}
2631 		break;
2632 
2633 #ifdef CONFIG_TCP_MD5SIG
2634 	case TCP_MD5SIG:
2635 		/* Read the IP->Key mappings from userspace */
2636 		err = tp->af_specific->md5_parse(sk, optval, optlen);
2637 		break;
2638 #endif
2639 	case TCP_USER_TIMEOUT:
2640 		/* Cap the max time in ms TCP will retry or probe the window
2641 		 * before giving up and aborting (ETIMEDOUT) a connection.
2642 		 */
2643 		if (val < 0)
2644 			err = -EINVAL;
2645 		else
2646 			icsk->icsk_user_timeout = msecs_to_jiffies(val);
2647 		break;
2648 
2649 	case TCP_FASTOPEN:
2650 		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2651 		    TCPF_LISTEN))) {
2652 			tcp_fastopen_init_key_once(true);
2653 
2654 			fastopen_queue_tune(sk, val);
2655 		} else {
2656 			err = -EINVAL;
2657 		}
2658 		break;
2659 	case TCP_TIMESTAMP:
2660 		if (!tp->repair)
2661 			err = -EPERM;
2662 		else
2663 			tp->tsoffset = val - tcp_time_stamp;
2664 		break;
2665 	case TCP_REPAIR_WINDOW:
2666 		err = tcp_repair_set_window(tp, optval, optlen);
2667 		break;
2668 	case TCP_NOTSENT_LOWAT:
2669 		tp->notsent_lowat = val;
2670 		sk->sk_write_space(sk);
2671 		break;
2672 	default:
2673 		err = -ENOPROTOOPT;
2674 		break;
2675 	}
2676 
2677 	release_sock(sk);
2678 	return err;
2679 }
2680 
2681 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2682 		   unsigned int optlen)
2683 {
2684 	const struct inet_connection_sock *icsk = inet_csk(sk);
2685 
2686 	if (level != SOL_TCP)
2687 		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2688 						     optval, optlen);
2689 	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2690 }
2691 EXPORT_SYMBOL(tcp_setsockopt);
2692 
2693 #ifdef CONFIG_COMPAT
2694 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2695 			  char __user *optval, unsigned int optlen)
2696 {
2697 	if (level != SOL_TCP)
2698 		return inet_csk_compat_setsockopt(sk, level, optname,
2699 						  optval, optlen);
2700 	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2701 }
2702 EXPORT_SYMBOL(compat_tcp_setsockopt);
2703 #endif
2704 
2705 /* Return information about state of tcp endpoint in API format. */
2706 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2707 {
2708 	const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
2709 	const struct inet_connection_sock *icsk = inet_csk(sk);
2710 	u32 now = tcp_time_stamp, intv;
2711 	u64 rate64;
2712 	bool slow;
2713 	u32 rate;
2714 
2715 	memset(info, 0, sizeof(*info));
2716 	if (sk->sk_type != SOCK_STREAM)
2717 		return;
2718 
2719 	info->tcpi_state = sk_state_load(sk);
2720 
2721 	/* Report meaningful fields for all TCP states, including listeners */
2722 	rate = READ_ONCE(sk->sk_pacing_rate);
2723 	rate64 = rate != ~0U ? rate : ~0ULL;
2724 	info->tcpi_pacing_rate = rate64;
2725 
2726 	rate = READ_ONCE(sk->sk_max_pacing_rate);
2727 	rate64 = rate != ~0U ? rate : ~0ULL;
2728 	info->tcpi_max_pacing_rate = rate64;
2729 
2730 	info->tcpi_reordering = tp->reordering;
2731 	info->tcpi_snd_cwnd = tp->snd_cwnd;
2732 
2733 	if (info->tcpi_state == TCP_LISTEN) {
2734 		/* listeners aliased fields :
2735 		 * tcpi_unacked -> Number of children ready for accept()
2736 		 * tcpi_sacked  -> max backlog
2737 		 */
2738 		info->tcpi_unacked = sk->sk_ack_backlog;
2739 		info->tcpi_sacked = sk->sk_max_ack_backlog;
2740 		return;
2741 	}
2742 	info->tcpi_ca_state = icsk->icsk_ca_state;
2743 	info->tcpi_retransmits = icsk->icsk_retransmits;
2744 	info->tcpi_probes = icsk->icsk_probes_out;
2745 	info->tcpi_backoff = icsk->icsk_backoff;
2746 
2747 	if (tp->rx_opt.tstamp_ok)
2748 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2749 	if (tcp_is_sack(tp))
2750 		info->tcpi_options |= TCPI_OPT_SACK;
2751 	if (tp->rx_opt.wscale_ok) {
2752 		info->tcpi_options |= TCPI_OPT_WSCALE;
2753 		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2754 		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2755 	}
2756 
2757 	if (tp->ecn_flags & TCP_ECN_OK)
2758 		info->tcpi_options |= TCPI_OPT_ECN;
2759 	if (tp->ecn_flags & TCP_ECN_SEEN)
2760 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2761 	if (tp->syn_data_acked)
2762 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
2763 
2764 	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2765 	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2766 	info->tcpi_snd_mss = tp->mss_cache;
2767 	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2768 
2769 	info->tcpi_unacked = tp->packets_out;
2770 	info->tcpi_sacked = tp->sacked_out;
2771 
2772 	info->tcpi_lost = tp->lost_out;
2773 	info->tcpi_retrans = tp->retrans_out;
2774 	info->tcpi_fackets = tp->fackets_out;
2775 
2776 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2777 	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2778 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2779 
2780 	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2781 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2782 	info->tcpi_rtt = tp->srtt_us >> 3;
2783 	info->tcpi_rttvar = tp->mdev_us >> 2;
2784 	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2785 	info->tcpi_advmss = tp->advmss;
2786 
2787 	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2788 	info->tcpi_rcv_space = tp->rcvq_space.space;
2789 
2790 	info->tcpi_total_retrans = tp->total_retrans;
2791 
2792 	slow = lock_sock_fast(sk);
2793 
2794 	info->tcpi_bytes_acked = tp->bytes_acked;
2795 	info->tcpi_bytes_received = tp->bytes_received;
2796 	info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
2797 
2798 	unlock_sock_fast(sk, slow);
2799 
2800 	info->tcpi_segs_out = tp->segs_out;
2801 	info->tcpi_segs_in = tp->segs_in;
2802 
2803 	info->tcpi_min_rtt = tcp_min_rtt(tp);
2804 	info->tcpi_data_segs_in = tp->data_segs_in;
2805 	info->tcpi_data_segs_out = tp->data_segs_out;
2806 
2807 	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
2808 	rate = READ_ONCE(tp->rate_delivered);
2809 	intv = READ_ONCE(tp->rate_interval_us);
2810 	if (rate && intv) {
2811 		rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
2812 		do_div(rate64, intv);
2813 		info->tcpi_delivery_rate = rate64;
2814 	}
2815 }
2816 EXPORT_SYMBOL_GPL(tcp_get_info);
2817 
2818 static int do_tcp_getsockopt(struct sock *sk, int level,
2819 		int optname, char __user *optval, int __user *optlen)
2820 {
2821 	struct inet_connection_sock *icsk = inet_csk(sk);
2822 	struct tcp_sock *tp = tcp_sk(sk);
2823 	struct net *net = sock_net(sk);
2824 	int val, len;
2825 
2826 	if (get_user(len, optlen))
2827 		return -EFAULT;
2828 
2829 	len = min_t(unsigned int, len, sizeof(int));
2830 
2831 	if (len < 0)
2832 		return -EINVAL;
2833 
2834 	switch (optname) {
2835 	case TCP_MAXSEG:
2836 		val = tp->mss_cache;
2837 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2838 			val = tp->rx_opt.user_mss;
2839 		if (tp->repair)
2840 			val = tp->rx_opt.mss_clamp;
2841 		break;
2842 	case TCP_NODELAY:
2843 		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2844 		break;
2845 	case TCP_CORK:
2846 		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2847 		break;
2848 	case TCP_KEEPIDLE:
2849 		val = keepalive_time_when(tp) / HZ;
2850 		break;
2851 	case TCP_KEEPINTVL:
2852 		val = keepalive_intvl_when(tp) / HZ;
2853 		break;
2854 	case TCP_KEEPCNT:
2855 		val = keepalive_probes(tp);
2856 		break;
2857 	case TCP_SYNCNT:
2858 		val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
2859 		break;
2860 	case TCP_LINGER2:
2861 		val = tp->linger2;
2862 		if (val >= 0)
2863 			val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
2864 		break;
2865 	case TCP_DEFER_ACCEPT:
2866 		val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2867 				      TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2868 		break;
2869 	case TCP_WINDOW_CLAMP:
2870 		val = tp->window_clamp;
2871 		break;
2872 	case TCP_INFO: {
2873 		struct tcp_info info;
2874 
2875 		if (get_user(len, optlen))
2876 			return -EFAULT;
2877 
2878 		tcp_get_info(sk, &info);
2879 
2880 		len = min_t(unsigned int, len, sizeof(info));
2881 		if (put_user(len, optlen))
2882 			return -EFAULT;
2883 		if (copy_to_user(optval, &info, len))
2884 			return -EFAULT;
2885 		return 0;
2886 	}
2887 	case TCP_CC_INFO: {
2888 		const struct tcp_congestion_ops *ca_ops;
2889 		union tcp_cc_info info;
2890 		size_t sz = 0;
2891 		int attr;
2892 
2893 		if (get_user(len, optlen))
2894 			return -EFAULT;
2895 
2896 		ca_ops = icsk->icsk_ca_ops;
2897 		if (ca_ops && ca_ops->get_info)
2898 			sz = ca_ops->get_info(sk, ~0U, &attr, &info);
2899 
2900 		len = min_t(unsigned int, len, sz);
2901 		if (put_user(len, optlen))
2902 			return -EFAULT;
2903 		if (copy_to_user(optval, &info, len))
2904 			return -EFAULT;
2905 		return 0;
2906 	}
2907 	case TCP_QUICKACK:
2908 		val = !icsk->icsk_ack.pingpong;
2909 		break;
2910 
2911 	case TCP_CONGESTION:
2912 		if (get_user(len, optlen))
2913 			return -EFAULT;
2914 		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2915 		if (put_user(len, optlen))
2916 			return -EFAULT;
2917 		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2918 			return -EFAULT;
2919 		return 0;
2920 
2921 	case TCP_THIN_LINEAR_TIMEOUTS:
2922 		val = tp->thin_lto;
2923 		break;
2924 	case TCP_THIN_DUPACK:
2925 		val = tp->thin_dupack;
2926 		break;
2927 
2928 	case TCP_REPAIR:
2929 		val = tp->repair;
2930 		break;
2931 
2932 	case TCP_REPAIR_QUEUE:
2933 		if (tp->repair)
2934 			val = tp->repair_queue;
2935 		else
2936 			return -EINVAL;
2937 		break;
2938 
2939 	case TCP_REPAIR_WINDOW: {
2940 		struct tcp_repair_window opt;
2941 
2942 		if (get_user(len, optlen))
2943 			return -EFAULT;
2944 
2945 		if (len != sizeof(opt))
2946 			return -EINVAL;
2947 
2948 		if (!tp->repair)
2949 			return -EPERM;
2950 
2951 		opt.snd_wl1	= tp->snd_wl1;
2952 		opt.snd_wnd	= tp->snd_wnd;
2953 		opt.max_window	= tp->max_window;
2954 		opt.rcv_wnd	= tp->rcv_wnd;
2955 		opt.rcv_wup	= tp->rcv_wup;
2956 
2957 		if (copy_to_user(optval, &opt, len))
2958 			return -EFAULT;
2959 		return 0;
2960 	}
2961 	case TCP_QUEUE_SEQ:
2962 		if (tp->repair_queue == TCP_SEND_QUEUE)
2963 			val = tp->write_seq;
2964 		else if (tp->repair_queue == TCP_RECV_QUEUE)
2965 			val = tp->rcv_nxt;
2966 		else
2967 			return -EINVAL;
2968 		break;
2969 
2970 	case TCP_USER_TIMEOUT:
2971 		val = jiffies_to_msecs(icsk->icsk_user_timeout);
2972 		break;
2973 
2974 	case TCP_FASTOPEN:
2975 		val = icsk->icsk_accept_queue.fastopenq.max_qlen;
2976 		break;
2977 
2978 	case TCP_TIMESTAMP:
2979 		val = tcp_time_stamp + tp->tsoffset;
2980 		break;
2981 	case TCP_NOTSENT_LOWAT:
2982 		val = tp->notsent_lowat;
2983 		break;
2984 	case TCP_SAVE_SYN:
2985 		val = tp->save_syn;
2986 		break;
2987 	case TCP_SAVED_SYN: {
2988 		if (get_user(len, optlen))
2989 			return -EFAULT;
2990 
2991 		lock_sock(sk);
2992 		if (tp->saved_syn) {
2993 			if (len < tp->saved_syn[0]) {
2994 				if (put_user(tp->saved_syn[0], optlen)) {
2995 					release_sock(sk);
2996 					return -EFAULT;
2997 				}
2998 				release_sock(sk);
2999 				return -EINVAL;
3000 			}
3001 			len = tp->saved_syn[0];
3002 			if (put_user(len, optlen)) {
3003 				release_sock(sk);
3004 				return -EFAULT;
3005 			}
3006 			if (copy_to_user(optval, tp->saved_syn + 1, len)) {
3007 				release_sock(sk);
3008 				return -EFAULT;
3009 			}
3010 			tcp_saved_syn_free(tp);
3011 			release_sock(sk);
3012 		} else {
3013 			release_sock(sk);
3014 			len = 0;
3015 			if (put_user(len, optlen))
3016 				return -EFAULT;
3017 		}
3018 		return 0;
3019 	}
3020 	default:
3021 		return -ENOPROTOOPT;
3022 	}
3023 
3024 	if (put_user(len, optlen))
3025 		return -EFAULT;
3026 	if (copy_to_user(optval, &val, len))
3027 		return -EFAULT;
3028 	return 0;
3029 }
3030 
3031 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
3032 		   int __user *optlen)
3033 {
3034 	struct inet_connection_sock *icsk = inet_csk(sk);
3035 
3036 	if (level != SOL_TCP)
3037 		return icsk->icsk_af_ops->getsockopt(sk, level, optname,
3038 						     optval, optlen);
3039 	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3040 }
3041 EXPORT_SYMBOL(tcp_getsockopt);
3042 
3043 #ifdef CONFIG_COMPAT
3044 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
3045 			  char __user *optval, int __user *optlen)
3046 {
3047 	if (level != SOL_TCP)
3048 		return inet_csk_compat_getsockopt(sk, level, optname,
3049 						  optval, optlen);
3050 	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3051 }
3052 EXPORT_SYMBOL(compat_tcp_getsockopt);
3053 #endif
3054 
3055 #ifdef CONFIG_TCP_MD5SIG
3056 static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
3057 static DEFINE_MUTEX(tcp_md5sig_mutex);
3058 static bool tcp_md5sig_pool_populated = false;
3059 
3060 static void __tcp_alloc_md5sig_pool(void)
3061 {
3062 	struct crypto_ahash *hash;
3063 	int cpu;
3064 
3065 	hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
3066 	if (IS_ERR(hash))
3067 		return;
3068 
3069 	for_each_possible_cpu(cpu) {
3070 		void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
3071 		struct ahash_request *req;
3072 
3073 		if (!scratch) {
3074 			scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
3075 					       sizeof(struct tcphdr),
3076 					       GFP_KERNEL,
3077 					       cpu_to_node(cpu));
3078 			if (!scratch)
3079 				return;
3080 			per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
3081 		}
3082 		if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
3083 			continue;
3084 
3085 		req = ahash_request_alloc(hash, GFP_KERNEL);
3086 		if (!req)
3087 			return;
3088 
3089 		ahash_request_set_callback(req, 0, NULL, NULL);
3090 
3091 		per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
3092 	}
3093 	/* before setting tcp_md5sig_pool_populated, we must commit all writes
3094 	 * to memory. See smp_rmb() in tcp_get_md5sig_pool()
3095 	 */
3096 	smp_wmb();
3097 	tcp_md5sig_pool_populated = true;
3098 }
3099 
3100 bool tcp_alloc_md5sig_pool(void)
3101 {
3102 	if (unlikely(!tcp_md5sig_pool_populated)) {
3103 		mutex_lock(&tcp_md5sig_mutex);
3104 
3105 		if (!tcp_md5sig_pool_populated)
3106 			__tcp_alloc_md5sig_pool();
3107 
3108 		mutex_unlock(&tcp_md5sig_mutex);
3109 	}
3110 	return tcp_md5sig_pool_populated;
3111 }
3112 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3113 
3114 
3115 /**
3116  *	tcp_get_md5sig_pool - get md5sig_pool for this user
3117  *
3118  *	We use percpu structure, so if we succeed, we exit with preemption
3119  *	and BH disabled, to make sure another thread or softirq handling
3120  *	wont try to get same context.
3121  */
3122 struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3123 {
3124 	local_bh_disable();
3125 
3126 	if (tcp_md5sig_pool_populated) {
3127 		/* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
3128 		smp_rmb();
3129 		return this_cpu_ptr(&tcp_md5sig_pool);
3130 	}
3131 	local_bh_enable();
3132 	return NULL;
3133 }
3134 EXPORT_SYMBOL(tcp_get_md5sig_pool);
3135 
3136 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3137 			  const struct sk_buff *skb, unsigned int header_len)
3138 {
3139 	struct scatterlist sg;
3140 	const struct tcphdr *tp = tcp_hdr(skb);
3141 	struct ahash_request *req = hp->md5_req;
3142 	unsigned int i;
3143 	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3144 					   skb_headlen(skb) - header_len : 0;
3145 	const struct skb_shared_info *shi = skb_shinfo(skb);
3146 	struct sk_buff *frag_iter;
3147 
3148 	sg_init_table(&sg, 1);
3149 
3150 	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3151 	ahash_request_set_crypt(req, &sg, NULL, head_data_len);
3152 	if (crypto_ahash_update(req))
3153 		return 1;
3154 
3155 	for (i = 0; i < shi->nr_frags; ++i) {
3156 		const struct skb_frag_struct *f = &shi->frags[i];
3157 		unsigned int offset = f->page_offset;
3158 		struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3159 
3160 		sg_set_page(&sg, page, skb_frag_size(f),
3161 			    offset_in_page(offset));
3162 		ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
3163 		if (crypto_ahash_update(req))
3164 			return 1;
3165 	}
3166 
3167 	skb_walk_frags(skb, frag_iter)
3168 		if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3169 			return 1;
3170 
3171 	return 0;
3172 }
3173 EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3174 
3175 int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3176 {
3177 	struct scatterlist sg;
3178 
3179 	sg_init_one(&sg, key->key, key->keylen);
3180 	ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
3181 	return crypto_ahash_update(hp->md5_req);
3182 }
3183 EXPORT_SYMBOL(tcp_md5_hash_key);
3184 
3185 #endif
3186 
3187 void tcp_done(struct sock *sk)
3188 {
3189 	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3190 
3191 	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3192 		TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3193 
3194 	tcp_set_state(sk, TCP_CLOSE);
3195 	tcp_clear_xmit_timers(sk);
3196 	if (req)
3197 		reqsk_fastopen_remove(sk, req, false);
3198 
3199 	sk->sk_shutdown = SHUTDOWN_MASK;
3200 
3201 	if (!sock_flag(sk, SOCK_DEAD))
3202 		sk->sk_state_change(sk);
3203 	else
3204 		inet_csk_destroy_sock(sk);
3205 }
3206 EXPORT_SYMBOL_GPL(tcp_done);
3207 
3208 int tcp_abort(struct sock *sk, int err)
3209 {
3210 	if (!sk_fullsock(sk)) {
3211 		if (sk->sk_state == TCP_NEW_SYN_RECV) {
3212 			struct request_sock *req = inet_reqsk(sk);
3213 
3214 			local_bh_disable();
3215 			inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
3216 							  req);
3217 			local_bh_enable();
3218 			return 0;
3219 		}
3220 		return -EOPNOTSUPP;
3221 	}
3222 
3223 	/* Don't race with userspace socket closes such as tcp_close. */
3224 	lock_sock(sk);
3225 
3226 	if (sk->sk_state == TCP_LISTEN) {
3227 		tcp_set_state(sk, TCP_CLOSE);
3228 		inet_csk_listen_stop(sk);
3229 	}
3230 
3231 	/* Don't race with BH socket closes such as inet_csk_listen_stop. */
3232 	local_bh_disable();
3233 	bh_lock_sock(sk);
3234 
3235 	if (!sock_flag(sk, SOCK_DEAD)) {
3236 		sk->sk_err = err;
3237 		/* This barrier is coupled with smp_rmb() in tcp_poll() */
3238 		smp_wmb();
3239 		sk->sk_error_report(sk);
3240 		if (tcp_need_reset(sk->sk_state))
3241 			tcp_send_active_reset(sk, GFP_ATOMIC);
3242 		tcp_done(sk);
3243 	}
3244 
3245 	bh_unlock_sock(sk);
3246 	local_bh_enable();
3247 	release_sock(sk);
3248 	return 0;
3249 }
3250 EXPORT_SYMBOL_GPL(tcp_abort);
3251 
3252 extern struct tcp_congestion_ops tcp_reno;
3253 
3254 static __initdata unsigned long thash_entries;
3255 static int __init set_thash_entries(char *str)
3256 {
3257 	ssize_t ret;
3258 
3259 	if (!str)
3260 		return 0;
3261 
3262 	ret = kstrtoul(str, 0, &thash_entries);
3263 	if (ret)
3264 		return 0;
3265 
3266 	return 1;
3267 }
3268 __setup("thash_entries=", set_thash_entries);
3269 
3270 static void __init tcp_init_mem(void)
3271 {
3272 	unsigned long limit = nr_free_buffer_pages() / 16;
3273 
3274 	limit = max(limit, 128UL);
3275 	sysctl_tcp_mem[0] = limit / 4 * 3;		/* 4.68 % */
3276 	sysctl_tcp_mem[1] = limit;			/* 6.25 % */
3277 	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;	/* 9.37 % */
3278 }
3279 
3280 void __init tcp_init(void)
3281 {
3282 	int max_rshare, max_wshare, cnt;
3283 	unsigned long limit;
3284 	unsigned int i;
3285 
3286 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
3287 		     FIELD_SIZEOF(struct sk_buff, cb));
3288 
3289 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3290 	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3291 	tcp_hashinfo.bind_bucket_cachep =
3292 		kmem_cache_create("tcp_bind_bucket",
3293 				  sizeof(struct inet_bind_bucket), 0,
3294 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3295 
3296 	/* Size and allocate the main established and bind bucket
3297 	 * hash tables.
3298 	 *
3299 	 * The methodology is similar to that of the buffer cache.
3300 	 */
3301 	tcp_hashinfo.ehash =
3302 		alloc_large_system_hash("TCP established",
3303 					sizeof(struct inet_ehash_bucket),
3304 					thash_entries,
3305 					17, /* one slot per 128 KB of memory */
3306 					0,
3307 					NULL,
3308 					&tcp_hashinfo.ehash_mask,
3309 					0,
3310 					thash_entries ? 0 : 512 * 1024);
3311 	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3312 		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3313 
3314 	if (inet_ehash_locks_alloc(&tcp_hashinfo))
3315 		panic("TCP: failed to alloc ehash_locks");
3316 	tcp_hashinfo.bhash =
3317 		alloc_large_system_hash("TCP bind",
3318 					sizeof(struct inet_bind_hashbucket),
3319 					tcp_hashinfo.ehash_mask + 1,
3320 					17, /* one slot per 128 KB of memory */
3321 					0,
3322 					&tcp_hashinfo.bhash_size,
3323 					NULL,
3324 					0,
3325 					64 * 1024);
3326 	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3327 	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3328 		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3329 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3330 	}
3331 
3332 
3333 	cnt = tcp_hashinfo.ehash_mask + 1;
3334 
3335 	tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3336 	sysctl_tcp_max_orphans = cnt / 2;
3337 	sysctl_max_syn_backlog = max(128, cnt / 256);
3338 
3339 	tcp_init_mem();
3340 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
3341 	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3342 	max_wshare = min(4UL*1024*1024, limit);
3343 	max_rshare = min(6UL*1024*1024, limit);
3344 
3345 	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3346 	sysctl_tcp_wmem[1] = 16*1024;
3347 	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3348 
3349 	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3350 	sysctl_tcp_rmem[1] = 87380;
3351 	sysctl_tcp_rmem[2] = max(87380, max_rshare);
3352 
3353 	pr_info("Hash tables configured (established %u bind %u)\n",
3354 		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3355 
3356 	tcp_metrics_init();
3357 	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3358 	tcp_tasklet_init();
3359 }
3360