xref: /linux/net/ipv4/tcp_minisocks.c (revision 6e8331ac6973435b1e7604c30f2ad394035b46e1)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  */
22 
23 #include <linux/mm.h>
24 #include <linux/module.h>
25 #include <linux/sysctl.h>
26 #include <linux/workqueue.h>
27 #include <net/tcp.h>
28 #include <net/inet_common.h>
29 #include <net/xfrm.h>
30 
31 #ifdef CONFIG_SYSCTL
32 #define SYNC_INIT 0 /* let the user enable it */
33 #else
34 #define SYNC_INIT 1
35 #endif
36 
37 int sysctl_tcp_syncookies = SYNC_INIT;
38 int sysctl_tcp_abort_on_overflow;
39 
40 struct inet_timewait_death_row tcp_death_row = {
41 	.sysctl_max_tw_buckets = NR_FILE * 2,
42 	.period		= TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
43 	.death_lock	= __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
44 	.hashinfo	= &tcp_hashinfo,
45 	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
46 					    (unsigned long)&tcp_death_row),
47 	.twkill_work	= __WORK_INITIALIZER(tcp_death_row.twkill_work,
48 					     inet_twdr_twkill_work,
49 					     &tcp_death_row),
50 /* Short-time timewait calendar */
51 
52 	.twcal_hand	= -1,
53 	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
54 					    (unsigned long)&tcp_death_row),
55 };
56 
57 EXPORT_SYMBOL_GPL(tcp_death_row);
58 
59 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
60 {
61 	if (seq == s_win)
62 		return 1;
63 	if (after(end_seq, s_win) && before(seq, e_win))
64 		return 1;
65 	return (seq == e_win && seq == end_seq);
66 }
67 
68 /*
69  * * Main purpose of TIME-WAIT state is to close connection gracefully,
70  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
71  *   (and, probably, tail of data) and one or more our ACKs are lost.
72  * * What is TIME-WAIT timeout? It is associated with maximal packet
73  *   lifetime in the internet, which results in wrong conclusion, that
74  *   it is set to catch "old duplicate segments" wandering out of their path.
75  *   It is not quite correct. This timeout is calculated so that it exceeds
76  *   maximal retransmission timeout enough to allow to lose one (or more)
77  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
78  * * When TIME-WAIT socket receives RST, it means that another end
79  *   finally closed and we are allowed to kill TIME-WAIT too.
80  * * Second purpose of TIME-WAIT is catching old duplicate segments.
81  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
82  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
83  * * If we invented some more clever way to catch duplicates
84  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
85  *
86  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
87  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
88  * from the very beginning.
89  *
90  * NOTE. With recycling (and later with fin-wait-2) TW bucket
91  * is _not_ stateless. It means, that strictly speaking we must
92  * spinlock it. I do not want! Well, probability of misbehaviour
93  * is ridiculously low and, seems, we could use some mb() tricks
94  * to avoid misread sequence numbers, states etc.  --ANK
95  */
96 enum tcp_tw_status
97 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
98 			   const struct tcphdr *th)
99 {
100 	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
101 	struct tcp_options_received tmp_opt;
102 	int paws_reject = 0;
103 
104 	tmp_opt.saw_tstamp = 0;
105 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
106 		tcp_parse_options(skb, &tmp_opt, 0);
107 
108 		if (tmp_opt.saw_tstamp) {
109 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
110 			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
111 			paws_reject = tcp_paws_check(&tmp_opt, th->rst);
112 		}
113 	}
114 
115 	if (tw->tw_substate == TCP_FIN_WAIT2) {
116 		/* Just repeat all the checks of tcp_rcv_state_process() */
117 
118 		/* Out of window, send ACK */
119 		if (paws_reject ||
120 		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
121 				   tcptw->tw_rcv_nxt,
122 				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
123 			return TCP_TW_ACK;
124 
125 		if (th->rst)
126 			goto kill;
127 
128 		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
129 			goto kill_with_rst;
130 
131 		/* Dup ACK? */
132 		if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
133 		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
134 			inet_twsk_put(tw);
135 			return TCP_TW_SUCCESS;
136 		}
137 
138 		/* New data or FIN. If new data arrive after half-duplex close,
139 		 * reset.
140 		 */
141 		if (!th->fin ||
142 		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
143 kill_with_rst:
144 			inet_twsk_deschedule(tw, &tcp_death_row);
145 			inet_twsk_put(tw);
146 			return TCP_TW_RST;
147 		}
148 
149 		/* FIN arrived, enter true time-wait state. */
150 		tw->tw_substate	  = TCP_TIME_WAIT;
151 		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
152 		if (tmp_opt.saw_tstamp) {
153 			tcptw->tw_ts_recent_stamp = xtime.tv_sec;
154 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
155 		}
156 
157 		/* I am shamed, but failed to make it more elegant.
158 		 * Yes, it is direct reference to IP, which is impossible
159 		 * to generalize to IPv6. Taking into account that IPv6
160 		 * do not understand recycling in any case, it not
161 		 * a big problem in practice. --ANK */
162 		if (tw->tw_family == AF_INET &&
163 		    tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
164 		    tcp_v4_tw_remember_stamp(tw))
165 			inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
166 					   TCP_TIMEWAIT_LEN);
167 		else
168 			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
169 					   TCP_TIMEWAIT_LEN);
170 		return TCP_TW_ACK;
171 	}
172 
173 	/*
174 	 *	Now real TIME-WAIT state.
175 	 *
176 	 *	RFC 1122:
177 	 *	"When a connection is [...] on TIME-WAIT state [...]
178 	 *	[a TCP] MAY accept a new SYN from the remote TCP to
179 	 *	reopen the connection directly, if it:
180 	 *
181 	 *	(1)  assigns its initial sequence number for the new
182 	 *	connection to be larger than the largest sequence
183 	 *	number it used on the previous connection incarnation,
184 	 *	and
185 	 *
186 	 *	(2)  returns to TIME-WAIT state if the SYN turns out
187 	 *	to be an old duplicate".
188 	 */
189 
190 	if (!paws_reject &&
191 	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
192 	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
193 		/* In window segment, it may be only reset or bare ack. */
194 
195 		if (th->rst) {
196 			/* This is TIME_WAIT assassination, in two flavors.
197 			 * Oh well... nobody has a sufficient solution to this
198 			 * protocol bug yet.
199 			 */
200 			if (sysctl_tcp_rfc1337 == 0) {
201 kill:
202 				inet_twsk_deschedule(tw, &tcp_death_row);
203 				inet_twsk_put(tw);
204 				return TCP_TW_SUCCESS;
205 			}
206 		}
207 		inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
208 				   TCP_TIMEWAIT_LEN);
209 
210 		if (tmp_opt.saw_tstamp) {
211 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
212 			tcptw->tw_ts_recent_stamp = xtime.tv_sec;
213 		}
214 
215 		inet_twsk_put(tw);
216 		return TCP_TW_SUCCESS;
217 	}
218 
219 	/* Out of window segment.
220 
221 	   All the segments are ACKed immediately.
222 
223 	   The only exception is new SYN. We accept it, if it is
224 	   not old duplicate and we are not in danger to be killed
225 	   by delayed old duplicates. RFC check is that it has
226 	   newer sequence number works at rates <40Mbit/sec.
227 	   However, if paws works, it is reliable AND even more,
228 	   we even may relax silly seq space cutoff.
229 
230 	   RED-PEN: we violate main RFC requirement, if this SYN will appear
231 	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
232 	   we must return socket to time-wait state. It is not good,
233 	   but not fatal yet.
234 	 */
235 
236 	if (th->syn && !th->rst && !th->ack && !paws_reject &&
237 	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
238 	     (tmp_opt.saw_tstamp &&
239 	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
240 		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
241 		if (isn == 0)
242 			isn++;
243 		TCP_SKB_CB(skb)->when = isn;
244 		return TCP_TW_SYN;
245 	}
246 
247 	if (paws_reject)
248 		NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
249 
250 	if(!th->rst) {
251 		/* In this case we must reset the TIMEWAIT timer.
252 		 *
253 		 * If it is ACKless SYN it may be both old duplicate
254 		 * and new good SYN with random sequence number <rcv_nxt.
255 		 * Do not reschedule in the last case.
256 		 */
257 		if (paws_reject || th->ack)
258 			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
259 					   TCP_TIMEWAIT_LEN);
260 
261 		/* Send ACK. Note, we do not put the bucket,
262 		 * it will be released by caller.
263 		 */
264 		return TCP_TW_ACK;
265 	}
266 	inet_twsk_put(tw);
267 	return TCP_TW_SUCCESS;
268 }
269 
270 /*
271  * Move a socket to time-wait or dead fin-wait-2 state.
272  */
273 void tcp_time_wait(struct sock *sk, int state, int timeo)
274 {
275 	struct inet_timewait_sock *tw = NULL;
276 	const struct inet_connection_sock *icsk = inet_csk(sk);
277 	const struct tcp_sock *tp = tcp_sk(sk);
278 	int recycle_ok = 0;
279 
280 	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
281 		recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
282 
283 	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
284 		tw = inet_twsk_alloc(sk, state);
285 
286 	if (tw != NULL) {
287 		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
288 		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
289 
290 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
291 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
292 		tcptw->tw_snd_nxt	= tp->snd_nxt;
293 		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
294 		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
295 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
296 
297 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
298 		if (tw->tw_family == PF_INET6) {
299 			struct ipv6_pinfo *np = inet6_sk(sk);
300 			struct inet6_timewait_sock *tw6;
301 
302 			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
303 			tw6 = inet6_twsk((struct sock *)tw);
304 			ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
305 			ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
306 			tw->tw_ipv6only = np->ipv6only;
307 		}
308 #endif
309 		/* Linkage updates. */
310 		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
311 
312 		/* Get the TIME_WAIT timeout firing. */
313 		if (timeo < rto)
314 			timeo = rto;
315 
316 		if (recycle_ok) {
317 			tw->tw_timeout = rto;
318 		} else {
319 			tw->tw_timeout = TCP_TIMEWAIT_LEN;
320 			if (state == TCP_TIME_WAIT)
321 				timeo = TCP_TIMEWAIT_LEN;
322 		}
323 
324 		inet_twsk_schedule(tw, &tcp_death_row, timeo,
325 				   TCP_TIMEWAIT_LEN);
326 		inet_twsk_put(tw);
327 	} else {
328 		/* Sorry, if we're out of memory, just CLOSE this
329 		 * socket up.  We've got bigger problems than
330 		 * non-graceful socket closings.
331 		 */
332 		if (net_ratelimit())
333 			printk(KERN_INFO "TCP: time wait bucket table overflow\n");
334 	}
335 
336 	tcp_update_metrics(sk);
337 	tcp_done(sk);
338 }
339 
340 /* This is not only more efficient than what we used to do, it eliminates
341  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
342  *
343  * Actually, we could lots of memory writes here. tp of listening
344  * socket contains all necessary default parameters.
345  */
346 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
347 {
348 	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
349 
350 	if (newsk != NULL) {
351 		const struct inet_request_sock *ireq = inet_rsk(req);
352 		struct tcp_request_sock *treq = tcp_rsk(req);
353 		struct inet_connection_sock *newicsk = inet_csk(sk);
354 		struct tcp_sock *newtp;
355 
356 		/* Now setup tcp_sock */
357 		newtp = tcp_sk(newsk);
358 		newtp->pred_flags = 0;
359 		newtp->rcv_nxt = treq->rcv_isn + 1;
360 		newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
361 
362 		tcp_prequeue_init(newtp);
363 
364 		tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
365 
366 		newtp->srtt = 0;
367 		newtp->mdev = TCP_TIMEOUT_INIT;
368 		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
369 
370 		newtp->packets_out = 0;
371 		newtp->left_out = 0;
372 		newtp->retrans_out = 0;
373 		newtp->sacked_out = 0;
374 		newtp->fackets_out = 0;
375 		newtp->snd_ssthresh = 0x7fffffff;
376 
377 		/* So many TCP implementations out there (incorrectly) count the
378 		 * initial SYN frame in their delayed-ACK and congestion control
379 		 * algorithms that we must have the following bandaid to talk
380 		 * efficiently to them.  -DaveM
381 		 */
382 		newtp->snd_cwnd = 2;
383 		newtp->snd_cwnd_cnt = 0;
384 		newtp->bytes_acked = 0;
385 
386 		newtp->frto_counter = 0;
387 		newtp->frto_highmark = 0;
388 
389 		newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
390 
391 		tcp_set_ca_state(newsk, TCP_CA_Open);
392 		tcp_init_xmit_timers(newsk);
393 		skb_queue_head_init(&newtp->out_of_order_queue);
394 		newtp->rcv_wup = treq->rcv_isn + 1;
395 		newtp->write_seq = treq->snt_isn + 1;
396 		newtp->pushed_seq = newtp->write_seq;
397 		newtp->copied_seq = treq->rcv_isn + 1;
398 
399 		newtp->rx_opt.saw_tstamp = 0;
400 
401 		newtp->rx_opt.dsack = 0;
402 		newtp->rx_opt.eff_sacks = 0;
403 
404 		newtp->rx_opt.num_sacks = 0;
405 		newtp->urg_data = 0;
406 
407 		if (sock_flag(newsk, SOCK_KEEPOPEN))
408 			inet_csk_reset_keepalive_timer(newsk,
409 						       keepalive_time_when(newtp));
410 
411 		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
412 		if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
413 			if (sysctl_tcp_fack)
414 				newtp->rx_opt.sack_ok |= 2;
415 		}
416 		newtp->window_clamp = req->window_clamp;
417 		newtp->rcv_ssthresh = req->rcv_wnd;
418 		newtp->rcv_wnd = req->rcv_wnd;
419 		newtp->rx_opt.wscale_ok = ireq->wscale_ok;
420 		if (newtp->rx_opt.wscale_ok) {
421 			newtp->rx_opt.snd_wscale = ireq->snd_wscale;
422 			newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
423 		} else {
424 			newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
425 			newtp->window_clamp = min(newtp->window_clamp, 65535U);
426 		}
427 		newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
428 		newtp->max_window = newtp->snd_wnd;
429 
430 		if (newtp->rx_opt.tstamp_ok) {
431 			newtp->rx_opt.ts_recent = req->ts_recent;
432 			newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
433 			newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
434 		} else {
435 			newtp->rx_opt.ts_recent_stamp = 0;
436 			newtp->tcp_header_len = sizeof(struct tcphdr);
437 		}
438 		if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
439 			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
440 		newtp->rx_opt.mss_clamp = req->mss;
441 		TCP_ECN_openreq_child(newtp, req);
442 
443 		TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
444 	}
445 	return newsk;
446 }
447 
448 /*
449  *	Process an incoming packet for SYN_RECV sockets represented
450  *	as a request_sock.
451  */
452 
453 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
454 			   struct request_sock *req,
455 			   struct request_sock **prev)
456 {
457 	struct tcphdr *th = skb->h.th;
458 	u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
459 	int paws_reject = 0;
460 	struct tcp_options_received tmp_opt;
461 	struct sock *child;
462 
463 	tmp_opt.saw_tstamp = 0;
464 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
465 		tcp_parse_options(skb, &tmp_opt, 0);
466 
467 		if (tmp_opt.saw_tstamp) {
468 			tmp_opt.ts_recent = req->ts_recent;
469 			/* We do not store true stamp, but it is not required,
470 			 * it can be estimated (approximately)
471 			 * from another data.
472 			 */
473 			tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
474 			paws_reject = tcp_paws_check(&tmp_opt, th->rst);
475 		}
476 	}
477 
478 	/* Check for pure retransmitted SYN. */
479 	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
480 	    flg == TCP_FLAG_SYN &&
481 	    !paws_reject) {
482 		/*
483 		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
484 		 * this case on figure 6 and figure 8, but formal
485 		 * protocol description says NOTHING.
486 		 * To be more exact, it says that we should send ACK,
487 		 * because this segment (at least, if it has no data)
488 		 * is out of window.
489 		 *
490 		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
491 		 *  describe SYN-RECV state. All the description
492 		 *  is wrong, we cannot believe to it and should
493 		 *  rely only on common sense and implementation
494 		 *  experience.
495 		 *
496 		 * Enforce "SYN-ACK" according to figure 8, figure 6
497 		 * of RFC793, fixed by RFC1122.
498 		 */
499 		req->rsk_ops->rtx_syn_ack(sk, req, NULL);
500 		return NULL;
501 	}
502 
503 	/* Further reproduces section "SEGMENT ARRIVES"
504 	   for state SYN-RECEIVED of RFC793.
505 	   It is broken, however, it does not work only
506 	   when SYNs are crossed.
507 
508 	   You would think that SYN crossing is impossible here, since
509 	   we should have a SYN_SENT socket (from connect()) on our end,
510 	   but this is not true if the crossed SYNs were sent to both
511 	   ends by a malicious third party.  We must defend against this,
512 	   and to do that we first verify the ACK (as per RFC793, page
513 	   36) and reset if it is invalid.  Is this a true full defense?
514 	   To convince ourselves, let us consider a way in which the ACK
515 	   test can still pass in this 'malicious crossed SYNs' case.
516 	   Malicious sender sends identical SYNs (and thus identical sequence
517 	   numbers) to both A and B:
518 
519 		A: gets SYN, seq=7
520 		B: gets SYN, seq=7
521 
522 	   By our good fortune, both A and B select the same initial
523 	   send sequence number of seven :-)
524 
525 		A: sends SYN|ACK, seq=7, ack_seq=8
526 		B: sends SYN|ACK, seq=7, ack_seq=8
527 
528 	   So we are now A eating this SYN|ACK, ACK test passes.  So
529 	   does sequence test, SYN is truncated, and thus we consider
530 	   it a bare ACK.
531 
532 	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
533 	   bare ACK.  Otherwise, we create an established connection.  Both
534 	   ends (listening sockets) accept the new incoming connection and try
535 	   to talk to each other. 8-)
536 
537 	   Note: This case is both harmless, and rare.  Possibility is about the
538 	   same as us discovering intelligent life on another plant tomorrow.
539 
540 	   But generally, we should (RFC lies!) to accept ACK
541 	   from SYNACK both here and in tcp_rcv_state_process().
542 	   tcp_rcv_state_process() does not, hence, we do not too.
543 
544 	   Note that the case is absolutely generic:
545 	   we cannot optimize anything here without
546 	   violating protocol. All the checks must be made
547 	   before attempt to create socket.
548 	 */
549 
550 	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
551 	 *                  and the incoming segment acknowledges something not yet
552 	 *                  sent (the segment carries an unacceptable ACK) ...
553 	 *                  a reset is sent."
554 	 *
555 	 * Invalid ACK: reset will be sent by listening socket
556 	 */
557 	if ((flg & TCP_FLAG_ACK) &&
558 	    (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
559 		return sk;
560 
561 	/* Also, it would be not so bad idea to check rcv_tsecr, which
562 	 * is essentially ACK extension and too early or too late values
563 	 * should cause reset in unsynchronized states.
564 	 */
565 
566 	/* RFC793: "first check sequence number". */
567 
568 	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
569 					  tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
570 		/* Out of window: send ACK and drop. */
571 		if (!(flg & TCP_FLAG_RST))
572 			req->rsk_ops->send_ack(skb, req);
573 		if (paws_reject)
574 			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
575 		return NULL;
576 	}
577 
578 	/* In sequence, PAWS is OK. */
579 
580 	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
581 			req->ts_recent = tmp_opt.rcv_tsval;
582 
583 		if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
584 			/* Truncate SYN, it is out of window starting
585 			   at tcp_rsk(req)->rcv_isn + 1. */
586 			flg &= ~TCP_FLAG_SYN;
587 		}
588 
589 		/* RFC793: "second check the RST bit" and
590 		 *	   "fourth, check the SYN bit"
591 		 */
592 		if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
593 			TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
594 			goto embryonic_reset;
595 		}
596 
597 		/* ACK sequence verified above, just make sure ACK is
598 		 * set.  If ACK not set, just silently drop the packet.
599 		 */
600 		if (!(flg & TCP_FLAG_ACK))
601 			return NULL;
602 
603 		/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
604 		if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
605 		    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
606 			inet_rsk(req)->acked = 1;
607 			return NULL;
608 		}
609 
610 		/* OK, ACK is valid, create big socket and
611 		 * feed this segment to it. It will repeat all
612 		 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
613 		 * ESTABLISHED STATE. If it will be dropped after
614 		 * socket is created, wait for troubles.
615 		 */
616 		child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
617 								 req, NULL);
618 		if (child == NULL)
619 			goto listen_overflow;
620 
621 		inet_csk_reqsk_queue_unlink(sk, req, prev);
622 		inet_csk_reqsk_queue_removed(sk, req);
623 
624 		inet_csk_reqsk_queue_add(sk, req, child);
625 		return child;
626 
627 	listen_overflow:
628 		if (!sysctl_tcp_abort_on_overflow) {
629 			inet_rsk(req)->acked = 1;
630 			return NULL;
631 		}
632 
633 	embryonic_reset:
634 		NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
635 		if (!(flg & TCP_FLAG_RST))
636 			req->rsk_ops->send_reset(skb);
637 
638 		inet_csk_reqsk_queue_drop(sk, req, prev);
639 		return NULL;
640 }
641 
642 /*
643  * Queue segment on the new socket if the new socket is active,
644  * otherwise we just shortcircuit this and continue with
645  * the new socket.
646  */
647 
648 int tcp_child_process(struct sock *parent, struct sock *child,
649 		      struct sk_buff *skb)
650 {
651 	int ret = 0;
652 	int state = child->sk_state;
653 
654 	if (!sock_owned_by_user(child)) {
655 		ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
656 
657 		/* Wakeup parent, send SIGIO */
658 		if (state == TCP_SYN_RECV && child->sk_state != state)
659 			parent->sk_data_ready(parent, 0);
660 	} else {
661 		/* Alas, it is possible again, because we do lookup
662 		 * in main socket hash table and lock on listening
663 		 * socket does not protect us more.
664 		 */
665 		sk_add_backlog(child, skb);
666 	}
667 
668 	bh_unlock_sock(child);
669 	sock_put(child);
670 	return ret;
671 }
672 
673 EXPORT_SYMBOL(tcp_check_req);
674 EXPORT_SYMBOL(tcp_child_process);
675 EXPORT_SYMBOL(tcp_create_openreq_child);
676 EXPORT_SYMBOL(tcp_timewait_state_process);
677