xref: /linux/net/ipv4/tcp_minisocks.c (revision c537b994505099b7197e7d3125b942ecbcc51eb6)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  */
22 
23 #include <linux/mm.h>
24 #include <linux/module.h>
25 #include <linux/sysctl.h>
26 #include <linux/workqueue.h>
27 #include <net/tcp.h>
28 #include <net/inet_common.h>
29 #include <net/xfrm.h>
30 
31 #ifdef CONFIG_SYSCTL
32 #define SYNC_INIT 0 /* let the user enable it */
33 #else
34 #define SYNC_INIT 1
35 #endif
36 
37 int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
38 int sysctl_tcp_abort_on_overflow __read_mostly;
39 
40 struct inet_timewait_death_row tcp_death_row = {
41 	.sysctl_max_tw_buckets = NR_FILE * 2,
42 	.period		= TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
43 	.death_lock	= __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
44 	.hashinfo	= &tcp_hashinfo,
45 	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
46 					    (unsigned long)&tcp_death_row),
47 	.twkill_work	= __WORK_INITIALIZER(tcp_death_row.twkill_work,
48 					     inet_twdr_twkill_work),
49 /* Short-time timewait calendar */
50 
51 	.twcal_hand	= -1,
52 	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
53 					    (unsigned long)&tcp_death_row),
54 };
55 
56 EXPORT_SYMBOL_GPL(tcp_death_row);
57 
58 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
59 {
60 	if (seq == s_win)
61 		return 1;
62 	if (after(end_seq, s_win) && before(seq, e_win))
63 		return 1;
64 	return (seq == e_win && seq == end_seq);
65 }
66 
67 /*
68  * * Main purpose of TIME-WAIT state is to close connection gracefully,
69  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
70  *   (and, probably, tail of data) and one or more our ACKs are lost.
71  * * What is TIME-WAIT timeout? It is associated with maximal packet
72  *   lifetime in the internet, which results in wrong conclusion, that
73  *   it is set to catch "old duplicate segments" wandering out of their path.
74  *   It is not quite correct. This timeout is calculated so that it exceeds
75  *   maximal retransmission timeout enough to allow to lose one (or more)
76  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
77  * * When TIME-WAIT socket receives RST, it means that another end
78  *   finally closed and we are allowed to kill TIME-WAIT too.
79  * * Second purpose of TIME-WAIT is catching old duplicate segments.
80  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
81  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
82  * * If we invented some more clever way to catch duplicates
83  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
84  *
85  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
86  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
87  * from the very beginning.
88  *
89  * NOTE. With recycling (and later with fin-wait-2) TW bucket
90  * is _not_ stateless. It means, that strictly speaking we must
91  * spinlock it. I do not want! Well, probability of misbehaviour
92  * is ridiculously low and, seems, we could use some mb() tricks
93  * to avoid misread sequence numbers, states etc.  --ANK
94  */
95 enum tcp_tw_status
96 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
97 			   const struct tcphdr *th)
98 {
99 	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
100 	struct tcp_options_received tmp_opt;
101 	int paws_reject = 0;
102 
103 	tmp_opt.saw_tstamp = 0;
104 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
105 		tcp_parse_options(skb, &tmp_opt, 0);
106 
107 		if (tmp_opt.saw_tstamp) {
108 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
109 			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
110 			paws_reject = tcp_paws_check(&tmp_opt, th->rst);
111 		}
112 	}
113 
114 	if (tw->tw_substate == TCP_FIN_WAIT2) {
115 		/* Just repeat all the checks of tcp_rcv_state_process() */
116 
117 		/* Out of window, send ACK */
118 		if (paws_reject ||
119 		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
120 				   tcptw->tw_rcv_nxt,
121 				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
122 			return TCP_TW_ACK;
123 
124 		if (th->rst)
125 			goto kill;
126 
127 		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
128 			goto kill_with_rst;
129 
130 		/* Dup ACK? */
131 		if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
132 		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
133 			inet_twsk_put(tw);
134 			return TCP_TW_SUCCESS;
135 		}
136 
137 		/* New data or FIN. If new data arrive after half-duplex close,
138 		 * reset.
139 		 */
140 		if (!th->fin ||
141 		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
142 kill_with_rst:
143 			inet_twsk_deschedule(tw, &tcp_death_row);
144 			inet_twsk_put(tw);
145 			return TCP_TW_RST;
146 		}
147 
148 		/* FIN arrived, enter true time-wait state. */
149 		tw->tw_substate	  = TCP_TIME_WAIT;
150 		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
151 		if (tmp_opt.saw_tstamp) {
152 			tcptw->tw_ts_recent_stamp = xtime.tv_sec;
153 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
154 		}
155 
156 		/* I am shamed, but failed to make it more elegant.
157 		 * Yes, it is direct reference to IP, which is impossible
158 		 * to generalize to IPv6. Taking into account that IPv6
159 		 * do not understand recycling in any case, it not
160 		 * a big problem in practice. --ANK */
161 		if (tw->tw_family == AF_INET &&
162 		    tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
163 		    tcp_v4_tw_remember_stamp(tw))
164 			inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
165 					   TCP_TIMEWAIT_LEN);
166 		else
167 			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
168 					   TCP_TIMEWAIT_LEN);
169 		return TCP_TW_ACK;
170 	}
171 
172 	/*
173 	 *	Now real TIME-WAIT state.
174 	 *
175 	 *	RFC 1122:
176 	 *	"When a connection is [...] on TIME-WAIT state [...]
177 	 *	[a TCP] MAY accept a new SYN from the remote TCP to
178 	 *	reopen the connection directly, if it:
179 	 *
180 	 *	(1)  assigns its initial sequence number for the new
181 	 *	connection to be larger than the largest sequence
182 	 *	number it used on the previous connection incarnation,
183 	 *	and
184 	 *
185 	 *	(2)  returns to TIME-WAIT state if the SYN turns out
186 	 *	to be an old duplicate".
187 	 */
188 
189 	if (!paws_reject &&
190 	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
191 	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
192 		/* In window segment, it may be only reset or bare ack. */
193 
194 		if (th->rst) {
195 			/* This is TIME_WAIT assassination, in two flavors.
196 			 * Oh well... nobody has a sufficient solution to this
197 			 * protocol bug yet.
198 			 */
199 			if (sysctl_tcp_rfc1337 == 0) {
200 kill:
201 				inet_twsk_deschedule(tw, &tcp_death_row);
202 				inet_twsk_put(tw);
203 				return TCP_TW_SUCCESS;
204 			}
205 		}
206 		inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
207 				   TCP_TIMEWAIT_LEN);
208 
209 		if (tmp_opt.saw_tstamp) {
210 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
211 			tcptw->tw_ts_recent_stamp = xtime.tv_sec;
212 		}
213 
214 		inet_twsk_put(tw);
215 		return TCP_TW_SUCCESS;
216 	}
217 
218 	/* Out of window segment.
219 
220 	   All the segments are ACKed immediately.
221 
222 	   The only exception is new SYN. We accept it, if it is
223 	   not old duplicate and we are not in danger to be killed
224 	   by delayed old duplicates. RFC check is that it has
225 	   newer sequence number works at rates <40Mbit/sec.
226 	   However, if paws works, it is reliable AND even more,
227 	   we even may relax silly seq space cutoff.
228 
229 	   RED-PEN: we violate main RFC requirement, if this SYN will appear
230 	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
231 	   we must return socket to time-wait state. It is not good,
232 	   but not fatal yet.
233 	 */
234 
235 	if (th->syn && !th->rst && !th->ack && !paws_reject &&
236 	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
237 	     (tmp_opt.saw_tstamp &&
238 	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
239 		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
240 		if (isn == 0)
241 			isn++;
242 		TCP_SKB_CB(skb)->when = isn;
243 		return TCP_TW_SYN;
244 	}
245 
246 	if (paws_reject)
247 		NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
248 
249 	if(!th->rst) {
250 		/* In this case we must reset the TIMEWAIT timer.
251 		 *
252 		 * If it is ACKless SYN it may be both old duplicate
253 		 * and new good SYN with random sequence number <rcv_nxt.
254 		 * Do not reschedule in the last case.
255 		 */
256 		if (paws_reject || th->ack)
257 			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
258 					   TCP_TIMEWAIT_LEN);
259 
260 		/* Send ACK. Note, we do not put the bucket,
261 		 * it will be released by caller.
262 		 */
263 		return TCP_TW_ACK;
264 	}
265 	inet_twsk_put(tw);
266 	return TCP_TW_SUCCESS;
267 }
268 
269 /*
270  * Move a socket to time-wait or dead fin-wait-2 state.
271  */
272 void tcp_time_wait(struct sock *sk, int state, int timeo)
273 {
274 	struct inet_timewait_sock *tw = NULL;
275 	const struct inet_connection_sock *icsk = inet_csk(sk);
276 	const struct tcp_sock *tp = tcp_sk(sk);
277 	int recycle_ok = 0;
278 
279 	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
280 		recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
281 
282 	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
283 		tw = inet_twsk_alloc(sk, state);
284 
285 	if (tw != NULL) {
286 		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
287 		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
288 
289 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
290 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
291 		tcptw->tw_snd_nxt	= tp->snd_nxt;
292 		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
293 		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
294 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
295 
296 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
297 		if (tw->tw_family == PF_INET6) {
298 			struct ipv6_pinfo *np = inet6_sk(sk);
299 			struct inet6_timewait_sock *tw6;
300 
301 			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
302 			tw6 = inet6_twsk((struct sock *)tw);
303 			ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
304 			ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
305 			tw->tw_ipv6only = np->ipv6only;
306 		}
307 #endif
308 
309 #ifdef CONFIG_TCP_MD5SIG
310 		/*
311 		 * The timewait bucket does not have the key DB from the
312 		 * sock structure. We just make a quick copy of the
313 		 * md5 key being used (if indeed we are using one)
314 		 * so the timewait ack generating code has the key.
315 		 */
316 		do {
317 			struct tcp_md5sig_key *key;
318 			memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));
319 			tcptw->tw_md5_keylen = 0;
320 			key = tp->af_specific->md5_lookup(sk, sk);
321 			if (key != NULL) {
322 				memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
323 				tcptw->tw_md5_keylen = key->keylen;
324 				if (tcp_alloc_md5sig_pool() == NULL)
325 					BUG();
326 			}
327 		} while(0);
328 #endif
329 
330 		/* Linkage updates. */
331 		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
332 
333 		/* Get the TIME_WAIT timeout firing. */
334 		if (timeo < rto)
335 			timeo = rto;
336 
337 		if (recycle_ok) {
338 			tw->tw_timeout = rto;
339 		} else {
340 			tw->tw_timeout = TCP_TIMEWAIT_LEN;
341 			if (state == TCP_TIME_WAIT)
342 				timeo = TCP_TIMEWAIT_LEN;
343 		}
344 
345 		inet_twsk_schedule(tw, &tcp_death_row, timeo,
346 				   TCP_TIMEWAIT_LEN);
347 		inet_twsk_put(tw);
348 	} else {
349 		/* Sorry, if we're out of memory, just CLOSE this
350 		 * socket up.  We've got bigger problems than
351 		 * non-graceful socket closings.
352 		 */
353 		LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
354 	}
355 
356 	tcp_update_metrics(sk);
357 	tcp_done(sk);
358 }
359 
360 void tcp_twsk_destructor(struct sock *sk)
361 {
362 #ifdef CONFIG_TCP_MD5SIG
363 	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
364 	if (twsk->tw_md5_keylen)
365 		tcp_put_md5sig_pool();
366 #endif
367 }
368 
369 EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
370 
371 /* This is not only more efficient than what we used to do, it eliminates
372  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
373  *
374  * Actually, we could lots of memory writes here. tp of listening
375  * socket contains all necessary default parameters.
376  */
377 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
378 {
379 	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
380 
381 	if (newsk != NULL) {
382 		const struct inet_request_sock *ireq = inet_rsk(req);
383 		struct tcp_request_sock *treq = tcp_rsk(req);
384 		struct inet_connection_sock *newicsk = inet_csk(newsk);
385 		struct tcp_sock *newtp;
386 
387 		/* Now setup tcp_sock */
388 		newtp = tcp_sk(newsk);
389 		newtp->pred_flags = 0;
390 		newtp->rcv_nxt = treq->rcv_isn + 1;
391 		newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
392 
393 		tcp_prequeue_init(newtp);
394 
395 		tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
396 
397 		newtp->srtt = 0;
398 		newtp->mdev = TCP_TIMEOUT_INIT;
399 		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
400 
401 		newtp->packets_out = 0;
402 		newtp->left_out = 0;
403 		newtp->retrans_out = 0;
404 		newtp->sacked_out = 0;
405 		newtp->fackets_out = 0;
406 		newtp->snd_ssthresh = 0x7fffffff;
407 
408 		/* So many TCP implementations out there (incorrectly) count the
409 		 * initial SYN frame in their delayed-ACK and congestion control
410 		 * algorithms that we must have the following bandaid to talk
411 		 * efficiently to them.  -DaveM
412 		 */
413 		newtp->snd_cwnd = 2;
414 		newtp->snd_cwnd_cnt = 0;
415 		newtp->bytes_acked = 0;
416 
417 		newtp->frto_counter = 0;
418 		newtp->frto_highmark = 0;
419 
420 		newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
421 
422 		tcp_set_ca_state(newsk, TCP_CA_Open);
423 		tcp_init_xmit_timers(newsk);
424 		skb_queue_head_init(&newtp->out_of_order_queue);
425 		newtp->rcv_wup = treq->rcv_isn + 1;
426 		newtp->write_seq = treq->snt_isn + 1;
427 		newtp->pushed_seq = newtp->write_seq;
428 		newtp->copied_seq = treq->rcv_isn + 1;
429 
430 		newtp->rx_opt.saw_tstamp = 0;
431 
432 		newtp->rx_opt.dsack = 0;
433 		newtp->rx_opt.eff_sacks = 0;
434 
435 		newtp->rx_opt.num_sacks = 0;
436 		newtp->urg_data = 0;
437 
438 		if (sock_flag(newsk, SOCK_KEEPOPEN))
439 			inet_csk_reset_keepalive_timer(newsk,
440 						       keepalive_time_when(newtp));
441 
442 		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
443 		if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
444 			if (sysctl_tcp_fack)
445 				newtp->rx_opt.sack_ok |= 2;
446 		}
447 		newtp->window_clamp = req->window_clamp;
448 		newtp->rcv_ssthresh = req->rcv_wnd;
449 		newtp->rcv_wnd = req->rcv_wnd;
450 		newtp->rx_opt.wscale_ok = ireq->wscale_ok;
451 		if (newtp->rx_opt.wscale_ok) {
452 			newtp->rx_opt.snd_wscale = ireq->snd_wscale;
453 			newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
454 		} else {
455 			newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
456 			newtp->window_clamp = min(newtp->window_clamp, 65535U);
457 		}
458 		newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
459 		newtp->max_window = newtp->snd_wnd;
460 
461 		if (newtp->rx_opt.tstamp_ok) {
462 			newtp->rx_opt.ts_recent = req->ts_recent;
463 			newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
464 			newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
465 		} else {
466 			newtp->rx_opt.ts_recent_stamp = 0;
467 			newtp->tcp_header_len = sizeof(struct tcphdr);
468 		}
469 #ifdef CONFIG_TCP_MD5SIG
470 		newtp->md5sig_info = NULL;	/*XXX*/
471 		if (newtp->af_specific->md5_lookup(sk, newsk))
472 			newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
473 #endif
474 		if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
475 			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
476 		newtp->rx_opt.mss_clamp = req->mss;
477 		TCP_ECN_openreq_child(newtp, req);
478 
479 		TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
480 	}
481 	return newsk;
482 }
483 
484 /*
485  *	Process an incoming packet for SYN_RECV sockets represented
486  *	as a request_sock.
487  */
488 
489 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
490 			   struct request_sock *req,
491 			   struct request_sock **prev)
492 {
493 	struct tcphdr *th = skb->h.th;
494 	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
495 	int paws_reject = 0;
496 	struct tcp_options_received tmp_opt;
497 	struct sock *child;
498 
499 	tmp_opt.saw_tstamp = 0;
500 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
501 		tcp_parse_options(skb, &tmp_opt, 0);
502 
503 		if (tmp_opt.saw_tstamp) {
504 			tmp_opt.ts_recent = req->ts_recent;
505 			/* We do not store true stamp, but it is not required,
506 			 * it can be estimated (approximately)
507 			 * from another data.
508 			 */
509 			tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
510 			paws_reject = tcp_paws_check(&tmp_opt, th->rst);
511 		}
512 	}
513 
514 	/* Check for pure retransmitted SYN. */
515 	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
516 	    flg == TCP_FLAG_SYN &&
517 	    !paws_reject) {
518 		/*
519 		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
520 		 * this case on figure 6 and figure 8, but formal
521 		 * protocol description says NOTHING.
522 		 * To be more exact, it says that we should send ACK,
523 		 * because this segment (at least, if it has no data)
524 		 * is out of window.
525 		 *
526 		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
527 		 *  describe SYN-RECV state. All the description
528 		 *  is wrong, we cannot believe to it and should
529 		 *  rely only on common sense and implementation
530 		 *  experience.
531 		 *
532 		 * Enforce "SYN-ACK" according to figure 8, figure 6
533 		 * of RFC793, fixed by RFC1122.
534 		 */
535 		req->rsk_ops->rtx_syn_ack(sk, req, NULL);
536 		return NULL;
537 	}
538 
539 	/* Further reproduces section "SEGMENT ARRIVES"
540 	   for state SYN-RECEIVED of RFC793.
541 	   It is broken, however, it does not work only
542 	   when SYNs are crossed.
543 
544 	   You would think that SYN crossing is impossible here, since
545 	   we should have a SYN_SENT socket (from connect()) on our end,
546 	   but this is not true if the crossed SYNs were sent to both
547 	   ends by a malicious third party.  We must defend against this,
548 	   and to do that we first verify the ACK (as per RFC793, page
549 	   36) and reset if it is invalid.  Is this a true full defense?
550 	   To convince ourselves, let us consider a way in which the ACK
551 	   test can still pass in this 'malicious crossed SYNs' case.
552 	   Malicious sender sends identical SYNs (and thus identical sequence
553 	   numbers) to both A and B:
554 
555 		A: gets SYN, seq=7
556 		B: gets SYN, seq=7
557 
558 	   By our good fortune, both A and B select the same initial
559 	   send sequence number of seven :-)
560 
561 		A: sends SYN|ACK, seq=7, ack_seq=8
562 		B: sends SYN|ACK, seq=7, ack_seq=8
563 
564 	   So we are now A eating this SYN|ACK, ACK test passes.  So
565 	   does sequence test, SYN is truncated, and thus we consider
566 	   it a bare ACK.
567 
568 	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
569 	   bare ACK.  Otherwise, we create an established connection.  Both
570 	   ends (listening sockets) accept the new incoming connection and try
571 	   to talk to each other. 8-)
572 
573 	   Note: This case is both harmless, and rare.  Possibility is about the
574 	   same as us discovering intelligent life on another plant tomorrow.
575 
576 	   But generally, we should (RFC lies!) to accept ACK
577 	   from SYNACK both here and in tcp_rcv_state_process().
578 	   tcp_rcv_state_process() does not, hence, we do not too.
579 
580 	   Note that the case is absolutely generic:
581 	   we cannot optimize anything here without
582 	   violating protocol. All the checks must be made
583 	   before attempt to create socket.
584 	 */
585 
586 	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
587 	 *                  and the incoming segment acknowledges something not yet
588 	 *                  sent (the segment carries an unacceptable ACK) ...
589 	 *                  a reset is sent."
590 	 *
591 	 * Invalid ACK: reset will be sent by listening socket
592 	 */
593 	if ((flg & TCP_FLAG_ACK) &&
594 	    (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
595 		return sk;
596 
597 	/* Also, it would be not so bad idea to check rcv_tsecr, which
598 	 * is essentially ACK extension and too early or too late values
599 	 * should cause reset in unsynchronized states.
600 	 */
601 
602 	/* RFC793: "first check sequence number". */
603 
604 	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
605 					  tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
606 		/* Out of window: send ACK and drop. */
607 		if (!(flg & TCP_FLAG_RST))
608 			req->rsk_ops->send_ack(skb, req);
609 		if (paws_reject)
610 			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
611 		return NULL;
612 	}
613 
614 	/* In sequence, PAWS is OK. */
615 
616 	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
617 			req->ts_recent = tmp_opt.rcv_tsval;
618 
619 		if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
620 			/* Truncate SYN, it is out of window starting
621 			   at tcp_rsk(req)->rcv_isn + 1. */
622 			flg &= ~TCP_FLAG_SYN;
623 		}
624 
625 		/* RFC793: "second check the RST bit" and
626 		 *	   "fourth, check the SYN bit"
627 		 */
628 		if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
629 			TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
630 			goto embryonic_reset;
631 		}
632 
633 		/* ACK sequence verified above, just make sure ACK is
634 		 * set.  If ACK not set, just silently drop the packet.
635 		 */
636 		if (!(flg & TCP_FLAG_ACK))
637 			return NULL;
638 
639 		/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
640 		if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
641 		    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
642 			inet_rsk(req)->acked = 1;
643 			return NULL;
644 		}
645 
646 		/* OK, ACK is valid, create big socket and
647 		 * feed this segment to it. It will repeat all
648 		 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
649 		 * ESTABLISHED STATE. If it will be dropped after
650 		 * socket is created, wait for troubles.
651 		 */
652 		child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
653 								 req, NULL);
654 		if (child == NULL)
655 			goto listen_overflow;
656 #ifdef CONFIG_TCP_MD5SIG
657 		else {
658 			/* Copy over the MD5 key from the original socket */
659 			struct tcp_md5sig_key *key;
660 			struct tcp_sock *tp = tcp_sk(sk);
661 			key = tp->af_specific->md5_lookup(sk, child);
662 			if (key != NULL) {
663 				/*
664 				 * We're using one, so create a matching key on the
665 				 * newsk structure. If we fail to get memory then we
666 				 * end up not copying the key across. Shucks.
667 				 */
668 				char *newkey = kmemdup(key->key, key->keylen,
669 						       GFP_ATOMIC);
670 				if (newkey) {
671 					if (!tcp_alloc_md5sig_pool())
672 						BUG();
673 					tp->af_specific->md5_add(child, child,
674 								 newkey,
675 								 key->keylen);
676 				}
677 			}
678 		}
679 #endif
680 
681 		inet_csk_reqsk_queue_unlink(sk, req, prev);
682 		inet_csk_reqsk_queue_removed(sk, req);
683 
684 		inet_csk_reqsk_queue_add(sk, req, child);
685 		return child;
686 
687 	listen_overflow:
688 		if (!sysctl_tcp_abort_on_overflow) {
689 			inet_rsk(req)->acked = 1;
690 			return NULL;
691 		}
692 
693 	embryonic_reset:
694 		NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
695 		if (!(flg & TCP_FLAG_RST))
696 			req->rsk_ops->send_reset(sk, skb);
697 
698 		inet_csk_reqsk_queue_drop(sk, req, prev);
699 		return NULL;
700 }
701 
702 /*
703  * Queue segment on the new socket if the new socket is active,
704  * otherwise we just shortcircuit this and continue with
705  * the new socket.
706  */
707 
708 int tcp_child_process(struct sock *parent, struct sock *child,
709 		      struct sk_buff *skb)
710 {
711 	int ret = 0;
712 	int state = child->sk_state;
713 
714 	if (!sock_owned_by_user(child)) {
715 		ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
716 
717 		/* Wakeup parent, send SIGIO */
718 		if (state == TCP_SYN_RECV && child->sk_state != state)
719 			parent->sk_data_ready(parent, 0);
720 	} else {
721 		/* Alas, it is possible again, because we do lookup
722 		 * in main socket hash table and lock on listening
723 		 * socket does not protect us more.
724 		 */
725 		sk_add_backlog(child, skb);
726 	}
727 
728 	bh_unlock_sock(child);
729 	sock_put(child);
730 	return ret;
731 }
732 
733 EXPORT_SYMBOL(tcp_check_req);
734 EXPORT_SYMBOL(tcp_child_process);
735 EXPORT_SYMBOL(tcp_create_openreq_child);
736 EXPORT_SYMBOL(tcp_timewait_state_process);
737