xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_time_wait.c (revision 359db861fd14071f8a25831efe3bf3790980d071)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, Joyent Inc. All rights reserved.
25  */
26 
27 /*
28  * This file contains functions related to TCP time wait processing.  Also
29  * refer to the time wait handling comments in tcp_impl.h.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #include <sys/callo.h>
37 
38 #include <inet/common.h>
39 #include <inet/ip.h>
40 #include <inet/tcp.h>
41 #include <inet/tcp_impl.h>
42 #include <inet/tcp_cluster.h>
43 
44 static void	tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *);
45 
46 /*
47  * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
48  * Running it every 5 seconds seems to give the best results.
49  */
50 #define	TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC)
51 
52 /*
53  * Remove a connection from the list of detached TIME_WAIT connections.
54  * It returns B_FALSE if it can't remove the connection from the list
55  * as the connection has already been removed from the list due to an
56  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
57  */
58 boolean_t
59 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
60 {
61 	boolean_t	locked = B_FALSE;
62 
63 	if (tcp_time_wait == NULL) {
64 		tcp_time_wait = *((tcp_squeue_priv_t **)
65 		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
66 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
67 		locked = B_TRUE;
68 	} else {
69 		ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
70 	}
71 
72 	/* 0 means that the tcp_t has not been added to the time wait list. */
73 	if (tcp->tcp_time_wait_expire == 0) {
74 		ASSERT(tcp->tcp_time_wait_next == NULL);
75 		ASSERT(tcp->tcp_time_wait_prev == NULL);
76 		if (locked)
77 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
78 		return (B_FALSE);
79 	}
80 	ASSERT(TCP_IS_DETACHED(tcp));
81 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
82 
83 	if (tcp == tcp_time_wait->tcp_time_wait_head) {
84 		ASSERT(tcp->tcp_time_wait_prev == NULL);
85 		tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
86 		if (tcp_time_wait->tcp_time_wait_head != NULL) {
87 			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
88 			    NULL;
89 		} else {
90 			tcp_time_wait->tcp_time_wait_tail = NULL;
91 		}
92 	} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
93 		ASSERT(tcp->tcp_time_wait_next == NULL);
94 		tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
95 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
96 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
97 	} else {
98 		ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
99 		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
100 		tcp->tcp_time_wait_prev->tcp_time_wait_next =
101 		    tcp->tcp_time_wait_next;
102 		tcp->tcp_time_wait_next->tcp_time_wait_prev =
103 		    tcp->tcp_time_wait_prev;
104 	}
105 	tcp->tcp_time_wait_next = NULL;
106 	tcp->tcp_time_wait_prev = NULL;
107 	tcp->tcp_time_wait_expire = 0;
108 
109 	if (locked)
110 		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
111 	return (B_TRUE);
112 }
113 
114 /* Constants used for fast checking of a localhost address */
115 #if defined(_BIG_ENDIAN)
116 #define	IPv4_LOCALHOST	0x7f000000U
117 #define	IPv4_LH_MASK	0xffffff00U
118 #else
119 #define	IPv4_LOCALHOST	0x0000007fU
120 #define	IPv4_LH_MASK	0x00ffffffU
121 #endif
122 
123 #define	IS_LOCAL_HOST(x)	( \
124 	((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \
125 	((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \
126 	((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
127 	IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
128 
129 /*
130  * Add a connection to the list of detached TIME_WAIT connections
131  * and set its time to expire.
132  */
133 void
134 tcp_time_wait_append(tcp_t *tcp)
135 {
136 	tcp_stack_t	*tcps = tcp->tcp_tcps;
137 	squeue_t	*sqp = tcp->tcp_connp->conn_sqp;
138 	tcp_squeue_priv_t *tcp_time_wait =
139 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
140 	hrtime_t firetime = 0;
141 
142 	tcp_timers_stop(tcp);
143 
144 	/* Freed above */
145 	ASSERT(tcp->tcp_timer_tid == 0);
146 	ASSERT(tcp->tcp_ack_tid == 0);
147 
148 	/* must have happened at the time of detaching the tcp */
149 	ASSERT(tcp->tcp_ptpahn == NULL);
150 	ASSERT(tcp->tcp_flow_stopped == 0);
151 	ASSERT(tcp->tcp_time_wait_next == NULL);
152 	ASSERT(tcp->tcp_time_wait_prev == NULL);
153 	ASSERT(tcp->tcp_time_wait_expire == 0);
154 	ASSERT(tcp->tcp_listener == NULL);
155 
156 	tcp->tcp_time_wait_expire = ddi_get_lbolt64();
157 	if (IS_LOCAL_HOST(tcp)) {
158 		/*
159 		 * This is the fastpath for handling localhost connections.
160 		 * Since we don't have to worry about packets on the localhost
161 		 * showing up after a long network delay, we want to expire
162 		 * these quickly so the port range on the localhost doesn't
163 		 * get starved by short-running, local apps.
164 		 *
165 		 * Leave tcp_time_wait_expire at the current time. This
166 		 * essentially means the connection is expired now and it will
167 		 * clean up the next time tcp_time_wait_collector runs.  We set
168 		 * firetime to use a short delay so that if we have to start a
169 		 * tcp_time_wait_collector thread below, it runs soon instead
170 		 * of after a delay of time_wait_interval. firetime being set
171 		 * to a non-0 value is also our indicator that we should add
172 		 * this connection to the head of the time wait list (since we
173 		 * are already expired) so that its sure to get cleaned up on
174 		 * the next run of tcp_time_wait_collector (which expects the
175 		 * entries to appear in time-order and stops when it hits the
176 		 * first non-expired entry).
177 		 */
178 		firetime = TCP_TIME_WAIT_DELAY;
179 	} else {
180 		/*
181 		 * Since tcp_time_wait_expire is lbolt64, it should not wrap
182 		 * around in practice.  Hence it cannot be 0.  Note that zero
183 		 * means that the tcp_t is not in the TIME_WAIT list.
184 		 */
185 		tcp->tcp_time_wait_expire += MSEC_TO_TICK(
186 		    tcps->tcps_time_wait_interval);
187 	}
188 
189 	ASSERT(TCP_IS_DETACHED(tcp));
190 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
191 	ASSERT(tcp->tcp_time_wait_next == NULL);
192 	ASSERT(tcp->tcp_time_wait_prev == NULL);
193 	TCP_DBGSTAT(tcps, tcp_time_wait);
194 
195 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
196 	if (tcp_time_wait->tcp_time_wait_head == NULL) {
197 		ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
198 		tcp_time_wait->tcp_time_wait_head = tcp;
199 
200 		/*
201 		 * Even if the list was empty before, there may be a timer
202 		 * running since a tcp_t can be removed from the list
203 		 * in other places, such as tcp_clean_death().  So check if
204 		 * a timer is needed.
205 		 */
206 		if (tcp_time_wait->tcp_time_wait_tid == 0) {
207 			if (firetime == 0)
208 				firetime = (hrtime_t)
209 				    (tcps->tcps_time_wait_interval + 1) *
210 				    MICROSEC;
211 
212 			tcp_time_wait->tcp_time_wait_tid =
213 			    timeout_generic(CALLOUT_NORMAL,
214 			    tcp_time_wait_collector, sqp, firetime,
215 			    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
216 		}
217 		tcp_time_wait->tcp_time_wait_tail = tcp;
218 	} else {
219 		/*
220 		 * The list is not empty, so a timer must be running.  If not,
221 		 * tcp_time_wait_collector() must be running on this
222 		 * tcp_time_wait list at the same time.
223 		 */
224 		ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 ||
225 		    tcp_time_wait->tcp_time_wait_running);
226 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
227 		ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
228 		    TCPS_TIME_WAIT);
229 
230 		if (firetime == 0) {
231 			/* add at end */
232 			tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next =
233 			    tcp;
234 			tcp->tcp_time_wait_prev =
235 			    tcp_time_wait->tcp_time_wait_tail;
236 			tcp_time_wait->tcp_time_wait_tail = tcp;
237 		} else {
238 			/* add at head */
239 			tcp->tcp_time_wait_next =
240 			    tcp_time_wait->tcp_time_wait_head;
241 			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
242 			    tcp;
243 			tcp_time_wait->tcp_time_wait_head = tcp;
244 		}
245 	}
246 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
247 }
248 
249 /*
250  * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT
251  * tcp_t.  Used in tcp_time_wait_collector().
252  */
253 /* ARGSUSED */
254 static void
255 tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
256 {
257 	conn_t	*connp = (conn_t *)arg;
258 	tcp_t	*tcp = connp->conn_tcp;
259 
260 	ASSERT(tcp != NULL);
261 	if (tcp->tcp_state == TCPS_CLOSED) {
262 		return;
263 	}
264 
265 	ASSERT((connp->conn_family == AF_INET &&
266 	    connp->conn_ipversion == IPV4_VERSION) ||
267 	    (connp->conn_family == AF_INET6 &&
268 	    (connp->conn_ipversion == IPV4_VERSION ||
269 	    connp->conn_ipversion == IPV6_VERSION)));
270 	ASSERT(!tcp->tcp_listener);
271 
272 	ASSERT(TCP_IS_DETACHED(tcp));
273 
274 	/*
275 	 * Because they have no upstream client to rebind or tcp_close()
276 	 * them later, we axe the connection here and now.
277 	 */
278 	tcp_close_detached(tcp);
279 }
280 
281 /*
282  * Blows away all tcps whose TIME_WAIT has expired. List traversal
283  * is done forwards from the head.
284  * This walks all stack instances since
285  * tcp_time_wait remains global across all stacks.
286  */
287 /* ARGSUSED */
288 void
289 tcp_time_wait_collector(void *arg)
290 {
291 	tcp_t *tcp;
292 	int64_t now;
293 	mblk_t *mp;
294 	conn_t *connp;
295 	kmutex_t *lock;
296 	boolean_t removed;
297 	extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t,
298 	    uint8_t *, in_port_t, uint8_t *, in_port_t, void *);
299 
300 	squeue_t *sqp = (squeue_t *)arg;
301 	tcp_squeue_priv_t *tcp_time_wait =
302 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
303 
304 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
305 	tcp_time_wait->tcp_time_wait_tid = 0;
306 #ifdef DEBUG
307 	tcp_time_wait->tcp_time_wait_running = B_TRUE;
308 #endif
309 
310 	if (tcp_time_wait->tcp_free_list != NULL &&
311 	    tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
312 		TCP_G_STAT(tcp_freelist_cleanup);
313 		while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
314 			tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
315 			tcp->tcp_time_wait_next = NULL;
316 			tcp_time_wait->tcp_free_list_cnt--;
317 			ASSERT(tcp->tcp_tcps == NULL);
318 			CONN_DEC_REF(tcp->tcp_connp);
319 		}
320 		ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
321 	}
322 
323 	/*
324 	 * In order to reap time waits reliably, we should use a
325 	 * source of time that is not adjustable by the user -- hence
326 	 * the call to ddi_get_lbolt64().
327 	 */
328 	now = ddi_get_lbolt64();
329 	while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
330 		/*
331 		 * lbolt64 should not wrap around in practice...  So we can
332 		 * do a direct comparison.
333 		 */
334 		if (now < tcp->tcp_time_wait_expire)
335 			break;
336 
337 		removed = tcp_time_wait_remove(tcp, tcp_time_wait);
338 		ASSERT(removed);
339 
340 		connp = tcp->tcp_connp;
341 		ASSERT(connp->conn_fanout != NULL);
342 		lock = &connp->conn_fanout->connf_lock;
343 		/*
344 		 * This is essentially a TW reclaim fast path optimization for
345 		 * performance where the timewait collector checks under the
346 		 * fanout lock (so that no one else can get access to the
347 		 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
348 		 * the classifier hash list. If ref count is indeed 2, we can
349 		 * just remove the conn under the fanout lock and avoid
350 		 * cleaning up the conn under the squeue, provided that
351 		 * clustering callbacks are not enabled. If clustering is
352 		 * enabled, we need to make the clustering callback before
353 		 * setting the CONDEMNED flag and after dropping all locks and
354 		 * so we forego this optimization and fall back to the slow
355 		 * path. Also please see the comments in tcp_closei_local
356 		 * regarding the refcnt logic.
357 		 *
358 		 * Since we are holding the tcp_time_wait_lock, its better
359 		 * not to block on the fanout_lock because other connections
360 		 * can't add themselves to time_wait list. So we do a
361 		 * tryenter instead of mutex_enter.
362 		 */
363 		if (mutex_tryenter(lock)) {
364 			mutex_enter(&connp->conn_lock);
365 			if ((connp->conn_ref == 2) &&
366 			    (cl_inet_disconnect == NULL)) {
367 				ipcl_hash_remove_locked(connp,
368 				    connp->conn_fanout);
369 				/*
370 				 * Set the CONDEMNED flag now itself so that
371 				 * the refcnt cannot increase due to any
372 				 * walker.
373 				 */
374 				connp->conn_state_flags |= CONN_CONDEMNED;
375 				mutex_exit(lock);
376 				mutex_exit(&connp->conn_lock);
377 				if (tcp_time_wait->tcp_free_list_cnt <
378 				    tcp_free_list_max_cnt) {
379 					/* Add to head of tcp_free_list */
380 					mutex_exit(
381 					    &tcp_time_wait->tcp_time_wait_lock);
382 					tcp_cleanup(tcp);
383 					ASSERT(connp->conn_latch == NULL);
384 					ASSERT(connp->conn_policy == NULL);
385 					ASSERT(tcp->tcp_tcps == NULL);
386 					ASSERT(connp->conn_netstack == NULL);
387 
388 					mutex_enter(
389 					    &tcp_time_wait->tcp_time_wait_lock);
390 					tcp->tcp_time_wait_next =
391 					    tcp_time_wait->tcp_free_list;
392 					tcp_time_wait->tcp_free_list = tcp;
393 					tcp_time_wait->tcp_free_list_cnt++;
394 					continue;
395 				} else {
396 					/* Do not add to tcp_free_list */
397 					mutex_exit(
398 					    &tcp_time_wait->tcp_time_wait_lock);
399 					tcp_bind_hash_remove(tcp);
400 					ixa_cleanup(tcp->tcp_connp->conn_ixa);
401 					tcp_ipsec_cleanup(tcp);
402 					CONN_DEC_REF(tcp->tcp_connp);
403 				}
404 			} else {
405 				CONN_INC_REF_LOCKED(connp);
406 				mutex_exit(lock);
407 				mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
408 				mutex_exit(&connp->conn_lock);
409 				/*
410 				 * We can reuse the closemp here since conn has
411 				 * detached (otherwise we wouldn't even be in
412 				 * time_wait list). tcp_closemp_used can safely
413 				 * be changed without taking a lock as no other
414 				 * thread can concurrently access it at this
415 				 * point in the connection lifecycle.
416 				 */
417 
418 				if (tcp->tcp_closemp.b_prev == NULL)
419 					tcp->tcp_closemp_used = B_TRUE;
420 				else
421 					cmn_err(CE_PANIC,
422 					    "tcp_timewait_collector: "
423 					    "concurrent use of tcp_closemp: "
424 					    "connp %p tcp %p\n", (void *)connp,
425 					    (void *)tcp);
426 
427 				TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
428 				mp = &tcp->tcp_closemp;
429 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
430 				    tcp_timewait_close, connp, NULL,
431 				    SQ_FILL, SQTAG_TCP_TIMEWAIT);
432 			}
433 		} else {
434 			mutex_enter(&connp->conn_lock);
435 			CONN_INC_REF_LOCKED(connp);
436 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
437 			mutex_exit(&connp->conn_lock);
438 			/*
439 			 * We can reuse the closemp here since conn has
440 			 * detached (otherwise we wouldn't even be in
441 			 * time_wait list). tcp_closemp_used can safely
442 			 * be changed without taking a lock as no other
443 			 * thread can concurrently access it at this
444 			 * point in the connection lifecycle.
445 			 */
446 
447 			if (tcp->tcp_closemp.b_prev == NULL)
448 				tcp->tcp_closemp_used = B_TRUE;
449 			else
450 				cmn_err(CE_PANIC, "tcp_timewait_collector: "
451 				    "concurrent use of tcp_closemp: "
452 				    "connp %p tcp %p\n", (void *)connp,
453 				    (void *)tcp);
454 
455 			TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
456 			mp = &tcp->tcp_closemp;
457 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
458 			    tcp_timewait_close, connp, NULL,
459 			    SQ_FILL, SQTAG_TCP_TIMEWAIT);
460 		}
461 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
462 	}
463 
464 	if (tcp_time_wait->tcp_free_list != NULL)
465 		tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
466 
467 	/*
468 	 * If the time wait list is not empty and there is no timer running,
469 	 * restart it.
470 	 */
471 	if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL &&
472 	    tcp_time_wait->tcp_time_wait_tid == 0) {
473 		hrtime_t firetime;
474 
475 		/* shouldn't be necessary, but just in case */
476 		if (tcp->tcp_time_wait_expire < now)
477 			tcp->tcp_time_wait_expire = now;
478 
479 		firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now);
480 		/* This ensures that we won't wake up too often. */
481 		firetime = MAX(TCP_TIME_WAIT_DELAY, firetime);
482 		tcp_time_wait->tcp_time_wait_tid =
483 		    timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector,
484 		    sqp, firetime, CALLOUT_TCP_RESOLUTION,
485 		    CALLOUT_FLAG_ROUNDUP);
486 	}
487 #ifdef DEBUG
488 	tcp_time_wait->tcp_time_wait_running = B_FALSE;
489 #endif
490 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
491 }
492 
493 /*
494  * tcp_time_wait_processing() handles processing of incoming packets when
495  * the tcp_t is in the TIME_WAIT state.
496  *
497  * A TIME_WAIT tcp_t that has an associated open TCP end point (not in
498  * detached state) is never put on the time wait list.
499  */
500 void
501 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
502     uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
503 {
504 	int32_t		bytes_acked;
505 	int32_t		gap;
506 	int32_t		rgap;
507 	tcp_opt_t	tcpopt;
508 	uint_t		flags;
509 	uint32_t	new_swnd = 0;
510 	conn_t		*nconnp;
511 	conn_t		*connp = tcp->tcp_connp;
512 	tcp_stack_t	*tcps = tcp->tcp_tcps;
513 
514 	BUMP_LOCAL(tcp->tcp_ibsegs);
515 	DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
516 
517 	flags = (unsigned int)tcpha->tha_flags & 0xFF;
518 	new_swnd = ntohs(tcpha->tha_win) <<
519 	    ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
520 
521 	if (tcp->tcp_snd_ts_ok && !(tcpha->tha_flags & TH_RST)) {
522 		int options;
523 		if (tcp->tcp_snd_sack_ok)
524 			tcpopt.tcp = tcp;
525 		else
526 			tcpopt.tcp = NULL;
527 		options = tcp_parse_options(tcpha, &tcpopt);
528 		if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
529 			DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
530 			goto done;
531 		} else if (!tcp_paws_check(tcp, &tcpopt)) {
532 			tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt,
533 			    TH_ACK);
534 			goto done;
535 		}
536 	}
537 	gap = seg_seq - tcp->tcp_rnxt;
538 	rgap = tcp->tcp_rwnd - (gap + seg_len);
539 	if (gap < 0) {
540 		TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
541 		TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
542 		    (seg_len > -gap ? -gap : seg_len));
543 		seg_len += gap;
544 		if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
545 			if (flags & TH_RST) {
546 				goto done;
547 			}
548 			if ((flags & TH_FIN) && seg_len == -1) {
549 				/*
550 				 * When TCP receives a duplicate FIN in
551 				 * TIME_WAIT state, restart the 2 MSL timer.
552 				 * See page 73 in RFC 793. Make sure this TCP
553 				 * is already on the TIME_WAIT list. If not,
554 				 * just restart the timer.
555 				 */
556 				if (TCP_IS_DETACHED(tcp)) {
557 					if (tcp_time_wait_remove(tcp, NULL) ==
558 					    B_TRUE) {
559 						tcp_time_wait_append(tcp);
560 						TCP_DBGSTAT(tcps,
561 						    tcp_rput_time_wait);
562 					}
563 				} else {
564 					ASSERT(tcp != NULL);
565 					TCP_TIMER_RESTART(tcp,
566 					    tcps->tcps_time_wait_interval);
567 				}
568 				tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
569 				    tcp->tcp_rnxt, TH_ACK);
570 				goto done;
571 			}
572 			flags |=  TH_ACK_NEEDED;
573 			seg_len = 0;
574 			goto process_ack;
575 		}
576 
577 		/* Fix seg_seq, and chew the gap off the front. */
578 		seg_seq = tcp->tcp_rnxt;
579 	}
580 
581 	if ((flags & TH_SYN) && gap > 0 && rgap < 0) {
582 		/*
583 		 * Make sure that when we accept the connection, pick
584 		 * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the
585 		 * old connection.
586 		 *
587 		 * The next ISS generated is equal to tcp_iss_incr_extra
588 		 * + tcp_iss_incr/2 + other components depending on the
589 		 * value of tcp_strong_iss.  We pre-calculate the new
590 		 * ISS here and compare with tcp_snxt to determine if
591 		 * we need to make adjustment to tcp_iss_incr_extra.
592 		 *
593 		 * The above calculation is ugly and is a
594 		 * waste of CPU cycles...
595 		 */
596 		uint32_t new_iss = tcps->tcps_iss_incr_extra;
597 		int32_t adj;
598 		ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
599 
600 		switch (tcps->tcps_strong_iss) {
601 		case 2: {
602 			/* Add time and MD5 components. */
603 			uint32_t answer[4];
604 			struct {
605 				uint32_t ports;
606 				in6_addr_t src;
607 				in6_addr_t dst;
608 			} arg;
609 			MD5_CTX context;
610 
611 			mutex_enter(&tcps->tcps_iss_key_lock);
612 			context = tcps->tcps_iss_key;
613 			mutex_exit(&tcps->tcps_iss_key_lock);
614 			arg.ports = connp->conn_ports;
615 			/* We use MAPPED addresses in tcp_iss_init */
616 			arg.src = connp->conn_laddr_v6;
617 			arg.dst = connp->conn_faddr_v6;
618 			MD5Update(&context, (uchar_t *)&arg,
619 			    sizeof (arg));
620 			MD5Final((uchar_t *)answer, &context);
621 			answer[0] ^= answer[1] ^ answer[2] ^ answer[3];
622 			new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0];
623 			break;
624 		}
625 		case 1:
626 			/* Add time component and min random (i.e. 1). */
627 			new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1;
628 			break;
629 		default:
630 			/* Add only time component. */
631 			new_iss += (uint32_t)gethrestime_sec() *
632 			    tcps->tcps_iss_incr;
633 			break;
634 		}
635 		if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) {
636 			/*
637 			 * New ISS not guaranteed to be tcp_iss_incr/2
638 			 * ahead of the current tcp_snxt, so add the
639 			 * difference to tcp_iss_incr_extra.
640 			 */
641 			tcps->tcps_iss_incr_extra += adj;
642 		}
643 		/*
644 		 * If tcp_clean_death() can not perform the task now,
645 		 * drop the SYN packet and let the other side re-xmit.
646 		 * Otherwise pass the SYN packet back in, since the
647 		 * old tcp state has been cleaned up or freed.
648 		 */
649 		if (tcp_clean_death(tcp, 0) == -1)
650 			goto done;
651 		nconnp = ipcl_classify(mp, ira, ipst);
652 		if (nconnp != NULL) {
653 			TCP_STAT(tcps, tcp_time_wait_syn_success);
654 			/* Drops ref on nconnp */
655 			tcp_reinput(nconnp, mp, ira, ipst);
656 			return;
657 		}
658 		goto done;
659 	}
660 
661 	/*
662 	 * rgap is the amount of stuff received out of window.  A negative
663 	 * value is the amount out of window.
664 	 */
665 	if (rgap < 0) {
666 		TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
667 		TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
668 		/* Fix seg_len and make sure there is something left. */
669 		seg_len += rgap;
670 		if (seg_len <= 0) {
671 			if (flags & TH_RST) {
672 				goto done;
673 			}
674 			flags |=  TH_ACK_NEEDED;
675 			seg_len = 0;
676 			goto process_ack;
677 		}
678 	}
679 	/*
680 	 * Check whether we can update tcp_ts_recent. This test is from RFC
681 	 * 7323, section 5.3.
682 	 */
683 	if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
684 	    TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
685 	    SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
686 		tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
687 		tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();
688 	}
689 
690 	if (seg_seq != tcp->tcp_rnxt && seg_len > 0) {
691 		/* Always ack out of order packets */
692 		flags |= TH_ACK_NEEDED;
693 		seg_len = 0;
694 	} else if (seg_len > 0) {
695 		TCPS_BUMP_MIB(tcps, tcpInClosed);
696 		TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
697 		TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
698 	}
699 	if (flags & TH_RST) {
700 		(void) tcp_clean_death(tcp, 0);
701 		goto done;
702 	}
703 	if (flags & TH_SYN) {
704 		tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
705 		    TH_RST|TH_ACK);
706 		/*
707 		 * Do not delete the TCP structure if it is in
708 		 * TIME_WAIT state.  Refer to RFC 1122, 4.2.2.13.
709 		 */
710 		goto done;
711 	}
712 process_ack:
713 	if (flags & TH_ACK) {
714 		bytes_acked = (int)(seg_ack - tcp->tcp_suna);
715 		if (bytes_acked <= 0) {
716 			if (bytes_acked == 0 && seg_len == 0 &&
717 			    new_swnd == tcp->tcp_swnd)
718 				TCPS_BUMP_MIB(tcps, tcpInDupAck);
719 		} else {
720 			/* Acks something not sent */
721 			flags |= TH_ACK_NEEDED;
722 		}
723 	}
724 	if (flags & TH_ACK_NEEDED) {
725 		/*
726 		 * Time to send an ack for some reason.
727 		 */
728 		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
729 		    tcp->tcp_rnxt, TH_ACK);
730 	}
731 done:
732 	freemsg(mp);
733 }
734