xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_time_wait.c (revision cffcfaee1e6b29ef9ceb7d80e4e053ffd029906b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, Joyent Inc. All rights reserved.
25  */
26 
27 /*
28  * This file contains functions related to TCP time wait processing.  Also
29  * refer to the time wait handling comments in tcp_impl.h.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #include <sys/callo.h>
37 
38 #include <inet/common.h>
39 #include <inet/ip.h>
40 #include <inet/tcp.h>
41 #include <inet/tcp_impl.h>
42 #include <inet/tcp_cluster.h>
43 
44 static void	tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *);
45 
46 /*
47  * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
48  * Running it every 5 seconds seems to give the best results.
49  */
50 #define	TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC)
51 
52 /*
53  * Remove a connection from the list of detached TIME_WAIT connections.
54  * It returns B_FALSE if it can't remove the connection from the list
55  * as the connection has already been removed from the list due to an
56  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
57  */
58 boolean_t
59 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
60 {
61 	boolean_t	locked = B_FALSE;
62 
63 	if (tcp_time_wait == NULL) {
64 		tcp_time_wait = *((tcp_squeue_priv_t **)
65 		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
66 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
67 		locked = B_TRUE;
68 	} else {
69 		ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
70 	}
71 
72 	/* 0 means that the tcp_t has not been added to the time wait list. */
73 	if (tcp->tcp_time_wait_expire == 0) {
74 		ASSERT(tcp->tcp_time_wait_next == NULL);
75 		ASSERT(tcp->tcp_time_wait_prev == NULL);
76 		if (locked)
77 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
78 		return (B_FALSE);
79 	}
80 	ASSERT(TCP_IS_DETACHED(tcp));
81 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
82 
83 	if (tcp == tcp_time_wait->tcp_time_wait_head) {
84 		ASSERT(tcp->tcp_time_wait_prev == NULL);
85 		tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
86 		if (tcp_time_wait->tcp_time_wait_head != NULL) {
87 			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
88 			    NULL;
89 		} else {
90 			tcp_time_wait->tcp_time_wait_tail = NULL;
91 		}
92 	} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
93 		ASSERT(tcp->tcp_time_wait_next == NULL);
94 		tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
95 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
96 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
97 	} else {
98 		ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
99 		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
100 		tcp->tcp_time_wait_prev->tcp_time_wait_next =
101 		    tcp->tcp_time_wait_next;
102 		tcp->tcp_time_wait_next->tcp_time_wait_prev =
103 		    tcp->tcp_time_wait_prev;
104 	}
105 	tcp->tcp_time_wait_next = NULL;
106 	tcp->tcp_time_wait_prev = NULL;
107 	tcp->tcp_time_wait_expire = 0;
108 
109 	if (locked)
110 		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
111 	return (B_TRUE);
112 }
113 
114 /* Constants used for fast checking of a localhost address */
115 #if defined(_BIG_ENDIAN)
116 #define	IPv4_LOCALHOST	0x7f000000U
117 #define	IPv4_LH_MASK	0xffffff00U
118 #else
119 #define	IPv4_LOCALHOST	0x0000007fU
120 #define	IPv4_LH_MASK	0x00ffffffU
121 #endif
122 
123 #define	IS_LOCAL_HOST(x)	( \
124 	((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \
125 	((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \
126 	((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
127 	IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
128 
129 /*
130  * Add a connection to the list of detached TIME_WAIT connections
131  * and set its time to expire.
132  */
133 void
134 tcp_time_wait_append(tcp_t *tcp)
135 {
136 	tcp_stack_t	*tcps = tcp->tcp_tcps;
137 	squeue_t	*sqp = tcp->tcp_connp->conn_sqp;
138 	tcp_squeue_priv_t *tcp_time_wait =
139 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
140 	hrtime_t firetime = 0;
141 
142 	tcp_timers_stop(tcp);
143 
144 	/* Freed above */
145 	ASSERT(tcp->tcp_timer_tid == 0);
146 	ASSERT(tcp->tcp_ack_tid == 0);
147 
148 	/* must have happened at the time of detaching the tcp */
149 	ASSERT(tcp->tcp_ptpahn == NULL);
150 	ASSERT(tcp->tcp_flow_stopped == 0);
151 	ASSERT(tcp->tcp_time_wait_next == NULL);
152 	ASSERT(tcp->tcp_time_wait_prev == NULL);
153 	ASSERT(tcp->tcp_time_wait_expire == 0);
154 	ASSERT(tcp->tcp_listener == NULL);
155 
156 	tcp->tcp_time_wait_expire = ddi_get_lbolt64();
157 	if (IS_LOCAL_HOST(tcp)) {
158 		/*
159 		 * This is the fastpath for handling localhost connections.
160 		 * Since we don't have to worry about packets on the localhost
161 		 * showing up after a long network delay, we want to expire
162 		 * these quickly so the port range on the localhost doesn't
163 		 * get starved by short-running, local apps.
164 		 *
165 		 * Leave tcp_time_wait_expire at the current time. This
166 		 * essentially means the connection is expired now and it will
167 		 * clean up the next time tcp_time_wait_collector runs.  We set
168 		 * firetime to use a short delay so that if we have to start a
169 		 * tcp_time_wait_collector thread below, it runs soon instead
170 		 * of after a delay of time_wait_interval. firetime being set
171 		 * to a non-0 value is also our indicator that we should add
172 		 * this connection to the head of the time wait list (since we
173 		 * are already expired) so that its sure to get cleaned up on
174 		 * the next run of tcp_time_wait_collector (which expects the
175 		 * entries to appear in time-order and stops when it hits the
176 		 * first non-expired entry).
177 		 */
178 		firetime = TCP_TIME_WAIT_DELAY;
179 	} else {
180 		/*
181 		 * Since tcp_time_wait_expire is lbolt64, it should not wrap
182 		 * around in practice.  Hence it cannot be 0.  Note that zero
183 		 * means that the tcp_t is not in the TIME_WAIT list.
184 		 */
185 		tcp->tcp_time_wait_expire += MSEC_TO_TICK(
186 		    tcps->tcps_time_wait_interval);
187 	}
188 
189 	ASSERT(TCP_IS_DETACHED(tcp));
190 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
191 	ASSERT(tcp->tcp_time_wait_next == NULL);
192 	ASSERT(tcp->tcp_time_wait_prev == NULL);
193 	TCP_DBGSTAT(tcps, tcp_time_wait);
194 
195 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
196 	if (tcp_time_wait->tcp_time_wait_head == NULL) {
197 		ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
198 		tcp_time_wait->tcp_time_wait_head = tcp;
199 
200 		/*
201 		 * Even if the list was empty before, there may be a timer
202 		 * running since a tcp_t can be removed from the list
203 		 * in other places, such as tcp_clean_death().  So check if
204 		 * a timer is needed.
205 		 */
206 		if (tcp_time_wait->tcp_time_wait_tid == 0) {
207 			if (firetime == 0)
208 				firetime = (hrtime_t)
209 				    (tcps->tcps_time_wait_interval + 1) *
210 				    MICROSEC;
211 
212 			tcp_time_wait->tcp_time_wait_tid =
213 			    timeout_generic(CALLOUT_NORMAL,
214 			    tcp_time_wait_collector, sqp, firetime,
215 			    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
216 		}
217 		tcp_time_wait->tcp_time_wait_tail = tcp;
218 	} else {
219 		/*
220 		 * The list is not empty, so a timer must be running.  If not,
221 		 * tcp_time_wait_collector() must be running on this
222 		 * tcp_time_wait list at the same time.
223 		 */
224 		ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 ||
225 		    tcp_time_wait->tcp_time_wait_running);
226 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
227 		ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
228 		    TCPS_TIME_WAIT);
229 
230 		if (firetime == 0) {
231 			/* add at end */
232 			tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next =
233 			    tcp;
234 			tcp->tcp_time_wait_prev =
235 			    tcp_time_wait->tcp_time_wait_tail;
236 			tcp_time_wait->tcp_time_wait_tail = tcp;
237 		} else {
238 			/* add at head */
239 			tcp->tcp_time_wait_next =
240 			    tcp_time_wait->tcp_time_wait_head;
241 			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
242 			    tcp;
243 			tcp_time_wait->tcp_time_wait_head = tcp;
244 		}
245 	}
246 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
247 }
248 
249 /*
250  * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT
251  * tcp_t.  Used in tcp_time_wait_collector().
252  */
253 /* ARGSUSED */
254 static void
255 tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
256 {
257 	conn_t	*connp = (conn_t *)arg;
258 	tcp_t	*tcp = connp->conn_tcp;
259 
260 	ASSERT(tcp != NULL);
261 	if (tcp->tcp_state == TCPS_CLOSED) {
262 		return;
263 	}
264 
265 	ASSERT((connp->conn_family == AF_INET &&
266 	    connp->conn_ipversion == IPV4_VERSION) ||
267 	    (connp->conn_family == AF_INET6 &&
268 	    (connp->conn_ipversion == IPV4_VERSION ||
269 	    connp->conn_ipversion == IPV6_VERSION)));
270 	ASSERT(!tcp->tcp_listener);
271 
272 	ASSERT(TCP_IS_DETACHED(tcp));
273 
274 	/*
275 	 * Because they have no upstream client to rebind or tcp_close()
276 	 * them later, we axe the connection here and now.
277 	 */
278 	tcp_close_detached(tcp);
279 }
280 
281 /*
282  * Blows away all tcps whose TIME_WAIT has expired. List traversal
283  * is done forwards from the head.
284  * This walks all stack instances since
285  * tcp_time_wait remains global across all stacks.
286  */
287 /* ARGSUSED */
288 void
289 tcp_time_wait_collector(void *arg)
290 {
291 	tcp_t *tcp;
292 	int64_t now;
293 	mblk_t *mp;
294 	conn_t *connp;
295 	kmutex_t *lock;
296 	boolean_t removed;
297 	extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t,
298 	    uint8_t *, in_port_t, uint8_t *, in_port_t, void *);
299 
300 	squeue_t *sqp = (squeue_t *)arg;
301 	tcp_squeue_priv_t *tcp_time_wait =
302 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
303 
304 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
305 	tcp_time_wait->tcp_time_wait_tid = 0;
306 #ifdef DEBUG
307 	tcp_time_wait->tcp_time_wait_running = B_TRUE;
308 #endif
309 
310 	if (tcp_time_wait->tcp_free_list != NULL &&
311 	    tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
312 		TCP_G_STAT(tcp_freelist_cleanup);
313 		while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
314 			tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
315 			tcp->tcp_time_wait_next = NULL;
316 			tcp_time_wait->tcp_free_list_cnt--;
317 			ASSERT(tcp->tcp_tcps == NULL);
318 			CONN_DEC_REF(tcp->tcp_connp);
319 		}
320 		ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
321 	}
322 
323 	/*
324 	 * In order to reap time waits reliably, we should use a
325 	 * source of time that is not adjustable by the user -- hence
326 	 * the call to ddi_get_lbolt64().
327 	 */
328 	now = ddi_get_lbolt64();
329 	while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
330 		/*
331 		 * lbolt64 should not wrap around in practice...  So we can
332 		 * do a direct comparison.
333 		 */
334 		if (now < tcp->tcp_time_wait_expire)
335 			break;
336 
337 		removed = tcp_time_wait_remove(tcp, tcp_time_wait);
338 		ASSERT(removed);
339 
340 		connp = tcp->tcp_connp;
341 		ASSERT(connp->conn_fanout != NULL);
342 		lock = &connp->conn_fanout->connf_lock;
343 		/*
344 		 * This is essentially a TW reclaim fast path optimization for
345 		 * performance where the timewait collector checks under the
346 		 * fanout lock (so that no one else can get access to the
347 		 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
348 		 * the classifier hash list. If ref count is indeed 2, we can
349 		 * just remove the conn under the fanout lock and avoid
350 		 * cleaning up the conn under the squeue, provided that
351 		 * clustering callbacks are not enabled. If clustering is
352 		 * enabled, we need to make the clustering callback before
353 		 * setting the CONDEMNED flag and after dropping all locks and
354 		 * so we forego this optimization and fall back to the slow
355 		 * path. Also please see the comments in tcp_closei_local
356 		 * regarding the refcnt logic.
357 		 *
358 		 * Since we are holding the tcp_time_wait_lock, its better
359 		 * not to block on the fanout_lock because other connections
360 		 * can't add themselves to time_wait list. So we do a
361 		 * tryenter instead of mutex_enter.
362 		 */
363 		if (mutex_tryenter(lock)) {
364 			mutex_enter(&connp->conn_lock);
365 			if ((connp->conn_ref == 2) &&
366 			    (cl_inet_disconnect == NULL)) {
367 				ipcl_hash_remove_locked(connp,
368 				    connp->conn_fanout);
369 				/*
370 				 * Set the CONDEMNED flag now itself so that
371 				 * the refcnt cannot increase due to any
372 				 * walker.
373 				 */
374 				connp->conn_state_flags |= CONN_CONDEMNED;
375 				mutex_exit(lock);
376 				mutex_exit(&connp->conn_lock);
377 				if (tcp_time_wait->tcp_free_list_cnt <
378 				    tcp_free_list_max_cnt) {
379 					/* Add to head of tcp_free_list */
380 					mutex_exit(
381 					    &tcp_time_wait->tcp_time_wait_lock);
382 					tcp_cleanup(tcp);
383 					ASSERT(connp->conn_latch == NULL);
384 					ASSERT(connp->conn_policy == NULL);
385 					ASSERT(tcp->tcp_tcps == NULL);
386 					ASSERT(connp->conn_netstack == NULL);
387 
388 					mutex_enter(
389 					    &tcp_time_wait->tcp_time_wait_lock);
390 					tcp->tcp_time_wait_next =
391 					    tcp_time_wait->tcp_free_list;
392 					tcp_time_wait->tcp_free_list = tcp;
393 					tcp_time_wait->tcp_free_list_cnt++;
394 					continue;
395 				} else {
396 					/* Do not add to tcp_free_list */
397 					mutex_exit(
398 					    &tcp_time_wait->tcp_time_wait_lock);
399 					tcp_bind_hash_remove(tcp);
400 					ixa_cleanup(tcp->tcp_connp->conn_ixa);
401 					tcp_ipsec_cleanup(tcp);
402 					CONN_DEC_REF(tcp->tcp_connp);
403 				}
404 			} else {
405 				CONN_INC_REF_LOCKED(connp);
406 				mutex_exit(lock);
407 				mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
408 				mutex_exit(&connp->conn_lock);
409 				/*
410 				 * We can reuse the closemp here since conn has
411 				 * detached (otherwise we wouldn't even be in
412 				 * time_wait list). tcp_closemp_used can safely
413 				 * be changed without taking a lock as no other
414 				 * thread can concurrently access it at this
415 				 * point in the connection lifecycle.
416 				 */
417 
418 				if (tcp->tcp_closemp.b_prev == NULL)
419 					tcp->tcp_closemp_used = B_TRUE;
420 				else
421 					cmn_err(CE_PANIC,
422 					    "tcp_timewait_collector: "
423 					    "concurrent use of tcp_closemp: "
424 					    "connp %p tcp %p\n", (void *)connp,
425 					    (void *)tcp);
426 
427 				TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
428 				mp = &tcp->tcp_closemp;
429 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
430 				    tcp_timewait_close, connp, NULL,
431 				    SQ_FILL, SQTAG_TCP_TIMEWAIT);
432 			}
433 		} else {
434 			mutex_enter(&connp->conn_lock);
435 			CONN_INC_REF_LOCKED(connp);
436 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
437 			mutex_exit(&connp->conn_lock);
438 			/*
439 			 * We can reuse the closemp here since conn has
440 			 * detached (otherwise we wouldn't even be in
441 			 * time_wait list). tcp_closemp_used can safely
442 			 * be changed without taking a lock as no other
443 			 * thread can concurrently access it at this
444 			 * point in the connection lifecycle.
445 			 */
446 
447 			if (tcp->tcp_closemp.b_prev == NULL)
448 				tcp->tcp_closemp_used = B_TRUE;
449 			else
450 				cmn_err(CE_PANIC, "tcp_timewait_collector: "
451 				    "concurrent use of tcp_closemp: "
452 				    "connp %p tcp %p\n", (void *)connp,
453 				    (void *)tcp);
454 
455 			TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
456 			mp = &tcp->tcp_closemp;
457 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
458 			    tcp_timewait_close, connp, NULL,
459 			    SQ_FILL, SQTAG_TCP_TIMEWAIT);
460 		}
461 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
462 	}
463 
464 	if (tcp_time_wait->tcp_free_list != NULL)
465 		tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
466 
467 	/*
468 	 * If the time wait list is not empty and there is no timer running,
469 	 * restart it.
470 	 */
471 	if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL &&
472 	    tcp_time_wait->tcp_time_wait_tid == 0) {
473 		hrtime_t firetime;
474 
475 		/* shouldn't be necessary, but just in case */
476 		if (tcp->tcp_time_wait_expire < now)
477 			tcp->tcp_time_wait_expire = now;
478 
479 		firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now);
480 		/* This ensures that we won't wake up too often. */
481 		firetime = MAX(TCP_TIME_WAIT_DELAY, firetime);
482 		tcp_time_wait->tcp_time_wait_tid =
483 		    timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector,
484 		    sqp, firetime, CALLOUT_TCP_RESOLUTION,
485 		    CALLOUT_FLAG_ROUNDUP);
486 	}
487 #ifdef DEBUG
488 	tcp_time_wait->tcp_time_wait_running = B_FALSE;
489 #endif
490 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
491 }
492 
493 /*
494  * tcp_time_wait_processing() handles processing of incoming packets when
495  * the tcp_t is in the TIME_WAIT state.
496  *
497  * A TIME_WAIT tcp_t that has an associated open TCP end point (not in
498  * detached state) is never put on the time wait list.
499  */
500 void
501 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
502     uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
503 {
504 	int32_t		bytes_acked;
505 	int32_t		gap;
506 	int32_t		rgap;
507 	tcp_opt_t	tcpopt;
508 	uint_t		flags;
509 	uint32_t	new_swnd = 0;
510 	conn_t		*nconnp;
511 	conn_t		*connp = tcp->tcp_connp;
512 	tcp_stack_t	*tcps = tcp->tcp_tcps;
513 
514 	BUMP_LOCAL(tcp->tcp_ibsegs);
515 	DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
516 
517 	flags = (unsigned int)tcpha->tha_flags & 0xFF;
518 	new_swnd = ntohs(tcpha->tha_win) <<
519 	    ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
520 	if (tcp->tcp_snd_ts_ok) {
521 		if (!tcp_paws_check(tcp, tcpha, &tcpopt)) {
522 			tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
523 			    tcp->tcp_rnxt, TH_ACK);
524 			goto done;
525 		}
526 	}
527 	gap = seg_seq - tcp->tcp_rnxt;
528 	rgap = tcp->tcp_rwnd - (gap + seg_len);
529 	if (gap < 0) {
530 		TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
531 		TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
532 		    (seg_len > -gap ? -gap : seg_len));
533 		seg_len += gap;
534 		if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
535 			if (flags & TH_RST) {
536 				goto done;
537 			}
538 			if ((flags & TH_FIN) && seg_len == -1) {
539 				/*
540 				 * When TCP receives a duplicate FIN in
541 				 * TIME_WAIT state, restart the 2 MSL timer.
542 				 * See page 73 in RFC 793. Make sure this TCP
543 				 * is already on the TIME_WAIT list. If not,
544 				 * just restart the timer.
545 				 */
546 				if (TCP_IS_DETACHED(tcp)) {
547 					if (tcp_time_wait_remove(tcp, NULL) ==
548 					    B_TRUE) {
549 						tcp_time_wait_append(tcp);
550 						TCP_DBGSTAT(tcps,
551 						    tcp_rput_time_wait);
552 					}
553 				} else {
554 					ASSERT(tcp != NULL);
555 					TCP_TIMER_RESTART(tcp,
556 					    tcps->tcps_time_wait_interval);
557 				}
558 				tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
559 				    tcp->tcp_rnxt, TH_ACK);
560 				goto done;
561 			}
562 			flags |=  TH_ACK_NEEDED;
563 			seg_len = 0;
564 			goto process_ack;
565 		}
566 
567 		/* Fix seg_seq, and chew the gap off the front. */
568 		seg_seq = tcp->tcp_rnxt;
569 	}
570 
571 	if ((flags & TH_SYN) && gap > 0 && rgap < 0) {
572 		/*
573 		 * Make sure that when we accept the connection, pick
574 		 * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the
575 		 * old connection.
576 		 *
577 		 * The next ISS generated is equal to tcp_iss_incr_extra
578 		 * + tcp_iss_incr/2 + other components depending on the
579 		 * value of tcp_strong_iss.  We pre-calculate the new
580 		 * ISS here and compare with tcp_snxt to determine if
581 		 * we need to make adjustment to tcp_iss_incr_extra.
582 		 *
583 		 * The above calculation is ugly and is a
584 		 * waste of CPU cycles...
585 		 */
586 		uint32_t new_iss = tcps->tcps_iss_incr_extra;
587 		int32_t adj;
588 		ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
589 
590 		switch (tcps->tcps_strong_iss) {
591 		case 2: {
592 			/* Add time and MD5 components. */
593 			uint32_t answer[4];
594 			struct {
595 				uint32_t ports;
596 				in6_addr_t src;
597 				in6_addr_t dst;
598 			} arg;
599 			MD5_CTX context;
600 
601 			mutex_enter(&tcps->tcps_iss_key_lock);
602 			context = tcps->tcps_iss_key;
603 			mutex_exit(&tcps->tcps_iss_key_lock);
604 			arg.ports = connp->conn_ports;
605 			/* We use MAPPED addresses in tcp_iss_init */
606 			arg.src = connp->conn_laddr_v6;
607 			arg.dst = connp->conn_faddr_v6;
608 			MD5Update(&context, (uchar_t *)&arg,
609 			    sizeof (arg));
610 			MD5Final((uchar_t *)answer, &context);
611 			answer[0] ^= answer[1] ^ answer[2] ^ answer[3];
612 			new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0];
613 			break;
614 		}
615 		case 1:
616 			/* Add time component and min random (i.e. 1). */
617 			new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1;
618 			break;
619 		default:
620 			/* Add only time component. */
621 			new_iss += (uint32_t)gethrestime_sec() *
622 			    tcps->tcps_iss_incr;
623 			break;
624 		}
625 		if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) {
626 			/*
627 			 * New ISS not guaranteed to be tcp_iss_incr/2
628 			 * ahead of the current tcp_snxt, so add the
629 			 * difference to tcp_iss_incr_extra.
630 			 */
631 			tcps->tcps_iss_incr_extra += adj;
632 		}
633 		/*
634 		 * If tcp_clean_death() can not perform the task now,
635 		 * drop the SYN packet and let the other side re-xmit.
636 		 * Otherwise pass the SYN packet back in, since the
637 		 * old tcp state has been cleaned up or freed.
638 		 */
639 		if (tcp_clean_death(tcp, 0) == -1)
640 			goto done;
641 		nconnp = ipcl_classify(mp, ira, ipst);
642 		if (nconnp != NULL) {
643 			TCP_STAT(tcps, tcp_time_wait_syn_success);
644 			/* Drops ref on nconnp */
645 			tcp_reinput(nconnp, mp, ira, ipst);
646 			return;
647 		}
648 		goto done;
649 	}
650 
651 	/*
652 	 * rgap is the amount of stuff received out of window.  A negative
653 	 * value is the amount out of window.
654 	 */
655 	if (rgap < 0) {
656 		TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
657 		TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
658 		/* Fix seg_len and make sure there is something left. */
659 		seg_len += rgap;
660 		if (seg_len <= 0) {
661 			if (flags & TH_RST) {
662 				goto done;
663 			}
664 			flags |=  TH_ACK_NEEDED;
665 			seg_len = 0;
666 			goto process_ack;
667 		}
668 	}
669 	/*
670 	 * Check whether we can update tcp_ts_recent.  This test is
671 	 * NOT the one in RFC 1323 3.4.  It is from Braden, 1993, "TCP
672 	 * Extensions for High Performance: An Update", Internet Draft.
673 	 */
674 	if (tcp->tcp_snd_ts_ok &&
675 	    TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
676 	    SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
677 		tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
678 		tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();
679 	}
680 
681 	if (seg_seq != tcp->tcp_rnxt && seg_len > 0) {
682 		/* Always ack out of order packets */
683 		flags |= TH_ACK_NEEDED;
684 		seg_len = 0;
685 	} else if (seg_len > 0) {
686 		TCPS_BUMP_MIB(tcps, tcpInClosed);
687 		TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
688 		TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
689 	}
690 	if (flags & TH_RST) {
691 		(void) tcp_clean_death(tcp, 0);
692 		goto done;
693 	}
694 	if (flags & TH_SYN) {
695 		tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
696 		    TH_RST|TH_ACK);
697 		/*
698 		 * Do not delete the TCP structure if it is in
699 		 * TIME_WAIT state.  Refer to RFC 1122, 4.2.2.13.
700 		 */
701 		goto done;
702 	}
703 process_ack:
704 	if (flags & TH_ACK) {
705 		bytes_acked = (int)(seg_ack - tcp->tcp_suna);
706 		if (bytes_acked <= 0) {
707 			if (bytes_acked == 0 && seg_len == 0 &&
708 			    new_swnd == tcp->tcp_swnd)
709 				TCPS_BUMP_MIB(tcps, tcpInDupAck);
710 		} else {
711 			/* Acks something not sent */
712 			flags |= TH_ACK_NEEDED;
713 		}
714 	}
715 	if (flags & TH_ACK_NEEDED) {
716 		/*
717 		 * Time to send an ack for some reason.
718 		 */
719 		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
720 		    tcp->tcp_rnxt, TH_ACK);
721 	}
722 done:
723 	freemsg(mp);
724 }
725