xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_time_wait.c (revision a2f04351e04971ab0879872d264d6038c156b860)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2016 Joyent, Inc.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 /*
29  * This file contains functions related to TCP time wait processing.  Also
30  * refer to the time wait handling comments in tcp_impl.h.
31  */
32 
33 #include <sys/types.h>
34 #include <sys/strsun.h>
35 #include <sys/squeue_impl.h>
36 #include <sys/squeue.h>
37 #include <sys/callo.h>
38 
39 #include <inet/common.h>
40 #include <inet/ip.h>
41 #include <inet/tcp.h>
42 #include <inet/tcp_impl.h>
43 #include <inet/tcp_cluster.h>
44 
45 static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *);
46 
47 #define	TW_BUCKET(t)					\
48 	(((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS)
49 
50 #define	TW_BUCKET_NEXT(b)	(((b) + 1) % TCP_TIME_WAIT_BUCKETS)
51 
52 
53 /*
54  * Remove a connection from the list of detached TIME_WAIT connections.
55  * It returns B_FALSE if it can't remove the connection from the list
56  * as the connection has already been removed from the list due to an
57  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
58  */
59 boolean_t
tcp_time_wait_remove(tcp_t * tcp,tcp_squeue_priv_t * tsp)60 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp)
61 {
62 	boolean_t	locked = B_FALSE;
63 
64 	if (tsp == NULL) {
65 		tsp = *((tcp_squeue_priv_t **)
66 		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
67 		mutex_enter(&tsp->tcp_time_wait_lock);
68 		locked = B_TRUE;
69 	} else {
70 		ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
71 	}
72 
73 	/* 0 means that the tcp_t has not been added to the time wait list. */
74 	if (tcp->tcp_time_wait_expire == 0) {
75 		ASSERT(tcp->tcp_time_wait_next == NULL);
76 		ASSERT(tcp->tcp_time_wait_prev == NULL);
77 		if (locked)
78 			mutex_exit(&tsp->tcp_time_wait_lock);
79 		return (B_FALSE);
80 	}
81 	ASSERT(TCP_IS_DETACHED(tcp));
82 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
83 	ASSERT(tsp->tcp_time_wait_cnt > 0);
84 
85 	if (tcp->tcp_time_wait_next != NULL) {
86 		tcp->tcp_time_wait_next->tcp_time_wait_prev =
87 		    tcp->tcp_time_wait_prev;
88 	}
89 	if (tcp->tcp_time_wait_prev != NULL) {
90 		tcp->tcp_time_wait_prev->tcp_time_wait_next =
91 		    tcp->tcp_time_wait_next;
92 	} else {
93 		unsigned int bucket;
94 
95 		bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
96 		ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp);
97 		tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next;
98 	}
99 	tcp->tcp_time_wait_next = NULL;
100 	tcp->tcp_time_wait_prev = NULL;
101 	tcp->tcp_time_wait_expire = 0;
102 	tsp->tcp_time_wait_cnt--;
103 
104 	if (locked)
105 		mutex_exit(&tsp->tcp_time_wait_lock);
106 	return (B_TRUE);
107 }
108 
109 /* Constants used for fast checking of a localhost address */
110 #if defined(_BIG_ENDIAN)
111 #define	IPv4_LOCALHOST	0x7f000000U
112 #define	IPv4_LH_MASK	0xffffff00U
113 #else
114 #define	IPv4_LOCALHOST	0x0000007fU
115 #define	IPv4_LH_MASK	0x00ffffffU
116 #endif
117 
118 #define	IS_LOCAL_HOST(x)	( \
119 	((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \
120 	((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \
121 	((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
122 	IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
123 
124 
125 /*
126  * Add a connection to the list of detached TIME_WAIT connections
127  * and set its time to expire.
128  */
129 void
tcp_time_wait_append(tcp_t * tcp)130 tcp_time_wait_append(tcp_t *tcp)
131 {
132 	tcp_stack_t	*tcps = tcp->tcp_tcps;
133 	squeue_t	*sqp = tcp->tcp_connp->conn_sqp;
134 	tcp_squeue_priv_t *tsp =
135 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
136 	int64_t		now, schedule;
137 	unsigned int	bucket;
138 
139 	tcp_timers_stop(tcp);
140 
141 	/* Freed above */
142 	ASSERT(tcp->tcp_timer_tid == 0);
143 	ASSERT(tcp->tcp_ack_tid == 0);
144 
145 	/* must have happened at the time of detaching the tcp */
146 	ASSERT(TCP_IS_DETACHED(tcp));
147 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
148 	ASSERT(tcp->tcp_ptpahn == NULL);
149 	ASSERT(tcp->tcp_flow_stopped == 0);
150 	ASSERT(tcp->tcp_time_wait_next == NULL);
151 	ASSERT(tcp->tcp_time_wait_prev == NULL);
152 	ASSERT(tcp->tcp_time_wait_expire == 0);
153 	ASSERT(tcp->tcp_listener == NULL);
154 
155 	TCP_DBGSTAT(tcps, tcp_time_wait);
156 	mutex_enter(&tsp->tcp_time_wait_lock);
157 
158 	/*
159 	 * Immediately expire loopback connections.  Since there is no worry
160 	 * about packets on the local host showing up after a long network
161 	 * delay, this is safe and allows much higher rates of connection churn
162 	 * for applications operating locally.
163 	 *
164 	 * This typically bypasses the tcp_free_list fast path due to squeue
165 	 * re-entry for the loopback close operation.
166 	 */
167 	if (tcp->tcp_loopback) {
168 		tcp_time_wait_purge(tcp, tsp);
169 		mutex_exit(&tsp->tcp_time_wait_lock);
170 		return;
171 	}
172 
173 	/*
174 	 * In order to reap TIME_WAITs reliably, we should use a source of time
175 	 * that is not adjustable by the user.  While it would be more accurate
176 	 * to grab this timestamp before (potentially) sleeping on the
177 	 * tcp_time_wait_lock, doing so complicates bucket addressing later.
178 	 */
179 	now = ddi_get_lbolt64();
180 
181 	/*
182 	 * Each squeue uses an arbitrary time offset when scheduling
183 	 * expiration timers.  This prevents the bucketing from forcing
184 	 * tcp_time_wait_collector to run in locksetup across squeues.
185 	 *
186 	 * This offset is (re)initialized when a new TIME_WAIT connection is
187 	 * added to an squeue which has no connections waiting to expire.
188 	 */
189 	if (tsp->tcp_time_wait_tid == 0) {
190 		ASSERT(tsp->tcp_time_wait_cnt == 0);
191 		tsp->tcp_time_wait_offset =
192 		    now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
193 	}
194 	now -= tsp->tcp_time_wait_offset;
195 
196 	/*
197 	 * Use the netstack-defined timeout, rounded up to the minimum
198 	 * time_wait_collector interval.
199 	 */
200 	schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval);
201 	tcp->tcp_time_wait_expire = schedule;
202 
203 	/*
204 	 * Append the connection into the appropriate bucket.
205 	 */
206 	bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
207 	tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket];
208 	tsp->tcp_time_wait_bucket[bucket] = tcp;
209 	if (tcp->tcp_time_wait_next != NULL) {
210 		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL);
211 		tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp;
212 	}
213 	tsp->tcp_time_wait_cnt++;
214 
215 	/*
216 	 * Round delay up to the nearest bucket boundary.
217 	 */
218 	schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
219 	schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
220 
221 	/*
222 	 * The newly inserted entry may require a tighter schedule for the
223 	 * expiration timer.
224 	 */
225 	if (schedule < tsp->tcp_time_wait_schedule) {
226 		callout_id_t old_tid = tsp->tcp_time_wait_tid;
227 
228 		tsp->tcp_time_wait_schedule = schedule;
229 		tsp->tcp_time_wait_tid =
230 		    timeout_generic(CALLOUT_NORMAL,
231 		    tcp_time_wait_collector, sqp,
232 		    TICK_TO_NSEC(schedule - now),
233 		    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
234 
235 		/*
236 		 * It is possible for the timer to fire before the untimeout
237 		 * action is able to complete.  In that case, the exclusion
238 		 * offered by the tcp_time_wait_collector_active flag will
239 		 * prevent multiple collector threads from processing records
240 		 * simultaneously from the same squeue.
241 		 */
242 		mutex_exit(&tsp->tcp_time_wait_lock);
243 		(void) untimeout_default(old_tid, 0);
244 		return;
245 	}
246 
247 	/*
248 	 * Start a fresh timer if none exists.
249 	 */
250 	if (tsp->tcp_time_wait_schedule == 0) {
251 		ASSERT(tsp->tcp_time_wait_tid == 0);
252 
253 		tsp->tcp_time_wait_schedule = schedule;
254 		tsp->tcp_time_wait_tid =
255 		    timeout_generic(CALLOUT_NORMAL,
256 		    tcp_time_wait_collector, sqp,
257 		    TICK_TO_NSEC(schedule - now),
258 		    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
259 	}
260 	mutex_exit(&tsp->tcp_time_wait_lock);
261 }
262 
263 /*
264  * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT
265  * tcp_t.  Used in tcp_time_wait_collector().
266  */
267 /* ARGSUSED */
268 static void
tcp_timewait_close(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)269 tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
270 {
271 	conn_t	*connp = (conn_t *)arg;
272 	tcp_t	*tcp = connp->conn_tcp;
273 
274 	ASSERT(tcp != NULL);
275 	if (tcp->tcp_state == TCPS_CLOSED) {
276 		return;
277 	}
278 
279 	ASSERT((connp->conn_family == AF_INET &&
280 	    connp->conn_ipversion == IPV4_VERSION) ||
281 	    (connp->conn_family == AF_INET6 &&
282 	    (connp->conn_ipversion == IPV4_VERSION ||
283 	    connp->conn_ipversion == IPV6_VERSION)));
284 	ASSERT(!tcp->tcp_listener);
285 
286 	ASSERT(TCP_IS_DETACHED(tcp));
287 
288 	/*
289 	 * Because they have no upstream client to rebind or tcp_close()
290 	 * them later, we axe the connection here and now.
291 	 */
292 	tcp_close_detached(tcp);
293 }
294 
295 
296 static void
tcp_time_wait_purge(tcp_t * tcp,tcp_squeue_priv_t * tsp)297 tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp)
298 {
299 	mblk_t *mp;
300 	conn_t *connp = tcp->tcp_connp;
301 	kmutex_t *lock;
302 
303 	ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
304 	ASSERT(connp->conn_fanout != NULL);
305 
306 	lock = &connp->conn_fanout->connf_lock;
307 
308 	/*
309 	 * This is essentially a TIME_WAIT reclaim fast path optimization for
310 	 * performance where the connection is checked under the fanout lock
311 	 * (so that no one else can get access to the conn_t) that the refcnt
312 	 * is 2 (one each for TCP and the classifier hash list).  That is the
313 	 * case and clustering callbacks are not enabled, the conn can be
314 	 * removed under the fanout lock and avoid clean-up under the squeue.
315 	 *
316 	 * This optimization is forgone when clustering is enabled since the
317 	 * clustering callback must be made before setting the CONDEMNED flag
318 	 * and after dropping all locks
319 	 *
320 	 * See the comments in tcp_closei_local for additional information
321 	 * regarding the refcnt logic.
322 	 */
323 	if (mutex_tryenter(lock)) {
324 		mutex_enter(&connp->conn_lock);
325 		if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) {
326 			ipcl_hash_remove_locked(connp, connp->conn_fanout);
327 			/*
328 			 * Set the CONDEMNED flag now itself so that the refcnt
329 			 * cannot increase due to any walker.
330 			 */
331 			connp->conn_state_flags |= CONN_CONDEMNED;
332 			mutex_exit(&connp->conn_lock);
333 			mutex_exit(lock);
334 			if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) {
335 				/*
336 				 * Add to head of tcp_free_list
337 				 */
338 				tcp_cleanup(tcp);
339 				ASSERT(connp->conn_latch == NULL);
340 				ASSERT(connp->conn_policy == NULL);
341 				ASSERT(tcp->tcp_tcps == NULL);
342 				ASSERT(connp->conn_netstack == NULL);
343 
344 				tcp->tcp_time_wait_next = tsp->tcp_free_list;
345 				tcp->tcp_in_free_list = B_TRUE;
346 				tsp->tcp_free_list = tcp;
347 				tsp->tcp_free_list_cnt++;
348 			} else {
349 				/*
350 				 * Do not add to tcp_free_list
351 				 */
352 				tcp_bind_hash_remove(tcp);
353 				ixa_cleanup(tcp->tcp_connp->conn_ixa);
354 				tcp_ipsec_cleanup(tcp);
355 				CONN_DEC_REF(tcp->tcp_connp);
356 			}
357 
358 			/*
359 			 * With the fast-path complete, we can bail.
360 			 */
361 			return;
362 		} else {
363 			/*
364 			 * Fall back to slow path.
365 			 */
366 			CONN_INC_REF_LOCKED(connp);
367 			mutex_exit(&connp->conn_lock);
368 			mutex_exit(lock);
369 		}
370 	} else {
371 		CONN_INC_REF(connp);
372 	}
373 
374 	/*
375 	 * We can reuse the closemp here since conn has detached (otherwise we
376 	 * wouldn't even be in time_wait list). It is safe to change
377 	 * tcp_closemp_used without taking a lock as no other thread can
378 	 * concurrently access it at this point in the connection lifecycle.
379 	 */
380 	if (tcp->tcp_closemp.b_prev == NULL) {
381 		tcp->tcp_closemp_used = B_TRUE;
382 	} else {
383 		cmn_err(CE_PANIC,
384 		    "tcp_timewait_collector: concurrent use of tcp_closemp: "
385 		    "connp %p tcp %p\n", (void *)connp, (void *)tcp);
386 	}
387 
388 	TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
389 	mp = &tcp->tcp_closemp;
390 	mutex_exit(&tsp->tcp_time_wait_lock);
391 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL,
392 	    SQ_FILL, SQTAG_TCP_TIMEWAIT);
393 	mutex_enter(&tsp->tcp_time_wait_lock);
394 }
395 
396 /*
397  * Purge any tcp_t instances associated with this squeue which have expired
398  * from the TIME_WAIT state.
399  */
400 void
tcp_time_wait_collector(void * arg)401 tcp_time_wait_collector(void *arg)
402 {
403 	tcp_t *tcp;
404 	int64_t now, sched_active, sched_cur, sched_new;
405 	unsigned int idx;
406 
407 	squeue_t *sqp = (squeue_t *)arg;
408 	tcp_squeue_priv_t *tsp =
409 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
410 
411 	mutex_enter(&tsp->tcp_time_wait_lock);
412 
413 	/*
414 	 * Because of timer scheduling complexity and the fact that the
415 	 * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is
416 	 * possible for multiple tcp_time_wait_collector threads to run against
417 	 * the same squeue.  This flag is used to exclude other collectors from
418 	 * the squeue during execution.
419 	 */
420 	if (tsp->tcp_time_wait_collector_active) {
421 		mutex_exit(&tsp->tcp_time_wait_lock);
422 		return;
423 	}
424 	tsp->tcp_time_wait_collector_active = B_TRUE;
425 
426 	/*
427 	 * After its assignment here, the value of sched_active must not be
428 	 * altered as it is used to validate the state of the
429 	 * tcp_time_wait_collector callout schedule for this squeue.
430 	 *
431 	 * The same does not hold true of sched_cur, which holds the timestamp
432 	 * of the bucket undergoing processing.  While it is initially equal to
433 	 * sched_active, certain conditions below can walk it forward,
434 	 * triggering the retry loop.
435 	 */
436 	sched_cur = sched_active = tsp->tcp_time_wait_schedule;
437 
438 	/*
439 	 * Purge the free list if necessary
440 	 */
441 	if (tsp->tcp_free_list != NULL) {
442 		TCP_G_STAT(tcp_freelist_cleanup);
443 		while ((tcp = tsp->tcp_free_list) != NULL) {
444 			tsp->tcp_free_list = tcp->tcp_time_wait_next;
445 			tcp->tcp_time_wait_next = NULL;
446 			tsp->tcp_free_list_cnt--;
447 			ASSERT(tcp->tcp_tcps == NULL);
448 			CONN_DEC_REF(tcp->tcp_connp);
449 		}
450 		ASSERT(tsp->tcp_free_list_cnt == 0);
451 	}
452 
453 	/*
454 	 * If there are no connections pending, clear timer-related state to be
455 	 * reinitialized by the next caller.
456 	 */
457 	if (tsp->tcp_time_wait_cnt == 0) {
458 		tsp->tcp_time_wait_offset = 0;
459 		tsp->tcp_time_wait_schedule = 0;
460 		tsp->tcp_time_wait_tid = 0;
461 		tsp->tcp_time_wait_collector_active = B_FALSE;
462 		mutex_exit(&tsp->tcp_time_wait_lock);
463 		return;
464 	}
465 
466 retry:
467 	/*
468 	 * Grab the bucket which we were scheduled to cleanse.
469 	 */
470 	idx = TW_BUCKET(sched_cur - 1);
471 	now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
472 	tcp = tsp->tcp_time_wait_bucket[idx];
473 
474 	while (tcp != NULL) {
475 		/*
476 		 * Since the bucket count is sized to prevent wrap-around
477 		 * during typical operation and timers are schedule to process
478 		 * buckets with only expired connections, there is only one
479 		 * reason to encounter a connection expiring in the future:
480 		 * The tcp_time_wait_collector thread has been so delayed in
481 		 * its processing that connections have wrapped around the
482 		 * timing wheel into this bucket.
483 		 *
484 		 * In that case, the remaining entires in the bucket can be
485 		 * ignored since, being appended sequentially, they should all
486 		 * expire in the future.
487 		 */
488 		if (now < tcp->tcp_time_wait_expire) {
489 			break;
490 		}
491 
492 		/*
493 		 * Pull the connection out of the bucket.
494 		 */
495 		VERIFY(tcp_time_wait_remove(tcp, tsp));
496 
497 		/*
498 		 * Purge the connection.
499 		 *
500 		 * While tcp_time_wait_lock will be temporarily dropped as part
501 		 * of the process, there is no risk of the timer being
502 		 * (re)scheduled while the collector is running since a value
503 		 * corresponding to the past is left in tcp_time_wait_schedule.
504 		 */
505 		tcp_time_wait_purge(tcp, tsp);
506 
507 		/*
508 		 * Because tcp_time_wait_remove clears the tcp_time_wait_next
509 		 * field, the next item must be grabbed directly from the
510 		 * bucket itself.
511 		 */
512 		tcp = tsp->tcp_time_wait_bucket[idx];
513 	}
514 
515 	if (tsp->tcp_time_wait_cnt == 0) {
516 		/*
517 		 * There is not a need for the collector to schedule a new
518 		 * timer if no pending items remain.  The timer state can be
519 		 * cleared only if it was untouched while the collector dropped
520 		 * its locks during tcp_time_wait_purge.
521 		 */
522 		if (tsp->tcp_time_wait_schedule == sched_active) {
523 			tsp->tcp_time_wait_offset = 0;
524 			tsp->tcp_time_wait_schedule = 0;
525 			tsp->tcp_time_wait_tid = 0;
526 		}
527 		tsp->tcp_time_wait_collector_active = B_FALSE;
528 		mutex_exit(&tsp->tcp_time_wait_lock);
529 		return;
530 	} else {
531 		unsigned int nidx;
532 
533 		/*
534 		 * Locate the next bucket containing entries.
535 		 */
536 		sched_new = sched_cur + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
537 		nidx = TW_BUCKET_NEXT(idx);
538 		while (tsp->tcp_time_wait_bucket[nidx] == NULL) {
539 			if (nidx == idx) {
540 				break;
541 			}
542 			nidx = TW_BUCKET_NEXT(nidx);
543 			sched_new += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
544 		}
545 		ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL);
546 	}
547 
548 	/*
549 	 * It is possible that the system is under such dire load that between
550 	 * the timer scheduling and TIME_WAIT processing delay, execution
551 	 * overran the interval allocated to this bucket.
552 	 */
553 	now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
554 	if (sched_new <= now) {
555 		/*
556 		 * Attempt to right the situation by immediately performing a
557 		 * purge on the next bucket.  This loop will continue as needed
558 		 * until the schedule can be pushed out ahead of the clock.
559 		 */
560 		sched_cur = sched_new;
561 		DTRACE_PROBE3(tcp__time__wait__overrun,
562 		    tcp_squeue_priv_t *, tsp, int64_t, sched_new, int64_t, now);
563 		goto retry;
564 	}
565 
566 	/*
567 	 * Another thread may have snuck in to reschedule the timer while locks
568 	 * were dropped during tcp_time_wait_purge.  Defer to the running timer
569 	 * if that is the case.
570 	 */
571 	if (tsp->tcp_time_wait_schedule != sched_active) {
572 		tsp->tcp_time_wait_collector_active = B_FALSE;
573 		mutex_exit(&tsp->tcp_time_wait_lock);
574 		return;
575 	}
576 
577 	/*
578 	 * Schedule the next timer.
579 	 */
580 	tsp->tcp_time_wait_schedule = sched_new;
581 	tsp->tcp_time_wait_tid =
582 	    timeout_generic(CALLOUT_NORMAL,
583 	    tcp_time_wait_collector, sqp,
584 	    TICK_TO_NSEC(sched_new - now),
585 	    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
586 	tsp->tcp_time_wait_collector_active = B_FALSE;
587 	mutex_exit(&tsp->tcp_time_wait_lock);
588 }
589 
590 /*
591  * tcp_time_wait_processing() handles processing of incoming packets when
592  * the tcp_t is in the TIME_WAIT state.
593  *
594  * A TIME_WAIT tcp_t that has an associated open TCP end point (not in
595  * detached state) is never put on the time wait list.
596  */
597 void
tcp_time_wait_processing(tcp_t * tcp,mblk_t * mp,uint32_t seg_seq,uint32_t seg_ack,int seg_len,tcpha_t * tcpha,ip_recv_attr_t * ira)598 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
599     uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
600 {
601 	int32_t		bytes_acked;
602 	int32_t		gap;
603 	int32_t		rgap;
604 	tcp_opt_t	tcpopt;
605 	uint_t		flags;
606 	uint32_t	new_swnd = 0;
607 	conn_t		*nconnp;
608 	conn_t		*connp = tcp->tcp_connp;
609 	tcp_stack_t	*tcps = tcp->tcp_tcps;
610 
611 	TCPS_BUMP_MIB(tcps, tcpHCInSegs);
612 	DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
613 
614 	flags = (unsigned int)tcpha->tha_flags & 0xFF;
615 	new_swnd = ntohs(tcpha->tha_win) <<
616 	    ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
617 
618 	boolean_t keepalive = (seg_len == 0 || seg_len == 1) &&
619 	    (seg_seq + 1 == tcp->tcp_rnxt);
620 	if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && !keepalive) {
621 		int options;
622 		if (tcp->tcp_snd_sack_ok)
623 			tcpopt.tcp = tcp;
624 		else
625 			tcpopt.tcp = NULL;
626 		options = tcp_parse_options(tcpha, &tcpopt);
627 		if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
628 			DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
629 			goto done;
630 		} else if (!tcp_paws_check(tcp, &tcpopt)) {
631 			tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt,
632 			    TH_ACK);
633 			goto done;
634 		}
635 	}
636 	gap = seg_seq - tcp->tcp_rnxt;
637 	rgap = tcp->tcp_rwnd - (gap + seg_len);
638 	if (gap < 0) {
639 		TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
640 		TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
641 		    (seg_len > -gap ? -gap : seg_len));
642 		seg_len += gap;
643 		if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
644 			if (flags & TH_RST) {
645 				goto done;
646 			}
647 			if ((flags & TH_FIN) && seg_len == -1) {
648 				/*
649 				 * When TCP receives a duplicate FIN in
650 				 * TIME_WAIT state, restart the 2 MSL timer.
651 				 * See page 73 in RFC 793. Make sure this TCP
652 				 * is already on the TIME_WAIT list. If not,
653 				 * just restart the timer.
654 				 */
655 				if (TCP_IS_DETACHED(tcp)) {
656 					if (tcp_time_wait_remove(tcp, NULL) ==
657 					    B_TRUE) {
658 						tcp_time_wait_append(tcp);
659 						TCP_DBGSTAT(tcps,
660 						    tcp_rput_time_wait);
661 					}
662 				} else {
663 					ASSERT(tcp != NULL);
664 					TCP_TIMER_RESTART(tcp,
665 					    tcps->tcps_time_wait_interval);
666 				}
667 				tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
668 				    tcp->tcp_rnxt, TH_ACK);
669 				goto done;
670 			}
671 			flags |=  TH_ACK_NEEDED;
672 			seg_len = 0;
673 			goto process_ack;
674 		}
675 
676 		/* Fix seg_seq, and chew the gap off the front. */
677 		seg_seq = tcp->tcp_rnxt;
678 	}
679 
680 	if ((flags & TH_SYN) && gap > 0 && rgap < 0) {
681 		/*
682 		 * Make sure that when we accept the connection, pick
683 		 * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the
684 		 * old connection.
685 		 *
686 		 * The next ISS generated is equal to tcp_iss_incr_extra
687 		 * + tcp_iss_incr/2 + other components depending on the
688 		 * value of tcp_strong_iss.  We pre-calculate the new
689 		 * ISS here and compare with tcp_snxt to determine if
690 		 * we need to make adjustment to tcp_iss_incr_extra.
691 		 *
692 		 * The above calculation is ugly and is a
693 		 * waste of CPU cycles...
694 		 */
695 		uint32_t new_iss = tcps->tcps_iss_incr_extra;
696 		int32_t adj;
697 		ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
698 
699 		switch (tcps->tcps_strong_iss) {
700 		case 2: {
701 			/* Add time and MD5 components. */
702 			uint32_t answer[4];
703 			struct {
704 				uint32_t ports;
705 				in6_addr_t src;
706 				in6_addr_t dst;
707 			} arg;
708 			MD5_CTX context;
709 
710 			mutex_enter(&tcps->tcps_iss_key_lock);
711 			context = tcps->tcps_iss_key;
712 			mutex_exit(&tcps->tcps_iss_key_lock);
713 			arg.ports = connp->conn_ports;
714 			/* We use MAPPED addresses in tcp_iss_init */
715 			arg.src = connp->conn_laddr_v6;
716 			arg.dst = connp->conn_faddr_v6;
717 			MD5Update(&context, (uchar_t *)&arg,
718 			    sizeof (arg));
719 			MD5Final((uchar_t *)answer, &context);
720 			answer[0] ^= answer[1] ^ answer[2] ^ answer[3];
721 			new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0];
722 			break;
723 		}
724 		case 1:
725 			/* Add time component and min random (i.e. 1). */
726 			new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1;
727 			break;
728 		default:
729 			/* Add only time component. */
730 			new_iss += (uint32_t)gethrestime_sec() *
731 			    tcps->tcps_iss_incr;
732 			break;
733 		}
734 		if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) {
735 			/*
736 			 * New ISS not guaranteed to be tcp_iss_incr/2
737 			 * ahead of the current tcp_snxt, so add the
738 			 * difference to tcp_iss_incr_extra.
739 			 */
740 			tcps->tcps_iss_incr_extra += adj;
741 		}
742 		/*
743 		 * If tcp_clean_death() can not perform the task now,
744 		 * drop the SYN packet and let the other side re-xmit.
745 		 * Otherwise pass the SYN packet back in, since the
746 		 * old tcp state has been cleaned up or freed.
747 		 */
748 		if (tcp_clean_death(tcp, 0) == -1)
749 			goto done;
750 		nconnp = ipcl_classify(mp, ira, ipst);
751 		if (nconnp != NULL) {
752 			TCP_STAT(tcps, tcp_time_wait_syn_success);
753 			/* Drops ref on nconnp */
754 			tcp_reinput(nconnp, mp, ira, ipst);
755 			return;
756 		}
757 		goto done;
758 	}
759 
760 	/*
761 	 * rgap is the amount of stuff received out of window.  A negative
762 	 * value is the amount out of window.
763 	 */
764 	if (rgap < 0) {
765 		TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
766 		TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
767 		/* Fix seg_len and make sure there is something left. */
768 		seg_len += rgap;
769 		if (seg_len <= 0) {
770 			if (flags & TH_RST) {
771 				goto done;
772 			}
773 			flags |=  TH_ACK_NEEDED;
774 			seg_len = 0;
775 			goto process_ack;
776 		}
777 	}
778 	/*
779 	 * Check whether we can update tcp_ts_recent. This test is from RFC
780 	 * 7323, section 5.3.
781 	 */
782 	if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
783 	    TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
784 	    SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
785 		tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
786 		tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();
787 	}
788 
789 	if (seg_seq != tcp->tcp_rnxt && seg_len > 0) {
790 		/* Always ack out of order packets */
791 		flags |= TH_ACK_NEEDED;
792 		seg_len = 0;
793 	} else if (seg_len > 0) {
794 		TCPS_BUMP_MIB(tcps, tcpInClosed);
795 		TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
796 		TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
797 		tcp->tcp_cs.tcp_in_data_inorder_segs++;
798 		tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
799 	}
800 	if (flags & TH_RST) {
801 		(void) tcp_clean_death(tcp, 0);
802 		goto done;
803 	}
804 	if (flags & TH_SYN) {
805 		tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
806 		    TH_RST|TH_ACK);
807 		/*
808 		 * Do not delete the TCP structure if it is in
809 		 * TIME_WAIT state.  Refer to RFC 1122, 4.2.2.13.
810 		 */
811 		goto done;
812 	}
813 process_ack:
814 	if (flags & TH_ACK) {
815 		bytes_acked = (int)(seg_ack - tcp->tcp_suna);
816 		if (bytes_acked <= 0) {
817 			if (bytes_acked == 0 && seg_len == 0 &&
818 			    new_swnd == tcp->tcp_swnd)
819 				TCPS_BUMP_MIB(tcps, tcpInDupAck);
820 		} else {
821 			/* Acks something not sent */
822 			flags |= TH_ACK_NEEDED;
823 		}
824 	}
825 	if (flags & TH_ACK_NEEDED) {
826 		/*
827 		 * Time to send an ack for some reason.
828 		 */
829 		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
830 		    tcp->tcp_rnxt, TH_ACK);
831 	}
832 done:
833 	freemsg(mp);
834 }
835