xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_timers.c (revision e8d80663e4f91871f843bb8ad9108dc0b76dfcf3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2011 Joyent, Inc.  All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/strlog.h>
30 #include <sys/strsun.h>
31 #include <sys/squeue_impl.h>
32 #include <sys/squeue.h>
33 #include <sys/callo.h>
34 #include <sys/strsubr.h>
35 
36 #include <inet/common.h>
37 #include <inet/ip.h>
38 #include <inet/ip_ire.h>
39 #include <inet/ip_rts.h>
40 #include <inet/tcp.h>
41 #include <inet/tcp_impl.h>
42 
43 /*
44  * Implementation of TCP Timers.
45  * =============================
46  *
47  * INTERFACE:
48  *
49  * There are two basic functions dealing with tcp timers:
50  *
51  *	timeout_id_t	tcp_timeout(connp, func, time)
52  * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
53  *	TCP_TIMER_RESTART(tcp, intvl)
54  *
55  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
56  * after 'time' ticks passed. The function called by timeout() must adhere to
57  * the same restrictions as a driver soft interrupt handler - it must not sleep
58  * or call other functions that might sleep. The value returned is the opaque
59  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
60  * cancel the request. The call to tcp_timeout() may fail in which case it
61  * returns zero. This is different from the timeout(9F) function which never
62  * fails.
63  *
64  * The call-back function 'func' always receives 'connp' as its single
65  * argument. It is always executed in the squeue corresponding to the tcp
66  * structure. The tcp structure is guaranteed to be present at the time the
67  * call-back is called.
68  *
69  * NOTE: The call-back function 'func' is never called if tcp is in
70  * 	the TCPS_CLOSED state.
71  *
72  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
73  * request. locks acquired by the call-back routine should not be held across
74  * the call to tcp_timeout_cancel() or a deadlock may result.
75  *
76  * tcp_timeout_cancel() returns -1 if the timeout request is invalid.
77  * Otherwise, it returns an integer value greater than or equal to 0.
78  *
79  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
80  * 	within squeue context corresponding to the tcp instance. Since the
81  *	call-back is also called via the same squeue, there are no race
82  *	conditions described in untimeout(9F) manual page since all calls are
83  *	strictly serialized.
84  *
85  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
86  *	stored in tcp_timer_tid and starts a new one using
87  *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
88  *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
89  *	field.
90  *
91  * IMPLEMENTATION:
92  *
93  * TCP timers are implemented using three-stage process. The call to
94  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
95  * when the timer expires. The tcp_timer_callback() arranges the call of the
96  * tcp_timer_handler() function via squeue corresponding to the tcp
97  * instance. The tcp_timer_handler() calls actual requested timeout call-back
98  * and passes tcp instance as an argument to it. Information is passed between
99  * stages using the tcp_timer_t structure which contains the connp pointer, the
100  * tcp call-back to call and the timeout id returned by the timeout(9F).
101  *
102  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
103  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
104  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
105  * returns the pointer to this mblk.
106  *
107  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
108  * looks like a normal mblk without actual dblk attached to it.
109  *
110  * To optimize performance each tcp instance holds a small cache of timer
111  * mblocks. In the current implementation it caches up to two timer mblocks per
112  * tcp instance. The cache is preserved over tcp frees and is only freed when
113  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
114  * timer processing happens on a corresponding squeue, the cache manipulation
115  * does not require any locks. Experiments show that majority of timer mblocks
116  * allocations are satisfied from the tcp cache and do not involve kmem calls.
117  *
118  * The tcp_timeout() places a refhold on the connp instance which guarantees
119  * that it will be present at the time the call-back function fires. The
120  * tcp_timer_handler() drops the reference after calling the call-back, so the
121  * call-back function does not need to manipulate the references explicitly.
122  */
123 
124 kmem_cache_t *tcp_timercache;
125 
126 static void	tcp_ip_notify(tcp_t *);
127 static void	tcp_timer_callback(void *);
128 static void	tcp_timer_free(tcp_t *, mblk_t *);
129 static void	tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *);
130 
131 /*
132  * tim is in millisec.
133  */
134 timeout_id_t
135 tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim)
136 {
137 	mblk_t *mp;
138 	tcp_timer_t *tcpt;
139 	tcp_t *tcp = connp->conn_tcp;
140 
141 	ASSERT(connp->conn_sqp != NULL);
142 
143 	TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls);
144 
145 	if (tcp->tcp_timercache == NULL) {
146 		mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
147 	} else {
148 		TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc);
149 		mp = tcp->tcp_timercache;
150 		tcp->tcp_timercache = mp->b_next;
151 		mp->b_next = NULL;
152 		ASSERT(mp->b_wptr == NULL);
153 	}
154 
155 	CONN_INC_REF(connp);
156 	tcpt = (tcp_timer_t *)mp->b_rptr;
157 	tcpt->connp = connp;
158 	tcpt->tcpt_proc = f;
159 	/*
160 	 * TCP timers are normal timeouts. Plus, they do not require more than
161 	 * a 10 millisecond resolution. By choosing a coarser resolution and by
162 	 * rounding up the expiration to the next resolution boundary, we can
163 	 * batch timers in the callout subsystem to make TCP timers more
164 	 * efficient. The roundup also protects short timers from expiring too
165 	 * early before they have a chance to be cancelled.
166 	 */
167 	tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp,
168 	    tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
169 	VERIFY(!(tcpt->tcpt_tid & CALLOUT_ID_FREE));
170 
171 	return ((timeout_id_t)mp);
172 }
173 
174 static void
175 tcp_timer_callback(void *arg)
176 {
177 	mblk_t *mp = (mblk_t *)arg;
178 	tcp_timer_t *tcpt;
179 	conn_t	*connp;
180 
181 	tcpt = (tcp_timer_t *)mp->b_rptr;
182 	connp = tcpt->connp;
183 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
184 	    NULL, SQ_FILL, SQTAG_TCP_TIMER);
185 }
186 
187 /* ARGSUSED */
188 static void
189 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
190 {
191 	tcp_timer_t *tcpt;
192 	conn_t *connp = (conn_t *)arg;
193 	tcp_t *tcp = connp->conn_tcp;
194 
195 	tcpt = (tcp_timer_t *)mp->b_rptr;
196 	ASSERT(connp == tcpt->connp);
197 	ASSERT((squeue_t *)arg2 == connp->conn_sqp);
198 
199 	if (tcpt->tcpt_tid & CALLOUT_ID_FREE) {
200 		/*
201 		 * This timeout was cancelled after it was enqueued to the
202 		 * squeue; free the timer and return.
203 		 */
204 		tcp_timer_free(connp->conn_tcp, mp);
205 		return;
206 	}
207 
208 	/*
209 	 * If the TCP has reached the closed state, don't proceed any
210 	 * further. This TCP logically does not exist on the system.
211 	 * tcpt_proc could for example access queues, that have already
212 	 * been qprocoff'ed off.
213 	 */
214 	if (tcp->tcp_state != TCPS_CLOSED) {
215 		(*tcpt->tcpt_proc)(connp);
216 	} else {
217 		tcp->tcp_timer_tid = 0;
218 	}
219 
220 	tcp_timer_free(connp->conn_tcp, mp);
221 }
222 
223 /*
224  * There is potential race with untimeout and the handler firing at the same
225  * time. The mblock may be freed by the handler while we are trying to use
226  * it. But since both should execute on the same squeue, this race should not
227  * occur.
228  */
229 clock_t
230 tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
231 {
232 	mblk_t	*mp = (mblk_t *)id;
233 	tcp_timer_t *tcpt;
234 	clock_t delta;
235 
236 	TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs);
237 
238 	if (mp == NULL)
239 		return (-1);
240 
241 	tcpt = (tcp_timer_t *)mp->b_rptr;
242 	ASSERT(tcpt->connp == connp);
243 
244 	delta = untimeout_default(tcpt->tcpt_tid, 0);
245 
246 	if (delta >= 0) {
247 		TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled);
248 		tcp_timer_free(connp->conn_tcp, mp);
249 		CONN_DEC_REF(connp);
250 	} else {
251 		/*
252 		 * If we were unable to untimeout successfully, it has already
253 		 * been enqueued on the squeue; mark the ID with the free
254 		 * bit.	 This bit can never be set in a valid identifier, and
255 		 * we'll use it to prevent the timeout from being executed.
256 		 * And note that we're within the squeue perimeter here, so
257 		 * we don't need to worry about racing with timer handling
258 		 * (which also executes within the perimeter).
259 		 */
260 		tcpt->tcpt_tid |= CALLOUT_ID_FREE;
261 		delta = 0;
262 	}
263 
264 	return (TICK_TO_MSEC(delta));
265 }
266 
267 /*
268  * Allocate space for the timer event. The allocation looks like mblk, but it is
269  * not a proper mblk. To avoid confusion we set b_wptr to NULL.
270  *
271  * Dealing with failures: If we can't allocate from the timer cache we try
272  * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
273  * points to b_rptr.
274  * If we can't allocate anything using allocb_tryhard(), we perform a last
275  * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
276  * save the actual allocation size in b_datap.
277  */
278 mblk_t *
279 tcp_timermp_alloc(int kmflags)
280 {
281 	mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache,
282 	    kmflags & ~KM_PANIC);
283 
284 	if (mp != NULL) {
285 		mp->b_next = mp->b_prev = NULL;
286 		mp->b_rptr = (uchar_t *)(&mp[1]);
287 		mp->b_wptr = NULL;
288 		mp->b_datap = NULL;
289 		mp->b_queue = NULL;
290 		mp->b_cont = NULL;
291 	} else if (kmflags & KM_PANIC) {
292 		/*
293 		 * Failed to allocate memory for the timer. Try allocating from
294 		 * dblock caches.
295 		 */
296 		/* ipclassifier calls this from a constructor - hence no tcps */
297 		TCP_G_STAT(tcp_timermp_allocfail);
298 		mp = allocb_tryhard(sizeof (tcp_timer_t));
299 		if (mp == NULL) {
300 			size_t size = 0;
301 			/*
302 			 * Memory is really low. Try tryhard allocation.
303 			 *
304 			 * ipclassifier calls this from a constructor -
305 			 * hence no tcps
306 			 */
307 			TCP_G_STAT(tcp_timermp_allocdblfail);
308 			mp = kmem_alloc_tryhard(sizeof (mblk_t) +
309 			    sizeof (tcp_timer_t), &size, kmflags);
310 			mp->b_rptr = (uchar_t *)(&mp[1]);
311 			mp->b_next = mp->b_prev = NULL;
312 			mp->b_wptr = (uchar_t *)-1;
313 			mp->b_datap = (dblk_t *)size;
314 			mp->b_queue = NULL;
315 			mp->b_cont = NULL;
316 		}
317 		ASSERT(mp->b_wptr != NULL);
318 	}
319 	/* ipclassifier calls this from a constructor - hence no tcps */
320 	TCP_G_DBGSTAT(tcp_timermp_alloced);
321 
322 	return (mp);
323 }
324 
325 /*
326  * Free per-tcp timer cache.
327  * It can only contain entries from tcp_timercache.
328  */
329 void
330 tcp_timermp_free(tcp_t *tcp)
331 {
332 	mblk_t *mp;
333 
334 	while ((mp = tcp->tcp_timercache) != NULL) {
335 		ASSERT(mp->b_wptr == NULL);
336 		tcp->tcp_timercache = tcp->tcp_timercache->b_next;
337 		kmem_cache_free(tcp_timercache, mp);
338 	}
339 }
340 
341 /*
342  * Free timer event. Put it on the per-tcp timer cache if there is not too many
343  * events there already (currently at most two events are cached).
344  * If the event is not allocated from the timer cache, free it right away.
345  */
346 static void
347 tcp_timer_free(tcp_t *tcp, mblk_t *mp)
348 {
349 	mblk_t *mp1 = tcp->tcp_timercache;
350 
351 	if (mp->b_wptr != NULL) {
352 		/*
353 		 * This allocation is not from a timer cache, free it right
354 		 * away.
355 		 */
356 		if (mp->b_wptr != (uchar_t *)-1)
357 			freeb(mp);
358 		else
359 			kmem_free(mp, (size_t)mp->b_datap);
360 	} else if (mp1 == NULL || mp1->b_next == NULL) {
361 		/* Cache this timer block for future allocations */
362 		mp->b_rptr = (uchar_t *)(&mp[1]);
363 		mp->b_next = mp1;
364 		tcp->tcp_timercache = mp;
365 	} else {
366 		kmem_cache_free(tcp_timercache, mp);
367 		TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed);
368 	}
369 }
370 
371 /*
372  * Stop all TCP timers.
373  */
374 void
375 tcp_timers_stop(tcp_t *tcp)
376 {
377 	if (tcp->tcp_timer_tid != 0) {
378 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
379 		tcp->tcp_timer_tid = 0;
380 	}
381 	if (tcp->tcp_ka_tid != 0) {
382 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
383 		tcp->tcp_ka_tid = 0;
384 	}
385 	if (tcp->tcp_ack_tid != 0) {
386 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
387 		tcp->tcp_ack_tid = 0;
388 	}
389 	if (tcp->tcp_push_tid != 0) {
390 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
391 		tcp->tcp_push_tid = 0;
392 	}
393 	if (tcp->tcp_reass_tid != 0) {
394 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid);
395 		tcp->tcp_reass_tid = 0;
396 	}
397 }
398 
399 /*
400  * Timer callback routine for keepalive probe.  We do a fake resend of
401  * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
402  * check to see if we have heard anything from the other end for the last
403  * RTO period.  If we have, set the timer to expire for another
404  * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
405  * RTO << 1 and check again when it expires.  Keep exponentially increasing
406  * the timeout if we have not heard from the other side.  If for more than
407  * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
408  * kill the connection unless the keepalive abort threshold is 0.  In
409  * that case, we will probe "forever."
410  * If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow
411  * the exponential backoff, but send probes tcp_ka_cnt times in regular
412  * intervals of tcp_ka_rinterval milliseconds until we hear back from peer.
413  * Kill the connection if we don't hear back from peer after tcp_ka_cnt
414  * probes are sent.
415  */
416 void
417 tcp_keepalive_timer(void *arg)
418 {
419 	mblk_t	*mp;
420 	conn_t	*connp = (conn_t *)arg;
421 	tcp_t  	*tcp = connp->conn_tcp;
422 	int32_t	firetime;
423 	int32_t	idletime;
424 	int32_t	ka_intrvl;
425 	tcp_stack_t	*tcps = tcp->tcp_tcps;
426 
427 	tcp->tcp_ka_tid = 0;
428 
429 	if (tcp->tcp_fused)
430 		return;
431 
432 	TCPS_BUMP_MIB(tcps, tcpTimKeepalive);
433 	ka_intrvl = tcp->tcp_ka_interval;
434 
435 	/*
436 	 * Keepalive probe should only be sent if the application has not
437 	 * done a close on the connection.
438 	 */
439 	if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
440 		return;
441 	}
442 	/* Timer fired too early, restart it. */
443 	if (tcp->tcp_state < TCPS_ESTABLISHED) {
444 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
445 		    ka_intrvl);
446 		return;
447 	}
448 
449 	idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
450 	/*
451 	 * If we have not heard from the other side for a long
452 	 * time, kill the connection unless the keepalive abort
453 	 * threshold is 0.  In that case, we will probe "forever."
454 	 */
455 	if (tcp->tcp_ka_abort_thres != 0 &&
456 	    idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
457 		TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop);
458 		(void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
459 		    tcp->tcp_client_errno : ETIMEDOUT);
460 		return;
461 	}
462 
463 	if (tcp->tcp_snxt == tcp->tcp_suna &&
464 	    idletime >= ka_intrvl) {
465 		/* Fake resend of last ACKed byte. */
466 		mblk_t	*mp1 = allocb(1, BPRI_LO);
467 
468 		if (mp1 != NULL) {
469 			*mp1->b_wptr++ = '\0';
470 			mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
471 			    tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
472 			freeb(mp1);
473 			/*
474 			 * if allocation failed, fall through to start the
475 			 * timer back.
476 			 */
477 			if (mp != NULL) {
478 				tcp_send_data(tcp, mp);
479 				TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe);
480 				if (tcp->tcp_ka_rinterval) {
481 					firetime = tcp->tcp_ka_rinterval;
482 				} else if (tcp->tcp_ka_last_intrvl != 0) {
483 					int max;
484 					/*
485 					 * We should probe again at least
486 					 * in ka_intrvl, but not more than
487 					 * tcp_rto_max.
488 					 */
489 					max = tcp->tcp_rto_max;
490 					firetime = MIN(ka_intrvl - 1,
491 					    tcp->tcp_ka_last_intrvl << 1);
492 					if (firetime > max)
493 						firetime = max;
494 				} else {
495 					firetime = tcp->tcp_rto;
496 				}
497 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
498 				    tcp_keepalive_timer, firetime);
499 				tcp->tcp_ka_last_intrvl = firetime;
500 				return;
501 			}
502 		}
503 	} else {
504 		tcp->tcp_ka_last_intrvl = 0;
505 	}
506 
507 	/* firetime can be negative if (mp1 == NULL || mp == NULL) */
508 	if ((firetime = ka_intrvl - idletime) < 0) {
509 		firetime = ka_intrvl;
510 	}
511 	tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime);
512 }
513 
514 void
515 tcp_reass_timer(void *arg)
516 {
517 	conn_t *connp = (conn_t *)arg;
518 	tcp_t *tcp = connp->conn_tcp;
519 
520 	tcp->tcp_reass_tid = 0;
521 	if (tcp->tcp_reass_head == NULL)
522 		return;
523 	ASSERT(tcp->tcp_reass_tail != NULL);
524 	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
525 		tcp_sack_remove(tcp->tcp_sack_list,
526 		    TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk);
527 	}
528 	tcp_close_mpp(&tcp->tcp_reass_head);
529 	tcp->tcp_reass_tail = NULL;
530 	TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout);
531 }
532 
533 /* This function handles the push timeout. */
534 void
535 tcp_push_timer(void *arg)
536 {
537 	conn_t	*connp = (conn_t *)arg;
538 	tcp_t *tcp = connp->conn_tcp;
539 
540 	TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt);
541 
542 	ASSERT(tcp->tcp_listener == NULL);
543 
544 	ASSERT(!IPCL_IS_NONSTR(connp));
545 
546 	tcp->tcp_push_tid = 0;
547 
548 	if (tcp->tcp_rcv_list != NULL &&
549 	    tcp_rcv_drain(tcp) == TH_ACK_NEEDED)
550 		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
551 }
552 
553 /*
554  * This function handles delayed ACK timeout.
555  */
556 void
557 tcp_ack_timer(void *arg)
558 {
559 	conn_t	*connp = (conn_t *)arg;
560 	tcp_t *tcp = connp->conn_tcp;
561 	mblk_t *mp;
562 	tcp_stack_t	*tcps = tcp->tcp_tcps;
563 
564 	TCP_DBGSTAT(tcps, tcp_ack_timer_cnt);
565 
566 	tcp->tcp_ack_tid = 0;
567 
568 	if (tcp->tcp_fused)
569 		return;
570 
571 	/*
572 	 * Do not send ACK if there is no outstanding unack'ed data.
573 	 */
574 	if (tcp->tcp_rnxt == tcp->tcp_rack) {
575 		return;
576 	}
577 
578 	if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) {
579 		/*
580 		 * Make sure we don't allow deferred ACKs to result in
581 		 * timer-based ACKing.  If we have held off an ACK
582 		 * when there was more than an mss here, and the timer
583 		 * goes off, we have to worry about the possibility
584 		 * that the sender isn't doing slow-start, or is out
585 		 * of step with us for some other reason.  We fall
586 		 * permanently back in the direction of
587 		 * ACK-every-other-packet as suggested in RFC 1122.
588 		 */
589 		if (tcp->tcp_rack_abs_max > 2)
590 			tcp->tcp_rack_abs_max--;
591 		tcp->tcp_rack_cur_max = 2;
592 	}
593 	mp = tcp_ack_mp(tcp);
594 
595 	if (mp != NULL) {
596 		BUMP_LOCAL(tcp->tcp_obsegs);
597 		TCPS_BUMP_MIB(tcps, tcpOutAck);
598 		TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
599 		tcp_send_data(tcp, mp);
600 	}
601 }
602 
603 /*
604  * Notify IP that we are having trouble with this connection.  IP should
605  * make note so it can potentially use a different IRE.
606  */
607 static void
608 tcp_ip_notify(tcp_t *tcp)
609 {
610 	conn_t		*connp = tcp->tcp_connp;
611 	ire_t		*ire;
612 
613 	/*
614 	 * Note: in the case of source routing we want to blow away the
615 	 * route to the first source route hop.
616 	 */
617 	ire = connp->conn_ixa->ixa_ire;
618 	if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
619 		if (ire->ire_ipversion == IPV4_VERSION) {
620 			/*
621 			 * As per RFC 1122, we send an RTM_LOSING to inform
622 			 * routing protocols.
623 			 */
624 			ip_rts_change(RTM_LOSING, ire->ire_addr,
625 			    ire->ire_gateway_addr, ire->ire_mask,
626 			    connp->conn_laddr_v4,  0, 0, 0,
627 			    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
628 			    ire->ire_ipst);
629 		}
630 		(void) ire_no_good(ire);
631 	}
632 }
633 
634 /*
635  * tcp_timer is the timer service routine.  It handles the retransmission,
636  * FIN_WAIT_2 flush, and zero window probe timeout events.  It figures out
637  * from the state of the tcp instance what kind of action needs to be done
638  * at the time it is called.
639  */
640 void
641 tcp_timer(void *arg)
642 {
643 	mblk_t		*mp;
644 	clock_t		first_threshold;
645 	clock_t		second_threshold;
646 	clock_t		ms;
647 	uint32_t	mss;
648 	conn_t		*connp = (conn_t *)arg;
649 	tcp_t		*tcp = connp->conn_tcp;
650 	tcp_stack_t	*tcps = tcp->tcp_tcps;
651 	boolean_t	dont_timeout = B_FALSE;
652 
653 	tcp->tcp_timer_tid = 0;
654 
655 	if (tcp->tcp_fused)
656 		return;
657 
658 	first_threshold =  tcp->tcp_first_timer_threshold;
659 	second_threshold = tcp->tcp_second_timer_threshold;
660 	switch (tcp->tcp_state) {
661 	case TCPS_IDLE:
662 	case TCPS_BOUND:
663 	case TCPS_LISTEN:
664 		return;
665 	case TCPS_SYN_RCVD: {
666 		tcp_t	*listener = tcp->tcp_listener;
667 
668 		if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
669 			/* it's our first timeout */
670 			tcp->tcp_syn_rcvd_timeout = 1;
671 			mutex_enter(&listener->tcp_eager_lock);
672 			listener->tcp_syn_rcvd_timeout++;
673 			if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) {
674 				/*
675 				 * Make this eager available for drop if we
676 				 * need to drop one to accomodate a new
677 				 * incoming SYN request.
678 				 */
679 				MAKE_DROPPABLE(listener, tcp);
680 			}
681 			if (!listener->tcp_syn_defense &&
682 			    (listener->tcp_syn_rcvd_timeout >
683 			    (tcps->tcps_conn_req_max_q0 >> 2)) &&
684 			    (tcps->tcps_conn_req_max_q0 > 200)) {
685 				/* We may be under attack. Put on a defense. */
686 				listener->tcp_syn_defense = B_TRUE;
687 				cmn_err(CE_WARN, "High TCP connect timeout "
688 				    "rate! System (port %d) may be under a "
689 				    "SYN flood attack!",
690 				    ntohs(listener->tcp_connp->conn_lport));
691 
692 				listener->tcp_ip_addr_cache = kmem_zalloc(
693 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
694 				    KM_NOSLEEP);
695 			}
696 			mutex_exit(&listener->tcp_eager_lock);
697 		} else if (listener != NULL) {
698 			mutex_enter(&listener->tcp_eager_lock);
699 			tcp->tcp_syn_rcvd_timeout++;
700 			if (tcp->tcp_syn_rcvd_timeout > 1 &&
701 			    !tcp->tcp_closemp_used) {
702 				/*
703 				 * This is our second timeout. Put the tcp in
704 				 * the list of droppable eagers to allow it to
705 				 * be dropped, if needed. We don't check
706 				 * whether tcp_dontdrop is set or not to
707 				 * protect ourselve from a SYN attack where a
708 				 * remote host can spoof itself as one of the
709 				 * good IP source and continue to hold
710 				 * resources too long.
711 				 */
712 				MAKE_DROPPABLE(listener, tcp);
713 			}
714 			mutex_exit(&listener->tcp_eager_lock);
715 		}
716 	}
717 		/* FALLTHRU */
718 	case TCPS_SYN_SENT:
719 		first_threshold =  tcp->tcp_first_ctimer_threshold;
720 		second_threshold = tcp->tcp_second_ctimer_threshold;
721 
722 		/*
723 		 * If an app has set the second_threshold to 0, it means that
724 		 * we need to retransmit forever, unless this is a passive
725 		 * open.  We need to set second_threshold back to a normal
726 		 * value such that later comparison with it still makes
727 		 * sense.  But we set dont_timeout to B_TRUE so that we will
728 		 * never time out.
729 		 */
730 		if (second_threshold == 0) {
731 			second_threshold = tcps->tcps_ip_abort_linterval;
732 			if (tcp->tcp_active_open)
733 				dont_timeout = B_TRUE;
734 		}
735 		break;
736 	case TCPS_ESTABLISHED:
737 	case TCPS_CLOSE_WAIT:
738 		/*
739 		 * If the end point has not been closed, TCP can retransmit
740 		 * forever.  But if the end point is closed, the normal
741 		 * timeout applies.
742 		 */
743 		if (second_threshold == 0) {
744 			second_threshold = tcps->tcps_ip_abort_linterval;
745 			dont_timeout = B_TRUE;
746 		}
747 		/* FALLTHRU */
748 	case TCPS_FIN_WAIT_1:
749 	case TCPS_CLOSING:
750 	case TCPS_LAST_ACK:
751 		/* If we have data to rexmit */
752 		if (tcp->tcp_suna != tcp->tcp_snxt) {
753 			clock_t	time_to_wait;
754 
755 			TCPS_BUMP_MIB(tcps, tcpTimRetrans);
756 			if (!tcp->tcp_xmit_head)
757 				break;
758 			time_to_wait = ddi_get_lbolt() -
759 			    (clock_t)tcp->tcp_xmit_head->b_prev;
760 			time_to_wait = tcp->tcp_rto -
761 			    TICK_TO_MSEC(time_to_wait);
762 			/*
763 			 * If the timer fires too early, 1 clock tick earlier,
764 			 * restart the timer.
765 			 */
766 			if (time_to_wait > msec_per_tick) {
767 				TCP_STAT(tcps, tcp_timer_fire_early);
768 				TCP_TIMER_RESTART(tcp, time_to_wait);
769 				return;
770 			}
771 			/*
772 			 * When we probe zero windows, we force the swnd open.
773 			 * If our peer acks with a closed window swnd will be
774 			 * set to zero by tcp_rput(). As long as we are
775 			 * receiving acks tcp_rput will
776 			 * reset 'tcp_ms_we_have_waited' so as not to trip the
777 			 * first and second interval actions.  NOTE: the timer
778 			 * interval is allowed to continue its exponential
779 			 * backoff.
780 			 */
781 			if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
782 				if (connp->conn_debug) {
783 					(void) strlog(TCP_MOD_ID, 0, 1,
784 					    SL_TRACE, "tcp_timer: zero win");
785 				}
786 			} else {
787 				/*
788 				 * After retransmission, we need to do
789 				 * slow start.  Set the ssthresh to one
790 				 * half of current effective window and
791 				 * cwnd to one MSS.  Also reset
792 				 * tcp_cwnd_cnt.
793 				 *
794 				 * Note that if tcp_ssthresh is reduced because
795 				 * of ECN, do not reduce it again unless it is
796 				 * already one window of data away (tcp_cwr
797 				 * should then be cleared) or this is a
798 				 * timeout for a retransmitted segment.
799 				 */
800 				uint32_t npkt;
801 
802 				if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
803 					npkt = ((tcp->tcp_timer_backoff ?
804 					    tcp->tcp_cwnd_ssthresh :
805 					    tcp->tcp_snxt -
806 					    tcp->tcp_suna) >> 1) / tcp->tcp_mss;
807 					tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
808 					    tcp->tcp_mss;
809 				}
810 				tcp->tcp_cwnd = tcp->tcp_mss;
811 				tcp->tcp_cwnd_cnt = 0;
812 				if (tcp->tcp_ecn_ok) {
813 					tcp->tcp_cwr = B_TRUE;
814 					tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
815 					tcp->tcp_ecn_cwr_sent = B_FALSE;
816 				}
817 			}
818 			break;
819 		}
820 		/*
821 		 * We have something to send yet we cannot send.  The
822 		 * reason can be:
823 		 *
824 		 * 1. Zero send window: we need to do zero window probe.
825 		 * 2. Zero cwnd: because of ECN, we need to "clock out
826 		 * segments.
827 		 * 3. SWS avoidance: receiver may have shrunk window,
828 		 * reset our knowledge.
829 		 *
830 		 * Note that condition 2 can happen with either 1 or
831 		 * 3.  But 1 and 3 are exclusive.
832 		 */
833 		if (tcp->tcp_unsent != 0) {
834 			/*
835 			 * Should not hold the zero-copy messages for too long.
836 			 */
837 			if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
838 				tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
839 				    tcp->tcp_xmit_head, B_TRUE);
840 
841 			if (tcp->tcp_cwnd == 0) {
842 				/*
843 				 * Set tcp_cwnd to 1 MSS so that a
844 				 * new segment can be sent out.  We
845 				 * are "clocking out" new data when
846 				 * the network is really congested.
847 				 */
848 				ASSERT(tcp->tcp_ecn_ok);
849 				tcp->tcp_cwnd = tcp->tcp_mss;
850 			}
851 			if (tcp->tcp_swnd == 0) {
852 				/* Extend window for zero window probe */
853 				tcp->tcp_swnd++;
854 				tcp->tcp_zero_win_probe = B_TRUE;
855 				TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
856 			} else {
857 				/*
858 				 * Handle timeout from sender SWS avoidance.
859 				 * Reset our knowledge of the max send window
860 				 * since the receiver might have reduced its
861 				 * receive buffer.  Avoid setting tcp_max_swnd
862 				 * to one since that will essentially disable
863 				 * the SWS checks.
864 				 *
865 				 * Note that since we don't have a SWS
866 				 * state variable, if the timeout is set
867 				 * for ECN but not for SWS, this
868 				 * code will also be executed.  This is
869 				 * fine as tcp_max_swnd is updated
870 				 * constantly and it will not affect
871 				 * anything.
872 				 */
873 				tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2);
874 			}
875 			tcp_wput_data(tcp, NULL, B_FALSE);
876 			return;
877 		}
878 		/* Is there a FIN that needs to be to re retransmitted? */
879 		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
880 		    !tcp->tcp_fin_acked)
881 			break;
882 		/* Nothing to do, return without restarting timer. */
883 		TCP_STAT(tcps, tcp_timer_fire_miss);
884 		return;
885 	case TCPS_FIN_WAIT_2:
886 		/*
887 		 * User closed the TCP endpoint and peer ACK'ed our FIN.
888 		 * We waited some time for for peer's FIN, but it hasn't
889 		 * arrived.  We flush the connection now to avoid
890 		 * case where the peer has rebooted.
891 		 */
892 		if (TCP_IS_DETACHED(tcp)) {
893 			(void) tcp_clean_death(tcp, 0);
894 		} else {
895 			TCP_TIMER_RESTART(tcp,
896 			    tcp->tcp_fin_wait_2_flush_interval);
897 		}
898 		return;
899 	case TCPS_TIME_WAIT:
900 		(void) tcp_clean_death(tcp, 0);
901 		return;
902 	default:
903 		if (connp->conn_debug) {
904 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
905 			    "tcp_timer: strange state (%d) %s",
906 			    tcp->tcp_state, tcp_display(tcp, NULL,
907 			    DISP_PORT_ONLY));
908 		}
909 		return;
910 	}
911 
912 	/*
913 	 * If the system is under memory pressure or the max number of
914 	 * connections have been established for the listener, be more
915 	 * aggressive in aborting connections.
916 	 */
917 	if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL &&
918 	    tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) {
919 		second_threshold = tcp_early_abort * SECONDS;
920 
921 		/* We will ignore the never timeout promise in this case... */
922 		dont_timeout = B_FALSE;
923 	}
924 
925 	ASSERT(second_threshold != 0);
926 
927 	if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
928 		/*
929 		 * Should not hold the zero-copy messages for too long.
930 		 */
931 		if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
932 			tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
933 			    tcp->tcp_xmit_head, B_TRUE);
934 
935 		if (dont_timeout) {
936 			/*
937 			 * Reset tcp_ms_we_have_waited to avoid overflow since
938 			 * we are going to retransmit forever.
939 			 */
940 			tcp->tcp_ms_we_have_waited = second_threshold;
941 			goto timer_rexmit;
942 		}
943 
944 		/*
945 		 * For zero window probe, we need to send indefinitely,
946 		 * unless we have not heard from the other side for some
947 		 * time...
948 		 */
949 		if ((tcp->tcp_zero_win_probe == 0) ||
950 		    (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) >
951 		    second_threshold)) {
952 			TCPS_BUMP_MIB(tcps, tcpTimRetransDrop);
953 			/*
954 			 * If TCP is in SYN_RCVD state, send back a
955 			 * RST|ACK as BSD does.  Note that tcp_zero_win_probe
956 			 * should be zero in TCPS_SYN_RCVD state.
957 			 */
958 			if (tcp->tcp_state == TCPS_SYN_RCVD) {
959 				tcp_xmit_ctl("tcp_timer: RST sent on timeout "
960 				    "in SYN_RCVD",
961 				    tcp, tcp->tcp_snxt,
962 				    tcp->tcp_rnxt, TH_RST | TH_ACK);
963 			}
964 			(void) tcp_clean_death(tcp,
965 			    tcp->tcp_client_errno ?
966 			    tcp->tcp_client_errno : ETIMEDOUT);
967 			return;
968 		} else {
969 			/*
970 			 * If the system is under memory pressure, we also
971 			 * abort connection in zero window probing.
972 			 */
973 			if (tcps->tcps_reclaim) {
974 				(void) tcp_clean_death(tcp,
975 				    tcp->tcp_client_errno ?
976 				    tcp->tcp_client_errno : ETIMEDOUT);
977 				TCP_STAT(tcps, tcp_zwin_mem_drop);
978 				return;
979 			}
980 			/*
981 			 * Set tcp_ms_we_have_waited to second_threshold
982 			 * so that in next timeout, we will do the above
983 			 * check (ddi_get_lbolt() - tcp_last_recv_time).
984 			 * This is also to avoid overflow.
985 			 *
986 			 * We don't need to decrement tcp_timer_backoff
987 			 * to avoid overflow because it will be decremented
988 			 * later if new timeout value is greater than
989 			 * tcp_rto_max.  In the case when tcp_rto_max is
990 			 * greater than second_threshold, it means that we
991 			 * will wait longer than second_threshold to send
992 			 * the next
993 			 * window probe.
994 			 */
995 			tcp->tcp_ms_we_have_waited = second_threshold;
996 		}
997 	} else if (ms > first_threshold) {
998 		/*
999 		 * Should not hold the zero-copy messages for too long.
1000 		 */
1001 		if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
1002 			tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
1003 			    tcp->tcp_xmit_head, B_TRUE);
1004 
1005 		/*
1006 		 * We have been retransmitting for too long...  The RTT
1007 		 * we calculated is probably incorrect.  Reinitialize it.
1008 		 * Need to compensate for 0 tcp_rtt_sa.  Reset
1009 		 * tcp_rtt_update so that we won't accidentally cache a
1010 		 * bad value.  But only do this if this is not a zero
1011 		 * window probe.
1012 		 */
1013 		if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
1014 			tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
1015 			    (tcp->tcp_rtt_sa >> 5);
1016 			tcp->tcp_rtt_sa = 0;
1017 			tcp_ip_notify(tcp);
1018 			tcp->tcp_rtt_update = 0;
1019 		}
1020 	}
1021 
1022 timer_rexmit:
1023 	tcp->tcp_timer_backoff++;
1024 	if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
1025 	    tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
1026 	    tcp->tcp_rto_min) {
1027 		/*
1028 		 * This means the original RTO is tcp_rexmit_interval_min.
1029 		 * So we will use tcp_rexmit_interval_min as the RTO value
1030 		 * and do the backoff.
1031 		 */
1032 		ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
1033 	} else {
1034 		ms <<= tcp->tcp_timer_backoff;
1035 	}
1036 	if (ms > tcp->tcp_rto_max) {
1037 		ms = tcp->tcp_rto_max;
1038 		/*
1039 		 * ms is at max, decrement tcp_timer_backoff to avoid
1040 		 * overflow.
1041 		 */
1042 		tcp->tcp_timer_backoff--;
1043 	}
1044 	tcp->tcp_ms_we_have_waited += ms;
1045 	if (tcp->tcp_zero_win_probe == 0) {
1046 		tcp->tcp_rto = ms;
1047 	}
1048 	TCP_TIMER_RESTART(tcp, ms);
1049 	/*
1050 	 * This is after a timeout and tcp_rto is backed off.  Set
1051 	 * tcp_set_timer to 1 so that next time RTO is updated, we will
1052 	 * restart the timer with a correct value.
1053 	 */
1054 	tcp->tcp_set_timer = 1;
1055 	mss = tcp->tcp_snxt - tcp->tcp_suna;
1056 	if (mss > tcp->tcp_mss)
1057 		mss = tcp->tcp_mss;
1058 	if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1059 		mss = tcp->tcp_swnd;
1060 
1061 	if ((mp = tcp->tcp_xmit_head) != NULL)
1062 		mp->b_prev = (mblk_t *)ddi_get_lbolt();
1063 	mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1064 	    B_TRUE);
1065 
1066 	/*
1067 	 * When slow start after retransmission begins, start with
1068 	 * this seq no.  tcp_rexmit_max marks the end of special slow
1069 	 * start phase.  tcp_snd_burst controls how many segments
1070 	 * can be sent because of an ack.
1071 	 */
1072 	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1073 	tcp->tcp_snd_burst = TCP_CWND_SS;
1074 	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
1075 	    (tcp->tcp_unsent == 0)) {
1076 		tcp->tcp_rexmit_max = tcp->tcp_fss;
1077 	} else {
1078 		tcp->tcp_rexmit_max = tcp->tcp_snxt;
1079 	}
1080 	tcp->tcp_rexmit = B_TRUE;
1081 	tcp->tcp_dupack_cnt = 0;
1082 
1083 	/*
1084 	 * Remove all rexmit SACK blk to start from fresh.
1085 	 */
1086 	if (tcp->tcp_snd_sack_ok)
1087 		TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
1088 	if (mp == NULL) {
1089 		return;
1090 	}
1091 
1092 	tcp->tcp_csuna = tcp->tcp_snxt;
1093 	TCPS_BUMP_MIB(tcps, tcpRetransSegs);
1094 	TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
1095 	tcp_send_data(tcp, mp);
1096 
1097 }
1098 
1099 /*
1100  * Handle lingering timeouts. This function is called when the SO_LINGER timeout
1101  * expires.
1102  */
1103 void
1104 tcp_close_linger_timeout(void *arg)
1105 {
1106 	conn_t	*connp = (conn_t *)arg;
1107 	tcp_t 	*tcp = connp->conn_tcp;
1108 
1109 	tcp->tcp_client_errno = ETIMEDOUT;
1110 	tcp_stop_lingering(tcp);
1111 }
1112