xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_timers.c (revision 5a45682c3e7b01faa1761ab8d86f0bed4cc1d363)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/strlog.h>
29 #include <sys/strsun.h>
30 #include <sys/squeue_impl.h>
31 #include <sys/squeue.h>
32 #include <sys/callo.h>
33 #include <sys/strsubr.h>
34 
35 #include <inet/common.h>
36 #include <inet/ip.h>
37 #include <inet/ip_ire.h>
38 #include <inet/ip_rts.h>
39 #include <inet/tcp.h>
40 #include <inet/tcp_impl.h>
41 
42 /*
43  * Implementation of TCP Timers.
44  * =============================
45  *
46  * INTERFACE:
47  *
48  * There are two basic functions dealing with tcp timers:
49  *
50  *	timeout_id_t	tcp_timeout(connp, func, time)
51  * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
52  *	TCP_TIMER_RESTART(tcp, intvl)
53  *
54  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
55  * after 'time' ticks passed. The function called by timeout() must adhere to
56  * the same restrictions as a driver soft interrupt handler - it must not sleep
57  * or call other functions that might sleep. The value returned is the opaque
58  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
59  * cancel the request. The call to tcp_timeout() may fail in which case it
60  * returns zero. This is different from the timeout(9F) function which never
61  * fails.
62  *
63  * The call-back function 'func' always receives 'connp' as its single
64  * argument. It is always executed in the squeue corresponding to the tcp
65  * structure. The tcp structure is guaranteed to be present at the time the
66  * call-back is called.
67  *
68  * NOTE: The call-back function 'func' is never called if tcp is in
69  * 	the TCPS_CLOSED state.
70  *
71  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
72  * request. locks acquired by the call-back routine should not be held across
73  * the call to tcp_timeout_cancel() or a deadlock may result.
74  *
75  * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
76  * Otherwise, it returns an integer value greater than or equal to 0. In
77  * particular, if the call-back function is already placed on the squeue, it can
78  * not be canceled.
79  *
80  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
81  * 	within squeue context corresponding to the tcp instance. Since the
82  *	call-back is also called via the same squeue, there are no race
83  *	conditions described in untimeout(9F) manual page since all calls are
84  *	strictly serialized.
85  *
86  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
87  *	stored in tcp_timer_tid and starts a new one using
88  *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
89  *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
90  *	field.
91  *
92  * NOTE: since the timeout cancellation is not guaranteed, the cancelled
93  *	call-back may still be called, so it is possible tcp_timer() will be
94  *	called several times. This should not be a problem since tcp_timer()
95  *	should always check the tcp instance state.
96  *
97  *
98  * IMPLEMENTATION:
99  *
100  * TCP timers are implemented using three-stage process. The call to
101  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
102  * when the timer expires. The tcp_timer_callback() arranges the call of the
103  * tcp_timer_handler() function via squeue corresponding to the tcp
104  * instance. The tcp_timer_handler() calls actual requested timeout call-back
105  * and passes tcp instance as an argument to it. Information is passed between
106  * stages using the tcp_timer_t structure which contains the connp pointer, the
107  * tcp call-back to call and the timeout id returned by the timeout(9F).
108  *
109  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
110  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
111  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
112  * returns the pointer to this mblk.
113  *
114  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
115  * looks like a normal mblk without actual dblk attached to it.
116  *
117  * To optimize performance each tcp instance holds a small cache of timer
118  * mblocks. In the current implementation it caches up to two timer mblocks per
119  * tcp instance. The cache is preserved over tcp frees and is only freed when
120  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
121  * timer processing happens on a corresponding squeue, the cache manipulation
122  * does not require any locks. Experiments show that majority of timer mblocks
123  * allocations are satisfied from the tcp cache and do not involve kmem calls.
124  *
125  * The tcp_timeout() places a refhold on the connp instance which guarantees
126  * that it will be present at the time the call-back function fires. The
127  * tcp_timer_handler() drops the reference after calling the call-back, so the
128  * call-back function does not need to manipulate the references explicitly.
129  */
130 
131 kmem_cache_t *tcp_timercache;
132 
133 static void	tcp_ip_notify(tcp_t *);
134 static void	tcp_timer_callback(void *);
135 static void	tcp_timer_free(tcp_t *, mblk_t *);
136 static void	tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *);
137 
138 /*
139  * tim is in millisec.
140  */
141 timeout_id_t
142 tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim)
143 {
144 	mblk_t *mp;
145 	tcp_timer_t *tcpt;
146 	tcp_t *tcp = connp->conn_tcp;
147 
148 	ASSERT(connp->conn_sqp != NULL);
149 
150 	TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls);
151 
152 	if (tcp->tcp_timercache == NULL) {
153 		mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
154 	} else {
155 		TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc);
156 		mp = tcp->tcp_timercache;
157 		tcp->tcp_timercache = mp->b_next;
158 		mp->b_next = NULL;
159 		ASSERT(mp->b_wptr == NULL);
160 	}
161 
162 	CONN_INC_REF(connp);
163 	tcpt = (tcp_timer_t *)mp->b_rptr;
164 	tcpt->connp = connp;
165 	tcpt->tcpt_proc = f;
166 	/*
167 	 * TCP timers are normal timeouts. Plus, they do not require more than
168 	 * a 10 millisecond resolution. By choosing a coarser resolution and by
169 	 * rounding up the expiration to the next resolution boundary, we can
170 	 * batch timers in the callout subsystem to make TCP timers more
171 	 * efficient. The roundup also protects short timers from expiring too
172 	 * early before they have a chance to be cancelled.
173 	 */
174 	tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp,
175 	    tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
176 
177 	return ((timeout_id_t)mp);
178 }
179 
180 static void
181 tcp_timer_callback(void *arg)
182 {
183 	mblk_t *mp = (mblk_t *)arg;
184 	tcp_timer_t *tcpt;
185 	conn_t	*connp;
186 
187 	tcpt = (tcp_timer_t *)mp->b_rptr;
188 	connp = tcpt->connp;
189 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
190 	    NULL, SQ_FILL, SQTAG_TCP_TIMER);
191 }
192 
193 /* ARGSUSED */
194 static void
195 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
196 {
197 	tcp_timer_t *tcpt;
198 	conn_t *connp = (conn_t *)arg;
199 	tcp_t *tcp = connp->conn_tcp;
200 
201 	tcpt = (tcp_timer_t *)mp->b_rptr;
202 	ASSERT(connp == tcpt->connp);
203 	ASSERT((squeue_t *)arg2 == connp->conn_sqp);
204 
205 	/*
206 	 * If the TCP has reached the closed state, don't proceed any
207 	 * further. This TCP logically does not exist on the system.
208 	 * tcpt_proc could for example access queues, that have already
209 	 * been qprocoff'ed off.
210 	 */
211 	if (tcp->tcp_state != TCPS_CLOSED) {
212 		(*tcpt->tcpt_proc)(connp);
213 	} else {
214 		tcp->tcp_timer_tid = 0;
215 	}
216 	tcp_timer_free(connp->conn_tcp, mp);
217 }
218 
219 /*
220  * There is potential race with untimeout and the handler firing at the same
221  * time. The mblock may be freed by the handler while we are trying to use
222  * it. But since both should execute on the same squeue, this race should not
223  * occur.
224  */
225 clock_t
226 tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
227 {
228 	mblk_t	*mp = (mblk_t *)id;
229 	tcp_timer_t *tcpt;
230 	clock_t delta;
231 
232 	TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs);
233 
234 	if (mp == NULL)
235 		return (-1);
236 
237 	tcpt = (tcp_timer_t *)mp->b_rptr;
238 	ASSERT(tcpt->connp == connp);
239 
240 	delta = untimeout_default(tcpt->tcpt_tid, 0);
241 
242 	if (delta >= 0) {
243 		TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled);
244 		tcp_timer_free(connp->conn_tcp, mp);
245 		CONN_DEC_REF(connp);
246 	}
247 
248 	return (TICK_TO_MSEC(delta));
249 }
250 
251 /*
252  * Allocate space for the timer event. The allocation looks like mblk, but it is
253  * not a proper mblk. To avoid confusion we set b_wptr to NULL.
254  *
255  * Dealing with failures: If we can't allocate from the timer cache we try
256  * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
257  * points to b_rptr.
258  * If we can't allocate anything using allocb_tryhard(), we perform a last
259  * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
260  * save the actual allocation size in b_datap.
261  */
262 mblk_t *
263 tcp_timermp_alloc(int kmflags)
264 {
265 	mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache,
266 	    kmflags & ~KM_PANIC);
267 
268 	if (mp != NULL) {
269 		mp->b_next = mp->b_prev = NULL;
270 		mp->b_rptr = (uchar_t *)(&mp[1]);
271 		mp->b_wptr = NULL;
272 		mp->b_datap = NULL;
273 		mp->b_queue = NULL;
274 		mp->b_cont = NULL;
275 	} else if (kmflags & KM_PANIC) {
276 		/*
277 		 * Failed to allocate memory for the timer. Try allocating from
278 		 * dblock caches.
279 		 */
280 		/* ipclassifier calls this from a constructor - hence no tcps */
281 		TCP_G_STAT(tcp_timermp_allocfail);
282 		mp = allocb_tryhard(sizeof (tcp_timer_t));
283 		if (mp == NULL) {
284 			size_t size = 0;
285 			/*
286 			 * Memory is really low. Try tryhard allocation.
287 			 *
288 			 * ipclassifier calls this from a constructor -
289 			 * hence no tcps
290 			 */
291 			TCP_G_STAT(tcp_timermp_allocdblfail);
292 			mp = kmem_alloc_tryhard(sizeof (mblk_t) +
293 			    sizeof (tcp_timer_t), &size, kmflags);
294 			mp->b_rptr = (uchar_t *)(&mp[1]);
295 			mp->b_next = mp->b_prev = NULL;
296 			mp->b_wptr = (uchar_t *)-1;
297 			mp->b_datap = (dblk_t *)size;
298 			mp->b_queue = NULL;
299 			mp->b_cont = NULL;
300 		}
301 		ASSERT(mp->b_wptr != NULL);
302 	}
303 	/* ipclassifier calls this from a constructor - hence no tcps */
304 	TCP_G_DBGSTAT(tcp_timermp_alloced);
305 
306 	return (mp);
307 }
308 
309 /*
310  * Free per-tcp timer cache.
311  * It can only contain entries from tcp_timercache.
312  */
313 void
314 tcp_timermp_free(tcp_t *tcp)
315 {
316 	mblk_t *mp;
317 
318 	while ((mp = tcp->tcp_timercache) != NULL) {
319 		ASSERT(mp->b_wptr == NULL);
320 		tcp->tcp_timercache = tcp->tcp_timercache->b_next;
321 		kmem_cache_free(tcp_timercache, mp);
322 	}
323 }
324 
325 /*
326  * Free timer event. Put it on the per-tcp timer cache if there is not too many
327  * events there already (currently at most two events are cached).
328  * If the event is not allocated from the timer cache, free it right away.
329  */
330 static void
331 tcp_timer_free(tcp_t *tcp, mblk_t *mp)
332 {
333 	mblk_t *mp1 = tcp->tcp_timercache;
334 
335 	if (mp->b_wptr != NULL) {
336 		/*
337 		 * This allocation is not from a timer cache, free it right
338 		 * away.
339 		 */
340 		if (mp->b_wptr != (uchar_t *)-1)
341 			freeb(mp);
342 		else
343 			kmem_free(mp, (size_t)mp->b_datap);
344 	} else if (mp1 == NULL || mp1->b_next == NULL) {
345 		/* Cache this timer block for future allocations */
346 		mp->b_rptr = (uchar_t *)(&mp[1]);
347 		mp->b_next = mp1;
348 		tcp->tcp_timercache = mp;
349 	} else {
350 		kmem_cache_free(tcp_timercache, mp);
351 		TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed);
352 	}
353 }
354 
355 /*
356  * Stop all TCP timers.
357  */
358 void
359 tcp_timers_stop(tcp_t *tcp)
360 {
361 	if (tcp->tcp_timer_tid != 0) {
362 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
363 		tcp->tcp_timer_tid = 0;
364 	}
365 	if (tcp->tcp_ka_tid != 0) {
366 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
367 		tcp->tcp_ka_tid = 0;
368 	}
369 	if (tcp->tcp_ack_tid != 0) {
370 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
371 		tcp->tcp_ack_tid = 0;
372 	}
373 	if (tcp->tcp_push_tid != 0) {
374 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
375 		tcp->tcp_push_tid = 0;
376 	}
377 	if (tcp->tcp_reass_tid != 0) {
378 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid);
379 		tcp->tcp_reass_tid = 0;
380 	}
381 }
382 
383 /*
384  * Timer callback routine for keepalive probe.  We do a fake resend of
385  * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
386  * check to see if we have heard anything from the other end for the last
387  * RTO period.  If we have, set the timer to expire for another
388  * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
389  * RTO << 1 and check again when it expires.  Keep exponentially increasing
390  * the timeout if we have not heard from the other side.  If for more than
391  * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
392  * kill the connection unless the keepalive abort threshold is 0.  In
393  * that case, we will probe "forever."
394  * If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow
395  * the exponential backoff, but send probes tcp_ka_cnt times in regular
396  * intervals of tcp_ka_rinterval milliseconds until we hear back from peer.
397  * Kill the connection if we don't hear back from peer after tcp_ka_cnt
398  * probes are sent.
399  */
400 void
401 tcp_keepalive_timer(void *arg)
402 {
403 	mblk_t	*mp;
404 	conn_t	*connp = (conn_t *)arg;
405 	tcp_t  	*tcp = connp->conn_tcp;
406 	int32_t	firetime;
407 	int32_t	idletime;
408 	int32_t	ka_intrvl;
409 	tcp_stack_t	*tcps = tcp->tcp_tcps;
410 
411 	tcp->tcp_ka_tid = 0;
412 
413 	if (tcp->tcp_fused)
414 		return;
415 
416 	TCPS_BUMP_MIB(tcps, tcpTimKeepalive);
417 	ka_intrvl = tcp->tcp_ka_interval;
418 
419 	/*
420 	 * Keepalive probe should only be sent if the application has not
421 	 * done a close on the connection.
422 	 */
423 	if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
424 		return;
425 	}
426 	/* Timer fired too early, restart it. */
427 	if (tcp->tcp_state < TCPS_ESTABLISHED) {
428 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
429 		    ka_intrvl);
430 		return;
431 	}
432 
433 	idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
434 	/*
435 	 * If we have not heard from the other side for a long
436 	 * time, kill the connection unless the keepalive abort
437 	 * threshold is 0.  In that case, we will probe "forever."
438 	 */
439 	if (tcp->tcp_ka_abort_thres != 0 &&
440 	    idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
441 		TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop);
442 		(void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
443 		    tcp->tcp_client_errno : ETIMEDOUT);
444 		return;
445 	}
446 
447 	if (tcp->tcp_snxt == tcp->tcp_suna &&
448 	    idletime >= ka_intrvl) {
449 		/* Fake resend of last ACKed byte. */
450 		mblk_t	*mp1 = allocb(1, BPRI_LO);
451 
452 		if (mp1 != NULL) {
453 			*mp1->b_wptr++ = '\0';
454 			mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
455 			    tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
456 			freeb(mp1);
457 			/*
458 			 * if allocation failed, fall through to start the
459 			 * timer back.
460 			 */
461 			if (mp != NULL) {
462 				tcp_send_data(tcp, mp);
463 				TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe);
464 				if (tcp->tcp_ka_rinterval) {
465 					firetime = tcp->tcp_ka_rinterval;
466 				} else if (tcp->tcp_ka_last_intrvl != 0) {
467 					int max;
468 					/*
469 					 * We should probe again at least
470 					 * in ka_intrvl, but not more than
471 					 * tcp_rto_max.
472 					 */
473 					max = tcp->tcp_rto_max;
474 					firetime = MIN(ka_intrvl - 1,
475 					    tcp->tcp_ka_last_intrvl << 1);
476 					if (firetime > max)
477 						firetime = max;
478 				} else {
479 					firetime = tcp->tcp_rto;
480 				}
481 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
482 				    tcp_keepalive_timer, firetime);
483 				tcp->tcp_ka_last_intrvl = firetime;
484 				return;
485 			}
486 		}
487 	} else {
488 		tcp->tcp_ka_last_intrvl = 0;
489 	}
490 
491 	/* firetime can be negative if (mp1 == NULL || mp == NULL) */
492 	if ((firetime = ka_intrvl - idletime) < 0) {
493 		firetime = ka_intrvl;
494 	}
495 	tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime);
496 }
497 
498 void
499 tcp_reass_timer(void *arg)
500 {
501 	conn_t *connp = (conn_t *)arg;
502 	tcp_t *tcp = connp->conn_tcp;
503 
504 	tcp->tcp_reass_tid = 0;
505 	if (tcp->tcp_reass_head == NULL)
506 		return;
507 	ASSERT(tcp->tcp_reass_tail != NULL);
508 	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
509 		tcp_sack_remove(tcp->tcp_sack_list,
510 		    TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk);
511 	}
512 	tcp_close_mpp(&tcp->tcp_reass_head);
513 	tcp->tcp_reass_tail = NULL;
514 	TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout);
515 }
516 
517 /* This function handles the push timeout. */
518 void
519 tcp_push_timer(void *arg)
520 {
521 	conn_t	*connp = (conn_t *)arg;
522 	tcp_t *tcp = connp->conn_tcp;
523 
524 	TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt);
525 
526 	ASSERT(tcp->tcp_listener == NULL);
527 
528 	ASSERT(!IPCL_IS_NONSTR(connp));
529 
530 	tcp->tcp_push_tid = 0;
531 
532 	if (tcp->tcp_rcv_list != NULL &&
533 	    tcp_rcv_drain(tcp) == TH_ACK_NEEDED)
534 		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
535 }
536 
537 /*
538  * This function handles delayed ACK timeout.
539  */
540 void
541 tcp_ack_timer(void *arg)
542 {
543 	conn_t	*connp = (conn_t *)arg;
544 	tcp_t *tcp = connp->conn_tcp;
545 	mblk_t *mp;
546 	tcp_stack_t	*tcps = tcp->tcp_tcps;
547 
548 	TCP_DBGSTAT(tcps, tcp_ack_timer_cnt);
549 
550 	tcp->tcp_ack_tid = 0;
551 
552 	if (tcp->tcp_fused)
553 		return;
554 
555 	/*
556 	 * Do not send ACK if there is no outstanding unack'ed data.
557 	 */
558 	if (tcp->tcp_rnxt == tcp->tcp_rack) {
559 		return;
560 	}
561 
562 	if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) {
563 		/*
564 		 * Make sure we don't allow deferred ACKs to result in
565 		 * timer-based ACKing.  If we have held off an ACK
566 		 * when there was more than an mss here, and the timer
567 		 * goes off, we have to worry about the possibility
568 		 * that the sender isn't doing slow-start, or is out
569 		 * of step with us for some other reason.  We fall
570 		 * permanently back in the direction of
571 		 * ACK-every-other-packet as suggested in RFC 1122.
572 		 */
573 		if (tcp->tcp_rack_abs_max > 2)
574 			tcp->tcp_rack_abs_max--;
575 		tcp->tcp_rack_cur_max = 2;
576 	}
577 	mp = tcp_ack_mp(tcp);
578 
579 	if (mp != NULL) {
580 		BUMP_LOCAL(tcp->tcp_obsegs);
581 		TCPS_BUMP_MIB(tcps, tcpOutAck);
582 		TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
583 		tcp_send_data(tcp, mp);
584 	}
585 }
586 
587 /*
588  * Notify IP that we are having trouble with this connection.  IP should
589  * make note so it can potentially use a different IRE.
590  */
591 static void
592 tcp_ip_notify(tcp_t *tcp)
593 {
594 	conn_t		*connp = tcp->tcp_connp;
595 	ire_t		*ire;
596 
597 	/*
598 	 * Note: in the case of source routing we want to blow away the
599 	 * route to the first source route hop.
600 	 */
601 	ire = connp->conn_ixa->ixa_ire;
602 	if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
603 		if (ire->ire_ipversion == IPV4_VERSION) {
604 			/*
605 			 * As per RFC 1122, we send an RTM_LOSING to inform
606 			 * routing protocols.
607 			 */
608 			ip_rts_change(RTM_LOSING, ire->ire_addr,
609 			    ire->ire_gateway_addr, ire->ire_mask,
610 			    connp->conn_laddr_v4,  0, 0, 0,
611 			    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
612 			    ire->ire_ipst);
613 		}
614 		(void) ire_no_good(ire);
615 	}
616 }
617 
618 /*
619  * tcp_timer is the timer service routine.  It handles the retransmission,
620  * FIN_WAIT_2 flush, and zero window probe timeout events.  It figures out
621  * from the state of the tcp instance what kind of action needs to be done
622  * at the time it is called.
623  */
624 void
625 tcp_timer(void *arg)
626 {
627 	mblk_t		*mp;
628 	clock_t		first_threshold;
629 	clock_t		second_threshold;
630 	clock_t		ms;
631 	uint32_t	mss;
632 	conn_t		*connp = (conn_t *)arg;
633 	tcp_t		*tcp = connp->conn_tcp;
634 	tcp_stack_t	*tcps = tcp->tcp_tcps;
635 	boolean_t	dont_timeout = B_FALSE;
636 
637 	tcp->tcp_timer_tid = 0;
638 
639 	if (tcp->tcp_fused)
640 		return;
641 
642 	first_threshold =  tcp->tcp_first_timer_threshold;
643 	second_threshold = tcp->tcp_second_timer_threshold;
644 	switch (tcp->tcp_state) {
645 	case TCPS_IDLE:
646 	case TCPS_BOUND:
647 	case TCPS_LISTEN:
648 		return;
649 	case TCPS_SYN_RCVD: {
650 		tcp_t	*listener = tcp->tcp_listener;
651 
652 		if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
653 			/* it's our first timeout */
654 			tcp->tcp_syn_rcvd_timeout = 1;
655 			mutex_enter(&listener->tcp_eager_lock);
656 			listener->tcp_syn_rcvd_timeout++;
657 			if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) {
658 				/*
659 				 * Make this eager available for drop if we
660 				 * need to drop one to accomodate a new
661 				 * incoming SYN request.
662 				 */
663 				MAKE_DROPPABLE(listener, tcp);
664 			}
665 			if (!listener->tcp_syn_defense &&
666 			    (listener->tcp_syn_rcvd_timeout >
667 			    (tcps->tcps_conn_req_max_q0 >> 2)) &&
668 			    (tcps->tcps_conn_req_max_q0 > 200)) {
669 				/* We may be under attack. Put on a defense. */
670 				listener->tcp_syn_defense = B_TRUE;
671 				cmn_err(CE_WARN, "High TCP connect timeout "
672 				    "rate! System (port %d) may be under a "
673 				    "SYN flood attack!",
674 				    ntohs(listener->tcp_connp->conn_lport));
675 
676 				listener->tcp_ip_addr_cache = kmem_zalloc(
677 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
678 				    KM_NOSLEEP);
679 			}
680 			mutex_exit(&listener->tcp_eager_lock);
681 		} else if (listener != NULL) {
682 			mutex_enter(&listener->tcp_eager_lock);
683 			tcp->tcp_syn_rcvd_timeout++;
684 			if (tcp->tcp_syn_rcvd_timeout > 1 &&
685 			    !tcp->tcp_closemp_used) {
686 				/*
687 				 * This is our second timeout. Put the tcp in
688 				 * the list of droppable eagers to allow it to
689 				 * be dropped, if needed. We don't check
690 				 * whether tcp_dontdrop is set or not to
691 				 * protect ourselve from a SYN attack where a
692 				 * remote host can spoof itself as one of the
693 				 * good IP source and continue to hold
694 				 * resources too long.
695 				 */
696 				MAKE_DROPPABLE(listener, tcp);
697 			}
698 			mutex_exit(&listener->tcp_eager_lock);
699 		}
700 	}
701 		/* FALLTHRU */
702 	case TCPS_SYN_SENT:
703 		first_threshold =  tcp->tcp_first_ctimer_threshold;
704 		second_threshold = tcp->tcp_second_ctimer_threshold;
705 
706 		/*
707 		 * If an app has set the second_threshold to 0, it means that
708 		 * we need to retransmit forever, unless this is a passive
709 		 * open.  We need to set second_threshold back to a normal
710 		 * value such that later comparison with it still makes
711 		 * sense.  But we set dont_timeout to B_TRUE so that we will
712 		 * never time out.
713 		 */
714 		if (second_threshold == 0) {
715 			second_threshold = tcps->tcps_ip_abort_linterval;
716 			if (tcp->tcp_active_open)
717 				dont_timeout = B_TRUE;
718 		}
719 		break;
720 	case TCPS_ESTABLISHED:
721 	case TCPS_CLOSE_WAIT:
722 		/*
723 		 * If the end point has not been closed, TCP can retransmit
724 		 * forever.  But if the end point is closed, the normal
725 		 * timeout applies.
726 		 */
727 		if (second_threshold == 0) {
728 			second_threshold = tcps->tcps_ip_abort_linterval;
729 			dont_timeout = B_TRUE;
730 		}
731 		/* FALLTHRU */
732 	case TCPS_FIN_WAIT_1:
733 	case TCPS_CLOSING:
734 	case TCPS_LAST_ACK:
735 		/* If we have data to rexmit */
736 		if (tcp->tcp_suna != tcp->tcp_snxt) {
737 			clock_t	time_to_wait;
738 
739 			TCPS_BUMP_MIB(tcps, tcpTimRetrans);
740 			if (!tcp->tcp_xmit_head)
741 				break;
742 			time_to_wait = ddi_get_lbolt() -
743 			    (clock_t)tcp->tcp_xmit_head->b_prev;
744 			time_to_wait = tcp->tcp_rto -
745 			    TICK_TO_MSEC(time_to_wait);
746 			/*
747 			 * If the timer fires too early, 1 clock tick earlier,
748 			 * restart the timer.
749 			 */
750 			if (time_to_wait > msec_per_tick) {
751 				TCP_STAT(tcps, tcp_timer_fire_early);
752 				TCP_TIMER_RESTART(tcp, time_to_wait);
753 				return;
754 			}
755 			/*
756 			 * When we probe zero windows, we force the swnd open.
757 			 * If our peer acks with a closed window swnd will be
758 			 * set to zero by tcp_rput(). As long as we are
759 			 * receiving acks tcp_rput will
760 			 * reset 'tcp_ms_we_have_waited' so as not to trip the
761 			 * first and second interval actions.  NOTE: the timer
762 			 * interval is allowed to continue its exponential
763 			 * backoff.
764 			 */
765 			if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
766 				if (connp->conn_debug) {
767 					(void) strlog(TCP_MOD_ID, 0, 1,
768 					    SL_TRACE, "tcp_timer: zero win");
769 				}
770 			} else {
771 				/*
772 				 * After retransmission, we need to do
773 				 * slow start.  Set the ssthresh to one
774 				 * half of current effective window and
775 				 * cwnd to one MSS.  Also reset
776 				 * tcp_cwnd_cnt.
777 				 *
778 				 * Note that if tcp_ssthresh is reduced because
779 				 * of ECN, do not reduce it again unless it is
780 				 * already one window of data away (tcp_cwr
781 				 * should then be cleared) or this is a
782 				 * timeout for a retransmitted segment.
783 				 */
784 				uint32_t npkt;
785 
786 				if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
787 					npkt = ((tcp->tcp_timer_backoff ?
788 					    tcp->tcp_cwnd_ssthresh :
789 					    tcp->tcp_snxt -
790 					    tcp->tcp_suna) >> 1) / tcp->tcp_mss;
791 					tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
792 					    tcp->tcp_mss;
793 				}
794 				tcp->tcp_cwnd = tcp->tcp_mss;
795 				tcp->tcp_cwnd_cnt = 0;
796 				if (tcp->tcp_ecn_ok) {
797 					tcp->tcp_cwr = B_TRUE;
798 					tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
799 					tcp->tcp_ecn_cwr_sent = B_FALSE;
800 				}
801 			}
802 			break;
803 		}
804 		/*
805 		 * We have something to send yet we cannot send.  The
806 		 * reason can be:
807 		 *
808 		 * 1. Zero send window: we need to do zero window probe.
809 		 * 2. Zero cwnd: because of ECN, we need to "clock out
810 		 * segments.
811 		 * 3. SWS avoidance: receiver may have shrunk window,
812 		 * reset our knowledge.
813 		 *
814 		 * Note that condition 2 can happen with either 1 or
815 		 * 3.  But 1 and 3 are exclusive.
816 		 */
817 		if (tcp->tcp_unsent != 0) {
818 			/*
819 			 * Should not hold the zero-copy messages for too long.
820 			 */
821 			if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
822 				tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
823 				    tcp->tcp_xmit_head, B_TRUE);
824 
825 			if (tcp->tcp_cwnd == 0) {
826 				/*
827 				 * Set tcp_cwnd to 1 MSS so that a
828 				 * new segment can be sent out.  We
829 				 * are "clocking out" new data when
830 				 * the network is really congested.
831 				 */
832 				ASSERT(tcp->tcp_ecn_ok);
833 				tcp->tcp_cwnd = tcp->tcp_mss;
834 			}
835 			if (tcp->tcp_swnd == 0) {
836 				/* Extend window for zero window probe */
837 				tcp->tcp_swnd++;
838 				tcp->tcp_zero_win_probe = B_TRUE;
839 				TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
840 			} else {
841 				/*
842 				 * Handle timeout from sender SWS avoidance.
843 				 * Reset our knowledge of the max send window
844 				 * since the receiver might have reduced its
845 				 * receive buffer.  Avoid setting tcp_max_swnd
846 				 * to one since that will essentially disable
847 				 * the SWS checks.
848 				 *
849 				 * Note that since we don't have a SWS
850 				 * state variable, if the timeout is set
851 				 * for ECN but not for SWS, this
852 				 * code will also be executed.  This is
853 				 * fine as tcp_max_swnd is updated
854 				 * constantly and it will not affect
855 				 * anything.
856 				 */
857 				tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2);
858 			}
859 			tcp_wput_data(tcp, NULL, B_FALSE);
860 			return;
861 		}
862 		/* Is there a FIN that needs to be to re retransmitted? */
863 		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
864 		    !tcp->tcp_fin_acked)
865 			break;
866 		/* Nothing to do, return without restarting timer. */
867 		TCP_STAT(tcps, tcp_timer_fire_miss);
868 		return;
869 	case TCPS_FIN_WAIT_2:
870 		/*
871 		 * User closed the TCP endpoint and peer ACK'ed our FIN.
872 		 * We waited some time for for peer's FIN, but it hasn't
873 		 * arrived.  We flush the connection now to avoid
874 		 * case where the peer has rebooted.
875 		 */
876 		if (TCP_IS_DETACHED(tcp)) {
877 			(void) tcp_clean_death(tcp, 0);
878 		} else {
879 			TCP_TIMER_RESTART(tcp,
880 			    tcp->tcp_fin_wait_2_flush_interval);
881 		}
882 		return;
883 	case TCPS_TIME_WAIT:
884 		(void) tcp_clean_death(tcp, 0);
885 		return;
886 	default:
887 		if (connp->conn_debug) {
888 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
889 			    "tcp_timer: strange state (%d) %s",
890 			    tcp->tcp_state, tcp_display(tcp, NULL,
891 			    DISP_PORT_ONLY));
892 		}
893 		return;
894 	}
895 
896 	/*
897 	 * If the system is under memory pressure or the max number of
898 	 * connections have been established for the listener, be more
899 	 * aggressive in aborting connections.
900 	 */
901 	if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL &&
902 	    tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) {
903 		second_threshold = tcp_early_abort * SECONDS;
904 
905 		/* We will ignore the never timeout promise in this case... */
906 		dont_timeout = B_FALSE;
907 	}
908 
909 	ASSERT(second_threshold != 0);
910 
911 	if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
912 		/*
913 		 * Should not hold the zero-copy messages for too long.
914 		 */
915 		if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
916 			tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
917 			    tcp->tcp_xmit_head, B_TRUE);
918 
919 		if (dont_timeout) {
920 			/*
921 			 * Reset tcp_ms_we_have_waited to avoid overflow since
922 			 * we are going to retransmit forever.
923 			 */
924 			tcp->tcp_ms_we_have_waited = second_threshold;
925 			goto timer_rexmit;
926 		}
927 
928 		/*
929 		 * For zero window probe, we need to send indefinitely,
930 		 * unless we have not heard from the other side for some
931 		 * time...
932 		 */
933 		if ((tcp->tcp_zero_win_probe == 0) ||
934 		    (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) >
935 		    second_threshold)) {
936 			TCPS_BUMP_MIB(tcps, tcpTimRetransDrop);
937 			/*
938 			 * If TCP is in SYN_RCVD state, send back a
939 			 * RST|ACK as BSD does.  Note that tcp_zero_win_probe
940 			 * should be zero in TCPS_SYN_RCVD state.
941 			 */
942 			if (tcp->tcp_state == TCPS_SYN_RCVD) {
943 				tcp_xmit_ctl("tcp_timer: RST sent on timeout "
944 				    "in SYN_RCVD",
945 				    tcp, tcp->tcp_snxt,
946 				    tcp->tcp_rnxt, TH_RST | TH_ACK);
947 			}
948 			(void) tcp_clean_death(tcp,
949 			    tcp->tcp_client_errno ?
950 			    tcp->tcp_client_errno : ETIMEDOUT);
951 			return;
952 		} else {
953 			/*
954 			 * If the system is under memory pressure, we also
955 			 * abort connection in zero window probing.
956 			 */
957 			if (tcps->tcps_reclaim) {
958 				(void) tcp_clean_death(tcp,
959 				    tcp->tcp_client_errno ?
960 				    tcp->tcp_client_errno : ETIMEDOUT);
961 				TCP_STAT(tcps, tcp_zwin_mem_drop);
962 				return;
963 			}
964 			/*
965 			 * Set tcp_ms_we_have_waited to second_threshold
966 			 * so that in next timeout, we will do the above
967 			 * check (ddi_get_lbolt() - tcp_last_recv_time).
968 			 * This is also to avoid overflow.
969 			 *
970 			 * We don't need to decrement tcp_timer_backoff
971 			 * to avoid overflow because it will be decremented
972 			 * later if new timeout value is greater than
973 			 * tcp_rto_max.  In the case when tcp_rto_max is
974 			 * greater than second_threshold, it means that we
975 			 * will wait longer than second_threshold to send
976 			 * the next
977 			 * window probe.
978 			 */
979 			tcp->tcp_ms_we_have_waited = second_threshold;
980 		}
981 	} else if (ms > first_threshold) {
982 		/*
983 		 * Should not hold the zero-copy messages for too long.
984 		 */
985 		if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
986 			tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
987 			    tcp->tcp_xmit_head, B_TRUE);
988 
989 		/*
990 		 * We have been retransmitting for too long...  The RTT
991 		 * we calculated is probably incorrect.  Reinitialize it.
992 		 * Need to compensate for 0 tcp_rtt_sa.  Reset
993 		 * tcp_rtt_update so that we won't accidentally cache a
994 		 * bad value.  But only do this if this is not a zero
995 		 * window probe.
996 		 */
997 		if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
998 			tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
999 			    (tcp->tcp_rtt_sa >> 5);
1000 			tcp->tcp_rtt_sa = 0;
1001 			tcp_ip_notify(tcp);
1002 			tcp->tcp_rtt_update = 0;
1003 		}
1004 	}
1005 
1006 timer_rexmit:
1007 	tcp->tcp_timer_backoff++;
1008 	if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
1009 	    tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
1010 	    tcp->tcp_rto_min) {
1011 		/*
1012 		 * This means the original RTO is tcp_rexmit_interval_min.
1013 		 * So we will use tcp_rexmit_interval_min as the RTO value
1014 		 * and do the backoff.
1015 		 */
1016 		ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
1017 	} else {
1018 		ms <<= tcp->tcp_timer_backoff;
1019 	}
1020 	if (ms > tcp->tcp_rto_max) {
1021 		ms = tcp->tcp_rto_max;
1022 		/*
1023 		 * ms is at max, decrement tcp_timer_backoff to avoid
1024 		 * overflow.
1025 		 */
1026 		tcp->tcp_timer_backoff--;
1027 	}
1028 	tcp->tcp_ms_we_have_waited += ms;
1029 	if (tcp->tcp_zero_win_probe == 0) {
1030 		tcp->tcp_rto = ms;
1031 	}
1032 	TCP_TIMER_RESTART(tcp, ms);
1033 	/*
1034 	 * This is after a timeout and tcp_rto is backed off.  Set
1035 	 * tcp_set_timer to 1 so that next time RTO is updated, we will
1036 	 * restart the timer with a correct value.
1037 	 */
1038 	tcp->tcp_set_timer = 1;
1039 	mss = tcp->tcp_snxt - tcp->tcp_suna;
1040 	if (mss > tcp->tcp_mss)
1041 		mss = tcp->tcp_mss;
1042 	if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1043 		mss = tcp->tcp_swnd;
1044 
1045 	if ((mp = tcp->tcp_xmit_head) != NULL)
1046 		mp->b_prev = (mblk_t *)ddi_get_lbolt();
1047 	mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1048 	    B_TRUE);
1049 
1050 	/*
1051 	 * When slow start after retransmission begins, start with
1052 	 * this seq no.  tcp_rexmit_max marks the end of special slow
1053 	 * start phase.  tcp_snd_burst controls how many segments
1054 	 * can be sent because of an ack.
1055 	 */
1056 	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1057 	tcp->tcp_snd_burst = TCP_CWND_SS;
1058 	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
1059 	    (tcp->tcp_unsent == 0)) {
1060 		tcp->tcp_rexmit_max = tcp->tcp_fss;
1061 	} else {
1062 		tcp->tcp_rexmit_max = tcp->tcp_snxt;
1063 	}
1064 	tcp->tcp_rexmit = B_TRUE;
1065 	tcp->tcp_dupack_cnt = 0;
1066 
1067 	/*
1068 	 * Remove all rexmit SACK blk to start from fresh.
1069 	 */
1070 	if (tcp->tcp_snd_sack_ok)
1071 		TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
1072 	if (mp == NULL) {
1073 		return;
1074 	}
1075 
1076 	tcp->tcp_csuna = tcp->tcp_snxt;
1077 	TCPS_BUMP_MIB(tcps, tcpRetransSegs);
1078 	TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
1079 	tcp_send_data(tcp, mp);
1080 
1081 }
1082 
1083 /*
1084  * Handle lingering timeouts. This function is called when the SO_LINGER timeout
1085  * expires.
1086  */
1087 void
1088 tcp_close_linger_timeout(void *arg)
1089 {
1090 	conn_t	*connp = (conn_t *)arg;
1091 	tcp_t 	*tcp = connp->conn_tcp;
1092 
1093 	tcp->tcp_client_errno = ETIMEDOUT;
1094 	tcp_stop_lingering(tcp);
1095 }
1096