xref: /titanic_51/usr/src/uts/common/inet/tcp/tcp_timers.c (revision 188eaed9d5f14c73dfba1cd0dabaa430bdfd4a9a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/strlog.h>
28 #include <sys/strsun.h>
29 #include <sys/squeue_impl.h>
30 #include <sys/squeue.h>
31 #include <sys/callo.h>
32 #include <sys/strsubr.h>
33 
34 #include <inet/common.h>
35 #include <inet/ip.h>
36 #include <inet/ip_ire.h>
37 #include <inet/ip_rts.h>
38 #include <inet/tcp.h>
39 #include <inet/tcp_impl.h>
40 
41 /*
42  * Implementation of TCP Timers.
43  * =============================
44  *
45  * INTERFACE:
46  *
47  * There are two basic functions dealing with tcp timers:
48  *
49  *	timeout_id_t	tcp_timeout(connp, func, time)
50  * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
51  *	TCP_TIMER_RESTART(tcp, intvl)
52  *
53  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
54  * after 'time' ticks passed. The function called by timeout() must adhere to
55  * the same restrictions as a driver soft interrupt handler - it must not sleep
56  * or call other functions that might sleep. The value returned is the opaque
57  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
58  * cancel the request. The call to tcp_timeout() may fail in which case it
59  * returns zero. This is different from the timeout(9F) function which never
60  * fails.
61  *
62  * The call-back function 'func' always receives 'connp' as its single
63  * argument. It is always executed in the squeue corresponding to the tcp
64  * structure. The tcp structure is guaranteed to be present at the time the
65  * call-back is called.
66  *
67  * NOTE: The call-back function 'func' is never called if tcp is in
68  * 	the TCPS_CLOSED state.
69  *
70  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
71  * request. locks acquired by the call-back routine should not be held across
72  * the call to tcp_timeout_cancel() or a deadlock may result.
73  *
74  * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
75  * Otherwise, it returns an integer value greater than or equal to 0. In
76  * particular, if the call-back function is already placed on the squeue, it can
77  * not be canceled.
78  *
79  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
80  * 	within squeue context corresponding to the tcp instance. Since the
81  *	call-back is also called via the same squeue, there are no race
82  *	conditions described in untimeout(9F) manual page since all calls are
83  *	strictly serialized.
84  *
85  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
86  *	stored in tcp_timer_tid and starts a new one using
87  *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
88  *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
89  *	field.
90  *
91  * NOTE: since the timeout cancellation is not guaranteed, the cancelled
92  *	call-back may still be called, so it is possible tcp_timer() will be
93  *	called several times. This should not be a problem since tcp_timer()
94  *	should always check the tcp instance state.
95  *
96  *
97  * IMPLEMENTATION:
98  *
99  * TCP timers are implemented using three-stage process. The call to
100  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
101  * when the timer expires. The tcp_timer_callback() arranges the call of the
102  * tcp_timer_handler() function via squeue corresponding to the tcp
103  * instance. The tcp_timer_handler() calls actual requested timeout call-back
104  * and passes tcp instance as an argument to it. Information is passed between
105  * stages using the tcp_timer_t structure which contains the connp pointer, the
106  * tcp call-back to call and the timeout id returned by the timeout(9F).
107  *
108  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
109  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
110  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
111  * returns the pointer to this mblk.
112  *
113  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
114  * looks like a normal mblk without actual dblk attached to it.
115  *
116  * To optimize performance each tcp instance holds a small cache of timer
117  * mblocks. In the current implementation it caches up to two timer mblocks per
118  * tcp instance. The cache is preserved over tcp frees and is only freed when
119  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
120  * timer processing happens on a corresponding squeue, the cache manipulation
121  * does not require any locks. Experiments show that majority of timer mblocks
122  * allocations are satisfied from the tcp cache and do not involve kmem calls.
123  *
124  * The tcp_timeout() places a refhold on the connp instance which guarantees
125  * that it will be present at the time the call-back function fires. The
126  * tcp_timer_handler() drops the reference after calling the call-back, so the
127  * call-back function does not need to manipulate the references explicitly.
128  */
129 
130 kmem_cache_t *tcp_timercache;
131 
132 static void	tcp_ip_notify(tcp_t *);
133 static void	tcp_timer_callback(void *);
134 static void	tcp_timer_free(tcp_t *, mblk_t *);
135 static void	tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *);
136 
137 /*
138  * tim is in millisec.
139  */
140 timeout_id_t
141 tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim)
142 {
143 	mblk_t *mp;
144 	tcp_timer_t *tcpt;
145 	tcp_t *tcp = connp->conn_tcp;
146 
147 	ASSERT(connp->conn_sqp != NULL);
148 
149 	TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls);
150 
151 	if (tcp->tcp_timercache == NULL) {
152 		mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
153 	} else {
154 		TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc);
155 		mp = tcp->tcp_timercache;
156 		tcp->tcp_timercache = mp->b_next;
157 		mp->b_next = NULL;
158 		ASSERT(mp->b_wptr == NULL);
159 	}
160 
161 	CONN_INC_REF(connp);
162 	tcpt = (tcp_timer_t *)mp->b_rptr;
163 	tcpt->connp = connp;
164 	tcpt->tcpt_proc = f;
165 	/*
166 	 * TCP timers are normal timeouts. Plus, they do not require more than
167 	 * a 10 millisecond resolution. By choosing a coarser resolution and by
168 	 * rounding up the expiration to the next resolution boundary, we can
169 	 * batch timers in the callout subsystem to make TCP timers more
170 	 * efficient. The roundup also protects short timers from expiring too
171 	 * early before they have a chance to be cancelled.
172 	 */
173 	tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp,
174 	    tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
175 
176 	return ((timeout_id_t)mp);
177 }
178 
179 static void
180 tcp_timer_callback(void *arg)
181 {
182 	mblk_t *mp = (mblk_t *)arg;
183 	tcp_timer_t *tcpt;
184 	conn_t	*connp;
185 
186 	tcpt = (tcp_timer_t *)mp->b_rptr;
187 	connp = tcpt->connp;
188 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
189 	    NULL, SQ_FILL, SQTAG_TCP_TIMER);
190 }
191 
192 /* ARGSUSED */
193 static void
194 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
195 {
196 	tcp_timer_t *tcpt;
197 	conn_t *connp = (conn_t *)arg;
198 	tcp_t *tcp = connp->conn_tcp;
199 
200 	tcpt = (tcp_timer_t *)mp->b_rptr;
201 	ASSERT(connp == tcpt->connp);
202 	ASSERT((squeue_t *)arg2 == connp->conn_sqp);
203 
204 	/*
205 	 * If the TCP has reached the closed state, don't proceed any
206 	 * further. This TCP logically does not exist on the system.
207 	 * tcpt_proc could for example access queues, that have already
208 	 * been qprocoff'ed off.
209 	 */
210 	if (tcp->tcp_state != TCPS_CLOSED) {
211 		(*tcpt->tcpt_proc)(connp);
212 	} else {
213 		tcp->tcp_timer_tid = 0;
214 	}
215 	tcp_timer_free(connp->conn_tcp, mp);
216 }
217 
218 /*
219  * There is potential race with untimeout and the handler firing at the same
220  * time. The mblock may be freed by the handler while we are trying to use
221  * it. But since both should execute on the same squeue, this race should not
222  * occur.
223  */
224 clock_t
225 tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
226 {
227 	mblk_t	*mp = (mblk_t *)id;
228 	tcp_timer_t *tcpt;
229 	clock_t delta;
230 
231 	TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs);
232 
233 	if (mp == NULL)
234 		return (-1);
235 
236 	tcpt = (tcp_timer_t *)mp->b_rptr;
237 	ASSERT(tcpt->connp == connp);
238 
239 	delta = untimeout_default(tcpt->tcpt_tid, 0);
240 
241 	if (delta >= 0) {
242 		TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled);
243 		tcp_timer_free(connp->conn_tcp, mp);
244 		CONN_DEC_REF(connp);
245 	}
246 
247 	return (TICK_TO_MSEC(delta));
248 }
249 
250 /*
251  * Allocate space for the timer event. The allocation looks like mblk, but it is
252  * not a proper mblk. To avoid confusion we set b_wptr to NULL.
253  *
254  * Dealing with failures: If we can't allocate from the timer cache we try
255  * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
256  * points to b_rptr.
257  * If we can't allocate anything using allocb_tryhard(), we perform a last
258  * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
259  * save the actual allocation size in b_datap.
260  */
261 mblk_t *
262 tcp_timermp_alloc(int kmflags)
263 {
264 	mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache,
265 	    kmflags & ~KM_PANIC);
266 
267 	if (mp != NULL) {
268 		mp->b_next = mp->b_prev = NULL;
269 		mp->b_rptr = (uchar_t *)(&mp[1]);
270 		mp->b_wptr = NULL;
271 		mp->b_datap = NULL;
272 		mp->b_queue = NULL;
273 		mp->b_cont = NULL;
274 	} else if (kmflags & KM_PANIC) {
275 		/*
276 		 * Failed to allocate memory for the timer. Try allocating from
277 		 * dblock caches.
278 		 */
279 		/* ipclassifier calls this from a constructor - hence no tcps */
280 		TCP_G_STAT(tcp_timermp_allocfail);
281 		mp = allocb_tryhard(sizeof (tcp_timer_t));
282 		if (mp == NULL) {
283 			size_t size = 0;
284 			/*
285 			 * Memory is really low. Try tryhard allocation.
286 			 *
287 			 * ipclassifier calls this from a constructor -
288 			 * hence no tcps
289 			 */
290 			TCP_G_STAT(tcp_timermp_allocdblfail);
291 			mp = kmem_alloc_tryhard(sizeof (mblk_t) +
292 			    sizeof (tcp_timer_t), &size, kmflags);
293 			mp->b_rptr = (uchar_t *)(&mp[1]);
294 			mp->b_next = mp->b_prev = NULL;
295 			mp->b_wptr = (uchar_t *)-1;
296 			mp->b_datap = (dblk_t *)size;
297 			mp->b_queue = NULL;
298 			mp->b_cont = NULL;
299 		}
300 		ASSERT(mp->b_wptr != NULL);
301 	}
302 	/* ipclassifier calls this from a constructor - hence no tcps */
303 	TCP_G_DBGSTAT(tcp_timermp_alloced);
304 
305 	return (mp);
306 }
307 
308 /*
309  * Free per-tcp timer cache.
310  * It can only contain entries from tcp_timercache.
311  */
312 void
313 tcp_timermp_free(tcp_t *tcp)
314 {
315 	mblk_t *mp;
316 
317 	while ((mp = tcp->tcp_timercache) != NULL) {
318 		ASSERT(mp->b_wptr == NULL);
319 		tcp->tcp_timercache = tcp->tcp_timercache->b_next;
320 		kmem_cache_free(tcp_timercache, mp);
321 	}
322 }
323 
324 /*
325  * Free timer event. Put it on the per-tcp timer cache if there is not too many
326  * events there already (currently at most two events are cached).
327  * If the event is not allocated from the timer cache, free it right away.
328  */
329 static void
330 tcp_timer_free(tcp_t *tcp, mblk_t *mp)
331 {
332 	mblk_t *mp1 = tcp->tcp_timercache;
333 
334 	if (mp->b_wptr != NULL) {
335 		/*
336 		 * This allocation is not from a timer cache, free it right
337 		 * away.
338 		 */
339 		if (mp->b_wptr != (uchar_t *)-1)
340 			freeb(mp);
341 		else
342 			kmem_free(mp, (size_t)mp->b_datap);
343 	} else if (mp1 == NULL || mp1->b_next == NULL) {
344 		/* Cache this timer block for future allocations */
345 		mp->b_rptr = (uchar_t *)(&mp[1]);
346 		mp->b_next = mp1;
347 		tcp->tcp_timercache = mp;
348 	} else {
349 		kmem_cache_free(tcp_timercache, mp);
350 		TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed);
351 	}
352 }
353 
354 /*
355  * Stop all TCP timers.
356  */
357 void
358 tcp_timers_stop(tcp_t *tcp)
359 {
360 	if (tcp->tcp_timer_tid != 0) {
361 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
362 		tcp->tcp_timer_tid = 0;
363 	}
364 	if (tcp->tcp_ka_tid != 0) {
365 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
366 		tcp->tcp_ka_tid = 0;
367 	}
368 	if (tcp->tcp_ack_tid != 0) {
369 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
370 		tcp->tcp_ack_tid = 0;
371 	}
372 	if (tcp->tcp_push_tid != 0) {
373 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
374 		tcp->tcp_push_tid = 0;
375 	}
376 	if (tcp->tcp_reass_tid != 0) {
377 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid);
378 		tcp->tcp_reass_tid = 0;
379 	}
380 }
381 
382 /*
383  * Timer callback routine for keepalive probe.  We do a fake resend of
384  * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
385  * check to see if we have heard anything from the other end for the last
386  * RTO period.  If we have, set the timer to expire for another
387  * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
388  * RTO << 1 and check again when it expires.  Keep exponentially increasing
389  * the timeout if we have not heard from the other side.  If for more than
390  * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
391  * kill the connection unless the keepalive abort threshold is 0.  In
392  * that case, we will probe "forever."
393  */
394 void
395 tcp_keepalive_timer(void *arg)
396 {
397 	mblk_t	*mp;
398 	conn_t	*connp = (conn_t *)arg;
399 	tcp_t  	*tcp = connp->conn_tcp;
400 	int32_t	firetime;
401 	int32_t	idletime;
402 	int32_t	ka_intrvl;
403 	tcp_stack_t	*tcps = tcp->tcp_tcps;
404 
405 	tcp->tcp_ka_tid = 0;
406 
407 	if (tcp->tcp_fused)
408 		return;
409 
410 	TCPS_BUMP_MIB(tcps, tcpTimKeepalive);
411 	ka_intrvl = tcp->tcp_ka_interval;
412 
413 	/*
414 	 * Keepalive probe should only be sent if the application has not
415 	 * done a close on the connection.
416 	 */
417 	if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
418 		return;
419 	}
420 	/* Timer fired too early, restart it. */
421 	if (tcp->tcp_state < TCPS_ESTABLISHED) {
422 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
423 		    ka_intrvl);
424 		return;
425 	}
426 
427 	idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
428 	/*
429 	 * If we have not heard from the other side for a long
430 	 * time, kill the connection unless the keepalive abort
431 	 * threshold is 0.  In that case, we will probe "forever."
432 	 */
433 	if (tcp->tcp_ka_abort_thres != 0 &&
434 	    idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
435 		TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop);
436 		(void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
437 		    tcp->tcp_client_errno : ETIMEDOUT);
438 		return;
439 	}
440 
441 	if (tcp->tcp_snxt == tcp->tcp_suna &&
442 	    idletime >= ka_intrvl) {
443 		/* Fake resend of last ACKed byte. */
444 		mblk_t	*mp1 = allocb(1, BPRI_LO);
445 
446 		if (mp1 != NULL) {
447 			*mp1->b_wptr++ = '\0';
448 			mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
449 			    tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
450 			freeb(mp1);
451 			/*
452 			 * if allocation failed, fall through to start the
453 			 * timer back.
454 			 */
455 			if (mp != NULL) {
456 				tcp_send_data(tcp, mp);
457 				TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe);
458 				if (tcp->tcp_ka_last_intrvl != 0) {
459 					int max;
460 					/*
461 					 * We should probe again at least
462 					 * in ka_intrvl, but not more than
463 					 * tcp_rto_max.
464 					 */
465 					max = tcp->tcp_rto_max;
466 					firetime = MIN(ka_intrvl - 1,
467 					    tcp->tcp_ka_last_intrvl << 1);
468 					if (firetime > max)
469 						firetime = max;
470 				} else {
471 					firetime = tcp->tcp_rto;
472 				}
473 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
474 				    tcp_keepalive_timer, firetime);
475 				tcp->tcp_ka_last_intrvl = firetime;
476 				return;
477 			}
478 		}
479 	} else {
480 		tcp->tcp_ka_last_intrvl = 0;
481 	}
482 
483 	/* firetime can be negative if (mp1 == NULL || mp == NULL) */
484 	if ((firetime = ka_intrvl - idletime) < 0) {
485 		firetime = ka_intrvl;
486 	}
487 	tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime);
488 }
489 
490 void
491 tcp_reass_timer(void *arg)
492 {
493 	conn_t *connp = (conn_t *)arg;
494 	tcp_t *tcp = connp->conn_tcp;
495 
496 	tcp->tcp_reass_tid = 0;
497 	if (tcp->tcp_reass_head == NULL)
498 		return;
499 	ASSERT(tcp->tcp_reass_tail != NULL);
500 	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
501 		tcp_sack_remove(tcp->tcp_sack_list,
502 		    TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk);
503 	}
504 	tcp_close_mpp(&tcp->tcp_reass_head);
505 	tcp->tcp_reass_tail = NULL;
506 	TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout);
507 }
508 
509 /* This function handles the push timeout. */
510 void
511 tcp_push_timer(void *arg)
512 {
513 	conn_t	*connp = (conn_t *)arg;
514 	tcp_t *tcp = connp->conn_tcp;
515 
516 	TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt);
517 
518 	ASSERT(tcp->tcp_listener == NULL);
519 
520 	ASSERT(!IPCL_IS_NONSTR(connp));
521 
522 	tcp->tcp_push_tid = 0;
523 
524 	if (tcp->tcp_rcv_list != NULL &&
525 	    tcp_rcv_drain(tcp) == TH_ACK_NEEDED)
526 		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
527 }
528 
529 /*
530  * This function handles delayed ACK timeout.
531  */
532 void
533 tcp_ack_timer(void *arg)
534 {
535 	conn_t	*connp = (conn_t *)arg;
536 	tcp_t *tcp = connp->conn_tcp;
537 	mblk_t *mp;
538 	tcp_stack_t	*tcps = tcp->tcp_tcps;
539 
540 	TCP_DBGSTAT(tcps, tcp_ack_timer_cnt);
541 
542 	tcp->tcp_ack_tid = 0;
543 
544 	if (tcp->tcp_fused)
545 		return;
546 
547 	/*
548 	 * Do not send ACK if there is no outstanding unack'ed data.
549 	 */
550 	if (tcp->tcp_rnxt == tcp->tcp_rack) {
551 		return;
552 	}
553 
554 	if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) {
555 		/*
556 		 * Make sure we don't allow deferred ACKs to result in
557 		 * timer-based ACKing.  If we have held off an ACK
558 		 * when there was more than an mss here, and the timer
559 		 * goes off, we have to worry about the possibility
560 		 * that the sender isn't doing slow-start, or is out
561 		 * of step with us for some other reason.  We fall
562 		 * permanently back in the direction of
563 		 * ACK-every-other-packet as suggested in RFC 1122.
564 		 */
565 		if (tcp->tcp_rack_abs_max > 2)
566 			tcp->tcp_rack_abs_max--;
567 		tcp->tcp_rack_cur_max = 2;
568 	}
569 	mp = tcp_ack_mp(tcp);
570 
571 	if (mp != NULL) {
572 		BUMP_LOCAL(tcp->tcp_obsegs);
573 		TCPS_BUMP_MIB(tcps, tcpOutAck);
574 		TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
575 		tcp_send_data(tcp, mp);
576 	}
577 }
578 
579 /*
580  * Notify IP that we are having trouble with this connection.  IP should
581  * make note so it can potentially use a different IRE.
582  */
583 static void
584 tcp_ip_notify(tcp_t *tcp)
585 {
586 	conn_t		*connp = tcp->tcp_connp;
587 	ire_t		*ire;
588 
589 	/*
590 	 * Note: in the case of source routing we want to blow away the
591 	 * route to the first source route hop.
592 	 */
593 	ire = connp->conn_ixa->ixa_ire;
594 	if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
595 		if (ire->ire_ipversion == IPV4_VERSION) {
596 			/*
597 			 * As per RFC 1122, we send an RTM_LOSING to inform
598 			 * routing protocols.
599 			 */
600 			ip_rts_change(RTM_LOSING, ire->ire_addr,
601 			    ire->ire_gateway_addr, ire->ire_mask,
602 			    connp->conn_laddr_v4,  0, 0, 0,
603 			    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
604 			    ire->ire_ipst);
605 		}
606 		(void) ire_no_good(ire);
607 	}
608 }
609 
610 /*
611  * tcp_timer is the timer service routine.  It handles the retransmission,
612  * FIN_WAIT_2 flush, and zero window probe timeout events.  It figures out
613  * from the state of the tcp instance what kind of action needs to be done
614  * at the time it is called.
615  */
616 void
617 tcp_timer(void *arg)
618 {
619 	mblk_t		*mp;
620 	clock_t		first_threshold;
621 	clock_t		second_threshold;
622 	clock_t		ms;
623 	uint32_t	mss;
624 	conn_t		*connp = (conn_t *)arg;
625 	tcp_t		*tcp = connp->conn_tcp;
626 	tcp_stack_t	*tcps = tcp->tcp_tcps;
627 	boolean_t	dont_timeout = B_FALSE;
628 
629 	tcp->tcp_timer_tid = 0;
630 
631 	if (tcp->tcp_fused)
632 		return;
633 
634 	first_threshold =  tcp->tcp_first_timer_threshold;
635 	second_threshold = tcp->tcp_second_timer_threshold;
636 	switch (tcp->tcp_state) {
637 	case TCPS_IDLE:
638 	case TCPS_BOUND:
639 	case TCPS_LISTEN:
640 		return;
641 	case TCPS_SYN_RCVD: {
642 		tcp_t	*listener = tcp->tcp_listener;
643 
644 		if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
645 			/* it's our first timeout */
646 			tcp->tcp_syn_rcvd_timeout = 1;
647 			mutex_enter(&listener->tcp_eager_lock);
648 			listener->tcp_syn_rcvd_timeout++;
649 			if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) {
650 				/*
651 				 * Make this eager available for drop if we
652 				 * need to drop one to accomodate a new
653 				 * incoming SYN request.
654 				 */
655 				MAKE_DROPPABLE(listener, tcp);
656 			}
657 			if (!listener->tcp_syn_defense &&
658 			    (listener->tcp_syn_rcvd_timeout >
659 			    (tcps->tcps_conn_req_max_q0 >> 2)) &&
660 			    (tcps->tcps_conn_req_max_q0 > 200)) {
661 				/* We may be under attack. Put on a defense. */
662 				listener->tcp_syn_defense = B_TRUE;
663 				cmn_err(CE_WARN, "High TCP connect timeout "
664 				    "rate! System (port %d) may be under a "
665 				    "SYN flood attack!",
666 				    ntohs(listener->tcp_connp->conn_lport));
667 
668 				listener->tcp_ip_addr_cache = kmem_zalloc(
669 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
670 				    KM_NOSLEEP);
671 			}
672 			mutex_exit(&listener->tcp_eager_lock);
673 		} else if (listener != NULL) {
674 			mutex_enter(&listener->tcp_eager_lock);
675 			tcp->tcp_syn_rcvd_timeout++;
676 			if (tcp->tcp_syn_rcvd_timeout > 1 &&
677 			    !tcp->tcp_closemp_used) {
678 				/*
679 				 * This is our second timeout. Put the tcp in
680 				 * the list of droppable eagers to allow it to
681 				 * be dropped, if needed. We don't check
682 				 * whether tcp_dontdrop is set or not to
683 				 * protect ourselve from a SYN attack where a
684 				 * remote host can spoof itself as one of the
685 				 * good IP source and continue to hold
686 				 * resources too long.
687 				 */
688 				MAKE_DROPPABLE(listener, tcp);
689 			}
690 			mutex_exit(&listener->tcp_eager_lock);
691 		}
692 	}
693 		/* FALLTHRU */
694 	case TCPS_SYN_SENT:
695 		first_threshold =  tcp->tcp_first_ctimer_threshold;
696 		second_threshold = tcp->tcp_second_ctimer_threshold;
697 
698 		/* Retransmit forever unless this is a passive open... */
699 		if (second_threshold == 0) {
700 			if (!tcp->tcp_active_open) {
701 				second_threshold =
702 				    tcps->tcps_ip_abort_linterval;
703 			} else {
704 				dont_timeout = B_TRUE;
705 			}
706 		}
707 		break;
708 	case TCPS_ESTABLISHED:
709 	case TCPS_CLOSE_WAIT:
710 		/*
711 		 * If the end point has not been closed, TCP can retransmit
712 		 * forever.  But if the end point is closed, the normal
713 		 * timeout applies.
714 		 */
715 		if (second_threshold == 0)
716 			dont_timeout = B_TRUE;
717 		/* FALLTHRU */
718 	case TCPS_FIN_WAIT_1:
719 	case TCPS_CLOSING:
720 	case TCPS_LAST_ACK:
721 		/* If we have data to rexmit */
722 		if (tcp->tcp_suna != tcp->tcp_snxt) {
723 			clock_t	time_to_wait;
724 
725 			TCPS_BUMP_MIB(tcps, tcpTimRetrans);
726 			if (!tcp->tcp_xmit_head)
727 				break;
728 			time_to_wait = ddi_get_lbolt() -
729 			    (clock_t)tcp->tcp_xmit_head->b_prev;
730 			time_to_wait = tcp->tcp_rto -
731 			    TICK_TO_MSEC(time_to_wait);
732 			/*
733 			 * If the timer fires too early, 1 clock tick earlier,
734 			 * restart the timer.
735 			 */
736 			if (time_to_wait > msec_per_tick) {
737 				TCP_STAT(tcps, tcp_timer_fire_early);
738 				TCP_TIMER_RESTART(tcp, time_to_wait);
739 				return;
740 			}
741 			/*
742 			 * When we probe zero windows, we force the swnd open.
743 			 * If our peer acks with a closed window swnd will be
744 			 * set to zero by tcp_rput(). As long as we are
745 			 * receiving acks tcp_rput will
746 			 * reset 'tcp_ms_we_have_waited' so as not to trip the
747 			 * first and second interval actions.  NOTE: the timer
748 			 * interval is allowed to continue its exponential
749 			 * backoff.
750 			 */
751 			if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
752 				if (connp->conn_debug) {
753 					(void) strlog(TCP_MOD_ID, 0, 1,
754 					    SL_TRACE, "tcp_timer: zero win");
755 				}
756 			} else {
757 				/*
758 				 * After retransmission, we need to do
759 				 * slow start.  Set the ssthresh to one
760 				 * half of current effective window and
761 				 * cwnd to one MSS.  Also reset
762 				 * tcp_cwnd_cnt.
763 				 *
764 				 * Note that if tcp_ssthresh is reduced because
765 				 * of ECN, do not reduce it again unless it is
766 				 * already one window of data away (tcp_cwr
767 				 * should then be cleared) or this is a
768 				 * timeout for a retransmitted segment.
769 				 */
770 				uint32_t npkt;
771 
772 				if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
773 					npkt = ((tcp->tcp_timer_backoff ?
774 					    tcp->tcp_cwnd_ssthresh :
775 					    tcp->tcp_snxt -
776 					    tcp->tcp_suna) >> 1) / tcp->tcp_mss;
777 					tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
778 					    tcp->tcp_mss;
779 				}
780 				tcp->tcp_cwnd = tcp->tcp_mss;
781 				tcp->tcp_cwnd_cnt = 0;
782 				if (tcp->tcp_ecn_ok) {
783 					tcp->tcp_cwr = B_TRUE;
784 					tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
785 					tcp->tcp_ecn_cwr_sent = B_FALSE;
786 				}
787 			}
788 			break;
789 		}
790 		/*
791 		 * We have something to send yet we cannot send.  The
792 		 * reason can be:
793 		 *
794 		 * 1. Zero send window: we need to do zero window probe.
795 		 * 2. Zero cwnd: because of ECN, we need to "clock out
796 		 * segments.
797 		 * 3. SWS avoidance: receiver may have shrunk window,
798 		 * reset our knowledge.
799 		 *
800 		 * Note that condition 2 can happen with either 1 or
801 		 * 3.  But 1 and 3 are exclusive.
802 		 */
803 		if (tcp->tcp_unsent != 0) {
804 			/*
805 			 * Should not hold the zero-copy messages for too long.
806 			 */
807 			if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
808 				tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
809 				    tcp->tcp_xmit_head, B_TRUE);
810 
811 			if (tcp->tcp_cwnd == 0) {
812 				/*
813 				 * Set tcp_cwnd to 1 MSS so that a
814 				 * new segment can be sent out.  We
815 				 * are "clocking out" new data when
816 				 * the network is really congested.
817 				 */
818 				ASSERT(tcp->tcp_ecn_ok);
819 				tcp->tcp_cwnd = tcp->tcp_mss;
820 			}
821 			if (tcp->tcp_swnd == 0) {
822 				/* Extend window for zero window probe */
823 				tcp->tcp_swnd++;
824 				tcp->tcp_zero_win_probe = B_TRUE;
825 				TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
826 			} else {
827 				/*
828 				 * Handle timeout from sender SWS avoidance.
829 				 * Reset our knowledge of the max send window
830 				 * since the receiver might have reduced its
831 				 * receive buffer.  Avoid setting tcp_max_swnd
832 				 * to one since that will essentially disable
833 				 * the SWS checks.
834 				 *
835 				 * Note that since we don't have a SWS
836 				 * state variable, if the timeout is set
837 				 * for ECN but not for SWS, this
838 				 * code will also be executed.  This is
839 				 * fine as tcp_max_swnd is updated
840 				 * constantly and it will not affect
841 				 * anything.
842 				 */
843 				tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2);
844 			}
845 			tcp_wput_data(tcp, NULL, B_FALSE);
846 			return;
847 		}
848 		/* Is there a FIN that needs to be to re retransmitted? */
849 		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
850 		    !tcp->tcp_fin_acked)
851 			break;
852 		/* Nothing to do, return without restarting timer. */
853 		TCP_STAT(tcps, tcp_timer_fire_miss);
854 		return;
855 	case TCPS_FIN_WAIT_2:
856 		/*
857 		 * User closed the TCP endpoint and peer ACK'ed our FIN.
858 		 * We waited some time for for peer's FIN, but it hasn't
859 		 * arrived.  We flush the connection now to avoid
860 		 * case where the peer has rebooted.
861 		 */
862 		if (TCP_IS_DETACHED(tcp)) {
863 			(void) tcp_clean_death(tcp, 0);
864 		} else {
865 			TCP_TIMER_RESTART(tcp,
866 			    tcp->tcp_fin_wait_2_flush_interval);
867 		}
868 		return;
869 	case TCPS_TIME_WAIT:
870 		(void) tcp_clean_death(tcp, 0);
871 		return;
872 	default:
873 		if (connp->conn_debug) {
874 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
875 			    "tcp_timer: strange state (%d) %s",
876 			    tcp->tcp_state, tcp_display(tcp, NULL,
877 			    DISP_PORT_ONLY));
878 		}
879 		return;
880 	}
881 
882 	/*
883 	 * If the system is under memory pressure or the max number of
884 	 * connections have been established for the listener, be more
885 	 * aggressive in aborting connections.
886 	 */
887 	if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL &&
888 	    tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) {
889 		second_threshold = tcp_early_abort * SECONDS;
890 
891 		/* We will ignore the never timeout promise in this case... */
892 		dont_timeout = B_FALSE;
893 	}
894 
895 	if (!dont_timeout && second_threshold == 0)
896 		second_threshold = tcps->tcps_ip_abort_interval;
897 
898 	if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
899 		/*
900 		 * Should not hold the zero-copy messages for too long.
901 		 */
902 		if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
903 			tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
904 			    tcp->tcp_xmit_head, B_TRUE);
905 
906 		if (dont_timeout)
907 			goto timer_rexmit;
908 
909 		/*
910 		 * For zero window probe, we need to send indefinitely,
911 		 * unless we have not heard from the other side for some
912 		 * time...
913 		 */
914 		if ((tcp->tcp_zero_win_probe == 0) ||
915 		    (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) >
916 		    second_threshold)) {
917 			TCPS_BUMP_MIB(tcps, tcpTimRetransDrop);
918 			/*
919 			 * If TCP is in SYN_RCVD state, send back a
920 			 * RST|ACK as BSD does.  Note that tcp_zero_win_probe
921 			 * should be zero in TCPS_SYN_RCVD state.
922 			 */
923 			if (tcp->tcp_state == TCPS_SYN_RCVD) {
924 				tcp_xmit_ctl("tcp_timer: RST sent on timeout "
925 				    "in SYN_RCVD",
926 				    tcp, tcp->tcp_snxt,
927 				    tcp->tcp_rnxt, TH_RST | TH_ACK);
928 			}
929 			(void) tcp_clean_death(tcp,
930 			    tcp->tcp_client_errno ?
931 			    tcp->tcp_client_errno : ETIMEDOUT);
932 			return;
933 		} else {
934 			/*
935 			 * If the system is under memory pressure, we also
936 			 * abort connection in zero window probing.
937 			 */
938 			if (tcps->tcps_reclaim) {
939 				(void) tcp_clean_death(tcp,
940 				    tcp->tcp_client_errno ?
941 				    tcp->tcp_client_errno : ETIMEDOUT);
942 				TCP_STAT(tcps, tcp_zwin_mem_drop);
943 				return;
944 			}
945 			/*
946 			 * Set tcp_ms_we_have_waited to second_threshold
947 			 * so that in next timeout, we will do the above
948 			 * check (ddi_get_lbolt() - tcp_last_recv_time).
949 			 * This is also to avoid overflow.
950 			 *
951 			 * We don't need to decrement tcp_timer_backoff
952 			 * to avoid overflow because it will be decremented
953 			 * later if new timeout value is greater than
954 			 * tcp_rto_max.  In the case when tcp_rto_max is
955 			 * greater than second_threshold, it means that we
956 			 * will wait longer than second_threshold to send
957 			 * the next
958 			 * window probe.
959 			 */
960 			tcp->tcp_ms_we_have_waited = second_threshold;
961 		}
962 	} else if (ms > first_threshold) {
963 		/*
964 		 * Should not hold the zero-copy messages for too long.
965 		 */
966 		if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
967 			tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
968 			    tcp->tcp_xmit_head, B_TRUE);
969 
970 		/*
971 		 * We have been retransmitting for too long...  The RTT
972 		 * we calculated is probably incorrect.  Reinitialize it.
973 		 * Need to compensate for 0 tcp_rtt_sa.  Reset
974 		 * tcp_rtt_update so that we won't accidentally cache a
975 		 * bad value.  But only do this if this is not a zero
976 		 * window probe.
977 		 */
978 		if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
979 			tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
980 			    (tcp->tcp_rtt_sa >> 5);
981 			tcp->tcp_rtt_sa = 0;
982 			tcp_ip_notify(tcp);
983 			tcp->tcp_rtt_update = 0;
984 		}
985 	}
986 
987 timer_rexmit:
988 	tcp->tcp_timer_backoff++;
989 	if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
990 	    tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
991 	    tcp->tcp_rto_min) {
992 		/*
993 		 * This means the original RTO is tcp_rexmit_interval_min.
994 		 * So we will use tcp_rexmit_interval_min as the RTO value
995 		 * and do the backoff.
996 		 */
997 		ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
998 	} else {
999 		ms <<= tcp->tcp_timer_backoff;
1000 	}
1001 	if (ms > tcp->tcp_rto_max) {
1002 		ms = tcp->tcp_rto_max;
1003 		/*
1004 		 * ms is at max, decrement tcp_timer_backoff to avoid
1005 		 * overflow.
1006 		 */
1007 		tcp->tcp_timer_backoff--;
1008 	}
1009 	tcp->tcp_ms_we_have_waited += ms;
1010 	if (tcp->tcp_zero_win_probe == 0) {
1011 		tcp->tcp_rto = ms;
1012 	}
1013 	TCP_TIMER_RESTART(tcp, ms);
1014 	/*
1015 	 * This is after a timeout and tcp_rto is backed off.  Set
1016 	 * tcp_set_timer to 1 so that next time RTO is updated, we will
1017 	 * restart the timer with a correct value.
1018 	 */
1019 	tcp->tcp_set_timer = 1;
1020 	mss = tcp->tcp_snxt - tcp->tcp_suna;
1021 	if (mss > tcp->tcp_mss)
1022 		mss = tcp->tcp_mss;
1023 	if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1024 		mss = tcp->tcp_swnd;
1025 
1026 	if ((mp = tcp->tcp_xmit_head) != NULL)
1027 		mp->b_prev = (mblk_t *)ddi_get_lbolt();
1028 	mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1029 	    B_TRUE);
1030 
1031 	/*
1032 	 * When slow start after retransmission begins, start with
1033 	 * this seq no.  tcp_rexmit_max marks the end of special slow
1034 	 * start phase.  tcp_snd_burst controls how many segments
1035 	 * can be sent because of an ack.
1036 	 */
1037 	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1038 	tcp->tcp_snd_burst = TCP_CWND_SS;
1039 	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
1040 	    (tcp->tcp_unsent == 0)) {
1041 		tcp->tcp_rexmit_max = tcp->tcp_fss;
1042 	} else {
1043 		tcp->tcp_rexmit_max = tcp->tcp_snxt;
1044 	}
1045 	tcp->tcp_rexmit = B_TRUE;
1046 	tcp->tcp_dupack_cnt = 0;
1047 
1048 	/*
1049 	 * Remove all rexmit SACK blk to start from fresh.
1050 	 */
1051 	if (tcp->tcp_snd_sack_ok)
1052 		TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
1053 	if (mp == NULL) {
1054 		return;
1055 	}
1056 
1057 	tcp->tcp_csuna = tcp->tcp_snxt;
1058 	TCPS_BUMP_MIB(tcps, tcpRetransSegs);
1059 	TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
1060 	tcp_send_data(tcp, mp);
1061 
1062 }
1063 
1064 /*
1065  * Handle lingering timeouts. This function is called when the SO_LINGER timeout
1066  * expires.
1067  */
1068 void
1069 tcp_close_linger_timeout(void *arg)
1070 {
1071 	conn_t	*connp = (conn_t *)arg;
1072 	tcp_t 	*tcp = connp->conn_tcp;
1073 
1074 	tcp->tcp_client_errno = ETIMEDOUT;
1075 	tcp_stop_lingering(tcp);
1076 }
1077