1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/strlog.h> 29 #include <sys/strsun.h> 30 #include <sys/squeue_impl.h> 31 #include <sys/squeue.h> 32 #include <sys/callo.h> 33 #include <sys/strsubr.h> 34 35 #include <inet/common.h> 36 #include <inet/ip.h> 37 #include <inet/ip_ire.h> 38 #include <inet/ip_rts.h> 39 #include <inet/tcp.h> 40 #include <inet/tcp_impl.h> 41 42 /* 43 * Implementation of TCP Timers. 44 * ============================= 45 * 46 * INTERFACE: 47 * 48 * There are two basic functions dealing with tcp timers: 49 * 50 * timeout_id_t tcp_timeout(connp, func, time) 51 * clock_t tcp_timeout_cancel(connp, timeout_id) 52 * TCP_TIMER_RESTART(tcp, intvl) 53 * 54 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 55 * after 'time' ticks passed. The function called by timeout() must adhere to 56 * the same restrictions as a driver soft interrupt handler - it must not sleep 57 * or call other functions that might sleep. The value returned is the opaque 58 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 59 * cancel the request. The call to tcp_timeout() may fail in which case it 60 * returns zero. This is different from the timeout(9F) function which never 61 * fails. 62 * 63 * The call-back function 'func' always receives 'connp' as its single 64 * argument. It is always executed in the squeue corresponding to the tcp 65 * structure. The tcp structure is guaranteed to be present at the time the 66 * call-back is called. 67 * 68 * NOTE: The call-back function 'func' is never called if tcp is in 69 * the TCPS_CLOSED state. 70 * 71 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 72 * request. locks acquired by the call-back routine should not be held across 73 * the call to tcp_timeout_cancel() or a deadlock may result. 74 * 75 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. 76 * Otherwise, it returns an integer value greater than or equal to 0. In 77 * particular, if the call-back function is already placed on the squeue, it can 78 * not be canceled. 79 * 80 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 81 * within squeue context corresponding to the tcp instance. Since the 82 * call-back is also called via the same squeue, there are no race 83 * conditions described in untimeout(9F) manual page since all calls are 84 * strictly serialized. 85 * 86 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 87 * stored in tcp_timer_tid and starts a new one using 88 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 89 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 90 * field. 91 * 92 * NOTE: since the timeout cancellation is not guaranteed, the cancelled 93 * call-back may still be called, so it is possible tcp_timer() will be 94 * called several times. This should not be a problem since tcp_timer() 95 * should always check the tcp instance state. 96 * 97 * 98 * IMPLEMENTATION: 99 * 100 * TCP timers are implemented using three-stage process. The call to 101 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 102 * when the timer expires. The tcp_timer_callback() arranges the call of the 103 * tcp_timer_handler() function via squeue corresponding to the tcp 104 * instance. The tcp_timer_handler() calls actual requested timeout call-back 105 * and passes tcp instance as an argument to it. Information is passed between 106 * stages using the tcp_timer_t structure which contains the connp pointer, the 107 * tcp call-back to call and the timeout id returned by the timeout(9F). 108 * 109 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 110 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 111 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 112 * returns the pointer to this mblk. 113 * 114 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 115 * looks like a normal mblk without actual dblk attached to it. 116 * 117 * To optimize performance each tcp instance holds a small cache of timer 118 * mblocks. In the current implementation it caches up to two timer mblocks per 119 * tcp instance. The cache is preserved over tcp frees and is only freed when 120 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 121 * timer processing happens on a corresponding squeue, the cache manipulation 122 * does not require any locks. Experiments show that majority of timer mblocks 123 * allocations are satisfied from the tcp cache and do not involve kmem calls. 124 * 125 * The tcp_timeout() places a refhold on the connp instance which guarantees 126 * that it will be present at the time the call-back function fires. The 127 * tcp_timer_handler() drops the reference after calling the call-back, so the 128 * call-back function does not need to manipulate the references explicitly. 129 */ 130 131 kmem_cache_t *tcp_timercache; 132 133 static void tcp_ip_notify(tcp_t *); 134 static void tcp_timer_callback(void *); 135 static void tcp_timer_free(tcp_t *, mblk_t *); 136 static void tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *); 137 138 /* 139 * tim is in millisec. 140 */ 141 timeout_id_t 142 tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim) 143 { 144 mblk_t *mp; 145 tcp_timer_t *tcpt; 146 tcp_t *tcp = connp->conn_tcp; 147 148 ASSERT(connp->conn_sqp != NULL); 149 150 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls); 151 152 if (tcp->tcp_timercache == NULL) { 153 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); 154 } else { 155 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc); 156 mp = tcp->tcp_timercache; 157 tcp->tcp_timercache = mp->b_next; 158 mp->b_next = NULL; 159 ASSERT(mp->b_wptr == NULL); 160 } 161 162 CONN_INC_REF(connp); 163 tcpt = (tcp_timer_t *)mp->b_rptr; 164 tcpt->connp = connp; 165 tcpt->tcpt_proc = f; 166 /* 167 * TCP timers are normal timeouts. Plus, they do not require more than 168 * a 10 millisecond resolution. By choosing a coarser resolution and by 169 * rounding up the expiration to the next resolution boundary, we can 170 * batch timers in the callout subsystem to make TCP timers more 171 * efficient. The roundup also protects short timers from expiring too 172 * early before they have a chance to be cancelled. 173 */ 174 tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp, 175 tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 176 177 return ((timeout_id_t)mp); 178 } 179 180 static void 181 tcp_timer_callback(void *arg) 182 { 183 mblk_t *mp = (mblk_t *)arg; 184 tcp_timer_t *tcpt; 185 conn_t *connp; 186 187 tcpt = (tcp_timer_t *)mp->b_rptr; 188 connp = tcpt->connp; 189 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp, 190 NULL, SQ_FILL, SQTAG_TCP_TIMER); 191 } 192 193 /* ARGSUSED */ 194 static void 195 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 196 { 197 tcp_timer_t *tcpt; 198 conn_t *connp = (conn_t *)arg; 199 tcp_t *tcp = connp->conn_tcp; 200 201 tcpt = (tcp_timer_t *)mp->b_rptr; 202 ASSERT(connp == tcpt->connp); 203 ASSERT((squeue_t *)arg2 == connp->conn_sqp); 204 205 /* 206 * If the TCP has reached the closed state, don't proceed any 207 * further. This TCP logically does not exist on the system. 208 * tcpt_proc could for example access queues, that have already 209 * been qprocoff'ed off. 210 */ 211 if (tcp->tcp_state != TCPS_CLOSED) { 212 (*tcpt->tcpt_proc)(connp); 213 } else { 214 tcp->tcp_timer_tid = 0; 215 } 216 tcp_timer_free(connp->conn_tcp, mp); 217 } 218 219 /* 220 * There is potential race with untimeout and the handler firing at the same 221 * time. The mblock may be freed by the handler while we are trying to use 222 * it. But since both should execute on the same squeue, this race should not 223 * occur. 224 */ 225 clock_t 226 tcp_timeout_cancel(conn_t *connp, timeout_id_t id) 227 { 228 mblk_t *mp = (mblk_t *)id; 229 tcp_timer_t *tcpt; 230 clock_t delta; 231 232 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs); 233 234 if (mp == NULL) 235 return (-1); 236 237 tcpt = (tcp_timer_t *)mp->b_rptr; 238 ASSERT(tcpt->connp == connp); 239 240 delta = untimeout_default(tcpt->tcpt_tid, 0); 241 242 if (delta >= 0) { 243 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled); 244 tcp_timer_free(connp->conn_tcp, mp); 245 CONN_DEC_REF(connp); 246 } 247 248 return (TICK_TO_MSEC(delta)); 249 } 250 251 /* 252 * Allocate space for the timer event. The allocation looks like mblk, but it is 253 * not a proper mblk. To avoid confusion we set b_wptr to NULL. 254 * 255 * Dealing with failures: If we can't allocate from the timer cache we try 256 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr 257 * points to b_rptr. 258 * If we can't allocate anything using allocb_tryhard(), we perform a last 259 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and 260 * save the actual allocation size in b_datap. 261 */ 262 mblk_t * 263 tcp_timermp_alloc(int kmflags) 264 { 265 mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache, 266 kmflags & ~KM_PANIC); 267 268 if (mp != NULL) { 269 mp->b_next = mp->b_prev = NULL; 270 mp->b_rptr = (uchar_t *)(&mp[1]); 271 mp->b_wptr = NULL; 272 mp->b_datap = NULL; 273 mp->b_queue = NULL; 274 mp->b_cont = NULL; 275 } else if (kmflags & KM_PANIC) { 276 /* 277 * Failed to allocate memory for the timer. Try allocating from 278 * dblock caches. 279 */ 280 /* ipclassifier calls this from a constructor - hence no tcps */ 281 TCP_G_STAT(tcp_timermp_allocfail); 282 mp = allocb_tryhard(sizeof (tcp_timer_t)); 283 if (mp == NULL) { 284 size_t size = 0; 285 /* 286 * Memory is really low. Try tryhard allocation. 287 * 288 * ipclassifier calls this from a constructor - 289 * hence no tcps 290 */ 291 TCP_G_STAT(tcp_timermp_allocdblfail); 292 mp = kmem_alloc_tryhard(sizeof (mblk_t) + 293 sizeof (tcp_timer_t), &size, kmflags); 294 mp->b_rptr = (uchar_t *)(&mp[1]); 295 mp->b_next = mp->b_prev = NULL; 296 mp->b_wptr = (uchar_t *)-1; 297 mp->b_datap = (dblk_t *)size; 298 mp->b_queue = NULL; 299 mp->b_cont = NULL; 300 } 301 ASSERT(mp->b_wptr != NULL); 302 } 303 /* ipclassifier calls this from a constructor - hence no tcps */ 304 TCP_G_DBGSTAT(tcp_timermp_alloced); 305 306 return (mp); 307 } 308 309 /* 310 * Free per-tcp timer cache. 311 * It can only contain entries from tcp_timercache. 312 */ 313 void 314 tcp_timermp_free(tcp_t *tcp) 315 { 316 mblk_t *mp; 317 318 while ((mp = tcp->tcp_timercache) != NULL) { 319 ASSERT(mp->b_wptr == NULL); 320 tcp->tcp_timercache = tcp->tcp_timercache->b_next; 321 kmem_cache_free(tcp_timercache, mp); 322 } 323 } 324 325 /* 326 * Free timer event. Put it on the per-tcp timer cache if there is not too many 327 * events there already (currently at most two events are cached). 328 * If the event is not allocated from the timer cache, free it right away. 329 */ 330 static void 331 tcp_timer_free(tcp_t *tcp, mblk_t *mp) 332 { 333 mblk_t *mp1 = tcp->tcp_timercache; 334 335 if (mp->b_wptr != NULL) { 336 /* 337 * This allocation is not from a timer cache, free it right 338 * away. 339 */ 340 if (mp->b_wptr != (uchar_t *)-1) 341 freeb(mp); 342 else 343 kmem_free(mp, (size_t)mp->b_datap); 344 } else if (mp1 == NULL || mp1->b_next == NULL) { 345 /* Cache this timer block for future allocations */ 346 mp->b_rptr = (uchar_t *)(&mp[1]); 347 mp->b_next = mp1; 348 tcp->tcp_timercache = mp; 349 } else { 350 kmem_cache_free(tcp_timercache, mp); 351 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed); 352 } 353 } 354 355 /* 356 * Stop all TCP timers. 357 */ 358 void 359 tcp_timers_stop(tcp_t *tcp) 360 { 361 if (tcp->tcp_timer_tid != 0) { 362 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 363 tcp->tcp_timer_tid = 0; 364 } 365 if (tcp->tcp_ka_tid != 0) { 366 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); 367 tcp->tcp_ka_tid = 0; 368 } 369 if (tcp->tcp_ack_tid != 0) { 370 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 371 tcp->tcp_ack_tid = 0; 372 } 373 if (tcp->tcp_push_tid != 0) { 374 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 375 tcp->tcp_push_tid = 0; 376 } 377 if (tcp->tcp_reass_tid != 0) { 378 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid); 379 tcp->tcp_reass_tid = 0; 380 } 381 } 382 383 /* 384 * Timer callback routine for keepalive probe. We do a fake resend of 385 * last ACKed byte. Then set a timer using RTO. When the timer expires, 386 * check to see if we have heard anything from the other end for the last 387 * RTO period. If we have, set the timer to expire for another 388 * tcp_keepalive_intrvl and check again. If we have not, set a timer using 389 * RTO << 1 and check again when it expires. Keep exponentially increasing 390 * the timeout if we have not heard from the other side. If for more than 391 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything, 392 * kill the connection unless the keepalive abort threshold is 0. In 393 * that case, we will probe "forever." 394 * If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow 395 * the exponential backoff, but send probes tcp_ka_cnt times in regular 396 * intervals of tcp_ka_rinterval milliseconds until we hear back from peer. 397 * Kill the connection if we don't hear back from peer after tcp_ka_cnt 398 * probes are sent. 399 */ 400 void 401 tcp_keepalive_timer(void *arg) 402 { 403 mblk_t *mp; 404 conn_t *connp = (conn_t *)arg; 405 tcp_t *tcp = connp->conn_tcp; 406 int32_t firetime; 407 int32_t idletime; 408 int32_t ka_intrvl; 409 tcp_stack_t *tcps = tcp->tcp_tcps; 410 411 tcp->tcp_ka_tid = 0; 412 413 if (tcp->tcp_fused) 414 return; 415 416 TCPS_BUMP_MIB(tcps, tcpTimKeepalive); 417 ka_intrvl = tcp->tcp_ka_interval; 418 419 /* 420 * Keepalive probe should only be sent if the application has not 421 * done a close on the connection. 422 */ 423 if (tcp->tcp_state > TCPS_CLOSE_WAIT) { 424 return; 425 } 426 /* Timer fired too early, restart it. */ 427 if (tcp->tcp_state < TCPS_ESTABLISHED) { 428 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 429 ka_intrvl); 430 return; 431 } 432 433 idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time); 434 /* 435 * If we have not heard from the other side for a long 436 * time, kill the connection unless the keepalive abort 437 * threshold is 0. In that case, we will probe "forever." 438 */ 439 if (tcp->tcp_ka_abort_thres != 0 && 440 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { 441 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop); 442 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? 443 tcp->tcp_client_errno : ETIMEDOUT); 444 return; 445 } 446 447 if (tcp->tcp_snxt == tcp->tcp_suna && 448 idletime >= ka_intrvl) { 449 /* Fake resend of last ACKed byte. */ 450 mblk_t *mp1 = allocb(1, BPRI_LO); 451 452 if (mp1 != NULL) { 453 *mp1->b_wptr++ = '\0'; 454 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL, 455 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE); 456 freeb(mp1); 457 /* 458 * if allocation failed, fall through to start the 459 * timer back. 460 */ 461 if (mp != NULL) { 462 tcp_send_data(tcp, mp); 463 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe); 464 if (tcp->tcp_ka_rinterval) { 465 firetime = tcp->tcp_ka_rinterval; 466 } else if (tcp->tcp_ka_last_intrvl != 0) { 467 int max; 468 /* 469 * We should probe again at least 470 * in ka_intrvl, but not more than 471 * tcp_rto_max. 472 */ 473 max = tcp->tcp_rto_max; 474 firetime = MIN(ka_intrvl - 1, 475 tcp->tcp_ka_last_intrvl << 1); 476 if (firetime > max) 477 firetime = max; 478 } else { 479 firetime = tcp->tcp_rto; 480 } 481 tcp->tcp_ka_tid = TCP_TIMER(tcp, 482 tcp_keepalive_timer, firetime); 483 tcp->tcp_ka_last_intrvl = firetime; 484 return; 485 } 486 } 487 } else { 488 tcp->tcp_ka_last_intrvl = 0; 489 } 490 491 /* firetime can be negative if (mp1 == NULL || mp == NULL) */ 492 if ((firetime = ka_intrvl - idletime) < 0) { 493 firetime = ka_intrvl; 494 } 495 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime); 496 } 497 498 void 499 tcp_reass_timer(void *arg) 500 { 501 conn_t *connp = (conn_t *)arg; 502 tcp_t *tcp = connp->conn_tcp; 503 504 tcp->tcp_reass_tid = 0; 505 if (tcp->tcp_reass_head == NULL) 506 return; 507 ASSERT(tcp->tcp_reass_tail != NULL); 508 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 509 tcp_sack_remove(tcp->tcp_sack_list, 510 TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk); 511 } 512 tcp_close_mpp(&tcp->tcp_reass_head); 513 tcp->tcp_reass_tail = NULL; 514 TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout); 515 } 516 517 /* This function handles the push timeout. */ 518 void 519 tcp_push_timer(void *arg) 520 { 521 conn_t *connp = (conn_t *)arg; 522 tcp_t *tcp = connp->conn_tcp; 523 524 TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt); 525 526 ASSERT(tcp->tcp_listener == NULL); 527 528 ASSERT(!IPCL_IS_NONSTR(connp)); 529 530 tcp->tcp_push_tid = 0; 531 532 if (tcp->tcp_rcv_list != NULL && 533 tcp_rcv_drain(tcp) == TH_ACK_NEEDED) 534 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 535 } 536 537 /* 538 * This function handles delayed ACK timeout. 539 */ 540 void 541 tcp_ack_timer(void *arg) 542 { 543 conn_t *connp = (conn_t *)arg; 544 tcp_t *tcp = connp->conn_tcp; 545 mblk_t *mp; 546 tcp_stack_t *tcps = tcp->tcp_tcps; 547 548 TCP_DBGSTAT(tcps, tcp_ack_timer_cnt); 549 550 tcp->tcp_ack_tid = 0; 551 552 if (tcp->tcp_fused) 553 return; 554 555 /* 556 * Do not send ACK if there is no outstanding unack'ed data. 557 */ 558 if (tcp->tcp_rnxt == tcp->tcp_rack) { 559 return; 560 } 561 562 if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) { 563 /* 564 * Make sure we don't allow deferred ACKs to result in 565 * timer-based ACKing. If we have held off an ACK 566 * when there was more than an mss here, and the timer 567 * goes off, we have to worry about the possibility 568 * that the sender isn't doing slow-start, or is out 569 * of step with us for some other reason. We fall 570 * permanently back in the direction of 571 * ACK-every-other-packet as suggested in RFC 1122. 572 */ 573 if (tcp->tcp_rack_abs_max > 2) 574 tcp->tcp_rack_abs_max--; 575 tcp->tcp_rack_cur_max = 2; 576 } 577 mp = tcp_ack_mp(tcp); 578 579 if (mp != NULL) { 580 BUMP_LOCAL(tcp->tcp_obsegs); 581 TCPS_BUMP_MIB(tcps, tcpOutAck); 582 TCPS_BUMP_MIB(tcps, tcpOutAckDelayed); 583 tcp_send_data(tcp, mp); 584 } 585 } 586 587 /* 588 * Notify IP that we are having trouble with this connection. IP should 589 * make note so it can potentially use a different IRE. 590 */ 591 static void 592 tcp_ip_notify(tcp_t *tcp) 593 { 594 conn_t *connp = tcp->tcp_connp; 595 ire_t *ire; 596 597 /* 598 * Note: in the case of source routing we want to blow away the 599 * route to the first source route hop. 600 */ 601 ire = connp->conn_ixa->ixa_ire; 602 if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 603 if (ire->ire_ipversion == IPV4_VERSION) { 604 /* 605 * As per RFC 1122, we send an RTM_LOSING to inform 606 * routing protocols. 607 */ 608 ip_rts_change(RTM_LOSING, ire->ire_addr, 609 ire->ire_gateway_addr, ire->ire_mask, 610 connp->conn_laddr_v4, 0, 0, 0, 611 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), 612 ire->ire_ipst); 613 } 614 (void) ire_no_good(ire); 615 } 616 } 617 618 /* 619 * tcp_timer is the timer service routine. It handles the retransmission, 620 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out 621 * from the state of the tcp instance what kind of action needs to be done 622 * at the time it is called. 623 */ 624 void 625 tcp_timer(void *arg) 626 { 627 mblk_t *mp; 628 clock_t first_threshold; 629 clock_t second_threshold; 630 clock_t ms; 631 uint32_t mss; 632 conn_t *connp = (conn_t *)arg; 633 tcp_t *tcp = connp->conn_tcp; 634 tcp_stack_t *tcps = tcp->tcp_tcps; 635 boolean_t dont_timeout = B_FALSE; 636 637 tcp->tcp_timer_tid = 0; 638 639 if (tcp->tcp_fused) 640 return; 641 642 first_threshold = tcp->tcp_first_timer_threshold; 643 second_threshold = tcp->tcp_second_timer_threshold; 644 switch (tcp->tcp_state) { 645 case TCPS_IDLE: 646 case TCPS_BOUND: 647 case TCPS_LISTEN: 648 return; 649 case TCPS_SYN_RCVD: { 650 tcp_t *listener = tcp->tcp_listener; 651 652 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { 653 /* it's our first timeout */ 654 tcp->tcp_syn_rcvd_timeout = 1; 655 mutex_enter(&listener->tcp_eager_lock); 656 listener->tcp_syn_rcvd_timeout++; 657 if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) { 658 /* 659 * Make this eager available for drop if we 660 * need to drop one to accomodate a new 661 * incoming SYN request. 662 */ 663 MAKE_DROPPABLE(listener, tcp); 664 } 665 if (!listener->tcp_syn_defense && 666 (listener->tcp_syn_rcvd_timeout > 667 (tcps->tcps_conn_req_max_q0 >> 2)) && 668 (tcps->tcps_conn_req_max_q0 > 200)) { 669 /* We may be under attack. Put on a defense. */ 670 listener->tcp_syn_defense = B_TRUE; 671 cmn_err(CE_WARN, "High TCP connect timeout " 672 "rate! System (port %d) may be under a " 673 "SYN flood attack!", 674 ntohs(listener->tcp_connp->conn_lport)); 675 676 listener->tcp_ip_addr_cache = kmem_zalloc( 677 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), 678 KM_NOSLEEP); 679 } 680 mutex_exit(&listener->tcp_eager_lock); 681 } else if (listener != NULL) { 682 mutex_enter(&listener->tcp_eager_lock); 683 tcp->tcp_syn_rcvd_timeout++; 684 if (tcp->tcp_syn_rcvd_timeout > 1 && 685 !tcp->tcp_closemp_used) { 686 /* 687 * This is our second timeout. Put the tcp in 688 * the list of droppable eagers to allow it to 689 * be dropped, if needed. We don't check 690 * whether tcp_dontdrop is set or not to 691 * protect ourselve from a SYN attack where a 692 * remote host can spoof itself as one of the 693 * good IP source and continue to hold 694 * resources too long. 695 */ 696 MAKE_DROPPABLE(listener, tcp); 697 } 698 mutex_exit(&listener->tcp_eager_lock); 699 } 700 } 701 /* FALLTHRU */ 702 case TCPS_SYN_SENT: 703 first_threshold = tcp->tcp_first_ctimer_threshold; 704 second_threshold = tcp->tcp_second_ctimer_threshold; 705 706 /* 707 * If an app has set the second_threshold to 0, it means that 708 * we need to retransmit forever, unless this is a passive 709 * open. We need to set second_threshold back to a normal 710 * value such that later comparison with it still makes 711 * sense. But we set dont_timeout to B_TRUE so that we will 712 * never time out. 713 */ 714 if (second_threshold == 0) { 715 second_threshold = tcps->tcps_ip_abort_linterval; 716 if (tcp->tcp_active_open) 717 dont_timeout = B_TRUE; 718 } 719 break; 720 case TCPS_ESTABLISHED: 721 case TCPS_CLOSE_WAIT: 722 /* 723 * If the end point has not been closed, TCP can retransmit 724 * forever. But if the end point is closed, the normal 725 * timeout applies. 726 */ 727 if (second_threshold == 0) { 728 second_threshold = tcps->tcps_ip_abort_linterval; 729 dont_timeout = B_TRUE; 730 } 731 /* FALLTHRU */ 732 case TCPS_FIN_WAIT_1: 733 case TCPS_CLOSING: 734 case TCPS_LAST_ACK: 735 /* If we have data to rexmit */ 736 if (tcp->tcp_suna != tcp->tcp_snxt) { 737 clock_t time_to_wait; 738 739 TCPS_BUMP_MIB(tcps, tcpTimRetrans); 740 if (!tcp->tcp_xmit_head) 741 break; 742 time_to_wait = ddi_get_lbolt() - 743 (clock_t)tcp->tcp_xmit_head->b_prev; 744 time_to_wait = tcp->tcp_rto - 745 TICK_TO_MSEC(time_to_wait); 746 /* 747 * If the timer fires too early, 1 clock tick earlier, 748 * restart the timer. 749 */ 750 if (time_to_wait > msec_per_tick) { 751 TCP_STAT(tcps, tcp_timer_fire_early); 752 TCP_TIMER_RESTART(tcp, time_to_wait); 753 return; 754 } 755 /* 756 * When we probe zero windows, we force the swnd open. 757 * If our peer acks with a closed window swnd will be 758 * set to zero by tcp_rput(). As long as we are 759 * receiving acks tcp_rput will 760 * reset 'tcp_ms_we_have_waited' so as not to trip the 761 * first and second interval actions. NOTE: the timer 762 * interval is allowed to continue its exponential 763 * backoff. 764 */ 765 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 766 if (connp->conn_debug) { 767 (void) strlog(TCP_MOD_ID, 0, 1, 768 SL_TRACE, "tcp_timer: zero win"); 769 } 770 } else { 771 /* 772 * After retransmission, we need to do 773 * slow start. Set the ssthresh to one 774 * half of current effective window and 775 * cwnd to one MSS. Also reset 776 * tcp_cwnd_cnt. 777 * 778 * Note that if tcp_ssthresh is reduced because 779 * of ECN, do not reduce it again unless it is 780 * already one window of data away (tcp_cwr 781 * should then be cleared) or this is a 782 * timeout for a retransmitted segment. 783 */ 784 uint32_t npkt; 785 786 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 787 npkt = ((tcp->tcp_timer_backoff ? 788 tcp->tcp_cwnd_ssthresh : 789 tcp->tcp_snxt - 790 tcp->tcp_suna) >> 1) / tcp->tcp_mss; 791 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 792 tcp->tcp_mss; 793 } 794 tcp->tcp_cwnd = tcp->tcp_mss; 795 tcp->tcp_cwnd_cnt = 0; 796 if (tcp->tcp_ecn_ok) { 797 tcp->tcp_cwr = B_TRUE; 798 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 799 tcp->tcp_ecn_cwr_sent = B_FALSE; 800 } 801 } 802 break; 803 } 804 /* 805 * We have something to send yet we cannot send. The 806 * reason can be: 807 * 808 * 1. Zero send window: we need to do zero window probe. 809 * 2. Zero cwnd: because of ECN, we need to "clock out 810 * segments. 811 * 3. SWS avoidance: receiver may have shrunk window, 812 * reset our knowledge. 813 * 814 * Note that condition 2 can happen with either 1 or 815 * 3. But 1 and 3 are exclusive. 816 */ 817 if (tcp->tcp_unsent != 0) { 818 /* 819 * Should not hold the zero-copy messages for too long. 820 */ 821 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 822 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 823 tcp->tcp_xmit_head, B_TRUE); 824 825 if (tcp->tcp_cwnd == 0) { 826 /* 827 * Set tcp_cwnd to 1 MSS so that a 828 * new segment can be sent out. We 829 * are "clocking out" new data when 830 * the network is really congested. 831 */ 832 ASSERT(tcp->tcp_ecn_ok); 833 tcp->tcp_cwnd = tcp->tcp_mss; 834 } 835 if (tcp->tcp_swnd == 0) { 836 /* Extend window for zero window probe */ 837 tcp->tcp_swnd++; 838 tcp->tcp_zero_win_probe = B_TRUE; 839 TCPS_BUMP_MIB(tcps, tcpOutWinProbe); 840 } else { 841 /* 842 * Handle timeout from sender SWS avoidance. 843 * Reset our knowledge of the max send window 844 * since the receiver might have reduced its 845 * receive buffer. Avoid setting tcp_max_swnd 846 * to one since that will essentially disable 847 * the SWS checks. 848 * 849 * Note that since we don't have a SWS 850 * state variable, if the timeout is set 851 * for ECN but not for SWS, this 852 * code will also be executed. This is 853 * fine as tcp_max_swnd is updated 854 * constantly and it will not affect 855 * anything. 856 */ 857 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 858 } 859 tcp_wput_data(tcp, NULL, B_FALSE); 860 return; 861 } 862 /* Is there a FIN that needs to be to re retransmitted? */ 863 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 864 !tcp->tcp_fin_acked) 865 break; 866 /* Nothing to do, return without restarting timer. */ 867 TCP_STAT(tcps, tcp_timer_fire_miss); 868 return; 869 case TCPS_FIN_WAIT_2: 870 /* 871 * User closed the TCP endpoint and peer ACK'ed our FIN. 872 * We waited some time for for peer's FIN, but it hasn't 873 * arrived. We flush the connection now to avoid 874 * case where the peer has rebooted. 875 */ 876 if (TCP_IS_DETACHED(tcp)) { 877 (void) tcp_clean_death(tcp, 0); 878 } else { 879 TCP_TIMER_RESTART(tcp, 880 tcp->tcp_fin_wait_2_flush_interval); 881 } 882 return; 883 case TCPS_TIME_WAIT: 884 (void) tcp_clean_death(tcp, 0); 885 return; 886 default: 887 if (connp->conn_debug) { 888 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 889 "tcp_timer: strange state (%d) %s", 890 tcp->tcp_state, tcp_display(tcp, NULL, 891 DISP_PORT_ONLY)); 892 } 893 return; 894 } 895 896 /* 897 * If the system is under memory pressure or the max number of 898 * connections have been established for the listener, be more 899 * aggressive in aborting connections. 900 */ 901 if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL && 902 tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) { 903 second_threshold = tcp_early_abort * SECONDS; 904 905 /* We will ignore the never timeout promise in this case... */ 906 dont_timeout = B_FALSE; 907 } 908 909 ASSERT(second_threshold != 0); 910 911 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 912 /* 913 * Should not hold the zero-copy messages for too long. 914 */ 915 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 916 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 917 tcp->tcp_xmit_head, B_TRUE); 918 919 if (dont_timeout) { 920 /* 921 * Reset tcp_ms_we_have_waited to avoid overflow since 922 * we are going to retransmit forever. 923 */ 924 tcp->tcp_ms_we_have_waited = second_threshold; 925 goto timer_rexmit; 926 } 927 928 /* 929 * For zero window probe, we need to send indefinitely, 930 * unless we have not heard from the other side for some 931 * time... 932 */ 933 if ((tcp->tcp_zero_win_probe == 0) || 934 (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) > 935 second_threshold)) { 936 TCPS_BUMP_MIB(tcps, tcpTimRetransDrop); 937 /* 938 * If TCP is in SYN_RCVD state, send back a 939 * RST|ACK as BSD does. Note that tcp_zero_win_probe 940 * should be zero in TCPS_SYN_RCVD state. 941 */ 942 if (tcp->tcp_state == TCPS_SYN_RCVD) { 943 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 944 "in SYN_RCVD", 945 tcp, tcp->tcp_snxt, 946 tcp->tcp_rnxt, TH_RST | TH_ACK); 947 } 948 (void) tcp_clean_death(tcp, 949 tcp->tcp_client_errno ? 950 tcp->tcp_client_errno : ETIMEDOUT); 951 return; 952 } else { 953 /* 954 * If the system is under memory pressure, we also 955 * abort connection in zero window probing. 956 */ 957 if (tcps->tcps_reclaim) { 958 (void) tcp_clean_death(tcp, 959 tcp->tcp_client_errno ? 960 tcp->tcp_client_errno : ETIMEDOUT); 961 TCP_STAT(tcps, tcp_zwin_mem_drop); 962 return; 963 } 964 /* 965 * Set tcp_ms_we_have_waited to second_threshold 966 * so that in next timeout, we will do the above 967 * check (ddi_get_lbolt() - tcp_last_recv_time). 968 * This is also to avoid overflow. 969 * 970 * We don't need to decrement tcp_timer_backoff 971 * to avoid overflow because it will be decremented 972 * later if new timeout value is greater than 973 * tcp_rto_max. In the case when tcp_rto_max is 974 * greater than second_threshold, it means that we 975 * will wait longer than second_threshold to send 976 * the next 977 * window probe. 978 */ 979 tcp->tcp_ms_we_have_waited = second_threshold; 980 } 981 } else if (ms > first_threshold) { 982 /* 983 * Should not hold the zero-copy messages for too long. 984 */ 985 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 986 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 987 tcp->tcp_xmit_head, B_TRUE); 988 989 /* 990 * We have been retransmitting for too long... The RTT 991 * we calculated is probably incorrect. Reinitialize it. 992 * Need to compensate for 0 tcp_rtt_sa. Reset 993 * tcp_rtt_update so that we won't accidentally cache a 994 * bad value. But only do this if this is not a zero 995 * window probe. 996 */ 997 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { 998 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 999 (tcp->tcp_rtt_sa >> 5); 1000 tcp->tcp_rtt_sa = 0; 1001 tcp_ip_notify(tcp); 1002 tcp->tcp_rtt_update = 0; 1003 } 1004 } 1005 1006 timer_rexmit: 1007 tcp->tcp_timer_backoff++; 1008 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 1009 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 1010 tcp->tcp_rto_min) { 1011 /* 1012 * This means the original RTO is tcp_rexmit_interval_min. 1013 * So we will use tcp_rexmit_interval_min as the RTO value 1014 * and do the backoff. 1015 */ 1016 ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff; 1017 } else { 1018 ms <<= tcp->tcp_timer_backoff; 1019 } 1020 if (ms > tcp->tcp_rto_max) { 1021 ms = tcp->tcp_rto_max; 1022 /* 1023 * ms is at max, decrement tcp_timer_backoff to avoid 1024 * overflow. 1025 */ 1026 tcp->tcp_timer_backoff--; 1027 } 1028 tcp->tcp_ms_we_have_waited += ms; 1029 if (tcp->tcp_zero_win_probe == 0) { 1030 tcp->tcp_rto = ms; 1031 } 1032 TCP_TIMER_RESTART(tcp, ms); 1033 /* 1034 * This is after a timeout and tcp_rto is backed off. Set 1035 * tcp_set_timer to 1 so that next time RTO is updated, we will 1036 * restart the timer with a correct value. 1037 */ 1038 tcp->tcp_set_timer = 1; 1039 mss = tcp->tcp_snxt - tcp->tcp_suna; 1040 if (mss > tcp->tcp_mss) 1041 mss = tcp->tcp_mss; 1042 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 1043 mss = tcp->tcp_swnd; 1044 1045 if ((mp = tcp->tcp_xmit_head) != NULL) 1046 mp->b_prev = (mblk_t *)ddi_get_lbolt(); 1047 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 1048 B_TRUE); 1049 1050 /* 1051 * When slow start after retransmission begins, start with 1052 * this seq no. tcp_rexmit_max marks the end of special slow 1053 * start phase. tcp_snd_burst controls how many segments 1054 * can be sent because of an ack. 1055 */ 1056 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 1057 tcp->tcp_snd_burst = TCP_CWND_SS; 1058 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 1059 (tcp->tcp_unsent == 0)) { 1060 tcp->tcp_rexmit_max = tcp->tcp_fss; 1061 } else { 1062 tcp->tcp_rexmit_max = tcp->tcp_snxt; 1063 } 1064 tcp->tcp_rexmit = B_TRUE; 1065 tcp->tcp_dupack_cnt = 0; 1066 1067 /* 1068 * Remove all rexmit SACK blk to start from fresh. 1069 */ 1070 if (tcp->tcp_snd_sack_ok) 1071 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 1072 if (mp == NULL) { 1073 return; 1074 } 1075 1076 tcp->tcp_csuna = tcp->tcp_snxt; 1077 TCPS_BUMP_MIB(tcps, tcpRetransSegs); 1078 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss); 1079 tcp_send_data(tcp, mp); 1080 1081 } 1082 1083 /* 1084 * Handle lingering timeouts. This function is called when the SO_LINGER timeout 1085 * expires. 1086 */ 1087 void 1088 tcp_close_linger_timeout(void *arg) 1089 { 1090 conn_t *connp = (conn_t *)arg; 1091 tcp_t *tcp = connp->conn_tcp; 1092 1093 tcp->tcp_client_errno = ETIMEDOUT; 1094 tcp_stop_lingering(tcp); 1095 } 1096