1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/strlog.h> 28 #include <sys/strsun.h> 29 #include <sys/squeue_impl.h> 30 #include <sys/squeue.h> 31 #include <sys/callo.h> 32 #include <sys/strsubr.h> 33 34 #include <inet/common.h> 35 #include <inet/ip.h> 36 #include <inet/ip_ire.h> 37 #include <inet/ip_rts.h> 38 #include <inet/tcp.h> 39 #include <inet/tcp_impl.h> 40 41 /* 42 * Implementation of TCP Timers. 43 * ============================= 44 * 45 * INTERFACE: 46 * 47 * There are two basic functions dealing with tcp timers: 48 * 49 * timeout_id_t tcp_timeout(connp, func, time) 50 * clock_t tcp_timeout_cancel(connp, timeout_id) 51 * TCP_TIMER_RESTART(tcp, intvl) 52 * 53 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 54 * after 'time' ticks passed. The function called by timeout() must adhere to 55 * the same restrictions as a driver soft interrupt handler - it must not sleep 56 * or call other functions that might sleep. The value returned is the opaque 57 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 58 * cancel the request. The call to tcp_timeout() may fail in which case it 59 * returns zero. This is different from the timeout(9F) function which never 60 * fails. 61 * 62 * The call-back function 'func' always receives 'connp' as its single 63 * argument. It is always executed in the squeue corresponding to the tcp 64 * structure. The tcp structure is guaranteed to be present at the time the 65 * call-back is called. 66 * 67 * NOTE: The call-back function 'func' is never called if tcp is in 68 * the TCPS_CLOSED state. 69 * 70 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 71 * request. locks acquired by the call-back routine should not be held across 72 * the call to tcp_timeout_cancel() or a deadlock may result. 73 * 74 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. 75 * Otherwise, it returns an integer value greater than or equal to 0. In 76 * particular, if the call-back function is already placed on the squeue, it can 77 * not be canceled. 78 * 79 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 80 * within squeue context corresponding to the tcp instance. Since the 81 * call-back is also called via the same squeue, there are no race 82 * conditions described in untimeout(9F) manual page since all calls are 83 * strictly serialized. 84 * 85 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 86 * stored in tcp_timer_tid and starts a new one using 87 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 88 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 89 * field. 90 * 91 * NOTE: since the timeout cancellation is not guaranteed, the cancelled 92 * call-back may still be called, so it is possible tcp_timer() will be 93 * called several times. This should not be a problem since tcp_timer() 94 * should always check the tcp instance state. 95 * 96 * 97 * IMPLEMENTATION: 98 * 99 * TCP timers are implemented using three-stage process. The call to 100 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 101 * when the timer expires. The tcp_timer_callback() arranges the call of the 102 * tcp_timer_handler() function via squeue corresponding to the tcp 103 * instance. The tcp_timer_handler() calls actual requested timeout call-back 104 * and passes tcp instance as an argument to it. Information is passed between 105 * stages using the tcp_timer_t structure which contains the connp pointer, the 106 * tcp call-back to call and the timeout id returned by the timeout(9F). 107 * 108 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 109 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 110 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 111 * returns the pointer to this mblk. 112 * 113 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 114 * looks like a normal mblk without actual dblk attached to it. 115 * 116 * To optimize performance each tcp instance holds a small cache of timer 117 * mblocks. In the current implementation it caches up to two timer mblocks per 118 * tcp instance. The cache is preserved over tcp frees and is only freed when 119 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 120 * timer processing happens on a corresponding squeue, the cache manipulation 121 * does not require any locks. Experiments show that majority of timer mblocks 122 * allocations are satisfied from the tcp cache and do not involve kmem calls. 123 * 124 * The tcp_timeout() places a refhold on the connp instance which guarantees 125 * that it will be present at the time the call-back function fires. The 126 * tcp_timer_handler() drops the reference after calling the call-back, so the 127 * call-back function does not need to manipulate the references explicitly. 128 */ 129 130 kmem_cache_t *tcp_timercache; 131 132 static void tcp_ip_notify(tcp_t *); 133 static void tcp_timer_callback(void *); 134 static void tcp_timer_free(tcp_t *, mblk_t *); 135 static void tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *); 136 137 /* 138 * tim is in millisec. 139 */ 140 timeout_id_t 141 tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim) 142 { 143 mblk_t *mp; 144 tcp_timer_t *tcpt; 145 tcp_t *tcp = connp->conn_tcp; 146 147 ASSERT(connp->conn_sqp != NULL); 148 149 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls); 150 151 if (tcp->tcp_timercache == NULL) { 152 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); 153 } else { 154 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc); 155 mp = tcp->tcp_timercache; 156 tcp->tcp_timercache = mp->b_next; 157 mp->b_next = NULL; 158 ASSERT(mp->b_wptr == NULL); 159 } 160 161 CONN_INC_REF(connp); 162 tcpt = (tcp_timer_t *)mp->b_rptr; 163 tcpt->connp = connp; 164 tcpt->tcpt_proc = f; 165 /* 166 * TCP timers are normal timeouts. Plus, they do not require more than 167 * a 10 millisecond resolution. By choosing a coarser resolution and by 168 * rounding up the expiration to the next resolution boundary, we can 169 * batch timers in the callout subsystem to make TCP timers more 170 * efficient. The roundup also protects short timers from expiring too 171 * early before they have a chance to be cancelled. 172 */ 173 tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp, 174 tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 175 176 return ((timeout_id_t)mp); 177 } 178 179 static void 180 tcp_timer_callback(void *arg) 181 { 182 mblk_t *mp = (mblk_t *)arg; 183 tcp_timer_t *tcpt; 184 conn_t *connp; 185 186 tcpt = (tcp_timer_t *)mp->b_rptr; 187 connp = tcpt->connp; 188 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp, 189 NULL, SQ_FILL, SQTAG_TCP_TIMER); 190 } 191 192 /* ARGSUSED */ 193 static void 194 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 195 { 196 tcp_timer_t *tcpt; 197 conn_t *connp = (conn_t *)arg; 198 tcp_t *tcp = connp->conn_tcp; 199 200 tcpt = (tcp_timer_t *)mp->b_rptr; 201 ASSERT(connp == tcpt->connp); 202 ASSERT((squeue_t *)arg2 == connp->conn_sqp); 203 204 /* 205 * If the TCP has reached the closed state, don't proceed any 206 * further. This TCP logically does not exist on the system. 207 * tcpt_proc could for example access queues, that have already 208 * been qprocoff'ed off. 209 */ 210 if (tcp->tcp_state != TCPS_CLOSED) { 211 (*tcpt->tcpt_proc)(connp); 212 } else { 213 tcp->tcp_timer_tid = 0; 214 } 215 tcp_timer_free(connp->conn_tcp, mp); 216 } 217 218 /* 219 * There is potential race with untimeout and the handler firing at the same 220 * time. The mblock may be freed by the handler while we are trying to use 221 * it. But since both should execute on the same squeue, this race should not 222 * occur. 223 */ 224 clock_t 225 tcp_timeout_cancel(conn_t *connp, timeout_id_t id) 226 { 227 mblk_t *mp = (mblk_t *)id; 228 tcp_timer_t *tcpt; 229 clock_t delta; 230 231 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs); 232 233 if (mp == NULL) 234 return (-1); 235 236 tcpt = (tcp_timer_t *)mp->b_rptr; 237 ASSERT(tcpt->connp == connp); 238 239 delta = untimeout_default(tcpt->tcpt_tid, 0); 240 241 if (delta >= 0) { 242 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled); 243 tcp_timer_free(connp->conn_tcp, mp); 244 CONN_DEC_REF(connp); 245 } 246 247 return (TICK_TO_MSEC(delta)); 248 } 249 250 /* 251 * Allocate space for the timer event. The allocation looks like mblk, but it is 252 * not a proper mblk. To avoid confusion we set b_wptr to NULL. 253 * 254 * Dealing with failures: If we can't allocate from the timer cache we try 255 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr 256 * points to b_rptr. 257 * If we can't allocate anything using allocb_tryhard(), we perform a last 258 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and 259 * save the actual allocation size in b_datap. 260 */ 261 mblk_t * 262 tcp_timermp_alloc(int kmflags) 263 { 264 mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache, 265 kmflags & ~KM_PANIC); 266 267 if (mp != NULL) { 268 mp->b_next = mp->b_prev = NULL; 269 mp->b_rptr = (uchar_t *)(&mp[1]); 270 mp->b_wptr = NULL; 271 mp->b_datap = NULL; 272 mp->b_queue = NULL; 273 mp->b_cont = NULL; 274 } else if (kmflags & KM_PANIC) { 275 /* 276 * Failed to allocate memory for the timer. Try allocating from 277 * dblock caches. 278 */ 279 /* ipclassifier calls this from a constructor - hence no tcps */ 280 TCP_G_STAT(tcp_timermp_allocfail); 281 mp = allocb_tryhard(sizeof (tcp_timer_t)); 282 if (mp == NULL) { 283 size_t size = 0; 284 /* 285 * Memory is really low. Try tryhard allocation. 286 * 287 * ipclassifier calls this from a constructor - 288 * hence no tcps 289 */ 290 TCP_G_STAT(tcp_timermp_allocdblfail); 291 mp = kmem_alloc_tryhard(sizeof (mblk_t) + 292 sizeof (tcp_timer_t), &size, kmflags); 293 mp->b_rptr = (uchar_t *)(&mp[1]); 294 mp->b_next = mp->b_prev = NULL; 295 mp->b_wptr = (uchar_t *)-1; 296 mp->b_datap = (dblk_t *)size; 297 mp->b_queue = NULL; 298 mp->b_cont = NULL; 299 } 300 ASSERT(mp->b_wptr != NULL); 301 } 302 /* ipclassifier calls this from a constructor - hence no tcps */ 303 TCP_G_DBGSTAT(tcp_timermp_alloced); 304 305 return (mp); 306 } 307 308 /* 309 * Free per-tcp timer cache. 310 * It can only contain entries from tcp_timercache. 311 */ 312 void 313 tcp_timermp_free(tcp_t *tcp) 314 { 315 mblk_t *mp; 316 317 while ((mp = tcp->tcp_timercache) != NULL) { 318 ASSERT(mp->b_wptr == NULL); 319 tcp->tcp_timercache = tcp->tcp_timercache->b_next; 320 kmem_cache_free(tcp_timercache, mp); 321 } 322 } 323 324 /* 325 * Free timer event. Put it on the per-tcp timer cache if there is not too many 326 * events there already (currently at most two events are cached). 327 * If the event is not allocated from the timer cache, free it right away. 328 */ 329 static void 330 tcp_timer_free(tcp_t *tcp, mblk_t *mp) 331 { 332 mblk_t *mp1 = tcp->tcp_timercache; 333 334 if (mp->b_wptr != NULL) { 335 /* 336 * This allocation is not from a timer cache, free it right 337 * away. 338 */ 339 if (mp->b_wptr != (uchar_t *)-1) 340 freeb(mp); 341 else 342 kmem_free(mp, (size_t)mp->b_datap); 343 } else if (mp1 == NULL || mp1->b_next == NULL) { 344 /* Cache this timer block for future allocations */ 345 mp->b_rptr = (uchar_t *)(&mp[1]); 346 mp->b_next = mp1; 347 tcp->tcp_timercache = mp; 348 } else { 349 kmem_cache_free(tcp_timercache, mp); 350 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed); 351 } 352 } 353 354 /* 355 * Stop all TCP timers. 356 */ 357 void 358 tcp_timers_stop(tcp_t *tcp) 359 { 360 if (tcp->tcp_timer_tid != 0) { 361 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 362 tcp->tcp_timer_tid = 0; 363 } 364 if (tcp->tcp_ka_tid != 0) { 365 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); 366 tcp->tcp_ka_tid = 0; 367 } 368 if (tcp->tcp_ack_tid != 0) { 369 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 370 tcp->tcp_ack_tid = 0; 371 } 372 if (tcp->tcp_push_tid != 0) { 373 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 374 tcp->tcp_push_tid = 0; 375 } 376 if (tcp->tcp_reass_tid != 0) { 377 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid); 378 tcp->tcp_reass_tid = 0; 379 } 380 } 381 382 /* 383 * Timer callback routine for keepalive probe. We do a fake resend of 384 * last ACKed byte. Then set a timer using RTO. When the timer expires, 385 * check to see if we have heard anything from the other end for the last 386 * RTO period. If we have, set the timer to expire for another 387 * tcp_keepalive_intrvl and check again. If we have not, set a timer using 388 * RTO << 1 and check again when it expires. Keep exponentially increasing 389 * the timeout if we have not heard from the other side. If for more than 390 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything, 391 * kill the connection unless the keepalive abort threshold is 0. In 392 * that case, we will probe "forever." 393 */ 394 void 395 tcp_keepalive_timer(void *arg) 396 { 397 mblk_t *mp; 398 conn_t *connp = (conn_t *)arg; 399 tcp_t *tcp = connp->conn_tcp; 400 int32_t firetime; 401 int32_t idletime; 402 int32_t ka_intrvl; 403 tcp_stack_t *tcps = tcp->tcp_tcps; 404 405 tcp->tcp_ka_tid = 0; 406 407 if (tcp->tcp_fused) 408 return; 409 410 TCPS_BUMP_MIB(tcps, tcpTimKeepalive); 411 ka_intrvl = tcp->tcp_ka_interval; 412 413 /* 414 * Keepalive probe should only be sent if the application has not 415 * done a close on the connection. 416 */ 417 if (tcp->tcp_state > TCPS_CLOSE_WAIT) { 418 return; 419 } 420 /* Timer fired too early, restart it. */ 421 if (tcp->tcp_state < TCPS_ESTABLISHED) { 422 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 423 ka_intrvl); 424 return; 425 } 426 427 idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time); 428 /* 429 * If we have not heard from the other side for a long 430 * time, kill the connection unless the keepalive abort 431 * threshold is 0. In that case, we will probe "forever." 432 */ 433 if (tcp->tcp_ka_abort_thres != 0 && 434 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { 435 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop); 436 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? 437 tcp->tcp_client_errno : ETIMEDOUT); 438 return; 439 } 440 441 if (tcp->tcp_snxt == tcp->tcp_suna && 442 idletime >= ka_intrvl) { 443 /* Fake resend of last ACKed byte. */ 444 mblk_t *mp1 = allocb(1, BPRI_LO); 445 446 if (mp1 != NULL) { 447 *mp1->b_wptr++ = '\0'; 448 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL, 449 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE); 450 freeb(mp1); 451 /* 452 * if allocation failed, fall through to start the 453 * timer back. 454 */ 455 if (mp != NULL) { 456 tcp_send_data(tcp, mp); 457 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe); 458 if (tcp->tcp_ka_last_intrvl != 0) { 459 int max; 460 /* 461 * We should probe again at least 462 * in ka_intrvl, but not more than 463 * tcp_rto_max. 464 */ 465 max = tcp->tcp_rto_max; 466 firetime = MIN(ka_intrvl - 1, 467 tcp->tcp_ka_last_intrvl << 1); 468 if (firetime > max) 469 firetime = max; 470 } else { 471 firetime = tcp->tcp_rto; 472 } 473 tcp->tcp_ka_tid = TCP_TIMER(tcp, 474 tcp_keepalive_timer, firetime); 475 tcp->tcp_ka_last_intrvl = firetime; 476 return; 477 } 478 } 479 } else { 480 tcp->tcp_ka_last_intrvl = 0; 481 } 482 483 /* firetime can be negative if (mp1 == NULL || mp == NULL) */ 484 if ((firetime = ka_intrvl - idletime) < 0) { 485 firetime = ka_intrvl; 486 } 487 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime); 488 } 489 490 void 491 tcp_reass_timer(void *arg) 492 { 493 conn_t *connp = (conn_t *)arg; 494 tcp_t *tcp = connp->conn_tcp; 495 496 tcp->tcp_reass_tid = 0; 497 if (tcp->tcp_reass_head == NULL) 498 return; 499 ASSERT(tcp->tcp_reass_tail != NULL); 500 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 501 tcp_sack_remove(tcp->tcp_sack_list, 502 TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk); 503 } 504 tcp_close_mpp(&tcp->tcp_reass_head); 505 tcp->tcp_reass_tail = NULL; 506 TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout); 507 } 508 509 /* This function handles the push timeout. */ 510 void 511 tcp_push_timer(void *arg) 512 { 513 conn_t *connp = (conn_t *)arg; 514 tcp_t *tcp = connp->conn_tcp; 515 516 TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt); 517 518 ASSERT(tcp->tcp_listener == NULL); 519 520 ASSERT(!IPCL_IS_NONSTR(connp)); 521 522 tcp->tcp_push_tid = 0; 523 524 if (tcp->tcp_rcv_list != NULL && 525 tcp_rcv_drain(tcp) == TH_ACK_NEEDED) 526 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 527 } 528 529 /* 530 * This function handles delayed ACK timeout. 531 */ 532 void 533 tcp_ack_timer(void *arg) 534 { 535 conn_t *connp = (conn_t *)arg; 536 tcp_t *tcp = connp->conn_tcp; 537 mblk_t *mp; 538 tcp_stack_t *tcps = tcp->tcp_tcps; 539 540 TCP_DBGSTAT(tcps, tcp_ack_timer_cnt); 541 542 tcp->tcp_ack_tid = 0; 543 544 if (tcp->tcp_fused) 545 return; 546 547 /* 548 * Do not send ACK if there is no outstanding unack'ed data. 549 */ 550 if (tcp->tcp_rnxt == tcp->tcp_rack) { 551 return; 552 } 553 554 if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) { 555 /* 556 * Make sure we don't allow deferred ACKs to result in 557 * timer-based ACKing. If we have held off an ACK 558 * when there was more than an mss here, and the timer 559 * goes off, we have to worry about the possibility 560 * that the sender isn't doing slow-start, or is out 561 * of step with us for some other reason. We fall 562 * permanently back in the direction of 563 * ACK-every-other-packet as suggested in RFC 1122. 564 */ 565 if (tcp->tcp_rack_abs_max > 2) 566 tcp->tcp_rack_abs_max--; 567 tcp->tcp_rack_cur_max = 2; 568 } 569 mp = tcp_ack_mp(tcp); 570 571 if (mp != NULL) { 572 BUMP_LOCAL(tcp->tcp_obsegs); 573 TCPS_BUMP_MIB(tcps, tcpOutAck); 574 TCPS_BUMP_MIB(tcps, tcpOutAckDelayed); 575 tcp_send_data(tcp, mp); 576 } 577 } 578 579 /* 580 * Notify IP that we are having trouble with this connection. IP should 581 * make note so it can potentially use a different IRE. 582 */ 583 static void 584 tcp_ip_notify(tcp_t *tcp) 585 { 586 conn_t *connp = tcp->tcp_connp; 587 ire_t *ire; 588 589 /* 590 * Note: in the case of source routing we want to blow away the 591 * route to the first source route hop. 592 */ 593 ire = connp->conn_ixa->ixa_ire; 594 if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 595 if (ire->ire_ipversion == IPV4_VERSION) { 596 /* 597 * As per RFC 1122, we send an RTM_LOSING to inform 598 * routing protocols. 599 */ 600 ip_rts_change(RTM_LOSING, ire->ire_addr, 601 ire->ire_gateway_addr, ire->ire_mask, 602 connp->conn_laddr_v4, 0, 0, 0, 603 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), 604 ire->ire_ipst); 605 } 606 (void) ire_no_good(ire); 607 } 608 } 609 610 /* 611 * tcp_timer is the timer service routine. It handles the retransmission, 612 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out 613 * from the state of the tcp instance what kind of action needs to be done 614 * at the time it is called. 615 */ 616 void 617 tcp_timer(void *arg) 618 { 619 mblk_t *mp; 620 clock_t first_threshold; 621 clock_t second_threshold; 622 clock_t ms; 623 uint32_t mss; 624 conn_t *connp = (conn_t *)arg; 625 tcp_t *tcp = connp->conn_tcp; 626 tcp_stack_t *tcps = tcp->tcp_tcps; 627 boolean_t dont_timeout = B_FALSE; 628 629 tcp->tcp_timer_tid = 0; 630 631 if (tcp->tcp_fused) 632 return; 633 634 first_threshold = tcp->tcp_first_timer_threshold; 635 second_threshold = tcp->tcp_second_timer_threshold; 636 switch (tcp->tcp_state) { 637 case TCPS_IDLE: 638 case TCPS_BOUND: 639 case TCPS_LISTEN: 640 return; 641 case TCPS_SYN_RCVD: { 642 tcp_t *listener = tcp->tcp_listener; 643 644 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { 645 /* it's our first timeout */ 646 tcp->tcp_syn_rcvd_timeout = 1; 647 mutex_enter(&listener->tcp_eager_lock); 648 listener->tcp_syn_rcvd_timeout++; 649 if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) { 650 /* 651 * Make this eager available for drop if we 652 * need to drop one to accomodate a new 653 * incoming SYN request. 654 */ 655 MAKE_DROPPABLE(listener, tcp); 656 } 657 if (!listener->tcp_syn_defense && 658 (listener->tcp_syn_rcvd_timeout > 659 (tcps->tcps_conn_req_max_q0 >> 2)) && 660 (tcps->tcps_conn_req_max_q0 > 200)) { 661 /* We may be under attack. Put on a defense. */ 662 listener->tcp_syn_defense = B_TRUE; 663 cmn_err(CE_WARN, "High TCP connect timeout " 664 "rate! System (port %d) may be under a " 665 "SYN flood attack!", 666 ntohs(listener->tcp_connp->conn_lport)); 667 668 listener->tcp_ip_addr_cache = kmem_zalloc( 669 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), 670 KM_NOSLEEP); 671 } 672 mutex_exit(&listener->tcp_eager_lock); 673 } else if (listener != NULL) { 674 mutex_enter(&listener->tcp_eager_lock); 675 tcp->tcp_syn_rcvd_timeout++; 676 if (tcp->tcp_syn_rcvd_timeout > 1 && 677 !tcp->tcp_closemp_used) { 678 /* 679 * This is our second timeout. Put the tcp in 680 * the list of droppable eagers to allow it to 681 * be dropped, if needed. We don't check 682 * whether tcp_dontdrop is set or not to 683 * protect ourselve from a SYN attack where a 684 * remote host can spoof itself as one of the 685 * good IP source and continue to hold 686 * resources too long. 687 */ 688 MAKE_DROPPABLE(listener, tcp); 689 } 690 mutex_exit(&listener->tcp_eager_lock); 691 } 692 } 693 /* FALLTHRU */ 694 case TCPS_SYN_SENT: 695 first_threshold = tcp->tcp_first_ctimer_threshold; 696 second_threshold = tcp->tcp_second_ctimer_threshold; 697 698 /* 699 * If an app has set the second_threshold to 0, it means that 700 * we need to retransmit forever, unless this is a passive 701 * open. We need to set second_threshold back to a normal 702 * value such that later comparison with it still makes 703 * sense. But we set dont_timeout to B_TRUE so that we will 704 * never time out. 705 */ 706 if (second_threshold == 0) { 707 second_threshold = tcps->tcps_ip_abort_linterval; 708 if (tcp->tcp_active_open) 709 dont_timeout = B_TRUE; 710 } 711 break; 712 case TCPS_ESTABLISHED: 713 case TCPS_CLOSE_WAIT: 714 /* 715 * If the end point has not been closed, TCP can retransmit 716 * forever. But if the end point is closed, the normal 717 * timeout applies. 718 */ 719 if (second_threshold == 0) { 720 second_threshold = tcps->tcps_ip_abort_linterval; 721 dont_timeout = B_TRUE; 722 } 723 /* FALLTHRU */ 724 case TCPS_FIN_WAIT_1: 725 case TCPS_CLOSING: 726 case TCPS_LAST_ACK: 727 /* If we have data to rexmit */ 728 if (tcp->tcp_suna != tcp->tcp_snxt) { 729 clock_t time_to_wait; 730 731 TCPS_BUMP_MIB(tcps, tcpTimRetrans); 732 if (!tcp->tcp_xmit_head) 733 break; 734 time_to_wait = ddi_get_lbolt() - 735 (clock_t)tcp->tcp_xmit_head->b_prev; 736 time_to_wait = tcp->tcp_rto - 737 TICK_TO_MSEC(time_to_wait); 738 /* 739 * If the timer fires too early, 1 clock tick earlier, 740 * restart the timer. 741 */ 742 if (time_to_wait > msec_per_tick) { 743 TCP_STAT(tcps, tcp_timer_fire_early); 744 TCP_TIMER_RESTART(tcp, time_to_wait); 745 return; 746 } 747 /* 748 * When we probe zero windows, we force the swnd open. 749 * If our peer acks with a closed window swnd will be 750 * set to zero by tcp_rput(). As long as we are 751 * receiving acks tcp_rput will 752 * reset 'tcp_ms_we_have_waited' so as not to trip the 753 * first and second interval actions. NOTE: the timer 754 * interval is allowed to continue its exponential 755 * backoff. 756 */ 757 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 758 if (connp->conn_debug) { 759 (void) strlog(TCP_MOD_ID, 0, 1, 760 SL_TRACE, "tcp_timer: zero win"); 761 } 762 } else { 763 /* 764 * After retransmission, we need to do 765 * slow start. Set the ssthresh to one 766 * half of current effective window and 767 * cwnd to one MSS. Also reset 768 * tcp_cwnd_cnt. 769 * 770 * Note that if tcp_ssthresh is reduced because 771 * of ECN, do not reduce it again unless it is 772 * already one window of data away (tcp_cwr 773 * should then be cleared) or this is a 774 * timeout for a retransmitted segment. 775 */ 776 uint32_t npkt; 777 778 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 779 npkt = ((tcp->tcp_timer_backoff ? 780 tcp->tcp_cwnd_ssthresh : 781 tcp->tcp_snxt - 782 tcp->tcp_suna) >> 1) / tcp->tcp_mss; 783 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 784 tcp->tcp_mss; 785 } 786 tcp->tcp_cwnd = tcp->tcp_mss; 787 tcp->tcp_cwnd_cnt = 0; 788 if (tcp->tcp_ecn_ok) { 789 tcp->tcp_cwr = B_TRUE; 790 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 791 tcp->tcp_ecn_cwr_sent = B_FALSE; 792 } 793 } 794 break; 795 } 796 /* 797 * We have something to send yet we cannot send. The 798 * reason can be: 799 * 800 * 1. Zero send window: we need to do zero window probe. 801 * 2. Zero cwnd: because of ECN, we need to "clock out 802 * segments. 803 * 3. SWS avoidance: receiver may have shrunk window, 804 * reset our knowledge. 805 * 806 * Note that condition 2 can happen with either 1 or 807 * 3. But 1 and 3 are exclusive. 808 */ 809 if (tcp->tcp_unsent != 0) { 810 /* 811 * Should not hold the zero-copy messages for too long. 812 */ 813 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 814 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 815 tcp->tcp_xmit_head, B_TRUE); 816 817 if (tcp->tcp_cwnd == 0) { 818 /* 819 * Set tcp_cwnd to 1 MSS so that a 820 * new segment can be sent out. We 821 * are "clocking out" new data when 822 * the network is really congested. 823 */ 824 ASSERT(tcp->tcp_ecn_ok); 825 tcp->tcp_cwnd = tcp->tcp_mss; 826 } 827 if (tcp->tcp_swnd == 0) { 828 /* Extend window for zero window probe */ 829 tcp->tcp_swnd++; 830 tcp->tcp_zero_win_probe = B_TRUE; 831 TCPS_BUMP_MIB(tcps, tcpOutWinProbe); 832 } else { 833 /* 834 * Handle timeout from sender SWS avoidance. 835 * Reset our knowledge of the max send window 836 * since the receiver might have reduced its 837 * receive buffer. Avoid setting tcp_max_swnd 838 * to one since that will essentially disable 839 * the SWS checks. 840 * 841 * Note that since we don't have a SWS 842 * state variable, if the timeout is set 843 * for ECN but not for SWS, this 844 * code will also be executed. This is 845 * fine as tcp_max_swnd is updated 846 * constantly and it will not affect 847 * anything. 848 */ 849 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 850 } 851 tcp_wput_data(tcp, NULL, B_FALSE); 852 return; 853 } 854 /* Is there a FIN that needs to be to re retransmitted? */ 855 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 856 !tcp->tcp_fin_acked) 857 break; 858 /* Nothing to do, return without restarting timer. */ 859 TCP_STAT(tcps, tcp_timer_fire_miss); 860 return; 861 case TCPS_FIN_WAIT_2: 862 /* 863 * User closed the TCP endpoint and peer ACK'ed our FIN. 864 * We waited some time for for peer's FIN, but it hasn't 865 * arrived. We flush the connection now to avoid 866 * case where the peer has rebooted. 867 */ 868 if (TCP_IS_DETACHED(tcp)) { 869 (void) tcp_clean_death(tcp, 0); 870 } else { 871 TCP_TIMER_RESTART(tcp, 872 tcp->tcp_fin_wait_2_flush_interval); 873 } 874 return; 875 case TCPS_TIME_WAIT: 876 (void) tcp_clean_death(tcp, 0); 877 return; 878 default: 879 if (connp->conn_debug) { 880 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 881 "tcp_timer: strange state (%d) %s", 882 tcp->tcp_state, tcp_display(tcp, NULL, 883 DISP_PORT_ONLY)); 884 } 885 return; 886 } 887 888 /* 889 * If the system is under memory pressure or the max number of 890 * connections have been established for the listener, be more 891 * aggressive in aborting connections. 892 */ 893 if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL && 894 tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) { 895 second_threshold = tcp_early_abort * SECONDS; 896 897 /* We will ignore the never timeout promise in this case... */ 898 dont_timeout = B_FALSE; 899 } 900 901 ASSERT(second_threshold != 0); 902 903 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 904 /* 905 * Should not hold the zero-copy messages for too long. 906 */ 907 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 908 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 909 tcp->tcp_xmit_head, B_TRUE); 910 911 if (dont_timeout) { 912 /* 913 * Reset tcp_ms_we_have_waited to avoid overflow since 914 * we are going to retransmit forever. 915 */ 916 tcp->tcp_ms_we_have_waited = second_threshold; 917 goto timer_rexmit; 918 } 919 920 /* 921 * For zero window probe, we need to send indefinitely, 922 * unless we have not heard from the other side for some 923 * time... 924 */ 925 if ((tcp->tcp_zero_win_probe == 0) || 926 (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) > 927 second_threshold)) { 928 TCPS_BUMP_MIB(tcps, tcpTimRetransDrop); 929 /* 930 * If TCP is in SYN_RCVD state, send back a 931 * RST|ACK as BSD does. Note that tcp_zero_win_probe 932 * should be zero in TCPS_SYN_RCVD state. 933 */ 934 if (tcp->tcp_state == TCPS_SYN_RCVD) { 935 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 936 "in SYN_RCVD", 937 tcp, tcp->tcp_snxt, 938 tcp->tcp_rnxt, TH_RST | TH_ACK); 939 } 940 (void) tcp_clean_death(tcp, 941 tcp->tcp_client_errno ? 942 tcp->tcp_client_errno : ETIMEDOUT); 943 return; 944 } else { 945 /* 946 * If the system is under memory pressure, we also 947 * abort connection in zero window probing. 948 */ 949 if (tcps->tcps_reclaim) { 950 (void) tcp_clean_death(tcp, 951 tcp->tcp_client_errno ? 952 tcp->tcp_client_errno : ETIMEDOUT); 953 TCP_STAT(tcps, tcp_zwin_mem_drop); 954 return; 955 } 956 /* 957 * Set tcp_ms_we_have_waited to second_threshold 958 * so that in next timeout, we will do the above 959 * check (ddi_get_lbolt() - tcp_last_recv_time). 960 * This is also to avoid overflow. 961 * 962 * We don't need to decrement tcp_timer_backoff 963 * to avoid overflow because it will be decremented 964 * later if new timeout value is greater than 965 * tcp_rto_max. In the case when tcp_rto_max is 966 * greater than second_threshold, it means that we 967 * will wait longer than second_threshold to send 968 * the next 969 * window probe. 970 */ 971 tcp->tcp_ms_we_have_waited = second_threshold; 972 } 973 } else if (ms > first_threshold) { 974 /* 975 * Should not hold the zero-copy messages for too long. 976 */ 977 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 978 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 979 tcp->tcp_xmit_head, B_TRUE); 980 981 /* 982 * We have been retransmitting for too long... The RTT 983 * we calculated is probably incorrect. Reinitialize it. 984 * Need to compensate for 0 tcp_rtt_sa. Reset 985 * tcp_rtt_update so that we won't accidentally cache a 986 * bad value. But only do this if this is not a zero 987 * window probe. 988 */ 989 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { 990 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 991 (tcp->tcp_rtt_sa >> 5); 992 tcp->tcp_rtt_sa = 0; 993 tcp_ip_notify(tcp); 994 tcp->tcp_rtt_update = 0; 995 } 996 } 997 998 timer_rexmit: 999 tcp->tcp_timer_backoff++; 1000 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 1001 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 1002 tcp->tcp_rto_min) { 1003 /* 1004 * This means the original RTO is tcp_rexmit_interval_min. 1005 * So we will use tcp_rexmit_interval_min as the RTO value 1006 * and do the backoff. 1007 */ 1008 ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff; 1009 } else { 1010 ms <<= tcp->tcp_timer_backoff; 1011 } 1012 if (ms > tcp->tcp_rto_max) { 1013 ms = tcp->tcp_rto_max; 1014 /* 1015 * ms is at max, decrement tcp_timer_backoff to avoid 1016 * overflow. 1017 */ 1018 tcp->tcp_timer_backoff--; 1019 } 1020 tcp->tcp_ms_we_have_waited += ms; 1021 if (tcp->tcp_zero_win_probe == 0) { 1022 tcp->tcp_rto = ms; 1023 } 1024 TCP_TIMER_RESTART(tcp, ms); 1025 /* 1026 * This is after a timeout and tcp_rto is backed off. Set 1027 * tcp_set_timer to 1 so that next time RTO is updated, we will 1028 * restart the timer with a correct value. 1029 */ 1030 tcp->tcp_set_timer = 1; 1031 mss = tcp->tcp_snxt - tcp->tcp_suna; 1032 if (mss > tcp->tcp_mss) 1033 mss = tcp->tcp_mss; 1034 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 1035 mss = tcp->tcp_swnd; 1036 1037 if ((mp = tcp->tcp_xmit_head) != NULL) 1038 mp->b_prev = (mblk_t *)ddi_get_lbolt(); 1039 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 1040 B_TRUE); 1041 1042 /* 1043 * When slow start after retransmission begins, start with 1044 * this seq no. tcp_rexmit_max marks the end of special slow 1045 * start phase. tcp_snd_burst controls how many segments 1046 * can be sent because of an ack. 1047 */ 1048 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 1049 tcp->tcp_snd_burst = TCP_CWND_SS; 1050 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 1051 (tcp->tcp_unsent == 0)) { 1052 tcp->tcp_rexmit_max = tcp->tcp_fss; 1053 } else { 1054 tcp->tcp_rexmit_max = tcp->tcp_snxt; 1055 } 1056 tcp->tcp_rexmit = B_TRUE; 1057 tcp->tcp_dupack_cnt = 0; 1058 1059 /* 1060 * Remove all rexmit SACK blk to start from fresh. 1061 */ 1062 if (tcp->tcp_snd_sack_ok) 1063 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 1064 if (mp == NULL) { 1065 return; 1066 } 1067 1068 tcp->tcp_csuna = tcp->tcp_snxt; 1069 TCPS_BUMP_MIB(tcps, tcpRetransSegs); 1070 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss); 1071 tcp_send_data(tcp, mp); 1072 1073 } 1074 1075 /* 1076 * Handle lingering timeouts. This function is called when the SO_LINGER timeout 1077 * expires. 1078 */ 1079 void 1080 tcp_close_linger_timeout(void *arg) 1081 { 1082 conn_t *connp = (conn_t *)arg; 1083 tcp_t *tcp = connp->conn_tcp; 1084 1085 tcp->tcp_client_errno = ETIMEDOUT; 1086 tcp_stop_lingering(tcp); 1087 } 1088