1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2011 Joyent, Inc. All rights reserved. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/strlog.h> 30 #include <sys/strsun.h> 31 #include <sys/squeue_impl.h> 32 #include <sys/squeue.h> 33 #include <sys/callo.h> 34 #include <sys/strsubr.h> 35 36 #include <inet/common.h> 37 #include <inet/ip.h> 38 #include <inet/ip_ire.h> 39 #include <inet/ip_rts.h> 40 #include <inet/tcp.h> 41 #include <inet/tcp_impl.h> 42 43 /* 44 * Implementation of TCP Timers. 45 * ============================= 46 * 47 * INTERFACE: 48 * 49 * There are two basic functions dealing with tcp timers: 50 * 51 * timeout_id_t tcp_timeout(connp, func, time) 52 * clock_t tcp_timeout_cancel(connp, timeout_id) 53 * TCP_TIMER_RESTART(tcp, intvl) 54 * 55 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 56 * after 'time' ticks passed. The function called by timeout() must adhere to 57 * the same restrictions as a driver soft interrupt handler - it must not sleep 58 * or call other functions that might sleep. The value returned is the opaque 59 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 60 * cancel the request. The call to tcp_timeout() may fail in which case it 61 * returns zero. This is different from the timeout(9F) function which never 62 * fails. 63 * 64 * The call-back function 'func' always receives 'connp' as its single 65 * argument. It is always executed in the squeue corresponding to the tcp 66 * structure. The tcp structure is guaranteed to be present at the time the 67 * call-back is called. 68 * 69 * NOTE: The call-back function 'func' is never called if tcp is in 70 * the TCPS_CLOSED state. 71 * 72 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 73 * request. locks acquired by the call-back routine should not be held across 74 * the call to tcp_timeout_cancel() or a deadlock may result. 75 * 76 * tcp_timeout_cancel() returns -1 if the timeout request is invalid. 77 * Otherwise, it returns an integer value greater than or equal to 0. 78 * 79 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 80 * within squeue context corresponding to the tcp instance. Since the 81 * call-back is also called via the same squeue, there are no race 82 * conditions described in untimeout(9F) manual page since all calls are 83 * strictly serialized. 84 * 85 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 86 * stored in tcp_timer_tid and starts a new one using 87 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 88 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 89 * field. 90 * 91 * IMPLEMENTATION: 92 * 93 * TCP timers are implemented using three-stage process. The call to 94 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 95 * when the timer expires. The tcp_timer_callback() arranges the call of the 96 * tcp_timer_handler() function via squeue corresponding to the tcp 97 * instance. The tcp_timer_handler() calls actual requested timeout call-back 98 * and passes tcp instance as an argument to it. Information is passed between 99 * stages using the tcp_timer_t structure which contains the connp pointer, the 100 * tcp call-back to call and the timeout id returned by the timeout(9F). 101 * 102 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 103 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 104 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 105 * returns the pointer to this mblk. 106 * 107 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 108 * looks like a normal mblk without actual dblk attached to it. 109 * 110 * To optimize performance each tcp instance holds a small cache of timer 111 * mblocks. In the current implementation it caches up to two timer mblocks per 112 * tcp instance. The cache is preserved over tcp frees and is only freed when 113 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 114 * timer processing happens on a corresponding squeue, the cache manipulation 115 * does not require any locks. Experiments show that majority of timer mblocks 116 * allocations are satisfied from the tcp cache and do not involve kmem calls. 117 * 118 * The tcp_timeout() places a refhold on the connp instance which guarantees 119 * that it will be present at the time the call-back function fires. The 120 * tcp_timer_handler() drops the reference after calling the call-back, so the 121 * call-back function does not need to manipulate the references explicitly. 122 */ 123 124 kmem_cache_t *tcp_timercache; 125 126 static void tcp_ip_notify(tcp_t *); 127 static void tcp_timer_callback(void *); 128 static void tcp_timer_free(tcp_t *, mblk_t *); 129 static void tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *); 130 131 /* 132 * tim is in millisec. 133 */ 134 timeout_id_t 135 tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim) 136 { 137 mblk_t *mp; 138 tcp_timer_t *tcpt; 139 tcp_t *tcp = connp->conn_tcp; 140 141 ASSERT(connp->conn_sqp != NULL); 142 143 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls); 144 145 if (tcp->tcp_timercache == NULL) { 146 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); 147 } else { 148 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc); 149 mp = tcp->tcp_timercache; 150 tcp->tcp_timercache = mp->b_next; 151 mp->b_next = NULL; 152 ASSERT(mp->b_wptr == NULL); 153 } 154 155 CONN_INC_REF(connp); 156 tcpt = (tcp_timer_t *)mp->b_rptr; 157 tcpt->connp = connp; 158 tcpt->tcpt_proc = f; 159 /* 160 * TCP timers are normal timeouts. Plus, they do not require more than 161 * a 10 millisecond resolution. By choosing a coarser resolution and by 162 * rounding up the expiration to the next resolution boundary, we can 163 * batch timers in the callout subsystem to make TCP timers more 164 * efficient. The roundup also protects short timers from expiring too 165 * early before they have a chance to be cancelled. 166 */ 167 tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp, 168 tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 169 VERIFY(!(tcpt->tcpt_tid & CALLOUT_ID_FREE)); 170 171 return ((timeout_id_t)mp); 172 } 173 174 static void 175 tcp_timer_callback(void *arg) 176 { 177 mblk_t *mp = (mblk_t *)arg; 178 tcp_timer_t *tcpt; 179 conn_t *connp; 180 181 tcpt = (tcp_timer_t *)mp->b_rptr; 182 connp = tcpt->connp; 183 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp, 184 NULL, SQ_FILL, SQTAG_TCP_TIMER); 185 } 186 187 /* ARGSUSED */ 188 static void 189 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 190 { 191 tcp_timer_t *tcpt; 192 conn_t *connp = (conn_t *)arg; 193 tcp_t *tcp = connp->conn_tcp; 194 195 tcpt = (tcp_timer_t *)mp->b_rptr; 196 ASSERT(connp == tcpt->connp); 197 ASSERT((squeue_t *)arg2 == connp->conn_sqp); 198 199 if (tcpt->tcpt_tid & CALLOUT_ID_FREE) { 200 /* 201 * This timeout was cancelled after it was enqueued to the 202 * squeue; free the timer and return. 203 */ 204 tcp_timer_free(connp->conn_tcp, mp); 205 return; 206 } 207 208 /* 209 * If the TCP has reached the closed state, don't proceed any 210 * further. This TCP logically does not exist on the system. 211 * tcpt_proc could for example access queues, that have already 212 * been qprocoff'ed off. 213 */ 214 if (tcp->tcp_state != TCPS_CLOSED) { 215 (*tcpt->tcpt_proc)(connp); 216 } else { 217 tcp->tcp_timer_tid = 0; 218 } 219 220 tcp_timer_free(connp->conn_tcp, mp); 221 } 222 223 /* 224 * There is potential race with untimeout and the handler firing at the same 225 * time. The mblock may be freed by the handler while we are trying to use 226 * it. But since both should execute on the same squeue, this race should not 227 * occur. 228 */ 229 clock_t 230 tcp_timeout_cancel(conn_t *connp, timeout_id_t id) 231 { 232 mblk_t *mp = (mblk_t *)id; 233 tcp_timer_t *tcpt; 234 clock_t delta; 235 236 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs); 237 238 if (mp == NULL) 239 return (-1); 240 241 tcpt = (tcp_timer_t *)mp->b_rptr; 242 ASSERT(tcpt->connp == connp); 243 244 delta = untimeout_default(tcpt->tcpt_tid, 0); 245 246 if (delta >= 0) { 247 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled); 248 tcp_timer_free(connp->conn_tcp, mp); 249 CONN_DEC_REF(connp); 250 } else { 251 /* 252 * If we were unable to untimeout successfully, it has already 253 * been enqueued on the squeue; mark the ID with the free 254 * bit. This bit can never be set in a valid identifier, and 255 * we'll use it to prevent the timeout from being executed. 256 * And note that we're within the squeue perimeter here, so 257 * we don't need to worry about racing with timer handling 258 * (which also executes within the perimeter). 259 */ 260 tcpt->tcpt_tid |= CALLOUT_ID_FREE; 261 delta = 0; 262 } 263 264 return (TICK_TO_MSEC(delta)); 265 } 266 267 /* 268 * Allocate space for the timer event. The allocation looks like mblk, but it is 269 * not a proper mblk. To avoid confusion we set b_wptr to NULL. 270 * 271 * Dealing with failures: If we can't allocate from the timer cache we try 272 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr 273 * points to b_rptr. 274 * If we can't allocate anything using allocb_tryhard(), we perform a last 275 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and 276 * save the actual allocation size in b_datap. 277 */ 278 mblk_t * 279 tcp_timermp_alloc(int kmflags) 280 { 281 mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache, 282 kmflags & ~KM_PANIC); 283 284 if (mp != NULL) { 285 mp->b_next = mp->b_prev = NULL; 286 mp->b_rptr = (uchar_t *)(&mp[1]); 287 mp->b_wptr = NULL; 288 mp->b_datap = NULL; 289 mp->b_queue = NULL; 290 mp->b_cont = NULL; 291 } else if (kmflags & KM_PANIC) { 292 /* 293 * Failed to allocate memory for the timer. Try allocating from 294 * dblock caches. 295 */ 296 /* ipclassifier calls this from a constructor - hence no tcps */ 297 TCP_G_STAT(tcp_timermp_allocfail); 298 mp = allocb_tryhard(sizeof (tcp_timer_t)); 299 if (mp == NULL) { 300 size_t size = 0; 301 /* 302 * Memory is really low. Try tryhard allocation. 303 * 304 * ipclassifier calls this from a constructor - 305 * hence no tcps 306 */ 307 TCP_G_STAT(tcp_timermp_allocdblfail); 308 mp = kmem_alloc_tryhard(sizeof (mblk_t) + 309 sizeof (tcp_timer_t), &size, kmflags); 310 mp->b_rptr = (uchar_t *)(&mp[1]); 311 mp->b_next = mp->b_prev = NULL; 312 mp->b_wptr = (uchar_t *)-1; 313 mp->b_datap = (dblk_t *)size; 314 mp->b_queue = NULL; 315 mp->b_cont = NULL; 316 } 317 ASSERT(mp->b_wptr != NULL); 318 } 319 /* ipclassifier calls this from a constructor - hence no tcps */ 320 TCP_G_DBGSTAT(tcp_timermp_alloced); 321 322 return (mp); 323 } 324 325 /* 326 * Free per-tcp timer cache. 327 * It can only contain entries from tcp_timercache. 328 */ 329 void 330 tcp_timermp_free(tcp_t *tcp) 331 { 332 mblk_t *mp; 333 334 while ((mp = tcp->tcp_timercache) != NULL) { 335 ASSERT(mp->b_wptr == NULL); 336 tcp->tcp_timercache = tcp->tcp_timercache->b_next; 337 kmem_cache_free(tcp_timercache, mp); 338 } 339 } 340 341 /* 342 * Free timer event. Put it on the per-tcp timer cache if there is not too many 343 * events there already (currently at most two events are cached). 344 * If the event is not allocated from the timer cache, free it right away. 345 */ 346 static void 347 tcp_timer_free(tcp_t *tcp, mblk_t *mp) 348 { 349 mblk_t *mp1 = tcp->tcp_timercache; 350 351 if (mp->b_wptr != NULL) { 352 /* 353 * This allocation is not from a timer cache, free it right 354 * away. 355 */ 356 if (mp->b_wptr != (uchar_t *)-1) 357 freeb(mp); 358 else 359 kmem_free(mp, (size_t)mp->b_datap); 360 } else if (mp1 == NULL || mp1->b_next == NULL) { 361 /* Cache this timer block for future allocations */ 362 mp->b_rptr = (uchar_t *)(&mp[1]); 363 mp->b_next = mp1; 364 tcp->tcp_timercache = mp; 365 } else { 366 kmem_cache_free(tcp_timercache, mp); 367 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed); 368 } 369 } 370 371 /* 372 * Stop all TCP timers. 373 */ 374 void 375 tcp_timers_stop(tcp_t *tcp) 376 { 377 if (tcp->tcp_timer_tid != 0) { 378 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 379 tcp->tcp_timer_tid = 0; 380 } 381 if (tcp->tcp_ka_tid != 0) { 382 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); 383 tcp->tcp_ka_tid = 0; 384 } 385 if (tcp->tcp_ack_tid != 0) { 386 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 387 tcp->tcp_ack_tid = 0; 388 } 389 if (tcp->tcp_push_tid != 0) { 390 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 391 tcp->tcp_push_tid = 0; 392 } 393 if (tcp->tcp_reass_tid != 0) { 394 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid); 395 tcp->tcp_reass_tid = 0; 396 } 397 } 398 399 /* 400 * Timer callback routine for keepalive probe. We do a fake resend of 401 * last ACKed byte. Then set a timer using RTO. When the timer expires, 402 * check to see if we have heard anything from the other end for the last 403 * RTO period. If we have, set the timer to expire for another 404 * tcp_keepalive_intrvl and check again. If we have not, set a timer using 405 * RTO << 1 and check again when it expires. Keep exponentially increasing 406 * the timeout if we have not heard from the other side. If for more than 407 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything, 408 * kill the connection unless the keepalive abort threshold is 0. In 409 * that case, we will probe "forever." 410 * If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow 411 * the exponential backoff, but send probes tcp_ka_cnt times in regular 412 * intervals of tcp_ka_rinterval milliseconds until we hear back from peer. 413 * Kill the connection if we don't hear back from peer after tcp_ka_cnt 414 * probes are sent. 415 */ 416 void 417 tcp_keepalive_timer(void *arg) 418 { 419 mblk_t *mp; 420 conn_t *connp = (conn_t *)arg; 421 tcp_t *tcp = connp->conn_tcp; 422 int32_t firetime; 423 int32_t idletime; 424 int32_t ka_intrvl; 425 tcp_stack_t *tcps = tcp->tcp_tcps; 426 427 tcp->tcp_ka_tid = 0; 428 429 if (tcp->tcp_fused) 430 return; 431 432 TCPS_BUMP_MIB(tcps, tcpTimKeepalive); 433 ka_intrvl = tcp->tcp_ka_interval; 434 435 /* 436 * Keepalive probe should only be sent if the application has not 437 * done a close on the connection. 438 */ 439 if (tcp->tcp_state > TCPS_CLOSE_WAIT) { 440 return; 441 } 442 /* Timer fired too early, restart it. */ 443 if (tcp->tcp_state < TCPS_ESTABLISHED) { 444 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 445 ka_intrvl); 446 return; 447 } 448 449 idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time); 450 /* 451 * If we have not heard from the other side for a long 452 * time, kill the connection unless the keepalive abort 453 * threshold is 0. In that case, we will probe "forever." 454 */ 455 if (tcp->tcp_ka_abort_thres != 0 && 456 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { 457 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop); 458 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? 459 tcp->tcp_client_errno : ETIMEDOUT); 460 return; 461 } 462 463 if (tcp->tcp_snxt == tcp->tcp_suna && 464 idletime >= ka_intrvl) { 465 /* Fake resend of last ACKed byte. */ 466 mblk_t *mp1 = allocb(1, BPRI_LO); 467 468 if (mp1 != NULL) { 469 *mp1->b_wptr++ = '\0'; 470 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL, 471 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE); 472 freeb(mp1); 473 /* 474 * if allocation failed, fall through to start the 475 * timer back. 476 */ 477 if (mp != NULL) { 478 tcp_send_data(tcp, mp); 479 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe); 480 if (tcp->tcp_ka_rinterval) { 481 firetime = tcp->tcp_ka_rinterval; 482 } else if (tcp->tcp_ka_last_intrvl != 0) { 483 int max; 484 /* 485 * We should probe again at least 486 * in ka_intrvl, but not more than 487 * tcp_rto_max. 488 */ 489 max = tcp->tcp_rto_max; 490 firetime = MIN(ka_intrvl - 1, 491 tcp->tcp_ka_last_intrvl << 1); 492 if (firetime > max) 493 firetime = max; 494 } else { 495 firetime = tcp->tcp_rto; 496 } 497 tcp->tcp_ka_tid = TCP_TIMER(tcp, 498 tcp_keepalive_timer, firetime); 499 tcp->tcp_ka_last_intrvl = firetime; 500 return; 501 } 502 } 503 } else { 504 tcp->tcp_ka_last_intrvl = 0; 505 } 506 507 /* firetime can be negative if (mp1 == NULL || mp == NULL) */ 508 if ((firetime = ka_intrvl - idletime) < 0) { 509 firetime = ka_intrvl; 510 } 511 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime); 512 } 513 514 void 515 tcp_reass_timer(void *arg) 516 { 517 conn_t *connp = (conn_t *)arg; 518 tcp_t *tcp = connp->conn_tcp; 519 520 tcp->tcp_reass_tid = 0; 521 if (tcp->tcp_reass_head == NULL) 522 return; 523 ASSERT(tcp->tcp_reass_tail != NULL); 524 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 525 tcp_sack_remove(tcp->tcp_sack_list, 526 TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk); 527 } 528 tcp_close_mpp(&tcp->tcp_reass_head); 529 tcp->tcp_reass_tail = NULL; 530 TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout); 531 } 532 533 /* This function handles the push timeout. */ 534 void 535 tcp_push_timer(void *arg) 536 { 537 conn_t *connp = (conn_t *)arg; 538 tcp_t *tcp = connp->conn_tcp; 539 540 TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt); 541 542 ASSERT(tcp->tcp_listener == NULL); 543 544 ASSERT(!IPCL_IS_NONSTR(connp)); 545 546 tcp->tcp_push_tid = 0; 547 548 if (tcp->tcp_rcv_list != NULL && 549 tcp_rcv_drain(tcp) == TH_ACK_NEEDED) 550 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 551 } 552 553 /* 554 * This function handles delayed ACK timeout. 555 */ 556 void 557 tcp_ack_timer(void *arg) 558 { 559 conn_t *connp = (conn_t *)arg; 560 tcp_t *tcp = connp->conn_tcp; 561 mblk_t *mp; 562 tcp_stack_t *tcps = tcp->tcp_tcps; 563 564 TCP_DBGSTAT(tcps, tcp_ack_timer_cnt); 565 566 tcp->tcp_ack_tid = 0; 567 568 if (tcp->tcp_fused) 569 return; 570 571 /* 572 * Do not send ACK if there is no outstanding unack'ed data. 573 */ 574 if (tcp->tcp_rnxt == tcp->tcp_rack) { 575 return; 576 } 577 578 if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) { 579 /* 580 * Make sure we don't allow deferred ACKs to result in 581 * timer-based ACKing. If we have held off an ACK 582 * when there was more than an mss here, and the timer 583 * goes off, we have to worry about the possibility 584 * that the sender isn't doing slow-start, or is out 585 * of step with us for some other reason. We fall 586 * permanently back in the direction of 587 * ACK-every-other-packet as suggested in RFC 1122. 588 */ 589 if (tcp->tcp_rack_abs_max > 2) 590 tcp->tcp_rack_abs_max--; 591 tcp->tcp_rack_cur_max = 2; 592 } 593 mp = tcp_ack_mp(tcp); 594 595 if (mp != NULL) { 596 BUMP_LOCAL(tcp->tcp_obsegs); 597 TCPS_BUMP_MIB(tcps, tcpOutAck); 598 TCPS_BUMP_MIB(tcps, tcpOutAckDelayed); 599 tcp_send_data(tcp, mp); 600 } 601 } 602 603 /* 604 * Notify IP that we are having trouble with this connection. IP should 605 * make note so it can potentially use a different IRE. 606 */ 607 static void 608 tcp_ip_notify(tcp_t *tcp) 609 { 610 conn_t *connp = tcp->tcp_connp; 611 ire_t *ire; 612 613 /* 614 * Note: in the case of source routing we want to blow away the 615 * route to the first source route hop. 616 */ 617 ire = connp->conn_ixa->ixa_ire; 618 if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 619 if (ire->ire_ipversion == IPV4_VERSION) { 620 /* 621 * As per RFC 1122, we send an RTM_LOSING to inform 622 * routing protocols. 623 */ 624 ip_rts_change(RTM_LOSING, ire->ire_addr, 625 ire->ire_gateway_addr, ire->ire_mask, 626 connp->conn_laddr_v4, 0, 0, 0, 627 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), 628 ire->ire_ipst); 629 } 630 (void) ire_no_good(ire); 631 } 632 } 633 634 /* 635 * tcp_timer is the timer service routine. It handles the retransmission, 636 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out 637 * from the state of the tcp instance what kind of action needs to be done 638 * at the time it is called. 639 */ 640 void 641 tcp_timer(void *arg) 642 { 643 mblk_t *mp; 644 clock_t first_threshold; 645 clock_t second_threshold; 646 clock_t ms; 647 uint32_t mss; 648 conn_t *connp = (conn_t *)arg; 649 tcp_t *tcp = connp->conn_tcp; 650 tcp_stack_t *tcps = tcp->tcp_tcps; 651 boolean_t dont_timeout = B_FALSE; 652 653 tcp->tcp_timer_tid = 0; 654 655 if (tcp->tcp_fused) 656 return; 657 658 first_threshold = tcp->tcp_first_timer_threshold; 659 second_threshold = tcp->tcp_second_timer_threshold; 660 switch (tcp->tcp_state) { 661 case TCPS_IDLE: 662 case TCPS_BOUND: 663 case TCPS_LISTEN: 664 return; 665 case TCPS_SYN_RCVD: { 666 tcp_t *listener = tcp->tcp_listener; 667 668 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { 669 /* it's our first timeout */ 670 tcp->tcp_syn_rcvd_timeout = 1; 671 mutex_enter(&listener->tcp_eager_lock); 672 listener->tcp_syn_rcvd_timeout++; 673 if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) { 674 /* 675 * Make this eager available for drop if we 676 * need to drop one to accomodate a new 677 * incoming SYN request. 678 */ 679 MAKE_DROPPABLE(listener, tcp); 680 } 681 if (!listener->tcp_syn_defense && 682 (listener->tcp_syn_rcvd_timeout > 683 (tcps->tcps_conn_req_max_q0 >> 2)) && 684 (tcps->tcps_conn_req_max_q0 > 200)) { 685 /* We may be under attack. Put on a defense. */ 686 listener->tcp_syn_defense = B_TRUE; 687 cmn_err(CE_WARN, "High TCP connect timeout " 688 "rate! System (port %d) may be under a " 689 "SYN flood attack!", 690 ntohs(listener->tcp_connp->conn_lport)); 691 692 listener->tcp_ip_addr_cache = kmem_zalloc( 693 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), 694 KM_NOSLEEP); 695 } 696 mutex_exit(&listener->tcp_eager_lock); 697 } else if (listener != NULL) { 698 mutex_enter(&listener->tcp_eager_lock); 699 tcp->tcp_syn_rcvd_timeout++; 700 if (tcp->tcp_syn_rcvd_timeout > 1 && 701 !tcp->tcp_closemp_used) { 702 /* 703 * This is our second timeout. Put the tcp in 704 * the list of droppable eagers to allow it to 705 * be dropped, if needed. We don't check 706 * whether tcp_dontdrop is set or not to 707 * protect ourselve from a SYN attack where a 708 * remote host can spoof itself as one of the 709 * good IP source and continue to hold 710 * resources too long. 711 */ 712 MAKE_DROPPABLE(listener, tcp); 713 } 714 mutex_exit(&listener->tcp_eager_lock); 715 } 716 } 717 /* FALLTHRU */ 718 case TCPS_SYN_SENT: 719 first_threshold = tcp->tcp_first_ctimer_threshold; 720 second_threshold = tcp->tcp_second_ctimer_threshold; 721 722 /* 723 * If an app has set the second_threshold to 0, it means that 724 * we need to retransmit forever, unless this is a passive 725 * open. We need to set second_threshold back to a normal 726 * value such that later comparison with it still makes 727 * sense. But we set dont_timeout to B_TRUE so that we will 728 * never time out. 729 */ 730 if (second_threshold == 0) { 731 second_threshold = tcps->tcps_ip_abort_linterval; 732 if (tcp->tcp_active_open) 733 dont_timeout = B_TRUE; 734 } 735 break; 736 case TCPS_ESTABLISHED: 737 case TCPS_CLOSE_WAIT: 738 /* 739 * If the end point has not been closed, TCP can retransmit 740 * forever. But if the end point is closed, the normal 741 * timeout applies. 742 */ 743 if (second_threshold == 0) { 744 second_threshold = tcps->tcps_ip_abort_linterval; 745 dont_timeout = B_TRUE; 746 } 747 /* FALLTHRU */ 748 case TCPS_FIN_WAIT_1: 749 case TCPS_CLOSING: 750 case TCPS_LAST_ACK: 751 /* If we have data to rexmit */ 752 if (tcp->tcp_suna != tcp->tcp_snxt) { 753 clock_t time_to_wait; 754 755 TCPS_BUMP_MIB(tcps, tcpTimRetrans); 756 if (!tcp->tcp_xmit_head) 757 break; 758 time_to_wait = ddi_get_lbolt() - 759 (clock_t)tcp->tcp_xmit_head->b_prev; 760 time_to_wait = tcp->tcp_rto - 761 TICK_TO_MSEC(time_to_wait); 762 /* 763 * If the timer fires too early, 1 clock tick earlier, 764 * restart the timer. 765 */ 766 if (time_to_wait > msec_per_tick) { 767 TCP_STAT(tcps, tcp_timer_fire_early); 768 TCP_TIMER_RESTART(tcp, time_to_wait); 769 return; 770 } 771 /* 772 * When we probe zero windows, we force the swnd open. 773 * If our peer acks with a closed window swnd will be 774 * set to zero by tcp_rput(). As long as we are 775 * receiving acks tcp_rput will 776 * reset 'tcp_ms_we_have_waited' so as not to trip the 777 * first and second interval actions. NOTE: the timer 778 * interval is allowed to continue its exponential 779 * backoff. 780 */ 781 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 782 if (connp->conn_debug) { 783 (void) strlog(TCP_MOD_ID, 0, 1, 784 SL_TRACE, "tcp_timer: zero win"); 785 } 786 } else { 787 /* 788 * After retransmission, we need to do 789 * slow start. Set the ssthresh to one 790 * half of current effective window and 791 * cwnd to one MSS. Also reset 792 * tcp_cwnd_cnt. 793 * 794 * Note that if tcp_ssthresh is reduced because 795 * of ECN, do not reduce it again unless it is 796 * already one window of data away (tcp_cwr 797 * should then be cleared) or this is a 798 * timeout for a retransmitted segment. 799 */ 800 uint32_t npkt; 801 802 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 803 npkt = ((tcp->tcp_timer_backoff ? 804 tcp->tcp_cwnd_ssthresh : 805 tcp->tcp_snxt - 806 tcp->tcp_suna) >> 1) / tcp->tcp_mss; 807 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 808 tcp->tcp_mss; 809 } 810 tcp->tcp_cwnd = tcp->tcp_mss; 811 tcp->tcp_cwnd_cnt = 0; 812 if (tcp->tcp_ecn_ok) { 813 tcp->tcp_cwr = B_TRUE; 814 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 815 tcp->tcp_ecn_cwr_sent = B_FALSE; 816 } 817 } 818 break; 819 } 820 /* 821 * We have something to send yet we cannot send. The 822 * reason can be: 823 * 824 * 1. Zero send window: we need to do zero window probe. 825 * 2. Zero cwnd: because of ECN, we need to "clock out 826 * segments. 827 * 3. SWS avoidance: receiver may have shrunk window, 828 * reset our knowledge. 829 * 830 * Note that condition 2 can happen with either 1 or 831 * 3. But 1 and 3 are exclusive. 832 */ 833 if (tcp->tcp_unsent != 0) { 834 /* 835 * Should not hold the zero-copy messages for too long. 836 */ 837 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 838 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 839 tcp->tcp_xmit_head, B_TRUE); 840 841 if (tcp->tcp_cwnd == 0) { 842 /* 843 * Set tcp_cwnd to 1 MSS so that a 844 * new segment can be sent out. We 845 * are "clocking out" new data when 846 * the network is really congested. 847 */ 848 ASSERT(tcp->tcp_ecn_ok); 849 tcp->tcp_cwnd = tcp->tcp_mss; 850 } 851 if (tcp->tcp_swnd == 0) { 852 /* Extend window for zero window probe */ 853 tcp->tcp_swnd++; 854 tcp->tcp_zero_win_probe = B_TRUE; 855 TCPS_BUMP_MIB(tcps, tcpOutWinProbe); 856 } else { 857 /* 858 * Handle timeout from sender SWS avoidance. 859 * Reset our knowledge of the max send window 860 * since the receiver might have reduced its 861 * receive buffer. Avoid setting tcp_max_swnd 862 * to one since that will essentially disable 863 * the SWS checks. 864 * 865 * Note that since we don't have a SWS 866 * state variable, if the timeout is set 867 * for ECN but not for SWS, this 868 * code will also be executed. This is 869 * fine as tcp_max_swnd is updated 870 * constantly and it will not affect 871 * anything. 872 */ 873 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 874 } 875 tcp_wput_data(tcp, NULL, B_FALSE); 876 return; 877 } 878 /* Is there a FIN that needs to be to re retransmitted? */ 879 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 880 !tcp->tcp_fin_acked) 881 break; 882 /* Nothing to do, return without restarting timer. */ 883 TCP_STAT(tcps, tcp_timer_fire_miss); 884 return; 885 case TCPS_FIN_WAIT_2: 886 /* 887 * User closed the TCP endpoint and peer ACK'ed our FIN. 888 * We waited some time for for peer's FIN, but it hasn't 889 * arrived. We flush the connection now to avoid 890 * case where the peer has rebooted. 891 */ 892 if (TCP_IS_DETACHED(tcp)) { 893 (void) tcp_clean_death(tcp, 0); 894 } else { 895 TCP_TIMER_RESTART(tcp, 896 tcp->tcp_fin_wait_2_flush_interval); 897 } 898 return; 899 case TCPS_TIME_WAIT: 900 (void) tcp_clean_death(tcp, 0); 901 return; 902 default: 903 if (connp->conn_debug) { 904 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 905 "tcp_timer: strange state (%d) %s", 906 tcp->tcp_state, tcp_display(tcp, NULL, 907 DISP_PORT_ONLY)); 908 } 909 return; 910 } 911 912 /* 913 * If the system is under memory pressure or the max number of 914 * connections have been established for the listener, be more 915 * aggressive in aborting connections. 916 */ 917 if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL && 918 tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) { 919 second_threshold = tcp_early_abort * SECONDS; 920 921 /* We will ignore the never timeout promise in this case... */ 922 dont_timeout = B_FALSE; 923 } 924 925 ASSERT(second_threshold != 0); 926 927 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 928 /* 929 * Should not hold the zero-copy messages for too long. 930 */ 931 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 932 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 933 tcp->tcp_xmit_head, B_TRUE); 934 935 if (dont_timeout) { 936 /* 937 * Reset tcp_ms_we_have_waited to avoid overflow since 938 * we are going to retransmit forever. 939 */ 940 tcp->tcp_ms_we_have_waited = second_threshold; 941 goto timer_rexmit; 942 } 943 944 /* 945 * For zero window probe, we need to send indefinitely, 946 * unless we have not heard from the other side for some 947 * time... 948 */ 949 if ((tcp->tcp_zero_win_probe == 0) || 950 (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) > 951 second_threshold)) { 952 TCPS_BUMP_MIB(tcps, tcpTimRetransDrop); 953 /* 954 * If TCP is in SYN_RCVD state, send back a 955 * RST|ACK as BSD does. Note that tcp_zero_win_probe 956 * should be zero in TCPS_SYN_RCVD state. 957 */ 958 if (tcp->tcp_state == TCPS_SYN_RCVD) { 959 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 960 "in SYN_RCVD", 961 tcp, tcp->tcp_snxt, 962 tcp->tcp_rnxt, TH_RST | TH_ACK); 963 } 964 (void) tcp_clean_death(tcp, 965 tcp->tcp_client_errno ? 966 tcp->tcp_client_errno : ETIMEDOUT); 967 return; 968 } else { 969 /* 970 * If the system is under memory pressure, we also 971 * abort connection in zero window probing. 972 */ 973 if (tcps->tcps_reclaim) { 974 (void) tcp_clean_death(tcp, 975 tcp->tcp_client_errno ? 976 tcp->tcp_client_errno : ETIMEDOUT); 977 TCP_STAT(tcps, tcp_zwin_mem_drop); 978 return; 979 } 980 /* 981 * Set tcp_ms_we_have_waited to second_threshold 982 * so that in next timeout, we will do the above 983 * check (ddi_get_lbolt() - tcp_last_recv_time). 984 * This is also to avoid overflow. 985 * 986 * We don't need to decrement tcp_timer_backoff 987 * to avoid overflow because it will be decremented 988 * later if new timeout value is greater than 989 * tcp_rto_max. In the case when tcp_rto_max is 990 * greater than second_threshold, it means that we 991 * will wait longer than second_threshold to send 992 * the next 993 * window probe. 994 */ 995 tcp->tcp_ms_we_have_waited = second_threshold; 996 } 997 } else if (ms > first_threshold) { 998 /* 999 * Should not hold the zero-copy messages for too long. 1000 */ 1001 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 1002 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 1003 tcp->tcp_xmit_head, B_TRUE); 1004 1005 /* 1006 * We have been retransmitting for too long... The RTT 1007 * we calculated is probably incorrect. Reinitialize it. 1008 * Need to compensate for 0 tcp_rtt_sa. Reset 1009 * tcp_rtt_update so that we won't accidentally cache a 1010 * bad value. But only do this if this is not a zero 1011 * window probe. 1012 */ 1013 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { 1014 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 1015 (tcp->tcp_rtt_sa >> 5); 1016 tcp->tcp_rtt_sa = 0; 1017 tcp_ip_notify(tcp); 1018 tcp->tcp_rtt_update = 0; 1019 } 1020 } 1021 1022 timer_rexmit: 1023 tcp->tcp_timer_backoff++; 1024 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 1025 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 1026 tcp->tcp_rto_min) { 1027 /* 1028 * This means the original RTO is tcp_rexmit_interval_min. 1029 * So we will use tcp_rexmit_interval_min as the RTO value 1030 * and do the backoff. 1031 */ 1032 ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff; 1033 } else { 1034 ms <<= tcp->tcp_timer_backoff; 1035 } 1036 if (ms > tcp->tcp_rto_max) { 1037 ms = tcp->tcp_rto_max; 1038 /* 1039 * ms is at max, decrement tcp_timer_backoff to avoid 1040 * overflow. 1041 */ 1042 tcp->tcp_timer_backoff--; 1043 } 1044 tcp->tcp_ms_we_have_waited += ms; 1045 if (tcp->tcp_zero_win_probe == 0) { 1046 tcp->tcp_rto = ms; 1047 } 1048 TCP_TIMER_RESTART(tcp, ms); 1049 /* 1050 * This is after a timeout and tcp_rto is backed off. Set 1051 * tcp_set_timer to 1 so that next time RTO is updated, we will 1052 * restart the timer with a correct value. 1053 */ 1054 tcp->tcp_set_timer = 1; 1055 mss = tcp->tcp_snxt - tcp->tcp_suna; 1056 if (mss > tcp->tcp_mss) 1057 mss = tcp->tcp_mss; 1058 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 1059 mss = tcp->tcp_swnd; 1060 1061 if ((mp = tcp->tcp_xmit_head) != NULL) 1062 mp->b_prev = (mblk_t *)ddi_get_lbolt(); 1063 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 1064 B_TRUE); 1065 1066 /* 1067 * When slow start after retransmission begins, start with 1068 * this seq no. tcp_rexmit_max marks the end of special slow 1069 * start phase. tcp_snd_burst controls how many segments 1070 * can be sent because of an ack. 1071 */ 1072 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 1073 tcp->tcp_snd_burst = TCP_CWND_SS; 1074 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 1075 (tcp->tcp_unsent == 0)) { 1076 tcp->tcp_rexmit_max = tcp->tcp_fss; 1077 } else { 1078 tcp->tcp_rexmit_max = tcp->tcp_snxt; 1079 } 1080 tcp->tcp_rexmit = B_TRUE; 1081 tcp->tcp_dupack_cnt = 0; 1082 1083 /* 1084 * Remove all rexmit SACK blk to start from fresh. 1085 */ 1086 if (tcp->tcp_snd_sack_ok) 1087 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 1088 if (mp == NULL) { 1089 return; 1090 } 1091 1092 tcp->tcp_csuna = tcp->tcp_snxt; 1093 TCPS_BUMP_MIB(tcps, tcpRetransSegs); 1094 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss); 1095 tcp_send_data(tcp, mp); 1096 1097 } 1098 1099 /* 1100 * Handle lingering timeouts. This function is called when the SO_LINGER timeout 1101 * expires. 1102 */ 1103 void 1104 tcp_close_linger_timeout(void *arg) 1105 { 1106 conn_t *connp = (conn_t *)arg; 1107 tcp_t *tcp = connp->conn_tcp; 1108 1109 tcp->tcp_client_errno = ETIMEDOUT; 1110 tcp_stop_lingering(tcp); 1111 } 1112