1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 * $FreeBSD$ 31 */ 32 33 #include "opt_inet6.h" 34 #include "opt_tcpdebug.h" 35 36 #include <sys/param.h> 37 #include <sys/kernel.h> 38 #include <sys/lock.h> 39 #include <sys/mbuf.h> 40 #include <sys/mutex.h> 41 #include <sys/protosw.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/sysctl.h> 45 #include <sys/systm.h> 46 47 #include <net/route.h> 48 49 #include <netinet/in.h> 50 #include <netinet/in_pcb.h> 51 #include <netinet/in_systm.h> 52 #ifdef INET6 53 #include <netinet6/in6_pcb.h> 54 #endif 55 #include <netinet/ip_var.h> 56 #include <netinet/tcp.h> 57 #include <netinet/tcp_fsm.h> 58 #include <netinet/tcp_timer.h> 59 #include <netinet/tcp_var.h> 60 #include <netinet/tcpip.h> 61 #ifdef TCPDEBUG 62 #include <netinet/tcp_debug.h> 63 #endif 64 65 static int 66 sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) 67 { 68 int error, s, tt; 69 70 tt = *(int *)oidp->oid_arg1; 71 s = (int)((int64_t)tt * 1000 / hz); 72 73 error = sysctl_handle_int(oidp, &s, 0, req); 74 if (error || !req->newptr) 75 return (error); 76 77 tt = (int)((int64_t)s * hz / 1000); 78 if (tt < 1) 79 return (EINVAL); 80 81 *(int *)oidp->oid_arg1 = tt; 82 return (0); 83 } 84 85 int tcp_keepinit; 86 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 87 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); 88 89 int tcp_keepidle; 90 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 91 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); 92 93 int tcp_keepintvl; 94 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 95 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); 96 97 int tcp_delacktime; 98 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 99 CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 100 "Time before a delayed ACK is sent"); 101 102 int tcp_msl; 103 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 104 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 105 106 int tcp_rexmit_min; 107 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 108 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); 109 110 int tcp_rexmit_slop; 111 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 112 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); 113 114 static int always_keepalive = 1; 115 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 116 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 117 118 static int tcp_keepcnt = TCPTV_KEEPCNT; 119 /* max idle probes */ 120 int tcp_maxpersistidle; 121 /* max idle time in persist */ 122 int tcp_maxidle; 123 124 /* 125 * Tcp protocol timeout routine called every 500 ms. 126 * Updates timestamps used for TCP 127 * causes finite state machine actions if timers expire. 128 */ 129 void 130 tcp_slowtimo() 131 { 132 int s; 133 134 s = splnet(); 135 tcp_maxidle = tcp_keepcnt * tcp_keepintvl; 136 splx(s); 137 INP_INFO_WLOCK(&tcbinfo); 138 (void) tcp_timer_2msl_tw(0); 139 INP_INFO_WUNLOCK(&tcbinfo); 140 } 141 142 /* 143 * Cancel all timers for TCP tp. 144 */ 145 void 146 tcp_canceltimers(tp) 147 struct tcpcb *tp; 148 { 149 callout_stop(tp->tt_2msl); 150 callout_stop(tp->tt_persist); 151 callout_stop(tp->tt_keep); 152 callout_stop(tp->tt_rexmt); 153 } 154 155 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 156 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 157 158 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 159 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 160 161 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 162 163 /* 164 * TCP timer processing. 165 */ 166 167 void 168 tcp_timer_delack(xtp) 169 void *xtp; 170 { 171 struct tcpcb *tp = xtp; 172 int s; 173 struct inpcb *inp; 174 175 s = splnet(); 176 INP_INFO_RLOCK(&tcbinfo); 177 inp = tp->t_inpcb; 178 if (!inp) { 179 INP_INFO_RUNLOCK(&tcbinfo); 180 splx(s); 181 return; 182 } 183 INP_LOCK(inp); 184 INP_INFO_RUNLOCK(&tcbinfo); 185 if (callout_pending(tp->tt_delack) || !callout_active(tp->tt_delack)) { 186 INP_UNLOCK(inp); 187 splx(s); 188 return; 189 } 190 callout_deactivate(tp->tt_delack); 191 192 tp->t_flags |= TF_ACKNOW; 193 tcpstat.tcps_delack++; 194 (void) tcp_output(tp); 195 INP_UNLOCK(inp); 196 splx(s); 197 } 198 199 void 200 tcp_timer_2msl(xtp) 201 void *xtp; 202 { 203 struct tcpcb *tp = xtp; 204 int s; 205 struct inpcb *inp; 206 #ifdef TCPDEBUG 207 int ostate; 208 209 ostate = tp->t_state; 210 #endif 211 s = splnet(); 212 INP_INFO_WLOCK(&tcbinfo); 213 inp = tp->t_inpcb; 214 if (!inp) { 215 INP_INFO_WUNLOCK(&tcbinfo); 216 splx(s); 217 return; 218 } 219 INP_LOCK(inp); 220 if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) { 221 INP_UNLOCK(tp->t_inpcb); 222 INP_INFO_WUNLOCK(&tcbinfo); 223 splx(s); 224 return; 225 } 226 callout_deactivate(tp->tt_2msl); 227 /* 228 * 2 MSL timeout in shutdown went off. If we're closed but 229 * still waiting for peer to close and connection has been idle 230 * too long, or if 2MSL time is up from TIME_WAIT, delete connection 231 * control block. Otherwise, check again in a bit. 232 */ 233 if (tp->t_state != TCPS_TIME_WAIT && 234 (ticks - tp->t_rcvtime) <= tcp_maxidle) 235 callout_reset(tp->tt_2msl, tcp_keepintvl, 236 tcp_timer_2msl, tp); 237 else 238 tp = tcp_close(tp); 239 240 #ifdef TCPDEBUG 241 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 242 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 243 PRU_SLOWTIMO); 244 #endif 245 if (tp) 246 INP_UNLOCK(inp); 247 INP_INFO_WUNLOCK(&tcbinfo); 248 splx(s); 249 } 250 251 struct twlist { 252 LIST_HEAD(, tcptw) tw_list; 253 struct tcptw tw_tail; 254 }; 255 #define TWLIST_NLISTS 2 256 static struct twlist twl_2msl[TWLIST_NLISTS]; 257 static struct twlist *tw_2msl_list[] = { &twl_2msl[0], &twl_2msl[1], NULL }; 258 259 void 260 tcp_timer_init(void) 261 { 262 int i; 263 struct twlist *twl; 264 265 for (i = 0; i < TWLIST_NLISTS; i++) { 266 twl = &twl_2msl[i]; 267 LIST_INIT(&twl->tw_list); 268 LIST_INSERT_HEAD(&twl->tw_list, &twl->tw_tail, tw_2msl); 269 } 270 } 271 272 void 273 tcp_timer_2msl_reset(struct tcptw *tw, int timeo) 274 { 275 int i; 276 struct tcptw *tw_tail; 277 278 if (tw->tw_time != 0) 279 LIST_REMOVE(tw, tw_2msl); 280 tw->tw_time = timeo + ticks; 281 i = timeo > tcp_msl ? 1 : 0; 282 tw_tail = &twl_2msl[i].tw_tail; 283 LIST_INSERT_BEFORE(tw_tail, tw, tw_2msl); 284 } 285 286 void 287 tcp_timer_2msl_stop(struct tcptw *tw) 288 { 289 290 if (tw->tw_time != 0) 291 LIST_REMOVE(tw, tw_2msl); 292 } 293 294 struct tcptw * 295 tcp_timer_2msl_tw(int reuse) 296 { 297 struct tcptw *tw, *tw_tail; 298 struct twlist *twl; 299 int i; 300 301 for (i = 0; i < 2; i++) { 302 twl = tw_2msl_list[i]; 303 tw_tail = &twl->tw_tail; 304 for (;;) { 305 tw = LIST_FIRST(&twl->tw_list); 306 if (tw == tw_tail || (!reuse && tw->tw_time > ticks)) 307 break; 308 INP_LOCK(tw->tw_inpcb); 309 if (tcp_twclose(tw, reuse) != NULL) 310 return (tw); 311 } 312 } 313 return (NULL); 314 } 315 316 void 317 tcp_timer_keep(xtp) 318 void *xtp; 319 { 320 struct tcpcb *tp = xtp; 321 struct tcptemp *t_template; 322 int s; 323 struct inpcb *inp; 324 #ifdef TCPDEBUG 325 int ostate; 326 327 ostate = tp->t_state; 328 #endif 329 s = splnet(); 330 INP_INFO_WLOCK(&tcbinfo); 331 inp = tp->t_inpcb; 332 if (!inp) { 333 INP_INFO_WUNLOCK(&tcbinfo); 334 splx(s); 335 return; 336 } 337 INP_LOCK(inp); 338 if (callout_pending(tp->tt_keep) || !callout_active(tp->tt_keep)) { 339 INP_UNLOCK(inp); 340 INP_INFO_WUNLOCK(&tcbinfo); 341 splx(s); 342 return; 343 } 344 callout_deactivate(tp->tt_keep); 345 /* 346 * Keep-alive timer went off; send something 347 * or drop connection if idle for too long. 348 */ 349 tcpstat.tcps_keeptimeo++; 350 if (tp->t_state < TCPS_ESTABLISHED) 351 goto dropit; 352 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 353 tp->t_state <= TCPS_CLOSING) { 354 if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle) 355 goto dropit; 356 /* 357 * Send a packet designed to force a response 358 * if the peer is up and reachable: 359 * either an ACK if the connection is still alive, 360 * or an RST if the peer has closed the connection 361 * due to timeout or reboot. 362 * Using sequence number tp->snd_una-1 363 * causes the transmitted zero-length segment 364 * to lie outside the receive window; 365 * by the protocol spec, this requires the 366 * correspondent TCP to respond. 367 */ 368 tcpstat.tcps_keepprobe++; 369 t_template = tcpip_maketemplate(inp); 370 if (t_template) { 371 tcp_respond(tp, t_template->tt_ipgen, 372 &t_template->tt_t, (struct mbuf *)NULL, 373 tp->rcv_nxt, tp->snd_una - 1, 0); 374 (void) m_free(dtom(t_template)); 375 } 376 callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); 377 } else 378 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); 379 380 #ifdef TCPDEBUG 381 if (inp->inp_socket->so_options & SO_DEBUG) 382 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 383 PRU_SLOWTIMO); 384 #endif 385 INP_UNLOCK(inp); 386 INP_INFO_WUNLOCK(&tcbinfo); 387 splx(s); 388 return; 389 390 dropit: 391 tcpstat.tcps_keepdrops++; 392 tp = tcp_drop(tp, ETIMEDOUT); 393 394 #ifdef TCPDEBUG 395 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 396 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 397 PRU_SLOWTIMO); 398 #endif 399 if (tp) 400 INP_UNLOCK(tp->t_inpcb); 401 INP_INFO_WUNLOCK(&tcbinfo); 402 splx(s); 403 } 404 405 void 406 tcp_timer_persist(xtp) 407 void *xtp; 408 { 409 struct tcpcb *tp = xtp; 410 int s; 411 struct inpcb *inp; 412 #ifdef TCPDEBUG 413 int ostate; 414 415 ostate = tp->t_state; 416 #endif 417 s = splnet(); 418 INP_INFO_WLOCK(&tcbinfo); 419 inp = tp->t_inpcb; 420 if (!inp) { 421 INP_INFO_WUNLOCK(&tcbinfo); 422 splx(s); 423 return; 424 } 425 INP_LOCK(inp); 426 if (callout_pending(tp->tt_persist) || !callout_active(tp->tt_persist)){ 427 INP_UNLOCK(inp); 428 INP_INFO_WUNLOCK(&tcbinfo); 429 splx(s); 430 return; 431 } 432 callout_deactivate(tp->tt_persist); 433 /* 434 * Persistance timer into zero window. 435 * Force a byte to be output, if possible. 436 */ 437 tcpstat.tcps_persisttimeo++; 438 /* 439 * Hack: if the peer is dead/unreachable, we do not 440 * time out if the window is closed. After a full 441 * backoff, drop the connection if the idle time 442 * (no responses to probes) reaches the maximum 443 * backoff that we would use if retransmitting. 444 */ 445 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 446 ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || 447 (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 448 tcpstat.tcps_persistdrop++; 449 tp = tcp_drop(tp, ETIMEDOUT); 450 goto out; 451 } 452 tcp_setpersist(tp); 453 tp->t_force = 1; 454 (void) tcp_output(tp); 455 tp->t_force = 0; 456 457 out: 458 #ifdef TCPDEBUG 459 if (tp && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 460 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 461 PRU_SLOWTIMO); 462 #endif 463 if (tp) 464 INP_UNLOCK(inp); 465 INP_INFO_WUNLOCK(&tcbinfo); 466 splx(s); 467 } 468 469 void 470 tcp_timer_rexmt(xtp) 471 void *xtp; 472 { 473 struct tcpcb *tp = xtp; 474 int s; 475 int rexmt; 476 int headlocked; 477 struct inpcb *inp; 478 #ifdef TCPDEBUG 479 int ostate; 480 481 ostate = tp->t_state; 482 #endif 483 s = splnet(); 484 INP_INFO_WLOCK(&tcbinfo); 485 headlocked = 1; 486 inp = tp->t_inpcb; 487 if (!inp) { 488 INP_INFO_WUNLOCK(&tcbinfo); 489 splx(s); 490 return; 491 } 492 INP_LOCK(inp); 493 if (callout_pending(tp->tt_rexmt) || !callout_active(tp->tt_rexmt)) { 494 INP_UNLOCK(inp); 495 INP_INFO_WUNLOCK(&tcbinfo); 496 splx(s); 497 return; 498 } 499 callout_deactivate(tp->tt_rexmt); 500 /* 501 * Retransmission timer went off. Message has not 502 * been acked within retransmit interval. Back off 503 * to a longer retransmit interval and retransmit one segment. 504 */ 505 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 506 tp->t_rxtshift = TCP_MAXRXTSHIFT; 507 tcpstat.tcps_timeoutdrop++; 508 tp = tcp_drop(tp, tp->t_softerror ? 509 tp->t_softerror : ETIMEDOUT); 510 goto out; 511 } 512 INP_INFO_WUNLOCK(&tcbinfo); 513 headlocked = 0; 514 if (tp->t_rxtshift == 1) { 515 /* 516 * first retransmit; record ssthresh and cwnd so they can 517 * be recovered if this turns out to be a "bad" retransmit. 518 * A retransmit is considered "bad" if an ACK for this 519 * segment is received within RTT/2 interval; the assumption 520 * here is that the ACK was already in flight. See 521 * "On Estimating End-to-End Network Path Properties" by 522 * Allman and Paxson for more details. 523 */ 524 tp->snd_cwnd_prev = tp->snd_cwnd; 525 tp->snd_ssthresh_prev = tp->snd_ssthresh; 526 tp->snd_recover_prev = tp->snd_recover; 527 if (IN_FASTRECOVERY(tp)) 528 tp->t_flags |= TF_WASFRECOVERY; 529 else 530 tp->t_flags &= ~TF_WASFRECOVERY; 531 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 532 } 533 tcpstat.tcps_rexmttimeo++; 534 if (tp->t_state == TCPS_SYN_SENT) 535 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; 536 else 537 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 538 TCPT_RANGESET(tp->t_rxtcur, rexmt, 539 tp->t_rttmin, TCPTV_REXMTMAX); 540 /* 541 * Disable rfc1323 and rfc1644 if we havn't got any response to 542 * our third SYN to work-around some broken terminal servers 543 * (most of which have hopefully been retired) that have bad VJ 544 * header compression code which trashes TCP segments containing 545 * unknown-to-them TCP options. 546 */ 547 if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) 548 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC); 549 /* 550 * If we backed off this far, our srtt estimate is probably bogus. 551 * Clobber it so we'll take the next rtt measurement as our srtt; 552 * move the current srtt into rttvar to keep the current 553 * retransmit times until then. 554 */ 555 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 556 #ifdef INET6 557 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 558 in6_losing(tp->t_inpcb); 559 else 560 #endif 561 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 562 tp->t_srtt = 0; 563 } 564 tp->snd_nxt = tp->snd_una; 565 tp->snd_recover = tp->snd_max; 566 /* 567 * Force a segment to be sent. 568 */ 569 tp->t_flags |= TF_ACKNOW; 570 /* 571 * If timing a segment in this window, stop the timer. 572 */ 573 tp->t_rtttime = 0; 574 /* 575 * Close the congestion window down to one segment 576 * (we'll open it by one segment for each ack we get). 577 * Since we probably have a window's worth of unacked 578 * data accumulated, this "slow start" keeps us from 579 * dumping all that data as back-to-back packets (which 580 * might overwhelm an intermediate gateway). 581 * 582 * There are two phases to the opening: Initially we 583 * open by one mss on each ack. This makes the window 584 * size increase exponentially with time. If the 585 * window is larger than the path can handle, this 586 * exponential growth results in dropped packet(s) 587 * almost immediately. To get more time between 588 * drops but still "push" the network to take advantage 589 * of improving conditions, we switch from exponential 590 * to linear window opening at some threshhold size. 591 * For a threshhold, we use half the current window 592 * size, truncated to a multiple of the mss. 593 * 594 * (the minimum cwnd that will give us exponential 595 * growth is 2 mss. We don't allow the threshhold 596 * to go below this.) 597 */ 598 { 599 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; 600 if (win < 2) 601 win = 2; 602 tp->snd_cwnd = tp->t_maxseg; 603 tp->snd_ssthresh = win * tp->t_maxseg; 604 tp->t_dupacks = 0; 605 } 606 EXIT_FASTRECOVERY(tp); 607 (void) tcp_output(tp); 608 609 out: 610 #ifdef TCPDEBUG 611 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 612 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 613 PRU_SLOWTIMO); 614 #endif 615 if (tp) 616 INP_UNLOCK(inp); 617 if (headlocked) 618 INP_INFO_WUNLOCK(&tcbinfo); 619 splx(s); 620 } 621