1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 * $FreeBSD$ 31 */ 32 33 #include "opt_inet6.h" 34 #include "opt_tcpdebug.h" 35 #include "opt_tcp_sack.h" 36 37 #include <sys/param.h> 38 #include <sys/kernel.h> 39 #include <sys/lock.h> 40 #include <sys/mbuf.h> 41 #include <sys/mutex.h> 42 #include <sys/protosw.h> 43 #include <sys/socket.h> 44 #include <sys/socketvar.h> 45 #include <sys/sysctl.h> 46 #include <sys/systm.h> 47 48 #include <net/route.h> 49 50 #include <netinet/in.h> 51 #include <netinet/in_pcb.h> 52 #include <netinet/in_systm.h> 53 #ifdef INET6 54 #include <netinet6/in6_pcb.h> 55 #endif 56 #include <netinet/ip_var.h> 57 #include <netinet/tcp.h> 58 #include <netinet/tcp_fsm.h> 59 #include <netinet/tcp_timer.h> 60 #include <netinet/tcp_var.h> 61 #include <netinet/tcpip.h> 62 #ifdef TCPDEBUG 63 #include <netinet/tcp_debug.h> 64 #endif 65 66 static int 67 sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) 68 { 69 int error, s, tt; 70 71 tt = *(int *)oidp->oid_arg1; 72 s = (int)((int64_t)tt * 1000 / hz); 73 74 error = sysctl_handle_int(oidp, &s, 0, req); 75 if (error || !req->newptr) 76 return (error); 77 78 tt = (int)((int64_t)s * hz / 1000); 79 if (tt < 1) 80 return (EINVAL); 81 82 *(int *)oidp->oid_arg1 = tt; 83 return (0); 84 } 85 86 int tcp_keepinit; 87 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 88 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); 89 90 int tcp_keepidle; 91 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 92 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); 93 94 int tcp_keepintvl; 95 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 96 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); 97 98 int tcp_delacktime; 99 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 100 CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 101 "Time before a delayed ACK is sent"); 102 103 int tcp_msl; 104 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 105 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 106 107 int tcp_rexmit_min; 108 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 109 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); 110 111 int tcp_rexmit_slop; 112 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 113 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); 114 115 static int always_keepalive = 1; 116 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 117 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 118 119 static int tcp_keepcnt = TCPTV_KEEPCNT; 120 /* max idle probes */ 121 int tcp_maxpersistidle; 122 /* max idle time in persist */ 123 int tcp_maxidle; 124 125 /* 126 * Tcp protocol timeout routine called every 500 ms. 127 * Updates timestamps used for TCP 128 * causes finite state machine actions if timers expire. 129 */ 130 void 131 tcp_slowtimo() 132 { 133 134 tcp_maxidle = tcp_keepcnt * tcp_keepintvl; 135 INP_INFO_WLOCK(&tcbinfo); 136 (void) tcp_timer_2msl_tw(0); 137 INP_INFO_WUNLOCK(&tcbinfo); 138 } 139 140 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 141 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 142 143 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 144 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 145 146 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 147 148 /* 149 * TCP timer processing. 150 */ 151 152 void 153 tcp_timer_delack(xtp) 154 void *xtp; 155 { 156 struct tcpcb *tp = xtp; 157 struct inpcb *inp; 158 159 INP_INFO_RLOCK(&tcbinfo); 160 inp = tp->t_inpcb; 161 if (inp == NULL) { 162 INP_INFO_RUNLOCK(&tcbinfo); 163 return; 164 } 165 INP_LOCK(inp); 166 INP_INFO_RUNLOCK(&tcbinfo); 167 if (callout_pending(tp->tt_delack) || !callout_active(tp->tt_delack)) { 168 INP_UNLOCK(inp); 169 return; 170 } 171 callout_deactivate(tp->tt_delack); 172 173 tp->t_flags |= TF_ACKNOW; 174 tcpstat.tcps_delack++; 175 (void) tcp_output(tp); 176 INP_UNLOCK(inp); 177 } 178 179 void 180 tcp_timer_2msl(xtp) 181 void *xtp; 182 { 183 struct tcpcb *tp = xtp; 184 struct inpcb *inp; 185 #ifdef TCPDEBUG 186 int ostate; 187 188 ostate = tp->t_state; 189 #endif 190 INP_INFO_WLOCK(&tcbinfo); 191 inp = tp->t_inpcb; 192 if (inp == NULL) { 193 INP_INFO_WUNLOCK(&tcbinfo); 194 return; 195 } 196 INP_LOCK(inp); 197 tcp_free_sackholes(tp); 198 if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) { 199 INP_UNLOCK(tp->t_inpcb); 200 INP_INFO_WUNLOCK(&tcbinfo); 201 return; 202 } 203 callout_deactivate(tp->tt_2msl); 204 /* 205 * 2 MSL timeout in shutdown went off. If we're closed but 206 * still waiting for peer to close and connection has been idle 207 * too long, or if 2MSL time is up from TIME_WAIT, delete connection 208 * control block. Otherwise, check again in a bit. 209 */ 210 if (tp->t_state != TCPS_TIME_WAIT && 211 (ticks - tp->t_rcvtime) <= tcp_maxidle) 212 callout_reset(tp->tt_2msl, tcp_keepintvl, 213 tcp_timer_2msl, tp); 214 else 215 tp = tcp_close(tp); 216 217 #ifdef TCPDEBUG 218 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 219 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 220 PRU_SLOWTIMO); 221 #endif 222 if (tp) 223 INP_UNLOCK(inp); 224 INP_INFO_WUNLOCK(&tcbinfo); 225 } 226 227 /* 228 * The timed wait lists contain references to each of the TCP sessions 229 * currently TIME_WAIT state. The list pointers, including the list pointers 230 * in each tcptw structure, are protected using the global tcbinfo lock, 231 * which must be held over list iteration and modification. 232 */ 233 struct twlist { 234 LIST_HEAD(, tcptw) tw_list; 235 struct tcptw tw_tail; 236 }; 237 #define TWLIST_NLISTS 2 238 static struct twlist twl_2msl[TWLIST_NLISTS]; 239 static struct twlist *tw_2msl_list[] = { &twl_2msl[0], &twl_2msl[1], NULL }; 240 241 void 242 tcp_timer_init(void) 243 { 244 int i; 245 struct twlist *twl; 246 247 for (i = 0; i < TWLIST_NLISTS; i++) { 248 twl = &twl_2msl[i]; 249 LIST_INIT(&twl->tw_list); 250 LIST_INSERT_HEAD(&twl->tw_list, &twl->tw_tail, tw_2msl); 251 } 252 } 253 254 void 255 tcp_timer_2msl_reset(struct tcptw *tw, int timeo) 256 { 257 int i; 258 struct tcptw *tw_tail; 259 260 INP_INFO_WLOCK_ASSERT(&tcbinfo); 261 INP_LOCK_ASSERT(tw->tw_inpcb); 262 if (tw->tw_time != 0) 263 LIST_REMOVE(tw, tw_2msl); 264 tw->tw_time = timeo + ticks; 265 i = timeo > tcp_msl ? 1 : 0; 266 tw_tail = &twl_2msl[i].tw_tail; 267 LIST_INSERT_BEFORE(tw_tail, tw, tw_2msl); 268 } 269 270 void 271 tcp_timer_2msl_stop(struct tcptw *tw) 272 { 273 274 INP_INFO_WLOCK_ASSERT(&tcbinfo); 275 if (tw->tw_time != 0) 276 LIST_REMOVE(tw, tw_2msl); 277 } 278 279 struct tcptw * 280 tcp_timer_2msl_tw(int reuse) 281 { 282 struct tcptw *tw, *tw_tail; 283 struct twlist *twl; 284 int i; 285 286 INP_INFO_WLOCK_ASSERT(&tcbinfo); 287 for (i = 0; i < 2; i++) { 288 twl = tw_2msl_list[i]; 289 tw_tail = &twl->tw_tail; 290 for (;;) { 291 tw = LIST_FIRST(&twl->tw_list); 292 if (tw == tw_tail || (!reuse && tw->tw_time > ticks)) 293 break; 294 INP_LOCK(tw->tw_inpcb); 295 if (tcp_twclose(tw, reuse) != NULL) 296 return (tw); 297 } 298 } 299 return (NULL); 300 } 301 302 void 303 tcp_timer_keep(xtp) 304 void *xtp; 305 { 306 struct tcpcb *tp = xtp; 307 struct tcptemp *t_template; 308 struct inpcb *inp; 309 #ifdef TCPDEBUG 310 int ostate; 311 312 ostate = tp->t_state; 313 #endif 314 INP_INFO_WLOCK(&tcbinfo); 315 inp = tp->t_inpcb; 316 if (!inp) { 317 INP_INFO_WUNLOCK(&tcbinfo); 318 return; 319 } 320 INP_LOCK(inp); 321 if (callout_pending(tp->tt_keep) || !callout_active(tp->tt_keep)) { 322 INP_UNLOCK(inp); 323 INP_INFO_WUNLOCK(&tcbinfo); 324 return; 325 } 326 callout_deactivate(tp->tt_keep); 327 /* 328 * Keep-alive timer went off; send something 329 * or drop connection if idle for too long. 330 */ 331 tcpstat.tcps_keeptimeo++; 332 if (tp->t_state < TCPS_ESTABLISHED) 333 goto dropit; 334 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 335 tp->t_state <= TCPS_CLOSING) { 336 if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle) 337 goto dropit; 338 /* 339 * Send a packet designed to force a response 340 * if the peer is up and reachable: 341 * either an ACK if the connection is still alive, 342 * or an RST if the peer has closed the connection 343 * due to timeout or reboot. 344 * Using sequence number tp->snd_una-1 345 * causes the transmitted zero-length segment 346 * to lie outside the receive window; 347 * by the protocol spec, this requires the 348 * correspondent TCP to respond. 349 */ 350 tcpstat.tcps_keepprobe++; 351 t_template = tcpip_maketemplate(inp); 352 if (t_template) { 353 tcp_respond(tp, t_template->tt_ipgen, 354 &t_template->tt_t, (struct mbuf *)NULL, 355 tp->rcv_nxt, tp->snd_una - 1, 0); 356 (void) m_free(dtom(t_template)); 357 } 358 callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); 359 } else 360 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); 361 362 #ifdef TCPDEBUG 363 if (inp->inp_socket->so_options & SO_DEBUG) 364 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 365 PRU_SLOWTIMO); 366 #endif 367 INP_UNLOCK(inp); 368 INP_INFO_WUNLOCK(&tcbinfo); 369 return; 370 371 dropit: 372 tcpstat.tcps_keepdrops++; 373 tp = tcp_drop(tp, ETIMEDOUT); 374 375 #ifdef TCPDEBUG 376 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 377 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 378 PRU_SLOWTIMO); 379 #endif 380 if (tp) 381 INP_UNLOCK(tp->t_inpcb); 382 INP_INFO_WUNLOCK(&tcbinfo); 383 } 384 385 void 386 tcp_timer_persist(xtp) 387 void *xtp; 388 { 389 struct tcpcb *tp = xtp; 390 struct inpcb *inp; 391 #ifdef TCPDEBUG 392 int ostate; 393 394 ostate = tp->t_state; 395 #endif 396 INP_INFO_WLOCK(&tcbinfo); 397 inp = tp->t_inpcb; 398 if (!inp) { 399 INP_INFO_WUNLOCK(&tcbinfo); 400 return; 401 } 402 INP_LOCK(inp); 403 if (callout_pending(tp->tt_persist) || !callout_active(tp->tt_persist)){ 404 INP_UNLOCK(inp); 405 INP_INFO_WUNLOCK(&tcbinfo); 406 return; 407 } 408 callout_deactivate(tp->tt_persist); 409 /* 410 * Persistance timer into zero window. 411 * Force a byte to be output, if possible. 412 */ 413 tcpstat.tcps_persisttimeo++; 414 /* 415 * Hack: if the peer is dead/unreachable, we do not 416 * time out if the window is closed. After a full 417 * backoff, drop the connection if the idle time 418 * (no responses to probes) reaches the maximum 419 * backoff that we would use if retransmitting. 420 */ 421 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 422 ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || 423 (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 424 tcpstat.tcps_persistdrop++; 425 tp = tcp_drop(tp, ETIMEDOUT); 426 goto out; 427 } 428 tcp_setpersist(tp); 429 tp->t_flags |= TF_FORCEDATA; 430 (void) tcp_output(tp); 431 tp->t_flags &= ~TF_FORCEDATA; 432 433 out: 434 #ifdef TCPDEBUG 435 if (tp && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 436 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 437 PRU_SLOWTIMO); 438 #endif 439 if (tp) 440 INP_UNLOCK(inp); 441 INP_INFO_WUNLOCK(&tcbinfo); 442 } 443 444 void 445 tcp_timer_rexmt(xtp) 446 void *xtp; 447 { 448 struct tcpcb *tp = xtp; 449 int rexmt; 450 int headlocked; 451 struct inpcb *inp; 452 #ifdef TCPDEBUG 453 int ostate; 454 455 ostate = tp->t_state; 456 #endif 457 INP_INFO_WLOCK(&tcbinfo); 458 headlocked = 1; 459 inp = tp->t_inpcb; 460 if (!inp) { 461 INP_INFO_WUNLOCK(&tcbinfo); 462 return; 463 } 464 INP_LOCK(inp); 465 if (callout_pending(tp->tt_rexmt) || !callout_active(tp->tt_rexmt)) { 466 INP_UNLOCK(inp); 467 INP_INFO_WUNLOCK(&tcbinfo); 468 return; 469 } 470 callout_deactivate(tp->tt_rexmt); 471 tcp_free_sackholes(tp); 472 /* 473 * Retransmission timer went off. Message has not 474 * been acked within retransmit interval. Back off 475 * to a longer retransmit interval and retransmit one segment. 476 */ 477 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 478 tp->t_rxtshift = TCP_MAXRXTSHIFT; 479 tcpstat.tcps_timeoutdrop++; 480 tp = tcp_drop(tp, tp->t_softerror ? 481 tp->t_softerror : ETIMEDOUT); 482 goto out; 483 } 484 INP_INFO_WUNLOCK(&tcbinfo); 485 headlocked = 0; 486 if (tp->t_rxtshift == 1) { 487 /* 488 * first retransmit; record ssthresh and cwnd so they can 489 * be recovered if this turns out to be a "bad" retransmit. 490 * A retransmit is considered "bad" if an ACK for this 491 * segment is received within RTT/2 interval; the assumption 492 * here is that the ACK was already in flight. See 493 * "On Estimating End-to-End Network Path Properties" by 494 * Allman and Paxson for more details. 495 */ 496 tp->snd_cwnd_prev = tp->snd_cwnd; 497 tp->snd_ssthresh_prev = tp->snd_ssthresh; 498 tp->snd_recover_prev = tp->snd_recover; 499 if (IN_FASTRECOVERY(tp)) 500 tp->t_flags |= TF_WASFRECOVERY; 501 else 502 tp->t_flags &= ~TF_WASFRECOVERY; 503 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 504 } 505 tcpstat.tcps_rexmttimeo++; 506 if (tp->t_state == TCPS_SYN_SENT) 507 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; 508 else 509 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 510 TCPT_RANGESET(tp->t_rxtcur, rexmt, 511 tp->t_rttmin, TCPTV_REXMTMAX); 512 /* 513 * Disable rfc1323 if we havn't got any response to 514 * our third SYN to work-around some broken terminal servers 515 * (most of which have hopefully been retired) that have bad VJ 516 * header compression code which trashes TCP segments containing 517 * unknown-to-them TCP options. 518 */ 519 if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) 520 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP); 521 /* 522 * If we backed off this far, our srtt estimate is probably bogus. 523 * Clobber it so we'll take the next rtt measurement as our srtt; 524 * move the current srtt into rttvar to keep the current 525 * retransmit times until then. 526 */ 527 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 528 #ifdef INET6 529 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 530 in6_losing(tp->t_inpcb); 531 else 532 #endif 533 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 534 tp->t_srtt = 0; 535 } 536 tp->snd_nxt = tp->snd_una; 537 tp->snd_recover = tp->snd_max; 538 /* 539 * Force a segment to be sent. 540 */ 541 tp->t_flags |= TF_ACKNOW; 542 /* 543 * If timing a segment in this window, stop the timer. 544 */ 545 tp->t_rtttime = 0; 546 /* 547 * Close the congestion window down to one segment 548 * (we'll open it by one segment for each ack we get). 549 * Since we probably have a window's worth of unacked 550 * data accumulated, this "slow start" keeps us from 551 * dumping all that data as back-to-back packets (which 552 * might overwhelm an intermediate gateway). 553 * 554 * There are two phases to the opening: Initially we 555 * open by one mss on each ack. This makes the window 556 * size increase exponentially with time. If the 557 * window is larger than the path can handle, this 558 * exponential growth results in dropped packet(s) 559 * almost immediately. To get more time between 560 * drops but still "push" the network to take advantage 561 * of improving conditions, we switch from exponential 562 * to linear window opening at some threshhold size. 563 * For a threshhold, we use half the current window 564 * size, truncated to a multiple of the mss. 565 * 566 * (the minimum cwnd that will give us exponential 567 * growth is 2 mss. We don't allow the threshhold 568 * to go below this.) 569 */ 570 { 571 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; 572 if (win < 2) 573 win = 2; 574 tp->snd_cwnd = tp->t_maxseg; 575 tp->snd_ssthresh = win * tp->t_maxseg; 576 tp->t_dupacks = 0; 577 } 578 EXIT_FASTRECOVERY(tp); 579 (void) tcp_output(tp); 580 581 out: 582 #ifdef TCPDEBUG 583 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 584 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 585 PRU_SLOWTIMO); 586 #endif 587 if (tp) 588 INP_UNLOCK(inp); 589 if (headlocked) 590 INP_INFO_WUNLOCK(&tcbinfo); 591 } 592