1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 * $FreeBSD$ 31 */ 32 33 #include "opt_inet6.h" 34 #include "opt_tcpdebug.h" 35 #include "opt_tcp_sack.h" 36 37 #include <sys/param.h> 38 #include <sys/kernel.h> 39 #include <sys/lock.h> 40 #include <sys/mbuf.h> 41 #include <sys/mutex.h> 42 #include <sys/protosw.h> 43 #include <sys/socket.h> 44 #include <sys/socketvar.h> 45 #include <sys/sysctl.h> 46 #include <sys/systm.h> 47 48 #include <net/route.h> 49 50 #include <netinet/in.h> 51 #include <netinet/in_pcb.h> 52 #include <netinet/in_systm.h> 53 #ifdef INET6 54 #include <netinet6/in6_pcb.h> 55 #endif 56 #include <netinet/ip_var.h> 57 #include <netinet/tcp.h> 58 #include <netinet/tcp_fsm.h> 59 #include <netinet/tcp_timer.h> 60 #include <netinet/tcp_var.h> 61 #include <netinet/tcpip.h> 62 #ifdef TCPDEBUG 63 #include <netinet/tcp_debug.h> 64 #endif 65 66 static int 67 sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) 68 { 69 int error, s, tt; 70 71 tt = *(int *)oidp->oid_arg1; 72 s = (int)((int64_t)tt * 1000 / hz); 73 74 error = sysctl_handle_int(oidp, &s, 0, req); 75 if (error || !req->newptr) 76 return (error); 77 78 tt = (int)((int64_t)s * hz / 1000); 79 if (tt < 1) 80 return (EINVAL); 81 82 *(int *)oidp->oid_arg1 = tt; 83 return (0); 84 } 85 86 int tcp_keepinit; 87 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 88 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); 89 90 int tcp_keepidle; 91 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 92 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); 93 94 int tcp_keepintvl; 95 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 96 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); 97 98 int tcp_delacktime; 99 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 100 CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 101 "Time before a delayed ACK is sent"); 102 103 int tcp_msl; 104 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 105 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 106 107 int tcp_rexmit_min; 108 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 109 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); 110 111 int tcp_rexmit_slop; 112 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 113 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); 114 115 static int always_keepalive = 1; 116 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 117 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 118 119 static int tcp_keepcnt = TCPTV_KEEPCNT; 120 /* max idle probes */ 121 int tcp_maxpersistidle; 122 /* max idle time in persist */ 123 int tcp_maxidle; 124 125 /* 126 * Tcp protocol timeout routine called every 500 ms. 127 * Updates timestamps used for TCP 128 * causes finite state machine actions if timers expire. 129 */ 130 void 131 tcp_slowtimo() 132 { 133 134 tcp_maxidle = tcp_keepcnt * tcp_keepintvl; 135 INP_INFO_WLOCK(&tcbinfo); 136 (void) tcp_timer_2msl_tw(0); 137 INP_INFO_WUNLOCK(&tcbinfo); 138 } 139 140 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 141 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 142 143 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 144 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 145 146 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 147 148 /* 149 * TCP timer processing. 150 */ 151 152 void 153 tcp_timer_delack(xtp) 154 void *xtp; 155 { 156 struct tcpcb *tp = xtp; 157 struct inpcb *inp; 158 159 INP_INFO_RLOCK(&tcbinfo); 160 inp = tp->t_inpcb; 161 if (inp == NULL) { 162 INP_INFO_RUNLOCK(&tcbinfo); 163 return; 164 } 165 INP_LOCK(inp); 166 INP_INFO_RUNLOCK(&tcbinfo); 167 if (callout_pending(tp->tt_delack) || !callout_active(tp->tt_delack)) { 168 INP_UNLOCK(inp); 169 return; 170 } 171 callout_deactivate(tp->tt_delack); 172 173 tp->t_flags |= TF_ACKNOW; 174 tcpstat.tcps_delack++; 175 (void) tcp_output(tp); 176 INP_UNLOCK(inp); 177 } 178 179 void 180 tcp_timer_2msl(xtp) 181 void *xtp; 182 { 183 struct tcpcb *tp = xtp; 184 struct inpcb *inp; 185 #ifdef TCPDEBUG 186 int ostate; 187 188 ostate = tp->t_state; 189 #endif 190 INP_INFO_WLOCK(&tcbinfo); 191 inp = tp->t_inpcb; 192 if (inp == NULL) { 193 INP_INFO_WUNLOCK(&tcbinfo); 194 return; 195 } 196 INP_LOCK(inp); 197 tcp_free_sackholes(tp); 198 if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) { 199 INP_UNLOCK(tp->t_inpcb); 200 INP_INFO_WUNLOCK(&tcbinfo); 201 return; 202 } 203 callout_deactivate(tp->tt_2msl); 204 /* 205 * 2 MSL timeout in shutdown went off. If we're closed but 206 * still waiting for peer to close and connection has been idle 207 * too long, or if 2MSL time is up from TIME_WAIT, delete connection 208 * control block. Otherwise, check again in a bit. 209 */ 210 if (tp->t_state != TCPS_TIME_WAIT && 211 (ticks - tp->t_rcvtime) <= tcp_maxidle) 212 callout_reset(tp->tt_2msl, tcp_keepintvl, 213 tcp_timer_2msl, tp); 214 else 215 tp = tcp_close(tp); 216 217 #ifdef TCPDEBUG 218 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 219 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 220 PRU_SLOWTIMO); 221 #endif 222 if (tp) 223 INP_UNLOCK(inp); 224 INP_INFO_WUNLOCK(&tcbinfo); 225 } 226 227 /* 228 * The timed wait lists contain references to each of the TCP sessions 229 * currently TIME_WAIT state. The list pointers, including the list pointers 230 * in each tcptw structure, are protected using the global tcbinfo lock, 231 * which must be held over list iteration and modification. 232 */ 233 struct twlist { 234 LIST_HEAD(, tcptw) tw_list; 235 struct tcptw tw_tail; 236 }; 237 #define TWLIST_NLISTS 2 238 static struct twlist twl_2msl[TWLIST_NLISTS]; 239 static struct twlist *tw_2msl_list[] = { &twl_2msl[0], &twl_2msl[1], NULL }; 240 241 void 242 tcp_timer_init(void) 243 { 244 int i; 245 struct twlist *twl; 246 247 for (i = 0; i < TWLIST_NLISTS; i++) { 248 twl = &twl_2msl[i]; 249 LIST_INIT(&twl->tw_list); 250 LIST_INSERT_HEAD(&twl->tw_list, &twl->tw_tail, tw_2msl); 251 } 252 } 253 254 void 255 tcp_timer_2msl_reset(struct tcptw *tw, int timeo) 256 { 257 int i; 258 struct tcptw *tw_tail; 259 260 INP_INFO_WLOCK_ASSERT(&tcbinfo); 261 INP_LOCK_ASSERT(tw->tw_inpcb); 262 if (tw->tw_time != 0) 263 LIST_REMOVE(tw, tw_2msl); 264 tw->tw_time = timeo + ticks; 265 i = timeo > tcp_msl ? 1 : 0; 266 tw_tail = &twl_2msl[i].tw_tail; 267 LIST_INSERT_BEFORE(tw_tail, tw, tw_2msl); 268 } 269 270 void 271 tcp_timer_2msl_stop(struct tcptw *tw) 272 { 273 274 INP_INFO_WLOCK_ASSERT(&tcbinfo); 275 if (tw->tw_time != 0) 276 LIST_REMOVE(tw, tw_2msl); 277 } 278 279 struct tcptw * 280 tcp_timer_2msl_tw(int reuse) 281 { 282 struct tcptw *tw, *tw_tail; 283 struct twlist *twl; 284 int i; 285 286 INP_INFO_WLOCK_ASSERT(&tcbinfo); 287 for (i = 0; i < 2; i++) { 288 twl = tw_2msl_list[i]; 289 tw_tail = &twl->tw_tail; 290 for (;;) { 291 tw = LIST_FIRST(&twl->tw_list); 292 if (tw == tw_tail || (!reuse && tw->tw_time > ticks)) 293 break; 294 INP_LOCK(tw->tw_inpcb); 295 if (tcp_twclose(tw, reuse) != NULL) 296 return (tw); 297 } 298 } 299 return (NULL); 300 } 301 302 void 303 tcp_timer_keep(xtp) 304 void *xtp; 305 { 306 struct tcpcb *tp = xtp; 307 struct tcptemp *t_template; 308 int s; 309 struct inpcb *inp; 310 #ifdef TCPDEBUG 311 int ostate; 312 313 ostate = tp->t_state; 314 #endif 315 s = splnet(); 316 INP_INFO_WLOCK(&tcbinfo); 317 inp = tp->t_inpcb; 318 if (!inp) { 319 INP_INFO_WUNLOCK(&tcbinfo); 320 splx(s); 321 return; 322 } 323 INP_LOCK(inp); 324 if (callout_pending(tp->tt_keep) || !callout_active(tp->tt_keep)) { 325 INP_UNLOCK(inp); 326 INP_INFO_WUNLOCK(&tcbinfo); 327 splx(s); 328 return; 329 } 330 callout_deactivate(tp->tt_keep); 331 /* 332 * Keep-alive timer went off; send something 333 * or drop connection if idle for too long. 334 */ 335 tcpstat.tcps_keeptimeo++; 336 if (tp->t_state < TCPS_ESTABLISHED) 337 goto dropit; 338 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 339 tp->t_state <= TCPS_CLOSING) { 340 if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle) 341 goto dropit; 342 /* 343 * Send a packet designed to force a response 344 * if the peer is up and reachable: 345 * either an ACK if the connection is still alive, 346 * or an RST if the peer has closed the connection 347 * due to timeout or reboot. 348 * Using sequence number tp->snd_una-1 349 * causes the transmitted zero-length segment 350 * to lie outside the receive window; 351 * by the protocol spec, this requires the 352 * correspondent TCP to respond. 353 */ 354 tcpstat.tcps_keepprobe++; 355 t_template = tcpip_maketemplate(inp); 356 if (t_template) { 357 tcp_respond(tp, t_template->tt_ipgen, 358 &t_template->tt_t, (struct mbuf *)NULL, 359 tp->rcv_nxt, tp->snd_una - 1, 0); 360 (void) m_free(dtom(t_template)); 361 } 362 callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); 363 } else 364 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); 365 366 #ifdef TCPDEBUG 367 if (inp->inp_socket->so_options & SO_DEBUG) 368 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 369 PRU_SLOWTIMO); 370 #endif 371 INP_UNLOCK(inp); 372 INP_INFO_WUNLOCK(&tcbinfo); 373 splx(s); 374 return; 375 376 dropit: 377 tcpstat.tcps_keepdrops++; 378 tp = tcp_drop(tp, ETIMEDOUT); 379 380 #ifdef TCPDEBUG 381 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 382 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 383 PRU_SLOWTIMO); 384 #endif 385 if (tp) 386 INP_UNLOCK(tp->t_inpcb); 387 INP_INFO_WUNLOCK(&tcbinfo); 388 splx(s); 389 } 390 391 void 392 tcp_timer_persist(xtp) 393 void *xtp; 394 { 395 struct tcpcb *tp = xtp; 396 int s; 397 struct inpcb *inp; 398 #ifdef TCPDEBUG 399 int ostate; 400 401 ostate = tp->t_state; 402 #endif 403 s = splnet(); 404 INP_INFO_WLOCK(&tcbinfo); 405 inp = tp->t_inpcb; 406 if (!inp) { 407 INP_INFO_WUNLOCK(&tcbinfo); 408 splx(s); 409 return; 410 } 411 INP_LOCK(inp); 412 if (callout_pending(tp->tt_persist) || !callout_active(tp->tt_persist)){ 413 INP_UNLOCK(inp); 414 INP_INFO_WUNLOCK(&tcbinfo); 415 splx(s); 416 return; 417 } 418 callout_deactivate(tp->tt_persist); 419 /* 420 * Persistance timer into zero window. 421 * Force a byte to be output, if possible. 422 */ 423 tcpstat.tcps_persisttimeo++; 424 /* 425 * Hack: if the peer is dead/unreachable, we do not 426 * time out if the window is closed. After a full 427 * backoff, drop the connection if the idle time 428 * (no responses to probes) reaches the maximum 429 * backoff that we would use if retransmitting. 430 */ 431 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 432 ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || 433 (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 434 tcpstat.tcps_persistdrop++; 435 tp = tcp_drop(tp, ETIMEDOUT); 436 goto out; 437 } 438 tcp_setpersist(tp); 439 tp->t_force = 1; 440 (void) tcp_output(tp); 441 tp->t_force = 0; 442 443 out: 444 #ifdef TCPDEBUG 445 if (tp && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 446 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 447 PRU_SLOWTIMO); 448 #endif 449 if (tp) 450 INP_UNLOCK(inp); 451 INP_INFO_WUNLOCK(&tcbinfo); 452 splx(s); 453 } 454 455 void 456 tcp_timer_rexmt(xtp) 457 void *xtp; 458 { 459 struct tcpcb *tp = xtp; 460 int s; 461 int rexmt; 462 int headlocked; 463 struct inpcb *inp; 464 #ifdef TCPDEBUG 465 int ostate; 466 467 ostate = tp->t_state; 468 #endif 469 s = splnet(); 470 INP_INFO_WLOCK(&tcbinfo); 471 headlocked = 1; 472 inp = tp->t_inpcb; 473 if (!inp) { 474 INP_INFO_WUNLOCK(&tcbinfo); 475 splx(s); 476 return; 477 } 478 INP_LOCK(inp); 479 if (callout_pending(tp->tt_rexmt) || !callout_active(tp->tt_rexmt)) { 480 INP_UNLOCK(inp); 481 INP_INFO_WUNLOCK(&tcbinfo); 482 splx(s); 483 return; 484 } 485 callout_deactivate(tp->tt_rexmt); 486 tcp_free_sackholes(tp); 487 /* 488 * Retransmission timer went off. Message has not 489 * been acked within retransmit interval. Back off 490 * to a longer retransmit interval and retransmit one segment. 491 */ 492 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 493 tp->t_rxtshift = TCP_MAXRXTSHIFT; 494 tcpstat.tcps_timeoutdrop++; 495 tp = tcp_drop(tp, tp->t_softerror ? 496 tp->t_softerror : ETIMEDOUT); 497 goto out; 498 } 499 INP_INFO_WUNLOCK(&tcbinfo); 500 headlocked = 0; 501 if (tp->t_rxtshift == 1) { 502 /* 503 * first retransmit; record ssthresh and cwnd so they can 504 * be recovered if this turns out to be a "bad" retransmit. 505 * A retransmit is considered "bad" if an ACK for this 506 * segment is received within RTT/2 interval; the assumption 507 * here is that the ACK was already in flight. See 508 * "On Estimating End-to-End Network Path Properties" by 509 * Allman and Paxson for more details. 510 */ 511 tp->snd_cwnd_prev = tp->snd_cwnd; 512 tp->snd_ssthresh_prev = tp->snd_ssthresh; 513 tp->snd_recover_prev = tp->snd_recover; 514 if (IN_FASTRECOVERY(tp)) 515 tp->t_flags |= TF_WASFRECOVERY; 516 else 517 tp->t_flags &= ~TF_WASFRECOVERY; 518 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 519 } 520 tcpstat.tcps_rexmttimeo++; 521 if (tp->t_state == TCPS_SYN_SENT) 522 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; 523 else 524 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 525 TCPT_RANGESET(tp->t_rxtcur, rexmt, 526 tp->t_rttmin, TCPTV_REXMTMAX); 527 /* 528 * Disable rfc1323 if we havn't got any response to 529 * our third SYN to work-around some broken terminal servers 530 * (most of which have hopefully been retired) that have bad VJ 531 * header compression code which trashes TCP segments containing 532 * unknown-to-them TCP options. 533 */ 534 if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) 535 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP); 536 /* 537 * If we backed off this far, our srtt estimate is probably bogus. 538 * Clobber it so we'll take the next rtt measurement as our srtt; 539 * move the current srtt into rttvar to keep the current 540 * retransmit times until then. 541 */ 542 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 543 #ifdef INET6 544 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 545 in6_losing(tp->t_inpcb); 546 else 547 #endif 548 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 549 tp->t_srtt = 0; 550 } 551 tp->snd_nxt = tp->snd_una; 552 tp->snd_recover = tp->snd_max; 553 /* 554 * Force a segment to be sent. 555 */ 556 tp->t_flags |= TF_ACKNOW; 557 /* 558 * If timing a segment in this window, stop the timer. 559 */ 560 tp->t_rtttime = 0; 561 /* 562 * Close the congestion window down to one segment 563 * (we'll open it by one segment for each ack we get). 564 * Since we probably have a window's worth of unacked 565 * data accumulated, this "slow start" keeps us from 566 * dumping all that data as back-to-back packets (which 567 * might overwhelm an intermediate gateway). 568 * 569 * There are two phases to the opening: Initially we 570 * open by one mss on each ack. This makes the window 571 * size increase exponentially with time. If the 572 * window is larger than the path can handle, this 573 * exponential growth results in dropped packet(s) 574 * almost immediately. To get more time between 575 * drops but still "push" the network to take advantage 576 * of improving conditions, we switch from exponential 577 * to linear window opening at some threshhold size. 578 * For a threshhold, we use half the current window 579 * size, truncated to a multiple of the mss. 580 * 581 * (the minimum cwnd that will give us exponential 582 * growth is 2 mss. We don't allow the threshhold 583 * to go below this.) 584 */ 585 { 586 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; 587 if (win < 2) 588 win = 2; 589 tp->snd_cwnd = tp->t_maxseg; 590 tp->snd_ssthresh = win * tp->t_maxseg; 591 tp->t_dupacks = 0; 592 } 593 EXIT_FASTRECOVERY(tp); 594 (void) tcp_output(tp); 595 596 out: 597 #ifdef TCPDEBUG 598 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 599 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 600 PRU_SLOWTIMO); 601 #endif 602 if (tp) 603 INP_UNLOCK(inp); 604 if (headlocked) 605 INP_INFO_WUNLOCK(&tcbinfo); 606 splx(s); 607 } 608