1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_rss.h" 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/lock.h> 45 #include <sys/mbuf.h> 46 #include <sys/mutex.h> 47 #include <sys/protosw.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 54 #include <net/if.h> 55 #include <net/route.h> 56 #include <net/rss_config.h> 57 #include <net/vnet.h> 58 #include <net/netisr.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_kdtrace.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet/in_rss.h> 64 #include <netinet/in_systm.h> 65 #ifdef INET6 66 #include <netinet6/in6_pcb.h> 67 #endif 68 #include <netinet/ip_var.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_log_buf.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_seq.h> 75 #include <netinet/cc/cc.h> 76 #ifdef INET6 77 #include <netinet6/tcp6_var.h> 78 #endif 79 #include <netinet/tcpip.h> 80 #include <netinet/tcp_debug.h> 81 82 int tcp_persmin; 83 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, 84 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 85 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", 86 "minimum persistence interval"); 87 88 int tcp_persmax; 89 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, 90 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 91 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", 92 "maximum persistence interval"); 93 94 int tcp_keepinit; 95 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, 96 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 97 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", 98 "time to establish connection"); 99 100 int tcp_keepidle; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, 102 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 103 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", 104 "time before keepalive probes begin"); 105 106 int tcp_keepintvl; 107 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, 108 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 109 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", 110 "time between keepalive probes"); 111 112 int tcp_delacktime; 113 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 114 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 115 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 116 "Time before a delayed ACK is sent"); 117 118 VNET_DEFINE(int, tcp_msl); 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, 120 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, 121 &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I", 122 "Maximum segment lifetime"); 123 124 int tcp_rexmit_initial; 125 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, 126 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 127 &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 128 "Initial Retransmission Timeout"); 129 130 int tcp_rexmit_min; 131 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, 132 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 133 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 134 "Minimum Retransmission Timeout"); 135 136 int tcp_rexmit_slop; 137 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, 138 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 139 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 140 "Retransmission Timer Slop"); 141 142 VNET_DEFINE(int, tcp_always_keepalive) = 1; 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, 144 &VNET_NAME(tcp_always_keepalive) , 0, 145 "Assume SO_KEEPALIVE on all TCP connections"); 146 147 int tcp_fast_finwait2_recycle = 0; 148 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 149 &tcp_fast_finwait2_recycle, 0, 150 "Recycle closed FIN_WAIT_2 connections faster"); 151 152 int tcp_finwait2_timeout; 153 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, 154 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 155 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", 156 "FIN-WAIT2 timeout"); 157 158 int tcp_keepcnt = TCPTV_KEEPCNT; 159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 160 "Number of keepalive probes to send"); 161 162 /* max idle probes */ 163 int tcp_maxpersistidle; 164 165 int tcp_rexmit_drop_options = 0; 166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 167 &tcp_rexmit_drop_options, 0, 168 "Drop TCP options from 3rd and later retransmitted SYN"); 169 170 int tcp_maxunacktime = TCPTV_MAXUNACKTIME; 171 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, 172 CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT, 173 &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I", 174 "Maximum time (in ms) that a session can linger without making progress"); 175 176 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 178 CTLFLAG_RW|CTLFLAG_VNET, 179 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 180 "Path MTU Discovery Black Hole Detection Enabled"); 181 182 #ifdef INET 183 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 184 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 185 CTLFLAG_RW|CTLFLAG_VNET, 186 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 187 "Path MTU Discovery Black Hole Detection lowered MSS"); 188 #endif 189 190 #ifdef INET6 191 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 192 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 193 CTLFLAG_RW|CTLFLAG_VNET, 194 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 195 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 196 #endif 197 198 #ifdef RSS 199 static int per_cpu_timers = 1; 200 #else 201 static int per_cpu_timers = 0; 202 #endif 203 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 204 &per_cpu_timers , 0, "run tcp timers on all cpus"); 205 206 /* 207 * Map the given inp to a CPU id. 208 * 209 * This queries RSS if it's compiled in, else it defaults to the current 210 * CPU ID. 211 */ 212 inline int 213 inp_to_cpuid(struct inpcb *inp) 214 { 215 u_int cpuid; 216 217 if (per_cpu_timers) { 218 #ifdef RSS 219 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 220 if (cpuid == NETISR_CPUID_NONE) 221 return (curcpu); /* XXX */ 222 else 223 return (cpuid); 224 #endif 225 /* 226 * We don't have a flowid -> cpuid mapping, so cheat and 227 * just map unknown cpuids to curcpu. Not the best, but 228 * apparently better than defaulting to swi 0. 229 */ 230 cpuid = inp->inp_flowid % (mp_maxid + 1); 231 if (! CPU_ABSENT(cpuid)) 232 return (cpuid); 233 return (curcpu); 234 } else { 235 return (0); 236 } 237 } 238 239 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 240 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 241 242 int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 243 244 /* 245 * TCP timer processing. 246 */ 247 248 void 249 tcp_timer_delack(void *xtp) 250 { 251 struct epoch_tracker et; 252 struct tcpcb *tp = xtp; 253 struct inpcb *inp; 254 CURVNET_SET(tp->t_vnet); 255 256 inp = tp->t_inpcb; 257 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 258 INP_WLOCK(inp); 259 if (callout_pending(&tp->t_timers->tt_delack) || 260 !callout_active(&tp->t_timers->tt_delack)) { 261 INP_WUNLOCK(inp); 262 CURVNET_RESTORE(); 263 return; 264 } 265 callout_deactivate(&tp->t_timers->tt_delack); 266 if ((inp->inp_flags & INP_DROPPED) != 0) { 267 INP_WUNLOCK(inp); 268 CURVNET_RESTORE(); 269 return; 270 } 271 tp->t_flags |= TF_ACKNOW; 272 TCPSTAT_INC(tcps_delack); 273 NET_EPOCH_ENTER(et); 274 (void) tcp_output_unlock(tp); 275 NET_EPOCH_EXIT(et); 276 CURVNET_RESTORE(); 277 } 278 279 /* 280 * Call tcp_close() from a callout context. 281 */ 282 static void 283 tcp_timer_close(struct tcpcb *tp) 284 { 285 struct epoch_tracker et; 286 struct inpcb *inp = tp->t_inpcb; 287 288 INP_WLOCK_ASSERT(inp); 289 290 NET_EPOCH_ENTER(et); 291 tp = tcp_close(tp); 292 NET_EPOCH_EXIT(et); 293 if (tp != NULL) 294 INP_WUNLOCK(inp); 295 } 296 297 /* 298 * Call tcp_drop() from a callout context. 299 */ 300 static void 301 tcp_timer_drop(struct tcpcb *tp) 302 { 303 struct epoch_tracker et; 304 struct inpcb *inp = tp->t_inpcb; 305 306 INP_WLOCK_ASSERT(inp); 307 308 NET_EPOCH_ENTER(et); 309 tp = tcp_drop(tp, ETIMEDOUT); 310 NET_EPOCH_EXIT(et); 311 if (tp != NULL) 312 INP_WUNLOCK(inp); 313 } 314 315 void 316 tcp_timer_2msl(void *xtp) 317 { 318 struct tcpcb *tp = xtp; 319 struct inpcb *inp; 320 CURVNET_SET(tp->t_vnet); 321 #ifdef TCPDEBUG 322 int ostate; 323 324 ostate = tp->t_state; 325 #endif 326 inp = tp->t_inpcb; 327 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 328 INP_WLOCK(inp); 329 tcp_log_end_status(tp, TCP_EI_STATUS_2MSL); 330 tcp_free_sackholes(tp); 331 if (callout_pending(&tp->t_timers->tt_2msl) || 332 !callout_active(&tp->t_timers->tt_2msl)) { 333 INP_WUNLOCK(tp->t_inpcb); 334 CURVNET_RESTORE(); 335 return; 336 } 337 callout_deactivate(&tp->t_timers->tt_2msl); 338 if (inp->inp_flags & INP_DROPPED) { 339 INP_WUNLOCK(inp); 340 CURVNET_RESTORE(); 341 return; 342 } 343 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 344 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 345 /* 346 * 2 MSL timeout in shutdown went off. If we're closed but 347 * still waiting for peer to close and connection has been idle 348 * too long delete connection control block. Otherwise, check 349 * again in a bit. 350 * 351 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 352 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 353 * Ignore fact that there were recent incoming segments. 354 */ 355 if (tp->t_state == TCPS_TIME_WAIT) { 356 tcp_timer_close(tp); 357 CURVNET_RESTORE(); 358 return; 359 } else if (tp->t_state == TCPS_FIN_WAIT_2 && 360 tcp_fast_finwait2_recycle && tp->t_inpcb->inp_socket && 361 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 362 TCPSTAT_INC(tcps_finwait2_drops); 363 tcp_timer_close(tp); 364 CURVNET_RESTORE(); 365 return; 366 } else { 367 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 368 callout_reset(&tp->t_timers->tt_2msl, 369 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 370 } else { 371 tcp_timer_close(tp); 372 CURVNET_RESTORE(); 373 return; 374 } 375 } 376 377 #ifdef TCPDEBUG 378 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 379 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 380 PRU_SLOWTIMO); 381 #endif 382 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 383 384 INP_WUNLOCK(inp); 385 CURVNET_RESTORE(); 386 } 387 388 void 389 tcp_timer_keep(void *xtp) 390 { 391 struct tcpcb *tp = xtp; 392 struct tcptemp *t_template; 393 struct inpcb *inp; 394 struct epoch_tracker et; 395 CURVNET_SET(tp->t_vnet); 396 #ifdef TCPDEBUG 397 int ostate; 398 399 ostate = tp->t_state; 400 #endif 401 inp = tp->t_inpcb; 402 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 403 INP_WLOCK(inp); 404 if (callout_pending(&tp->t_timers->tt_keep) || 405 !callout_active(&tp->t_timers->tt_keep)) { 406 INP_WUNLOCK(inp); 407 CURVNET_RESTORE(); 408 return; 409 } 410 callout_deactivate(&tp->t_timers->tt_keep); 411 if (inp->inp_flags & INP_DROPPED) { 412 INP_WUNLOCK(inp); 413 CURVNET_RESTORE(); 414 return; 415 } 416 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 417 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 418 419 /* 420 * Because we don't regularly reset the keepalive callout in 421 * the ESTABLISHED state, it may be that we don't actually need 422 * to send a keepalive yet. If that occurs, schedule another 423 * call for the next time the keepalive timer might expire. 424 */ 425 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 426 u_int idletime; 427 428 idletime = ticks - tp->t_rcvtime; 429 if (idletime < TP_KEEPIDLE(tp)) { 430 callout_reset(&tp->t_timers->tt_keep, 431 TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); 432 INP_WUNLOCK(inp); 433 CURVNET_RESTORE(); 434 return; 435 } 436 } 437 438 /* 439 * Keep-alive timer went off; send something 440 * or drop connection if idle for too long. 441 */ 442 TCPSTAT_INC(tcps_keeptimeo); 443 if (tp->t_state < TCPS_ESTABLISHED) 444 goto dropit; 445 if ((V_tcp_always_keepalive || 446 inp->inp_socket->so_options & SO_KEEPALIVE) && 447 tp->t_state <= TCPS_CLOSING) { 448 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 449 goto dropit; 450 /* 451 * Send a packet designed to force a response 452 * if the peer is up and reachable: 453 * either an ACK if the connection is still alive, 454 * or an RST if the peer has closed the connection 455 * due to timeout or reboot. 456 * Using sequence number tp->snd_una-1 457 * causes the transmitted zero-length segment 458 * to lie outside the receive window; 459 * by the protocol spec, this requires the 460 * correspondent TCP to respond. 461 */ 462 TCPSTAT_INC(tcps_keepprobe); 463 t_template = tcpip_maketemplate(inp); 464 if (t_template) { 465 NET_EPOCH_ENTER(et); 466 tcp_respond(tp, t_template->tt_ipgen, 467 &t_template->tt_t, (struct mbuf *)NULL, 468 tp->rcv_nxt, tp->snd_una - 1, 0); 469 NET_EPOCH_EXIT(et); 470 free(t_template, M_TEMP); 471 } 472 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 473 tcp_timer_keep, tp); 474 } else 475 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 476 tcp_timer_keep, tp); 477 478 #ifdef TCPDEBUG 479 if (inp->inp_socket->so_options & SO_DEBUG) 480 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 481 PRU_SLOWTIMO); 482 #endif 483 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 484 INP_WUNLOCK(inp); 485 CURVNET_RESTORE(); 486 return; 487 488 dropit: 489 TCPSTAT_INC(tcps_keepdrops); 490 NET_EPOCH_ENTER(et); 491 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 492 tp = tcp_drop(tp, ETIMEDOUT); 493 494 #ifdef TCPDEBUG 495 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 496 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 497 PRU_SLOWTIMO); 498 #endif 499 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 500 NET_EPOCH_EXIT(et); 501 if (tp != NULL) 502 INP_WUNLOCK(inp); 503 CURVNET_RESTORE(); 504 } 505 506 /* 507 * Has this session exceeded the maximum time without seeing a substantive 508 * acknowledgement? If so, return true; otherwise false. 509 */ 510 static bool 511 tcp_maxunacktime_check(struct tcpcb *tp) 512 { 513 514 /* Are we tracking this timer for this session? */ 515 if (TP_MAXUNACKTIME(tp) == 0) 516 return false; 517 518 /* Do we have a current measurement. */ 519 if (tp->t_acktime == 0) 520 return false; 521 522 /* Are we within the acceptable range? */ 523 if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks)) 524 return false; 525 526 /* We exceeded the timer. */ 527 TCPSTAT_INC(tcps_progdrops); 528 return true; 529 } 530 531 void 532 tcp_timer_persist(void *xtp) 533 { 534 struct tcpcb *tp = xtp; 535 struct inpcb *inp; 536 struct epoch_tracker et; 537 bool progdrop; 538 int outrv; 539 CURVNET_SET(tp->t_vnet); 540 #ifdef TCPDEBUG 541 int ostate; 542 543 ostate = tp->t_state; 544 #endif 545 inp = tp->t_inpcb; 546 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 547 INP_WLOCK(inp); 548 if (callout_pending(&tp->t_timers->tt_persist) || 549 !callout_active(&tp->t_timers->tt_persist)) { 550 INP_WUNLOCK(inp); 551 CURVNET_RESTORE(); 552 return; 553 } 554 callout_deactivate(&tp->t_timers->tt_persist); 555 if (inp->inp_flags & INP_DROPPED) { 556 INP_WUNLOCK(inp); 557 CURVNET_RESTORE(); 558 return; 559 } 560 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 561 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 562 /* 563 * Persistence timer into zero window. 564 * Force a byte to be output, if possible. 565 */ 566 TCPSTAT_INC(tcps_persisttimeo); 567 /* 568 * Hack: if the peer is dead/unreachable, we do not 569 * time out if the window is closed. After a full 570 * backoff, drop the connection if the idle time 571 * (no responses to probes) reaches the maximum 572 * backoff that we would use if retransmitting. 573 * Also, drop the connection if we haven't been making 574 * progress. 575 */ 576 progdrop = tcp_maxunacktime_check(tp); 577 if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT && 578 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 579 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { 580 if (!progdrop) 581 TCPSTAT_INC(tcps_persistdrop); 582 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 583 tcp_timer_drop(tp); 584 CURVNET_RESTORE(); 585 return; 586 } 587 /* 588 * If the user has closed the socket then drop a persisting 589 * connection after a much reduced timeout. 590 */ 591 if (tp->t_state > TCPS_CLOSE_WAIT && 592 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 593 TCPSTAT_INC(tcps_persistdrop); 594 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 595 tcp_timer_drop(tp); 596 CURVNET_RESTORE(); 597 return; 598 } 599 tcp_setpersist(tp); 600 tp->t_flags |= TF_FORCEDATA; 601 NET_EPOCH_ENTER(et); 602 outrv = tcp_output_nodrop(tp); 603 tp->t_flags &= ~TF_FORCEDATA; 604 605 #ifdef TCPDEBUG 606 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 607 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 608 #endif 609 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 610 (void) tcp_unlock_or_drop(tp, outrv); 611 NET_EPOCH_EXIT(et); 612 CURVNET_RESTORE(); 613 } 614 615 void 616 tcp_timer_rexmt(void * xtp) 617 { 618 struct tcpcb *tp = xtp; 619 CURVNET_SET(tp->t_vnet); 620 int rexmt, outrv; 621 struct inpcb *inp; 622 struct epoch_tracker et; 623 bool isipv6; 624 #ifdef TCPDEBUG 625 int ostate; 626 627 ostate = tp->t_state; 628 #endif 629 inp = tp->t_inpcb; 630 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 631 INP_WLOCK(inp); 632 if (callout_pending(&tp->t_timers->tt_rexmt) || 633 !callout_active(&tp->t_timers->tt_rexmt)) { 634 INP_WUNLOCK(inp); 635 CURVNET_RESTORE(); 636 return; 637 } 638 callout_deactivate(&tp->t_timers->tt_rexmt); 639 if (inp->inp_flags & INP_DROPPED) { 640 INP_WUNLOCK(inp); 641 CURVNET_RESTORE(); 642 return; 643 } 644 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 645 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 646 tcp_free_sackholes(tp); 647 TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); 648 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 649 /* The stack has a timer action too. */ 650 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 651 } 652 /* 653 * Retransmission timer went off. Message has not 654 * been acked within retransmit interval. Back off 655 * to a longer retransmit interval and retransmit one segment. 656 * 657 * If we've either exceeded the maximum number of retransmissions, 658 * or we've gone long enough without making progress, then drop 659 * the session. 660 */ 661 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) { 662 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) 663 TCPSTAT_INC(tcps_timeoutdrop); 664 tp->t_rxtshift = TCP_MAXRXTSHIFT; 665 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 666 tcp_timer_drop(tp); 667 CURVNET_RESTORE(); 668 return; 669 } 670 if (tp->t_state == TCPS_SYN_SENT) { 671 /* 672 * If the SYN was retransmitted, indicate CWND to be 673 * limited to 1 segment in cc_conn_init(). 674 */ 675 tp->snd_cwnd = 1; 676 } else if (tp->t_rxtshift == 1) { 677 /* 678 * first retransmit; record ssthresh and cwnd so they can 679 * be recovered if this turns out to be a "bad" retransmit. 680 * A retransmit is considered "bad" if an ACK for this 681 * segment is received within RTT/2 interval; the assumption 682 * here is that the ACK was already in flight. See 683 * "On Estimating End-to-End Network Path Properties" by 684 * Allman and Paxson for more details. 685 */ 686 tp->snd_cwnd_prev = tp->snd_cwnd; 687 tp->snd_ssthresh_prev = tp->snd_ssthresh; 688 tp->snd_recover_prev = tp->snd_recover; 689 if (IN_FASTRECOVERY(tp->t_flags)) 690 tp->t_flags |= TF_WASFRECOVERY; 691 else 692 tp->t_flags &= ~TF_WASFRECOVERY; 693 if (IN_CONGRECOVERY(tp->t_flags)) 694 tp->t_flags |= TF_WASCRECOVERY; 695 else 696 tp->t_flags &= ~TF_WASCRECOVERY; 697 if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 698 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 699 /* In the event that we've negotiated timestamps 700 * badrxtwin will be set to the value that we set 701 * the retransmitted packet's to_tsval to by tcp_output 702 */ 703 tp->t_flags |= TF_PREVVALID; 704 } else 705 tp->t_flags &= ~TF_PREVVALID; 706 TCPSTAT_INC(tcps_rexmttimeo); 707 if ((tp->t_state == TCPS_SYN_SENT) || 708 (tp->t_state == TCPS_SYN_RECEIVED)) 709 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 710 else 711 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 712 TCPT_RANGESET(tp->t_rxtcur, rexmt, 713 tp->t_rttmin, TCPTV_REXMTMAX); 714 715 /* 716 * We enter the path for PLMTUD if connection is established or, if 717 * connection is FIN_WAIT_1 status, reason for the last is that if 718 * amount of data we send is very small, we could send it in couple of 719 * packets and process straight to FIN. In that case we won't catch 720 * ESTABLISHED state. 721 */ 722 #ifdef INET6 723 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 724 #else 725 isipv6 = false; 726 #endif 727 if (((V_tcp_pmtud_blackhole_detect == 1) || 728 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 729 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 730 ((tp->t_state == TCPS_ESTABLISHED) || 731 (tp->t_state == TCPS_FIN_WAIT_1))) { 732 if (tp->t_rxtshift == 1) { 733 /* 734 * We enter blackhole detection after the first 735 * unsuccessful timer based retransmission. 736 * Then we reduce up to two times the MSS, each 737 * candidate giving two tries of retransmissions. 738 * But we give a candidate only two tries, if it 739 * actually reduces the MSS. 740 */ 741 tp->t_blackhole_enter = 2; 742 tp->t_blackhole_exit = tp->t_blackhole_enter; 743 if (isipv6) { 744 #ifdef INET6 745 if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) 746 tp->t_blackhole_exit += 2; 747 if (tp->t_maxseg > V_tcp_v6mssdflt && 748 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) 749 tp->t_blackhole_exit += 2; 750 #endif 751 } else { 752 #ifdef INET 753 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) 754 tp->t_blackhole_exit += 2; 755 if (tp->t_maxseg > V_tcp_mssdflt && 756 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) 757 tp->t_blackhole_exit += 2; 758 #endif 759 } 760 } 761 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 762 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 763 (tp->t_rxtshift >= tp->t_blackhole_enter && 764 tp->t_rxtshift < tp->t_blackhole_exit && 765 (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) { 766 /* 767 * Enter Path MTU Black-hole Detection mechanism: 768 * - Disable Path MTU Discovery (IP "DF" bit). 769 * - Reduce MTU to lower value than what we 770 * negotiated with peer. 771 */ 772 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 773 /* Record that we may have found a black hole. */ 774 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 775 /* Keep track of previous MSS. */ 776 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 777 } 778 779 /* 780 * Reduce the MSS to blackhole value or to the default 781 * in an attempt to retransmit. 782 */ 783 #ifdef INET6 784 if (isipv6 && 785 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss && 786 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) { 787 /* Use the sysctl tuneable blackhole MSS. */ 788 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 789 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 790 } else if (isipv6) { 791 /* Use the default MSS. */ 792 tp->t_maxseg = V_tcp_v6mssdflt; 793 /* 794 * Disable Path MTU Discovery when we switch to 795 * minmss. 796 */ 797 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 798 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 799 } 800 #endif 801 #if defined(INET6) && defined(INET) 802 else 803 #endif 804 #ifdef INET 805 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss && 806 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) { 807 /* Use the sysctl tuneable blackhole MSS. */ 808 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 809 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 810 } else { 811 /* Use the default MSS. */ 812 tp->t_maxseg = V_tcp_mssdflt; 813 /* 814 * Disable Path MTU Discovery when we switch to 815 * minmss. 816 */ 817 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 818 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 819 } 820 #endif 821 /* 822 * Reset the slow-start flight size 823 * as it may depend on the new MSS. 824 */ 825 if (CC_ALGO(tp)->conn_init != NULL) 826 CC_ALGO(tp)->conn_init(tp->ccv); 827 } else { 828 /* 829 * If further retransmissions are still unsuccessful 830 * with a lowered MTU, maybe this isn't a blackhole and 831 * we restore the previous MSS and blackhole detection 832 * flags. 833 */ 834 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 835 (tp->t_rxtshift >= tp->t_blackhole_exit)) { 836 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 837 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 838 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 839 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 840 /* 841 * Reset the slow-start flight size as it 842 * may depend on the new MSS. 843 */ 844 if (CC_ALGO(tp)->conn_init != NULL) 845 CC_ALGO(tp)->conn_init(tp->ccv); 846 } 847 } 848 } 849 850 /* 851 * Disable RFC1323 and SACK if we haven't got any response to 852 * our third SYN to work-around some broken terminal servers 853 * (most of which have hopefully been retired) that have bad VJ 854 * header compression code which trashes TCP segments containing 855 * unknown-to-them TCP options. 856 */ 857 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 858 (tp->t_rxtshift == 3)) 859 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 860 /* 861 * If we backed off this far, notify the L3 protocol that we're having 862 * connection problems. 863 */ 864 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 865 #ifdef INET6 866 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 867 in6_losing(tp->t_inpcb); 868 else 869 #endif 870 in_losing(tp->t_inpcb); 871 } 872 tp->snd_nxt = tp->snd_una; 873 tp->snd_recover = tp->snd_max; 874 /* 875 * Force a segment to be sent. 876 */ 877 tp->t_flags |= TF_ACKNOW; 878 /* 879 * If timing a segment in this window, stop the timer. 880 */ 881 tp->t_rtttime = 0; 882 883 cc_cong_signal(tp, NULL, CC_RTO); 884 NET_EPOCH_ENTER(et); 885 outrv = tcp_output_nodrop(tp); 886 #ifdef TCPDEBUG 887 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 888 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 889 PRU_SLOWTIMO); 890 #endif 891 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 892 (void) tcp_unlock_or_drop(tp, outrv); 893 NET_EPOCH_EXIT(et); 894 CURVNET_RESTORE(); 895 } 896 897 void 898 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 899 { 900 struct callout *t_callout; 901 callout_func_t *f_callout; 902 struct inpcb *inp = tp->t_inpcb; 903 int cpu = inp_to_cpuid(inp); 904 905 #ifdef TCP_OFFLOAD 906 if (tp->t_flags & TF_TOE) 907 return; 908 #endif 909 910 if (tp->t_timers->tt_flags & TT_STOPPED) 911 return; 912 913 switch (timer_type) { 914 case TT_DELACK: 915 t_callout = &tp->t_timers->tt_delack; 916 f_callout = tcp_timer_delack; 917 break; 918 case TT_REXMT: 919 t_callout = &tp->t_timers->tt_rexmt; 920 f_callout = tcp_timer_rexmt; 921 break; 922 case TT_PERSIST: 923 t_callout = &tp->t_timers->tt_persist; 924 f_callout = tcp_timer_persist; 925 break; 926 case TT_KEEP: 927 t_callout = &tp->t_timers->tt_keep; 928 f_callout = tcp_timer_keep; 929 break; 930 case TT_2MSL: 931 t_callout = &tp->t_timers->tt_2msl; 932 f_callout = tcp_timer_2msl; 933 break; 934 default: 935 if (tp->t_fb->tfb_tcp_timer_activate) { 936 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 937 return; 938 } 939 panic("tp %p bad timer_type %#x", tp, timer_type); 940 } 941 if (delta == 0) { 942 callout_stop(t_callout); 943 } else { 944 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 945 } 946 } 947 948 int 949 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 950 { 951 struct callout *t_callout; 952 953 switch (timer_type) { 954 case TT_DELACK: 955 t_callout = &tp->t_timers->tt_delack; 956 break; 957 case TT_REXMT: 958 t_callout = &tp->t_timers->tt_rexmt; 959 break; 960 case TT_PERSIST: 961 t_callout = &tp->t_timers->tt_persist; 962 break; 963 case TT_KEEP: 964 t_callout = &tp->t_timers->tt_keep; 965 break; 966 case TT_2MSL: 967 t_callout = &tp->t_timers->tt_2msl; 968 break; 969 default: 970 if (tp->t_fb->tfb_tcp_timer_active) { 971 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 972 } 973 panic("tp %p bad timer_type %#x", tp, timer_type); 974 } 975 return callout_active(t_callout); 976 } 977 978 /* 979 * Stop the timer from running, and apply a flag 980 * against the timer_flags that will force the 981 * timer never to run. The flag is needed to assure 982 * a race does not leave it running and cause 983 * the timer to possibly restart itself (keep and persist 984 * especially do this). 985 */ 986 int 987 tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) 988 { 989 struct callout *t_callout; 990 uint32_t t_flags; 991 992 switch (timer_type) { 993 case TT_DELACK: 994 t_flags = TT_DELACK_SUS; 995 t_callout = &tp->t_timers->tt_delack; 996 break; 997 case TT_REXMT: 998 t_flags = TT_REXMT_SUS; 999 t_callout = &tp->t_timers->tt_rexmt; 1000 break; 1001 case TT_PERSIST: 1002 t_flags = TT_PERSIST_SUS; 1003 t_callout = &tp->t_timers->tt_persist; 1004 break; 1005 case TT_KEEP: 1006 t_flags = TT_KEEP_SUS; 1007 t_callout = &tp->t_timers->tt_keep; 1008 break; 1009 case TT_2MSL: 1010 t_flags = TT_2MSL_SUS; 1011 t_callout = &tp->t_timers->tt_2msl; 1012 break; 1013 default: 1014 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 1015 } 1016 tp->t_timers->tt_flags |= t_flags; 1017 return (callout_stop(t_callout)); 1018 } 1019 1020 void 1021 tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) 1022 { 1023 switch (timer_type) { 1024 case TT_DELACK: 1025 if (tp->t_timers->tt_flags & TT_DELACK_SUS) { 1026 tp->t_timers->tt_flags &= ~TT_DELACK_SUS; 1027 if (tp->t_flags & TF_DELACK) { 1028 /* Delayed ack timer should be up activate a timer */ 1029 tp->t_flags &= ~TF_DELACK; 1030 tcp_timer_activate(tp, TT_DELACK, 1031 tcp_delacktime); 1032 } 1033 } 1034 break; 1035 case TT_REXMT: 1036 if (tp->t_timers->tt_flags & TT_REXMT_SUS) { 1037 tp->t_timers->tt_flags &= ~TT_REXMT_SUS; 1038 if (SEQ_GT(tp->snd_max, tp->snd_una) && 1039 (tcp_timer_active((tp), TT_PERSIST) == 0) && 1040 tp->snd_wnd) { 1041 /* We have outstanding data activate a timer */ 1042 tcp_timer_activate(tp, TT_REXMT, 1043 tp->t_rxtcur); 1044 } 1045 } 1046 break; 1047 case TT_PERSIST: 1048 if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { 1049 tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; 1050 if (tp->snd_wnd == 0) { 1051 /* Activate the persists timer */ 1052 tp->t_rxtshift = 0; 1053 tcp_setpersist(tp); 1054 } 1055 } 1056 break; 1057 case TT_KEEP: 1058 if (tp->t_timers->tt_flags & TT_KEEP_SUS) { 1059 tp->t_timers->tt_flags &= ~TT_KEEP_SUS; 1060 tcp_timer_activate(tp, TT_KEEP, 1061 TCPS_HAVEESTABLISHED(tp->t_state) ? 1062 TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); 1063 } 1064 break; 1065 case TT_2MSL: 1066 if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { 1067 tp->t_timers->tt_flags &= ~TT_2MSL_SUS; 1068 if ((tp->t_state == TCPS_FIN_WAIT_2) && 1069 ((tp->t_inpcb->inp_socket == NULL) || 1070 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) { 1071 /* Star the 2MSL timer */ 1072 tcp_timer_activate(tp, TT_2MSL, 1073 (tcp_fast_finwait2_recycle) ? 1074 tcp_finwait2_timeout : TP_MAXIDLE(tp)); 1075 } 1076 } 1077 break; 1078 default: 1079 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 1080 } 1081 } 1082 1083 static void 1084 tcp_timer_discard(void *ptp) 1085 { 1086 struct inpcb *inp; 1087 struct tcpcb *tp; 1088 struct epoch_tracker et; 1089 1090 tp = (struct tcpcb *)ptp; 1091 CURVNET_SET(tp->t_vnet); 1092 NET_EPOCH_ENTER(et); 1093 inp = tp->t_inpcb; 1094 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", 1095 __func__, tp)); 1096 INP_WLOCK(inp); 1097 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, 1098 ("%s: tcpcb has to be stopped here", __func__)); 1099 if (--tp->t_timers->tt_draincnt > 0 || 1100 tcp_freecb(tp) == false) 1101 INP_WUNLOCK(inp); 1102 NET_EPOCH_EXIT(et); 1103 CURVNET_RESTORE(); 1104 } 1105 1106 void 1107 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 1108 { 1109 struct callout *t_callout; 1110 1111 tp->t_timers->tt_flags |= TT_STOPPED; 1112 switch (timer_type) { 1113 case TT_DELACK: 1114 t_callout = &tp->t_timers->tt_delack; 1115 break; 1116 case TT_REXMT: 1117 t_callout = &tp->t_timers->tt_rexmt; 1118 break; 1119 case TT_PERSIST: 1120 t_callout = &tp->t_timers->tt_persist; 1121 break; 1122 case TT_KEEP: 1123 t_callout = &tp->t_timers->tt_keep; 1124 break; 1125 case TT_2MSL: 1126 t_callout = &tp->t_timers->tt_2msl; 1127 break; 1128 default: 1129 if (tp->t_fb->tfb_tcp_timer_stop) { 1130 /* 1131 * XXXrrs we need to look at this with the 1132 * stop case below (flags). 1133 */ 1134 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 1135 return; 1136 } 1137 panic("tp %p bad timer_type %#x", tp, timer_type); 1138 } 1139 1140 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 1141 /* 1142 * Can't stop the callout, defer tcpcb actual deletion 1143 * to the last one. We do this using the async drain 1144 * function and incrementing the count in 1145 */ 1146 tp->t_timers->tt_draincnt++; 1147 } 1148 } 1149